Objects/unicodeobject.c

   1 /*
   2
   3 Unicode implementation based on original code by Fredrik Lundh,
   4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
   5 Unicode Integration Proposal (see file Misc/unicode.txt).
   6
   7 Major speed upgrades to the method implementations at the Reykjavik
   8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
   9
  10 Copyright (c) Corporation for National Research Initiatives.
  11
  12 --------------------------------------------------------------------
  13 The original string type implementation is:
  14
  15   Copyright (c) 1999 by Secret Labs AB
  16   Copyright (c) 1999 by Fredrik Lundh
  17
  18 By obtaining, using, and/or copying this software and/or its
  19 associated documentation, you agree that you have read, understood,
  20 and will comply with the following terms and conditions:
  21
  22 Permission to use, copy, modify, and distribute this software and its
  23 associated documentation for any purpose and without fee is hereby
  24 granted, provided that the above copyright notice appears in all
  25 copies, and that both that copyright notice and this permission notice
  26 appear in supporting documentation, and that the name of Secret Labs
  27 AB or the author not be used in advertising or publicity pertaining to
  28 distribution of the software without specific, written prior
  29 permission.
  30
  31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  33 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  38 --------------------------------------------------------------------
  39
  40 */
  41
  42 #define PY_SSIZE_T_CLEAN
  43 #include "Python.h"
  44
  45 #include "unicodeobject.h"
  46 #include "ucnhash.h"
  47
  48 #ifdef MS_WINDOWS
  49 #include <windows.h>
  50 #endif
  51
  52 /* Limit for the Unicode object free list */
  53
  54 #define PyUnicode_MAXFREELIST       1024
  55
  56 /* Limit for the Unicode object free list stay alive optimization.
  57
  58    The implementation will keep allocated Unicode memory intact for
  59    all objects on the free list having a size less than this
  60    limit. This reduces malloc() overhead for small Unicode objects.
  61
  62    At worst this will result in PyUnicode_MAXFREELIST *
  63    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
  64    malloc()-overhead) bytes of unused garbage.
  65
  66    Setting the limit to 0 effectively turns the feature off.
  67
  68    Note: This is an experimental feature ! If you get core dumps when
  69    using Unicode objects, turn this feature off.
  70
  71 */
  72
  73 #define KEEPALIVE_SIZE_LIMIT       9
  74
  75 /* Endianness switches; defaults to little endian */
  76
  77 #ifdef WORDS_BIGENDIAN
  78 # define BYTEORDER_IS_BIG_ENDIAN
  79 #else
  80 # define BYTEORDER_IS_LITTLE_ENDIAN
  81 #endif
  82
  83 /* --- Globals ------------------------------------------------------------
  84
  85    The globals are initialized by the _PyUnicode_Init() API and should
  86    not be used before calling that API.
  87
  88 */
  89
  90
  91 #ifdef __cplusplus
  92 extern "C" {
  93 #endif
  94
  95 /* Free list for Unicode objects */
  96 static PyUnicodeObject *free_list;
  97 static int numfree;
  98
  99 /* The empty Unicode object is shared to improve performance. */
 100 static PyUnicodeObject *unicode_empty;
 101
 102 /* Single character Unicode strings in the Latin-1 range are being
 103    shared as well. */
 104 static PyUnicodeObject *unicode_latin1[256];
 105
 106 /* Default encoding to use and assume when NULL is passed as encoding
 107    parameter; it is initialized by _PyUnicode_Init().
 108
 109    Always use the PyUnicode_SetDefaultEncoding() and
 110    PyUnicode_GetDefaultEncoding() APIs to access this global.
 111
 112 */
 113 static char unicode_default_encoding[100];
 114
 115 /* Fast detection of the most frequent whitespace characters */
 116 const unsigned char _Py_ascii_whitespace[] = {
 117     0, 0, 0, 0, 0, 0, 0, 0,
 118 /*     case 0x0009: * HORIZONTAL TABULATION */
 119 /*     case 0x000A: * LINE FEED */
 120 /*     case 0x000B: * VERTICAL TABULATION */
 121 /*     case 0x000C: * FORM FEED */
 122 /*     case 0x000D: * CARRIAGE RETURN */
 123     0, 1, 1, 1, 1, 1, 0, 0,
 124     0, 0, 0, 0, 0, 0, 0, 0,
 125 /*     case 0x001C: * FILE SEPARATOR */
 126 /*     case 0x001D: * GROUP SEPARATOR */
 127 /*     case 0x001E: * RECORD SEPARATOR */
 128 /*     case 0x001F: * UNIT SEPARATOR */
 129     0, 0, 0, 0, 1, 1, 1, 1,
 130 /*     case 0x0020: * SPACE */
 131     1, 0, 0, 0, 0, 0, 0, 0,
 132     0, 0, 0, 0, 0, 0, 0, 0,
 133     0, 0, 0, 0, 0, 0, 0, 0,
 134     0, 0, 0, 0, 0, 0, 0, 0,
 135
 136     0, 0, 0, 0, 0, 0, 0, 0,
 137     0, 0, 0, 0, 0, 0, 0, 0,
 138     0, 0, 0, 0, 0, 0, 0, 0,
 139     0, 0, 0, 0, 0, 0, 0, 0,
 140     0, 0, 0, 0, 0, 0, 0, 0,
 141     0, 0, 0, 0, 0, 0, 0, 0,
 142     0, 0, 0, 0, 0, 0, 0, 0,
 143     0, 0, 0, 0, 0, 0, 0, 0
 144 };
 145
 146 /* Same for linebreaks */
 147 static unsigned char ascii_linebreak[] = {
 148     0, 0, 0, 0, 0, 0, 0, 0,
 149 /*         0x000A, * LINE FEED */
 150 /*         0x000D, * CARRIAGE RETURN */
 151     0, 0, 1, 0, 0, 1, 0, 0,
 152     0, 0, 0, 0, 0, 0, 0, 0,
 153 /*         0x001C, * FILE SEPARATOR */
 154 /*         0x001D, * GROUP SEPARATOR */
 155 /*         0x001E, * RECORD SEPARATOR */
 156     0, 0, 0, 0, 1, 1, 1, 0,
 157     0, 0, 0, 0, 0, 0, 0, 0,
 158     0, 0, 0, 0, 0, 0, 0, 0,
 159     0, 0, 0, 0, 0, 0, 0, 0,
 160     0, 0, 0, 0, 0, 0, 0, 0,
 161
 162     0, 0, 0, 0, 0, 0, 0, 0,
 163     0, 0, 0, 0, 0, 0, 0, 0,
 164     0, 0, 0, 0, 0, 0, 0, 0,
 165     0, 0, 0, 0, 0, 0, 0, 0,
 166     0, 0, 0, 0, 0, 0, 0, 0,
 167     0, 0, 0, 0, 0, 0, 0, 0,
 168     0, 0, 0, 0, 0, 0, 0, 0,
 169     0, 0, 0, 0, 0, 0, 0, 0
 170 };
 171
 172
 173 Py_UNICODE
 174 PyUnicode_GetMax(void)
 175 {
 176 #ifdef Py_UNICODE_WIDE
 177     return 0x10FFFF;
 178 #else
 179     /* This is actually an illegal character, so it should
 180        not be passed to unichr. */
 181     return 0xFFFF;
 182 #endif
 183 }
 184
 185 /* --- Bloom Filters ----------------------------------------------------- */
 186
 187 /* stuff to implement simple "bloom filters" for Unicode characters.
 188    to keep things simple, we use a single bitmask, using the least 5
 189    bits from each unicode characters as the bit index. */
 190
 191 /* the linebreak mask is set up by Unicode_Init below */
 192
 193 #define BLOOM_MASK unsigned long
 194
 195 static BLOOM_MASK bloom_linebreak;
 196
 197 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
 198
 199 #define BLOOM_LINEBREAK(ch)                                             \
 200     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
 201      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
 202
 203 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
 204 {
 205     /* calculate simple bloom-style bitmask for a given unicode string */
 206
 207     long mask;
 208     Py_ssize_t i;
 209
 210     mask = 0;
 211     for (i = 0; i < len; i++)
 212         mask |= (1 << (ptr[i] & 0x1F));
 213
 214     return mask;
 215 }
 216
 217 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
 218 {
 219     Py_ssize_t i;
 220
 221     for (i = 0; i < setlen; i++)
 222         if (set[i] == chr)
 223             return 1;
 224
 225     return 0;
 226 }
 227
 228 #define BLOOM_MEMBER(mask, chr, set, setlen)                    \
 229     BLOOM(mask, chr) && unicode_member(chr, set, setlen)
 230
 231 /* --- Unicode Object ----------------------------------------------------- */
 232
 233 static
 234 int unicode_resize(register PyUnicodeObject *unicode,
 235                    Py_ssize_t length)
 236 {
 237     void *oldstr;
 238
 239     /* Shortcut if there's nothing much to do. */
 240     if (unicode->length == length)
 241         goto reset;
 242
 243     /* Resizing shared object (unicode_empty or single character
 244        objects) in-place is not allowed. Use PyUnicode_Resize()
 245        instead ! */
 246
 247     if (unicode == unicode_empty ||
 248         (unicode->length == 1 &&
 249          unicode->str[0] < 256U &&
 250          unicode_latin1[unicode->str[0]] == unicode)) {
 251         PyErr_SetString(PyExc_SystemError,
 252                         "can't resize shared unicode objects");
 253         return -1;
 254     }
 255
 256     /* We allocate one more byte to make sure the string is Ux0000 terminated.
 257        The overallocation is also used by fastsearch, which assumes that it's
 258        safe to look at str[length] (without making any assumptions about what
 259        it contains). */
 260
 261     oldstr = unicode->str;
 262     unicode->str = PyObject_REALLOC(unicode->str,
 263                                     sizeof(Py_UNICODE) * (length + 1));
 264     if (!unicode->str) {
 265         unicode->str = (Py_UNICODE *)oldstr;
 266         PyErr_NoMemory();
 267         return -1;
 268     }
 269     unicode->str[length] = 0;
 270     unicode->length = length;
 271
 272   reset:
 273     /* Reset the object caches */
 274     if (unicode->defenc) {
 275         Py_DECREF(unicode->defenc);
 276         unicode->defenc = NULL;
 277     }
 278     unicode->hash = -1;
 279
 280     return 0;
 281 }
 282
 283 /* We allocate one more byte to make sure the string is
 284    Ux0000 terminated -- XXX is this needed ?
 285
 286    XXX This allocator could further be enhanced by assuring that the
 287    free list never reduces its size below 1.
 288
 289 */
 290
 291 static
 292 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
 293 {
 294     register PyUnicodeObject *unicode;
 295
 296     /* Optimization for empty strings */
 297     if (length == 0 && unicode_empty != NULL) {
 298         Py_INCREF(unicode_empty);
 299         return unicode_empty;
 300     }
 301
 302     /* Ensure we won't overflow the size. */
 303     if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
 304         return (PyUnicodeObject *)PyErr_NoMemory();
 305     }
 306
 307     /* Unicode freelist & memory allocation */
 308     if (free_list) {
 309         unicode = free_list;
 310         free_list = *(PyUnicodeObject **)unicode;
 311         numfree--;
 312         if (unicode->str) {
 313             /* Keep-Alive optimization: we only upsize the buffer,
 314                never downsize it. */
 315             if ((unicode->length < length) &&
 316                 unicode_resize(unicode, length) < 0) {
 317                 PyObject_DEL(unicode->str);
 318                 unicode->str = NULL;
 319             }
 320         }
 321         else {
 322             size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
 323             unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
 324         }
 325         PyObject_INIT(unicode, &PyUnicode_Type);
 326     }
 327     else {
 328         size_t new_size;
 329         unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
 330         if (unicode == NULL)
 331             return NULL;
 332         new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
 333         unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
 334     }
 335
 336     if (!unicode->str) {
 337         PyErr_NoMemory();
 338         goto onError;
 339     }
 340     /* Initialize the first element to guard against cases where
 341      * the caller fails before initializing str -- unicode_resize()
 342      * reads str[0], and the Keep-Alive optimization can keep memory
 343      * allocated for str alive across a call to unicode_dealloc(unicode).
 344      * We don't want unicode_resize to read uninitialized memory in
 345      * that case.
 346      */
 347     unicode->str[0] = 0;
 348     unicode->str[length] = 0;
 349     unicode->length = length;
 350     unicode->hash = -1;
 351     unicode->defenc = NULL;
 352     return unicode;
 353
 354   onError:
 355     /* XXX UNREF/NEWREF interface should be more symmetrical */
 356     _Py_DEC_REFTOTAL;
 357     _Py_ForgetReference((PyObject *)unicode);
 358     PyObject_Del(unicode);
 359     return NULL;
 360 }
 361
 362 static
 363 void unicode_dealloc(register PyUnicodeObject *unicode)
 364 {
 365     if (PyUnicode_CheckExact(unicode) &&
 366         numfree < PyUnicode_MAXFREELIST) {
 367         /* Keep-Alive optimization */
 368         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
 369             PyObject_DEL(unicode->str);
 370             unicode->str = NULL;
 371             unicode->length = 0;
 372         }
 373         if (unicode->defenc) {
 374             Py_DECREF(unicode->defenc);
 375             unicode->defenc = NULL;
 376         }
 377         /* Add to free list */
 378         *(PyUnicodeObject **)unicode = free_list;
 379         free_list = unicode;
 380         numfree++;
 381     }
 382     else {
 383         PyObject_DEL(unicode->str);
 384         Py_XDECREF(unicode->defenc);
 385         Py_TYPE(unicode)->tp_free((PyObject *)unicode);
 386     }
 387 }
 388
 389 static
 390 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
 391 {
 392     register PyUnicodeObject *v;
 393
 394     /* Argument checks */
 395     if (unicode == NULL) {
 396         PyErr_BadInternalCall();
 397         return -1;
 398     }
 399     v = *unicode;
 400     if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
 401         PyErr_BadInternalCall();
 402         return -1;
 403     }
 404
 405     /* Resizing unicode_empty and single character objects is not
 406        possible since these are being shared. We simply return a fresh
 407        copy with the same Unicode content. */
 408     if (v->length != length &&
 409         (v == unicode_empty || v->length == 1)) {
 410         PyUnicodeObject *w = _PyUnicode_New(length);
 411         if (w == NULL)
 412             return -1;
 413         Py_UNICODE_COPY(w->str, v->str,
 414                         length < v->length ? length : v->length);
 415         Py_DECREF(*unicode);
 416         *unicode = w;
 417         return 0;
 418     }
 419
 420     /* Note that we don't have to modify *unicode for unshared Unicode
 421        objects, since we can modify them in-place. */
 422     return unicode_resize(v, length);
 423 }
 424
 425 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
 426 {
 427     return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
 428 }
 429
 430 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
 431                                 Py_ssize_t size)
 432 {
 433     PyUnicodeObject *unicode;
 434
 435     /* If the Unicode data is known at construction time, we can apply
 436        some optimizations which share commonly used objects. */
 437     if (u != NULL) {
 438
 439         /* Optimization for empty strings */
 440         if (size == 0 && unicode_empty != NULL) {
 441             Py_INCREF(unicode_empty);
 442             return (PyObject *)unicode_empty;
 443         }
 444
 445         /* Single character Unicode objects in the Latin-1 range are
 446            shared when using this constructor */
 447         if (size == 1 && *u < 256) {
 448             unicode = unicode_latin1[*u];
 449             if (!unicode) {
 450                 unicode = _PyUnicode_New(1);
 451                 if (!unicode)
 452                     return NULL;
 453                 unicode->str[0] = *u;
 454                 unicode_latin1[*u] = unicode;
 455             }
 456             Py_INCREF(unicode);
 457             return (PyObject *)unicode;
 458         }
 459     }
 460
 461     unicode = _PyUnicode_New(size);
 462     if (!unicode)
 463         return NULL;
 464
 465     /* Copy the Unicode data into the new object */
 466     if (u != NULL)
 467         Py_UNICODE_COPY(unicode->str, u, size);
 468
 469     return (PyObject *)unicode;
 470 }
 471
 472 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
 473 {
 474     PyUnicodeObject *unicode;
 475
 476     if (size < 0) {
 477         PyErr_SetString(PyExc_SystemError,
 478                         "Negative size passed to PyUnicode_FromStringAndSize");
 479         return NULL;
 480     }
 481
 482     /* If the Unicode data is known at construction time, we can apply
 483        some optimizations which share commonly used objects.
 484        Also, this means the input must be UTF-8, so fall back to the
 485        UTF-8 decoder at the end. */
 486     if (u != NULL) {
 487
 488         /* Optimization for empty strings */
 489         if (size == 0 && unicode_empty != NULL) {
 490             Py_INCREF(unicode_empty);
 491             return (PyObject *)unicode_empty;
 492         }
 493
 494         /* Single characters are shared when using this constructor.
 495            Restrict to ASCII, since the input must be UTF-8. */
 496         if (size == 1 && Py_CHARMASK(*u) < 128) {
 497             unicode = unicode_latin1[Py_CHARMASK(*u)];
 498             if (!unicode) {
 499                 unicode = _PyUnicode_New(1);
 500                 if (!unicode)
 501                     return NULL;
 502                 unicode->str[0] = Py_CHARMASK(*u);
 503                 unicode_latin1[Py_CHARMASK(*u)] = unicode;
 504             }
 505             Py_INCREF(unicode);
 506             return (PyObject *)unicode;
 507         }
 508
 509         return PyUnicode_DecodeUTF8(u, size, NULL);
 510     }
 511
 512     unicode = _PyUnicode_New(size);
 513     if (!unicode)
 514         return NULL;
 515
 516     return (PyObject *)unicode;
 517 }
 518
 519 PyObject *PyUnicode_FromString(const char *u)
 520 {
 521     size_t size = strlen(u);
 522     if (size > PY_SSIZE_T_MAX) {
 523         PyErr_SetString(PyExc_OverflowError, "input too long");
 524         return NULL;
 525     }
 526
 527     return PyUnicode_FromStringAndSize(u, size);
 528 }
 529
 530 #ifdef HAVE_WCHAR_H
 531
 532 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
 533 # define CONVERT_WCHAR_TO_SURROGATES
 534 #endif
 535
 536 #ifdef CONVERT_WCHAR_TO_SURROGATES
 537
 538 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
 539    to convert from UTF32 to UTF16. */
 540
 541 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 542                                  Py_ssize_t size)
 543 {
 544     PyUnicodeObject *unicode;
 545     register Py_ssize_t i;
 546     Py_ssize_t alloc;
 547     const wchar_t *orig_w;
 548
 549     if (w == NULL) {
 550         PyErr_BadInternalCall();
 551         return NULL;
 552     }
 553
 554     alloc = size;
 555     orig_w = w;
 556     for (i = size; i > 0; i--) {
 557         if (*w > 0xFFFF)
 558             alloc++;
 559         w++;
 560     }
 561     w = orig_w;
 562     unicode = _PyUnicode_New(alloc);
 563     if (!unicode)
 564         return NULL;
 565
 566     /* Copy the wchar_t data into the new object */
 567     {
 568         register Py_UNICODE *u;
 569         u = PyUnicode_AS_UNICODE(unicode);
 570         for (i = size; i > 0; i--) {
 571             if (*w > 0xFFFF) {
 572                 wchar_t ordinal = *w++;
 573                 ordinal -= 0x10000;
 574                 *u++ = 0xD800 | (ordinal >> 10);
 575                 *u++ = 0xDC00 | (ordinal & 0x3FF);
 576             }
 577             else
 578                 *u++ = *w++;
 579         }
 580     }
 581     return (PyObject *)unicode;
 582 }
 583
 584 #else
 585
 586 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 587                                  Py_ssize_t size)
 588 {
 589     PyUnicodeObject *unicode;
 590
 591     if (w == NULL) {
 592         PyErr_BadInternalCall();
 593         return NULL;
 594     }
 595
 596     unicode = _PyUnicode_New(size);
 597     if (!unicode)
 598         return NULL;
 599
 600     /* Copy the wchar_t data into the new object */
 601 #ifdef HAVE_USABLE_WCHAR_T
 602     memcpy(unicode->str, w, size * sizeof(wchar_t));
 603 #else
 604     {
 605         register Py_UNICODE *u;
 606         register Py_ssize_t i;
 607         u = PyUnicode_AS_UNICODE(unicode);
 608         for (i = size; i > 0; i--)
 609             *u++ = *w++;
 610     }
 611 #endif
 612
 613     return (PyObject *)unicode;
 614 }
 615
 616 #endif /* CONVERT_WCHAR_TO_SURROGATES */
 617
 618 #undef CONVERT_WCHAR_TO_SURROGATES
 619
 620 static void
 621 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
 622 {
 623     *fmt++ = '%';
 624     if (width) {
 625         if (zeropad)
 626             *fmt++ = '0';
 627         fmt += sprintf(fmt, "%d", width);
 628     }
 629     if (precision)
 630         fmt += sprintf(fmt, ".%d", precision);
 631     if (longflag)
 632         *fmt++ = 'l';
 633     else if (size_tflag) {
 634         char *f = PY_FORMAT_SIZE_T;
 635         while (*f)
 636             *fmt++ = *f++;
 637     }
 638     *fmt++ = c;
 639     *fmt = '\0';
 640 }
 641
 642 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
 643
 644 PyObject *
 645 PyUnicode_FromFormatV(const char *format, va_list vargs)
 646 {
 647     va_list count;
 648     Py_ssize_t callcount = 0;
 649     PyObject **callresults = NULL;
 650     PyObject **callresult = NULL;
 651     Py_ssize_t n = 0;
 652     int width = 0;
 653     int precision = 0;
 654     int zeropad;
 655     const char* f;
 656     Py_UNICODE *s;
 657     PyObject *string;
 658     /* used by sprintf */
 659     char buffer[21];
 660     /* use abuffer instead of buffer, if we need more space
 661      * (which can happen if there's a format specifier with width). */
 662     char *abuffer = NULL;
 663     char *realbuffer;
 664     Py_ssize_t abuffersize = 0;
 665     char fmt[60]; /* should be enough for %0width.precisionld */
 666     const char *copy;
 667
 668 #ifdef VA_LIST_IS_ARRAY
 669     Py_MEMCPY(count, vargs, sizeof(va_list));
 670 #else
 671 #ifdef  __va_copy
 672     __va_copy(count, vargs);
 673 #else
 674     count = vargs;
 675 #endif
 676 #endif
 677      /* step 1: count the number of %S/%R/%s format specifications
 678       * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
 679       * objects once during step 3 and put the result in an array) */
 680     for (f = format; *f; f++) {
 681          if (*f == '%') {
 682              if (*(f+1)=='%')
 683                  continue;
 684              if (*(f+1)=='S' || *(f+1)=='R')
 685                  ++callcount;
 686              while (isdigit((unsigned)*f))
 687                  width = (width*10) + *f++ - '0';
 688              while (*++f && *f != '%' && !isalpha((unsigned)*f))
 689                  ;
 690              if (*f == 's')
 691                  ++callcount;
 692          }
 693     }
 694     /* step 2: allocate memory for the results of
 695      * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
 696     if (callcount) {
 697         callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
 698         if (!callresults) {
 699             PyErr_NoMemory();
 700             return NULL;
 701         }
 702         callresult = callresults;
 703     }
 704     /* step 3: figure out how large a buffer we need */
 705     for (f = format; *f; f++) {
 706         if (*f == '%') {
 707             const char* p = f;
 708             width = 0;
 709             while (isdigit((unsigned)*f))
 710                 width = (width*10) + *f++ - '0';
 711             while (*++f && *f != '%' && !isalpha((unsigned)*f))
 712                 ;
 713
 714             /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
 715              * they don't affect the amount of space we reserve.
 716              */
 717             if ((*f == 'l' || *f == 'z') &&
 718                 (f[1] == 'd' || f[1] == 'u'))
 719                 ++f;
 720
 721             switch (*f) {
 722             case 'c':
 723                 (void)va_arg(count, int);
 724                 /* fall through... */
 725             case '%':
 726                 n++;
 727                 break;
 728             case 'd': case 'u': case 'i': case 'x':
 729                 (void) va_arg(count, int);
 730                 /* 20 bytes is enough to hold a 64-bit
 731                    integer.  Decimal takes the most space.
 732                    This isn't enough for octal.
 733                    If a width is specified we need more
 734                    (which we allocate later). */
 735                 if (width < 20)
 736                     width = 20;
 737                 n += width;
 738                 if (abuffersize < width)
 739                     abuffersize = width;
 740                 break;
 741             case 's':
 742             {
 743                 /* UTF-8 */
 744                 const char *s = va_arg(count, const char*);
 745                 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
 746                 if (!str)
 747                     goto fail;
 748                 n += PyUnicode_GET_SIZE(str);
 749                 /* Remember the str and switch to the next slot */
 750                 *callresult++ = str;
 751                 break;
 752             }
 753             case 'U':
 754             {
 755                 PyObject *obj = va_arg(count, PyObject *);
 756                 assert(obj && PyUnicode_Check(obj));
 757                 n += PyUnicode_GET_SIZE(obj);
 758                 break;
 759             }
 760             case 'V':
 761             {
 762                 PyObject *obj = va_arg(count, PyObject *);
 763                 const char *str = va_arg(count, const char *);
 764                 assert(obj || str);
 765                 assert(!obj || PyUnicode_Check(obj));
 766                 if (obj)
 767                     n += PyUnicode_GET_SIZE(obj);
 768                 else
 769                     n += strlen(str);
 770                 break;
 771             }
 772             case 'S':
 773             {
 774                 PyObject *obj = va_arg(count, PyObject *);
 775                 PyObject *str;
 776                 assert(obj);
 777                 str = PyObject_Str(obj);
 778                 if (!str)
 779                     goto fail;
 780                 n += PyUnicode_GET_SIZE(str);
 781                 /* Remember the str and switch to the next slot */
 782                 *callresult++ = str;
 783                 break;
 784             }
 785             case 'R':
 786             {
 787                 PyObject *obj = va_arg(count, PyObject *);
 788                 PyObject *repr;
 789                 assert(obj);
 790                 repr = PyObject_Repr(obj);
 791                 if (!repr)
 792                     goto fail;
 793                 n += PyUnicode_GET_SIZE(repr);
 794                 /* Remember the repr and switch to the next slot */
 795                 *callresult++ = repr;
 796                 break;
 797             }
 798             case 'p':
 799                 (void) va_arg(count, int);
 800                 /* maximum 64-bit pointer representation:
 801                  * 0xffffffffffffffff
 802                  * so 19 characters is enough.
 803                  * XXX I count 18 -- what's the extra for?
 804                  */
 805                 n += 19;
 806                 break;
 807             default:
 808                 /* if we stumble upon an unknown
 809                    formatting code, copy the rest of
 810                    the format string to the output
 811                    string. (we cannot just skip the
 812                    code, since there's no way to know
 813                    what's in the argument list) */
 814                 n += strlen(p);
 815                 goto expand;
 816             }
 817         } else
 818             n++;
 819     }
 820   expand:
 821     if (abuffersize > 20) {
 822         abuffer = PyObject_Malloc(abuffersize);
 823         if (!abuffer) {
 824             PyErr_NoMemory();
 825             goto fail;
 826         }
 827         realbuffer = abuffer;
 828     }
 829     else
 830         realbuffer = buffer;
 831     /* step 4: fill the buffer */
 832     /* Since we've analyzed how much space we need for the worst case,
 833        we don't have to resize the string.
 834        There can be no errors beyond this point. */
 835     string = PyUnicode_FromUnicode(NULL, n);
 836     if (!string)
 837         goto fail;
 838
 839     s = PyUnicode_AS_UNICODE(string);
 840     callresult = callresults;
 841
 842     for (f = format; *f; f++) {
 843         if (*f == '%') {
 844             const char* p = f++;
 845             int longflag = 0;
 846             int size_tflag = 0;
 847             zeropad = (*f == '0');
 848             /* parse the width.precision part */
 849             width = 0;
 850             while (isdigit((unsigned)*f))
 851                 width = (width*10) + *f++ - '0';
 852             precision = 0;
 853             if (*f == '.') {
 854                 f++;
 855                 while (isdigit((unsigned)*f))
 856                     precision = (precision*10) + *f++ - '0';
 857             }
 858             /* handle the long flag, but only for %ld and %lu.
 859                others can be added when necessary. */
 860             if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
 861                 longflag = 1;
 862                 ++f;
 863             }
 864             /* handle the size_t flag. */
 865             if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
 866                 size_tflag = 1;
 867                 ++f;
 868             }
 869
 870             switch (*f) {
 871             case 'c':
 872                 *s++ = va_arg(vargs, int);
 873                 break;
 874             case 'd':
 875                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
 876                 if (longflag)
 877                     sprintf(realbuffer, fmt, va_arg(vargs, long));
 878                 else if (size_tflag)
 879                     sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
 880                 else
 881                     sprintf(realbuffer, fmt, va_arg(vargs, int));
 882                 appendstring(realbuffer);
 883                 break;
 884             case 'u':
 885                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
 886                 if (longflag)
 887                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
 888                 else if (size_tflag)
 889                     sprintf(realbuffer, fmt, va_arg(vargs, size_t));
 890                 else
 891                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
 892                 appendstring(realbuffer);
 893                 break;
 894             case 'i':
 895                 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
 896                 sprintf(realbuffer, fmt, va_arg(vargs, int));
 897                 appendstring(realbuffer);
 898                 break;
 899             case 'x':
 900                 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
 901                 sprintf(realbuffer, fmt, va_arg(vargs, int));
 902                 appendstring(realbuffer);
 903                 break;
 904             case 's':
 905             {
 906                 /* unused, since we already have the result */
 907                 (void) va_arg(vargs, char *);
 908                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
 909                                 PyUnicode_GET_SIZE(*callresult));
 910                 s += PyUnicode_GET_SIZE(*callresult);
 911                 /* We're done with the unicode()/repr() => forget it */
 912                 Py_DECREF(*callresult);
 913                 /* switch to next unicode()/repr() result */
 914                 ++callresult;
 915                 break;
 916             }
 917             case 'U':
 918             {
 919                 PyObject *obj = va_arg(vargs, PyObject *);
 920                 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
 921                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
 922                 s += size;
 923                 break;
 924             }
 925             case 'V':
 926             {
 927                 PyObject *obj = va_arg(vargs, PyObject *);
 928                 const char *str = va_arg(vargs, const char *);
 929                 if (obj) {
 930                     Py_ssize_t size = PyUnicode_GET_SIZE(obj);
 931                     Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
 932                     s += size;
 933                 } else {
 934                     appendstring(str);
 935                 }
 936                 break;
 937             }
 938             case 'S':
 939             case 'R':
 940             {
 941                 Py_UNICODE *ucopy;
 942                 Py_ssize_t usize;
 943                 Py_ssize_t upos;
 944                 /* unused, since we already have the result */
 945                 (void) va_arg(vargs, PyObject *);
 946                 ucopy = PyUnicode_AS_UNICODE(*callresult);
 947                 usize = PyUnicode_GET_SIZE(*callresult);
 948                 for (upos = 0; upos<usize;)
 949                     *s++ = ucopy[upos++];
 950                 /* We're done with the unicode()/repr() => forget it */
 951                 Py_DECREF(*callresult);
 952                 /* switch to next unicode()/repr() result */
 953                 ++callresult;
 954                 break;
 955             }
 956             case 'p':
 957                 sprintf(buffer, "%p", va_arg(vargs, void*));
 958                 /* %p is ill-defined:  ensure leading 0x. */
 959                 if (buffer[1] == 'X')
 960                     buffer[1] = 'x';
 961                 else if (buffer[1] != 'x') {
 962                     memmove(buffer+2, buffer, strlen(buffer)+1);
 963                     buffer[0] = '0';
 964                     buffer[1] = 'x';
 965                 }
 966                 appendstring(buffer);
 967                 break;
 968             case '%':
 969                 *s++ = '%';
 970                 break;
 971             default:
 972                 appendstring(p);
 973                 goto end;
 974             }
 975         } else
 976             *s++ = *f;
 977     }
 978
 979   end:
 980     if (callresults)
 981         PyObject_Free(callresults);
 982     if (abuffer)
 983         PyObject_Free(abuffer);
 984     PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
 985     return string;
 986   fail:
 987     if (callresults) {
 988         PyObject **callresult2 = callresults;
 989         while (callresult2 < callresult) {
 990             Py_DECREF(*callresult2);
 991             ++callresult2;
 992         }
 993         PyObject_Free(callresults);
 994     }
 995     if (abuffer)
 996         PyObject_Free(abuffer);
 997     return NULL;
 998 }
 999
1000 #undef appendstring
1001
1002 PyObject *
1003 PyUnicode_FromFormat(const char *format, ...)
1004 {
1005     PyObject* ret;
1006     va_list vargs;
1007
1008 #ifdef HAVE_STDARG_PROTOTYPES
1009     va_start(vargs, format);
1010 #else
1011     va_start(vargs);
1012 #endif
1013     ret = PyUnicode_FromFormatV(format, vargs);
1014     va_end(vargs);
1015     return ret;
1016 }
1017
1018 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1019                                 wchar_t *w,
1020                                 Py_ssize_t size)
1021 {
1022     if (unicode == NULL) {
1023         PyErr_BadInternalCall();
1024         return -1;
1025     }
1026
1027     /* If possible, try to copy the 0-termination as well */
1028     if (size > PyUnicode_GET_SIZE(unicode))
1029         size = PyUnicode_GET_SIZE(unicode) + 1;
1030
1031 #ifdef HAVE_USABLE_WCHAR_T
1032     memcpy(w, unicode->str, size * sizeof(wchar_t));
1033 #else
1034     {
1035         register Py_UNICODE *u;
1036         register Py_ssize_t i;
1037         u = PyUnicode_AS_UNICODE(unicode);
1038         for (i = size; i > 0; i--)
1039             *w++ = *u++;
1040     }
1041 #endif
1042
1043     if (size > PyUnicode_GET_SIZE(unicode))
1044         return PyUnicode_GET_SIZE(unicode);
1045     else
1046         return size;
1047 }
1048
1049 #endif
1050
1051 PyObject *PyUnicode_FromOrdinal(int ordinal)
1052 {
1053     Py_UNICODE s[1];
1054
1055 #ifdef Py_UNICODE_WIDE
1056     if (ordinal < 0 || ordinal > 0x10ffff) {
1057         PyErr_SetString(PyExc_ValueError,
1058                         "unichr() arg not in range(0x110000) "
1059                         "(wide Python build)");
1060         return NULL;
1061     }
1062 #else
1063     if (ordinal < 0 || ordinal > 0xffff) {
1064         PyErr_SetString(PyExc_ValueError,
1065                         "unichr() arg not in range(0x10000) "
1066                         "(narrow Python build)");
1067         return NULL;
1068     }
1069 #endif
1070
1071     s[0] = (Py_UNICODE)ordinal;
1072     return PyUnicode_FromUnicode(s, 1);
1073 }
1074
1075 PyObject *PyUnicode_FromObject(register PyObject *obj)
1076 {
1077     /* XXX Perhaps we should make this API an alias of
1078        PyObject_Unicode() instead ?! */
1079     if (PyUnicode_CheckExact(obj)) {
1080         Py_INCREF(obj);
1081         return obj;
1082     }
1083     if (PyUnicode_Check(obj)) {
1084         /* For a Unicode subtype that's not a Unicode object,
1085            return a true Unicode object with the same data. */
1086         return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1087                                      PyUnicode_GET_SIZE(obj));
1088     }
1089     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1090 }
1091
1092 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1093                                       const char *encoding,
1094                                       const char *errors)
1095 {
1096     const char *s = NULL;
1097     Py_ssize_t len;
1098     PyObject *v;
1099
1100     if (obj == NULL) {
1101         PyErr_BadInternalCall();
1102         return NULL;
1103     }
1104
1105 #if 0
1106     /* For b/w compatibility we also accept Unicode objects provided
1107        that no encodings is given and then redirect to
1108        PyObject_Unicode() which then applies the additional logic for
1109        Unicode subclasses.
1110
1111        NOTE: This API should really only be used for object which
1112        represent *encoded* Unicode !
1113
1114     */
1115     if (PyUnicode_Check(obj)) {
1116         if (encoding) {
1117             PyErr_SetString(PyExc_TypeError,
1118                             "decoding Unicode is not supported");
1119             return NULL;
1120         }
1121         return PyObject_Unicode(obj);
1122     }
1123 #else
1124     if (PyUnicode_Check(obj)) {
1125         PyErr_SetString(PyExc_TypeError,
1126                         "decoding Unicode is not supported");
1127         return NULL;
1128     }
1129 #endif
1130
1131     /* Coerce object */
1132     if (PyString_Check(obj)) {
1133         s = PyString_AS_STRING(obj);
1134         len = PyString_GET_SIZE(obj);
1135     }
1136     else if (PyByteArray_Check(obj)) {
1137         /* Python 2.x specific */
1138         PyErr_Format(PyExc_TypeError,
1139                      "decoding bytearray is not supported");
1140         return NULL;
1141     }
1142     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1143         /* Overwrite the error message with something more useful in
1144            case of a TypeError. */
1145         if (PyErr_ExceptionMatches(PyExc_TypeError))
1146             PyErr_Format(PyExc_TypeError,
1147                          "coercing to Unicode: need string or buffer, "
1148                          "%.80s found",
1149                          Py_TYPE(obj)->tp_name);
1150         goto onError;
1151     }
1152
1153     /* Convert to Unicode */
1154     if (len == 0) {
1155         Py_INCREF(unicode_empty);
1156         v = (PyObject *)unicode_empty;
1157     }
1158     else
1159         v = PyUnicode_Decode(s, len, encoding, errors);
1160
1161     return v;
1162
1163   onError:
1164     return NULL;
1165 }
1166
1167 PyObject *PyUnicode_Decode(const char *s,
1168                            Py_ssize_t size,
1169                            const char *encoding,
1170                            const char *errors)
1171 {
1172     PyObject *buffer = NULL, *unicode;
1173
1174     if (encoding == NULL)
1175         encoding = PyUnicode_GetDefaultEncoding();
1176
1177     /* Shortcuts for common default encodings */
1178     if (strcmp(encoding, "utf-8") == 0)
1179         return PyUnicode_DecodeUTF8(s, size, errors);
1180     else if (strcmp(encoding, "latin-1") == 0)
1181         return PyUnicode_DecodeLatin1(s, size, errors);
1182 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1183     else if (strcmp(encoding, "mbcs") == 0)
1184         return PyUnicode_DecodeMBCS(s, size, errors);
1185 #endif
1186     else if (strcmp(encoding, "ascii") == 0)
1187         return PyUnicode_DecodeASCII(s, size, errors);
1188
1189     /* Decode via the codec registry */
1190     buffer = PyBuffer_FromMemory((void *)s, size);
1191     if (buffer == NULL)
1192         goto onError;
1193     unicode = PyCodec_Decode(buffer, encoding, errors);
1194     if (unicode == NULL)
1195         goto onError;
1196     if (!PyUnicode_Check(unicode)) {
1197         PyErr_Format(PyExc_TypeError,
1198                      "decoder did not return an unicode object (type=%.400s)",
1199                      Py_TYPE(unicode)->tp_name);
1200         Py_DECREF(unicode);
1201         goto onError;
1202     }
1203     Py_DECREF(buffer);
1204     return unicode;
1205
1206   onError:
1207     Py_XDECREF(buffer);
1208     return NULL;
1209 }
1210
1211 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1212                                     const char *encoding,
1213                                     const char *errors)
1214 {
1215     PyObject *v;
1216
1217     if (!PyUnicode_Check(unicode)) {
1218         PyErr_BadArgument();
1219         goto onError;
1220     }
1221
1222     if (encoding == NULL)
1223         encoding = PyUnicode_GetDefaultEncoding();
1224
1225     /* Decode via the codec registry */
1226     v = PyCodec_Decode(unicode, encoding, errors);
1227     if (v == NULL)
1228         goto onError;
1229     return v;
1230
1231   onError:
1232     return NULL;
1233 }
1234
1235 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1236                            Py_ssize_t size,
1237                            const char *encoding,
1238                            const char *errors)
1239 {
1240     PyObject *v, *unicode;
1241
1242     unicode = PyUnicode_FromUnicode(s, size);
1243     if (unicode == NULL)
1244         return NULL;
1245     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1246     Py_DECREF(unicode);
1247     return v;
1248 }
1249
1250 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1251                                     const char *encoding,
1252                                     const char *errors)
1253 {
1254     PyObject *v;
1255
1256     if (!PyUnicode_Check(unicode)) {
1257         PyErr_BadArgument();
1258         goto onError;
1259     }
1260
1261     if (encoding == NULL)
1262         encoding = PyUnicode_GetDefaultEncoding();
1263
1264     /* Encode via the codec registry */
1265     v = PyCodec_Encode(unicode, encoding, errors);
1266     if (v == NULL)
1267         goto onError;
1268     return v;
1269
1270   onError:
1271     return NULL;
1272 }
1273
1274 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1275                                     const char *encoding,
1276                                     const char *errors)
1277 {
1278     PyObject *v;
1279
1280     if (!PyUnicode_Check(unicode)) {
1281         PyErr_BadArgument();
1282         goto onError;
1283     }
1284
1285     if (encoding == NULL)
1286         encoding = PyUnicode_GetDefaultEncoding();
1287
1288     /* Shortcuts for common default encodings */
1289     if (errors == NULL) {
1290         if (strcmp(encoding, "utf-8") == 0)
1291             return PyUnicode_AsUTF8String(unicode);
1292         else if (strcmp(encoding, "latin-1") == 0)
1293             return PyUnicode_AsLatin1String(unicode);
1294 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1295         else if (strcmp(encoding, "mbcs") == 0)
1296             return PyUnicode_AsMBCSString(unicode);
1297 #endif
1298         else if (strcmp(encoding, "ascii") == 0)
1299             return PyUnicode_AsASCIIString(unicode);
1300     }
1301
1302     /* Encode via the codec registry */
1303     v = PyCodec_Encode(unicode, encoding, errors);
1304     if (v == NULL)
1305         goto onError;
1306     if (!PyString_Check(v)) {
1307         PyErr_Format(PyExc_TypeError,
1308                      "encoder did not return a string object (type=%.400s)",
1309                      Py_TYPE(v)->tp_name);
1310         Py_DECREF(v);
1311         goto onError;
1312     }
1313     return v;
1314
1315   onError:
1316     return NULL;
1317 }
1318
1319 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1320                                             const char *errors)
1321 {
1322     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1323
1324     if (v)
1325         return v;
1326     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1327     if (v && errors == NULL)
1328         ((PyUnicodeObject *)unicode)->defenc = v;
1329     return v;
1330 }
1331
1332 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1333 {
1334     if (!PyUnicode_Check(unicode)) {
1335         PyErr_BadArgument();
1336         goto onError;
1337     }
1338     return PyUnicode_AS_UNICODE(unicode);
1339
1340   onError:
1341     return NULL;
1342 }
1343
1344 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1345 {
1346     if (!PyUnicode_Check(unicode)) {
1347         PyErr_BadArgument();
1348         goto onError;
1349     }
1350     return PyUnicode_GET_SIZE(unicode);
1351
1352   onError:
1353     return -1;
1354 }
1355
1356 const char *PyUnicode_GetDefaultEncoding(void)
1357 {
1358     return unicode_default_encoding;
1359 }
1360
1361 int PyUnicode_SetDefaultEncoding(const char *encoding)
1362 {
1363     PyObject *v;
1364
1365     /* Make sure the encoding is valid. As side effect, this also
1366        loads the encoding into the codec registry cache. */
1367     v = _PyCodec_Lookup(encoding);
1368     if (v == NULL)
1369         goto onError;
1370     Py_DECREF(v);
1371     strncpy(unicode_default_encoding,
1372             encoding,
1373             sizeof(unicode_default_encoding));
1374     return 0;
1375
1376   onError:
1377     return -1;
1378 }
1379
1380 /* error handling callback helper:
1381    build arguments, call the callback and check the arguments,
1382    if no exception occurred, copy the replacement to the output
1383    and adjust various state variables.
1384    return 0 on success, -1 on error
1385 */
1386
1387 static
1388 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1389                                      const char *encoding, const char *reason,
1390                                      const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1391                                      Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1392                                      PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1393 {
1394     static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1395
1396     PyObject *restuple = NULL;
1397     PyObject *repunicode = NULL;
1398     Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1399     Py_ssize_t requiredsize;
1400     Py_ssize_t newpos;
1401     Py_UNICODE *repptr;
1402     Py_ssize_t repsize;
1403     int res = -1;
1404
1405     if (*errorHandler == NULL) {
1406         *errorHandler = PyCodec_LookupError(errors);
1407         if (*errorHandler == NULL)
1408             goto onError;
1409     }
1410
1411     if (*exceptionObject == NULL) {
1412         *exceptionObject = PyUnicodeDecodeError_Create(
1413             encoding, input, insize, *startinpos, *endinpos, reason);
1414         if (*exceptionObject == NULL)
1415             goto onError;
1416     }
1417     else {
1418         if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1419             goto onError;
1420         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1421             goto onError;
1422         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1423             goto onError;
1424     }
1425
1426     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1427     if (restuple == NULL)
1428         goto onError;
1429     if (!PyTuple_Check(restuple)) {
1430         PyErr_SetString(PyExc_TypeError, &argparse[4]);
1431         goto onError;
1432     }
1433     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1434         goto onError;
1435     if (newpos<0)
1436         newpos = insize+newpos;
1437     if (newpos<0 || newpos>insize) {
1438         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1439         goto onError;
1440     }
1441
1442     /* need more space? (at least enough for what we
1443        have+the replacement+the rest of the string (starting
1444        at the new input position), so we won't have to check space
1445        when there are no errors in the rest of the string) */
1446     repptr = PyUnicode_AS_UNICODE(repunicode);
1447     repsize = PyUnicode_GET_SIZE(repunicode);
1448     requiredsize = *outpos + repsize + insize-newpos;
1449     if (requiredsize > outsize) {
1450         if (requiredsize<2*outsize)
1451             requiredsize = 2*outsize;
1452         if (_PyUnicode_Resize(output, requiredsize) < 0)
1453             goto onError;
1454         *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1455     }
1456     *endinpos = newpos;
1457     *inptr = input + newpos;
1458     Py_UNICODE_COPY(*outptr, repptr, repsize);
1459     *outptr += repsize;
1460     *outpos += repsize;
1461     /* we made it! */
1462     res = 0;
1463
1464   onError:
1465     Py_XDECREF(restuple);
1466     return res;
1467 }
1468
1469 /* --- UTF-7 Codec -------------------------------------------------------- */
1470
1471 /* See RFC2152 for details.  We encode conservatively and decode liberally. */
1472
1473 /* Three simple macros defining base-64. */
1474
1475 /* Is c a base-64 character? */
1476
1477 #define IS_BASE64(c) \
1478     (isalnum(c) || (c) == '+' || (c) == '/')
1479
1480 /* given that c is a base-64 character, what is its base-64 value? */
1481
1482 #define FROM_BASE64(c)                                                  \
1483     (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
1484      ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
1485      ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
1486      (c) == '+' ? 62 : 63)
1487
1488 /* What is the base-64 character of the bottom 6 bits of n? */
1489
1490 #define TO_BASE64(n)  \
1491     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1492
1493 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1494  * decoded as itself.  We are permissive on decoding; the only ASCII
1495  * byte not decoding to itself is the + which begins a base64
1496  * string. */
1497
1498 #define DECODE_DIRECT(c)                                \
1499     ((c) <= 127 && (c) != '+')
1500
1501 /* The UTF-7 encoder treats ASCII characters differently according to
1502  * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1503  * the above).  See RFC2152.  This array identifies these different
1504  * sets:
1505  * 0 : "Set D"
1506  *     alphanumeric and '(),-./:?
1507  * 1 : "Set O"
1508  *     !"#$%&*;<=>@[]^_`{|}
1509  * 2 : "whitespace"
1510  *     ht nl cr sp
1511  * 3 : special (must be base64 encoded)
1512  *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1513  */
1514
1515 static
1516 char utf7_category[128] = {
1517 /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
1518     3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
1519 /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
1520     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
1521 /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
1522     2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
1523 /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
1524     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
1525 /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
1526     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1527 /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
1528     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
1529 /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
1530     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1531 /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
1532     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
1533 };
1534
1535 /* ENCODE_DIRECT: this character should be encoded as itself.  The
1536  * answer depends on whether we are encoding set O as itself, and also
1537  * on whether we are encoding whitespace as itself.  RFC2152 makes it
1538  * clear that the answers to these questions vary between
1539  * applications, so this code needs to be flexible.  */
1540
1541 #define ENCODE_DIRECT(c, directO, directWS)             \
1542     ((c) < 128 && (c) > 0 &&                            \
1543      ((utf7_category[(c)] == 0) ||                      \
1544       (directWS && (utf7_category[(c)] == 2)) ||        \
1545       (directO && (utf7_category[(c)] == 1))))
1546
1547 PyObject *PyUnicode_DecodeUTF7(const char *s,
1548                                Py_ssize_t size,
1549                                const char *errors)
1550 {
1551     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1552 }
1553
1554 /* The decoder.  The only state we preserve is our read position,
1555  * i.e. how many characters we have consumed.  So if we end in the
1556  * middle of a shift sequence we have to back off the read position
1557  * and the output to the beginning of the sequence, otherwise we lose
1558  * all the shift state (seen bits, number of bits seen, high
1559  * surrogate). */
1560
1561 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1562                                        Py_ssize_t size,
1563                                        const char *errors,
1564                                        Py_ssize_t *consumed)
1565 {
1566     const char *starts = s;
1567     Py_ssize_t startinpos;
1568     Py_ssize_t endinpos;
1569     Py_ssize_t outpos;
1570     const char *e;
1571     PyUnicodeObject *unicode;
1572     Py_UNICODE *p;
1573     const char *errmsg = "";
1574     int inShift = 0;
1575     Py_UNICODE *shiftOutStart;
1576     unsigned int base64bits = 0;
1577     unsigned long base64buffer = 0;
1578     Py_UNICODE surrogate = 0;
1579     PyObject *errorHandler = NULL;
1580     PyObject *exc = NULL;
1581
1582     unicode = _PyUnicode_New(size);
1583     if (!unicode)
1584         return NULL;
1585     if (size == 0) {
1586         if (consumed)
1587             *consumed = 0;
1588         return (PyObject *)unicode;
1589     }
1590
1591     p = unicode->str;
1592     shiftOutStart = p;
1593     e = s + size;
1594
1595     while (s < e) {
1596         Py_UNICODE ch = (unsigned char) *s;
1597
1598         if (inShift) { /* in a base-64 section */
1599             if (IS_BASE64(ch)) { /* consume a base-64 character */
1600                 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1601                 base64bits += 6;
1602                 s++;
1603                 if (base64bits >= 16) {
1604                     /* we have enough bits for a UTF-16 value */
1605                     Py_UNICODE outCh = (Py_UNICODE)
1606                                        (base64buffer >> (base64bits-16));
1607                     base64bits -= 16;
1608                     base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1609                     if (surrogate) {
1610                         /* expecting a second surrogate */
1611                         if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1612 #ifdef Py_UNICODE_WIDE
1613                             *p++ = (((surrogate & 0x3FF)<<10)
1614                                     | (outCh & 0x3FF)) + 0x10000;
1615 #else
1616                             *p++ = surrogate;
1617                             *p++ = outCh;
1618 #endif
1619                             surrogate = 0;
1620                         }
1621                         else {
1622                             surrogate = 0;
1623                             errmsg = "second surrogate missing";
1624                             goto utf7Error;
1625                         }
1626                     }
1627                     else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1628                         /* first surrogate */
1629                         surrogate = outCh;
1630                     }
1631                     else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1632                         errmsg = "unexpected second surrogate";
1633                         goto utf7Error;
1634                     }
1635                     else {
1636                         *p++ = outCh;
1637                     }
1638                 }
1639             }
1640             else { /* now leaving a base-64 section */
1641                 inShift = 0;
1642                 s++;
1643                 if (surrogate) {
1644                     errmsg = "second surrogate missing at end of shift sequence";
1645                     goto utf7Error;
1646                 }
1647                 if (base64bits > 0) { /* left-over bits */
1648                     if (base64bits >= 6) {
1649                         /* We've seen at least one base-64 character */
1650                         errmsg = "partial character in shift sequence";
1651                         goto utf7Error;
1652                     }
1653                     else {
1654                         /* Some bits remain; they should be zero */
1655                         if (base64buffer != 0) {
1656                             errmsg = "non-zero padding bits in shift sequence";
1657                             goto utf7Error;
1658                         }
1659                     }
1660                 }
1661                 if (ch != '-') {
1662                     /* '-' is absorbed; other terminating
1663                        characters are preserved */
1664                     *p++ = ch;
1665                 }
1666             }
1667         }
1668         else if ( ch == '+' ) {
1669             startinpos = s-starts;
1670             s++; /* consume '+' */
1671             if (s < e && *s == '-') { /* '+-' encodes '+' */
1672                 s++;
1673                 *p++ = '+';
1674             }
1675             else { /* begin base64-encoded section */
1676                 inShift = 1;
1677                 shiftOutStart = p;
1678                 base64bits = 0;
1679             }
1680         }
1681         else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
1682             *p++ = ch;
1683             s++;
1684         }
1685         else {
1686             startinpos = s-starts;
1687             s++;
1688             errmsg = "unexpected special character";
1689             goto utf7Error;
1690         }
1691         continue;
1692 utf7Error:
1693         outpos = p-PyUnicode_AS_UNICODE(unicode);
1694         endinpos = s-starts;
1695         if (unicode_decode_call_errorhandler(
1696                 errors, &errorHandler,
1697                 "utf7", errmsg,
1698                 starts, size, &startinpos, &endinpos, &exc, &s,
1699                 &unicode, &outpos, &p))
1700             goto onError;
1701     }
1702
1703     /* end of string */
1704
1705     if (inShift && !consumed) { /* in shift sequence, no more to follow */
1706         /* if we're in an inconsistent state, that's an error */
1707         if (surrogate ||
1708                 (base64bits >= 6) ||
1709                 (base64bits > 0 && base64buffer != 0)) {
1710             outpos = p-PyUnicode_AS_UNICODE(unicode);
1711             endinpos = size;
1712             if (unicode_decode_call_errorhandler(
1713                     errors, &errorHandler,
1714                     "utf7", "unterminated shift sequence",
1715                     starts, size, &startinpos, &endinpos, &exc, &s,
1716                     &unicode, &outpos, &p))
1717                 goto onError;
1718         }
1719     }
1720
1721     /* return state */
1722     if (consumed) {
1723         if (inShift) {
1724             p = shiftOutStart; /* back off output */
1725             *consumed = startinpos;
1726         }
1727         else {
1728             *consumed = s-starts;
1729         }
1730     }
1731
1732     if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1733         goto onError;
1734
1735     Py_XDECREF(errorHandler);
1736     Py_XDECREF(exc);
1737     return (PyObject *)unicode;
1738
1739   onError:
1740     Py_XDECREF(errorHandler);
1741     Py_XDECREF(exc);
1742     Py_DECREF(unicode);
1743     return NULL;
1744 }
1745
1746
1747 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1748                                Py_ssize_t size,
1749                                int base64SetO,
1750                                int base64WhiteSpace,
1751                                const char *errors)
1752 {
1753     PyObject *v;
1754     /* It might be possible to tighten this worst case */
1755     Py_ssize_t allocated = 8 * size;
1756     int inShift = 0;
1757     Py_ssize_t i = 0;
1758     unsigned int base64bits = 0;
1759     unsigned long base64buffer = 0;
1760     char * out;
1761     char * start;
1762
1763     if (allocated / 8 != size)
1764         return PyErr_NoMemory();
1765
1766     if (size == 0)
1767         return PyString_FromStringAndSize(NULL, 0);
1768
1769     v = PyString_FromStringAndSize(NULL, allocated);
1770     if (v == NULL)
1771         return NULL;
1772
1773     start = out = PyString_AS_STRING(v);
1774     for (;i < size; ++i) {
1775         Py_UNICODE ch = s[i];
1776
1777         if (inShift) {
1778             if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1779                 /* shifting out */
1780                 if (base64bits) { /* output remaining bits */
1781                     *out++ = TO_BASE64(base64buffer << (6-base64bits));
1782                     base64buffer = 0;
1783                     base64bits = 0;
1784                 }
1785                 inShift = 0;
1786                 /* Characters not in the BASE64 set implicitly unshift the sequence
1787                    so no '-' is required, except if the character is itself a '-' */
1788                 if (IS_BASE64(ch) || ch == '-') {
1789                     *out++ = '-';
1790                 }
1791                 *out++ = (char) ch;
1792             }
1793             else {
1794                 goto encode_char;
1795             }
1796         }
1797         else { /* not in a shift sequence */
1798             if (ch == '+') {
1799                 *out++ = '+';
1800                         *out++ = '-';
1801             }
1802             else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1803                 *out++ = (char) ch;
1804             }
1805             else {
1806                 *out++ = '+';
1807                 inShift = 1;
1808                 goto encode_char;
1809             }
1810         }
1811         continue;
1812 encode_char:
1813 #ifdef Py_UNICODE_WIDE
1814         if (ch >= 0x10000) {
1815             /* code first surrogate */
1816             base64bits += 16;
1817             base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1818             while (base64bits >= 6) {
1819                 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1820                 base64bits -= 6;
1821             }
1822             /* prepare second surrogate */
1823             ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
1824         }
1825 #endif
1826         base64bits += 16;
1827         base64buffer = (base64buffer << 16) | ch;
1828         while (base64bits >= 6) {
1829             *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1830             base64bits -= 6;
1831         }
1832     }
1833     if (base64bits)
1834         *out++= TO_BASE64(base64buffer << (6-base64bits) );
1835     if (inShift)
1836         *out++ = '-';
1837
1838     _PyString_Resize(&v, out - start);
1839     return v;
1840 }
1841
1842 #undef IS_BASE64
1843 #undef FROM_BASE64
1844 #undef TO_BASE64
1845 #undef DECODE_DIRECT
1846 #undef ENCODE_DIRECT
1847
1848 /* --- UTF-8 Codec -------------------------------------------------------- */
1849
1850 static
1851 char utf8_code_length[256] = {
1852     /* Map UTF-8 encoded prefix byte to sequence length.  zero means
1853        illegal prefix.  see RFC 2279 for details */
1854     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1855     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1856     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1857     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1858     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1859     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1860     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1861     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1862     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1863     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1864     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1865     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1866     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1867     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1868     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1869     4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1870 };
1871
1872 PyObject *PyUnicode_DecodeUTF8(const char *s,
1873                                Py_ssize_t size,
1874                                const char *errors)
1875 {
1876     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1877 }
1878
1879 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1880                                        Py_ssize_t size,
1881                                        const char *errors,
1882                                        Py_ssize_t *consumed)
1883 {
1884     const char *starts = s;
1885     int n;
1886     Py_ssize_t startinpos;
1887     Py_ssize_t endinpos;
1888     Py_ssize_t outpos;
1889     const char *e;
1890     PyUnicodeObject *unicode;
1891     Py_UNICODE *p;
1892     const char *errmsg = "";
1893     PyObject *errorHandler = NULL;
1894     PyObject *exc = NULL;
1895
1896     /* Note: size will always be longer than the resulting Unicode
1897        character count */
1898     unicode = _PyUnicode_New(size);
1899     if (!unicode)
1900         return NULL;
1901     if (size == 0) {
1902         if (consumed)
1903             *consumed = 0;
1904         return (PyObject *)unicode;
1905     }
1906
1907     /* Unpack UTF-8 encoded data */
1908     p = unicode->str;
1909     e = s + size;
1910
1911     while (s < e) {
1912         Py_UCS4 ch = (unsigned char)*s;
1913
1914         if (ch < 0x80) {
1915             *p++ = (Py_UNICODE)ch;
1916             s++;
1917             continue;
1918         }
1919
1920         n = utf8_code_length[ch];
1921
1922         if (s + n > e) {
1923             if (consumed)
1924                 break;
1925             else {
1926                 errmsg = "unexpected end of data";
1927                 startinpos = s-starts;
1928                 endinpos = size;
1929                 goto utf8Error;
1930             }
1931         }
1932
1933         switch (n) {
1934
1935         case 0:
1936             errmsg = "unexpected code byte";
1937             startinpos = s-starts;
1938             endinpos = startinpos+1;
1939             goto utf8Error;
1940
1941         case 1:
1942             errmsg = "internal error";
1943             startinpos = s-starts;
1944             endinpos = startinpos+1;
1945             goto utf8Error;
1946
1947         case 2:
1948             if ((s[1] & 0xc0) != 0x80) {
1949                 errmsg = "invalid data";
1950                 startinpos = s-starts;
1951                 endinpos = startinpos+2;
1952                 goto utf8Error;
1953             }
1954             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1955             if (ch < 0x80) {
1956                 startinpos = s-starts;
1957                 endinpos = startinpos+2;
1958                 errmsg = "illegal encoding";
1959                 goto utf8Error;
1960             }
1961             else
1962                 *p++ = (Py_UNICODE)ch;
1963             break;
1964
1965         case 3:
1966             if ((s[1] & 0xc0) != 0x80 ||
1967                 (s[2] & 0xc0) != 0x80) {
1968                 errmsg = "invalid data";
1969                 startinpos = s-starts;
1970                 endinpos = startinpos+3;
1971                 goto utf8Error;
1972             }
1973             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1974             if (ch < 0x0800) {
1975                 /* Note: UTF-8 encodings of surrogates are considered
1976                    legal UTF-8 sequences;
1977
1978                    XXX For wide builds (UCS-4) we should probably try
1979                    to recombine the surrogates into a single code
1980                    unit.
1981                 */
1982                 errmsg = "illegal encoding";
1983                 startinpos = s-starts;
1984                 endinpos = startinpos+3;
1985                 goto utf8Error;
1986             }
1987             else
1988                 *p++ = (Py_UNICODE)ch;
1989             break;
1990
1991         case 4:
1992             if ((s[1] & 0xc0) != 0x80 ||
1993                 (s[2] & 0xc0) != 0x80 ||
1994                 (s[3] & 0xc0) != 0x80) {
1995                 errmsg = "invalid data";
1996                 startinpos = s-starts;
1997                 endinpos = startinpos+4;
1998                 goto utf8Error;
1999             }
2000             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2001                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2002             /* validate and convert to UTF-16 */
2003             if ((ch < 0x10000)        /* minimum value allowed for 4
2004                                          byte encoding */
2005                 || (ch > 0x10ffff))   /* maximum value allowed for
2006                                          UTF-16 */
2007             {
2008                 errmsg = "illegal encoding";
2009                 startinpos = s-starts;
2010                 endinpos = startinpos+4;
2011                 goto utf8Error;
2012             }
2013 #ifdef Py_UNICODE_WIDE
2014             *p++ = (Py_UNICODE)ch;
2015 #else
2016             /*  compute and append the two surrogates: */
2017
2018             /*  translate from 10000..10FFFF to 0..FFFF */
2019             ch -= 0x10000;
2020
2021             /*  high surrogate = top 10 bits added to D800 */
2022             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2023
2024             /*  low surrogate = bottom 10 bits added to DC00 */
2025             *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2026 #endif
2027             break;
2028
2029         default:
2030             /* Other sizes are only needed for UCS-4 */
2031             errmsg = "unsupported Unicode code range";
2032             startinpos = s-starts;
2033             endinpos = startinpos+n;
2034             goto utf8Error;
2035         }
2036         s += n;
2037         continue;
2038
2039       utf8Error:
2040         outpos = p-PyUnicode_AS_UNICODE(unicode);
2041         if (unicode_decode_call_errorhandler(
2042                 errors, &errorHandler,
2043                 "utf8", errmsg,
2044                 starts, size, &startinpos, &endinpos, &exc, &s,
2045                 &unicode, &outpos, &p))
2046             goto onError;
2047     }
2048     if (consumed)
2049         *consumed = s-starts;
2050
2051     /* Adjust length */
2052     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2053         goto onError;
2054
2055     Py_XDECREF(errorHandler);
2056     Py_XDECREF(exc);
2057     return (PyObject *)unicode;
2058
2059   onError:
2060     Py_XDECREF(errorHandler);
2061     Py_XDECREF(exc);
2062     Py_DECREF(unicode);
2063     return NULL;
2064 }
2065
2066 /* Allocation strategy:  if the string is short, convert into a stack buffer
2067    and allocate exactly as much space needed at the end.  Else allocate the
2068    maximum possible needed (4 result bytes per Unicode character), and return
2069    the excess memory at the end.
2070 */
2071 PyObject *
2072 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2073                      Py_ssize_t size,
2074                      const char *errors)
2075 {
2076 #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
2077
2078     Py_ssize_t i;           /* index into s of next input byte */
2079     PyObject *v;        /* result string object */
2080     char *p;            /* next free byte in output buffer */
2081     Py_ssize_t nallocated;  /* number of result bytes allocated */
2082     Py_ssize_t nneeded;        /* number of result bytes needed */
2083     char stackbuf[MAX_SHORT_UNICHARS * 4];
2084
2085     assert(s != NULL);
2086     assert(size >= 0);
2087
2088     if (size <= MAX_SHORT_UNICHARS) {
2089         /* Write into the stack buffer; nallocated can't overflow.
2090          * At the end, we'll allocate exactly as much heap space as it
2091          * turns out we need.
2092          */
2093         nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2094         v = NULL;   /* will allocate after we're done */
2095         p = stackbuf;
2096     }
2097     else {
2098         /* Overallocate on the heap, and give the excess back at the end. */
2099         nallocated = size * 4;
2100         if (nallocated / 4 != size)  /* overflow! */
2101             return PyErr_NoMemory();
2102         v = PyString_FromStringAndSize(NULL, nallocated);
2103         if (v == NULL)
2104             return NULL;
2105         p = PyString_AS_STRING(v);
2106     }
2107
2108     for (i = 0; i < size;) {
2109         Py_UCS4 ch = s[i++];
2110
2111         if (ch < 0x80)
2112             /* Encode ASCII */
2113             *p++ = (char) ch;
2114
2115         else if (ch < 0x0800) {
2116             /* Encode Latin-1 */
2117             *p++ = (char)(0xc0 | (ch >> 6));
2118             *p++ = (char)(0x80 | (ch & 0x3f));
2119         }
2120         else {
2121             /* Encode UCS2 Unicode ordinals */
2122             if (ch < 0x10000) {
2123                 /* Special case: check for high surrogate */
2124                 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2125                     Py_UCS4 ch2 = s[i];
2126                     /* Check for low surrogate and combine the two to
2127                        form a UCS4 value */
2128                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2129                         ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2130                         i++;
2131                         goto encodeUCS4;
2132                     }
2133                     /* Fall through: handles isolated high surrogates */
2134                 }
2135                 *p++ = (char)(0xe0 | (ch >> 12));
2136                 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2137                 *p++ = (char)(0x80 | (ch & 0x3f));
2138                 continue;
2139             }
2140           encodeUCS4:
2141             /* Encode UCS4 Unicode ordinals */
2142             *p++ = (char)(0xf0 | (ch >> 18));
2143             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2144             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2145             *p++ = (char)(0x80 | (ch & 0x3f));
2146         }
2147     }
2148
2149     if (v == NULL) {
2150         /* This was stack allocated. */
2151         nneeded = p - stackbuf;
2152         assert(nneeded <= nallocated);
2153         v = PyString_FromStringAndSize(stackbuf, nneeded);
2154     }
2155     else {
2156         /* Cut back to size actually needed. */
2157         nneeded = p - PyString_AS_STRING(v);
2158         assert(nneeded <= nallocated);
2159         _PyString_Resize(&v, nneeded);
2160     }
2161     return v;
2162
2163 #undef MAX_SHORT_UNICHARS
2164 }
2165
2166 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2167 {
2168     if (!PyUnicode_Check(unicode)) {
2169         PyErr_BadArgument();
2170         return NULL;
2171     }
2172     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2173                                 PyUnicode_GET_SIZE(unicode),
2174                                 NULL);
2175 }
2176
2177 /* --- UTF-32 Codec ------------------------------------------------------- */
2178
2179 PyObject *
2180 PyUnicode_DecodeUTF32(const char *s,
2181                       Py_ssize_t size,
2182                       const char *errors,
2183                       int *byteorder)
2184 {
2185     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2186 }
2187
2188 PyObject *
2189 PyUnicode_DecodeUTF32Stateful(const char *s,
2190                               Py_ssize_t size,
2191                               const char *errors,
2192                               int *byteorder,
2193                               Py_ssize_t *consumed)
2194 {
2195     const char *starts = s;
2196     Py_ssize_t startinpos;
2197     Py_ssize_t endinpos;
2198     Py_ssize_t outpos;
2199     PyUnicodeObject *unicode;
2200     Py_UNICODE *p;
2201 #ifndef Py_UNICODE_WIDE
2202     int i, pairs;
2203 #else
2204     const int pairs = 0;
2205 #endif
2206     const unsigned char *q, *e;
2207     int bo = 0;       /* assume native ordering by default */
2208     const char *errmsg = "";
2209     /* Offsets from q for retrieving bytes in the right order. */
2210 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2211     int iorder[] = {0, 1, 2, 3};
2212 #else
2213     int iorder[] = {3, 2, 1, 0};
2214 #endif
2215     PyObject *errorHandler = NULL;
2216     PyObject *exc = NULL;
2217     /* On narrow builds we split characters outside the BMP into two
2218        codepoints => count how much extra space we need. */
2219 #ifndef Py_UNICODE_WIDE
2220     for (i = pairs = 0; i < size/4; i++)
2221         if (((Py_UCS4 *)s)[i] >= 0x10000)
2222             pairs++;
2223 #endif
2224
2225     /* This might be one to much, because of a BOM */
2226     unicode = _PyUnicode_New((size+3)/4+pairs);
2227     if (!unicode)
2228         return NULL;
2229     if (size == 0)
2230         return (PyObject *)unicode;
2231
2232     /* Unpack UTF-32 encoded data */
2233     p = unicode->str;
2234     q = (unsigned char *)s;
2235     e = q + size;
2236
2237     if (byteorder)
2238         bo = *byteorder;
2239
2240     /* Check for BOM marks (U+FEFF) in the input and adjust current
2241        byte order setting accordingly. In native mode, the leading BOM
2242        mark is skipped, in all other modes, it is copied to the output
2243        stream as-is (giving a ZWNBSP character). */
2244     if (bo == 0) {
2245         if (size >= 4) {
2246             const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2247                 (q[iorder[1]] << 8) | q[iorder[0]];
2248 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2249             if (bom == 0x0000FEFF) {
2250                 q += 4;
2251                 bo = -1;
2252             }
2253             else if (bom == 0xFFFE0000) {
2254                 q += 4;
2255                 bo = 1;
2256             }
2257 #else
2258             if (bom == 0x0000FEFF) {
2259                 q += 4;
2260                 bo = 1;
2261             }
2262             else if (bom == 0xFFFE0000) {
2263                 q += 4;
2264                 bo = -1;
2265             }
2266 #endif
2267         }
2268     }
2269
2270     if (bo == -1) {
2271         /* force LE */
2272         iorder[0] = 0;
2273         iorder[1] = 1;
2274         iorder[2] = 2;
2275         iorder[3] = 3;
2276     }
2277     else if (bo == 1) {
2278         /* force BE */
2279         iorder[0] = 3;
2280         iorder[1] = 2;
2281         iorder[2] = 1;
2282         iorder[3] = 0;
2283     }
2284
2285     while (q < e) {
2286         Py_UCS4 ch;
2287         /* remaining bytes at the end? (size should be divisible by 4) */
2288         if (e-q<4) {
2289             if (consumed)
2290                 break;
2291             errmsg = "truncated data";
2292             startinpos = ((const char *)q)-starts;
2293             endinpos = ((const char *)e)-starts;
2294             goto utf32Error;
2295             /* The remaining input chars are ignored if the callback
2296                chooses to skip the input */
2297         }
2298         ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2299             (q[iorder[1]] << 8) | q[iorder[0]];
2300
2301         if (ch >= 0x110000)
2302         {
2303             errmsg = "codepoint not in range(0x110000)";
2304             startinpos = ((const char *)q)-starts;
2305             endinpos = startinpos+4;
2306             goto utf32Error;
2307         }
2308 #ifndef Py_UNICODE_WIDE
2309         if (ch >= 0x10000)
2310         {
2311             *p++ = 0xD800 | ((ch-0x10000) >> 10);
2312             *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2313         }
2314         else
2315 #endif
2316             *p++ = ch;
2317         q += 4;
2318         continue;
2319       utf32Error:
2320         outpos = p-PyUnicode_AS_UNICODE(unicode);
2321         if (unicode_decode_call_errorhandler(
2322                 errors, &errorHandler,
2323                 "utf32", errmsg,
2324                 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2325                 &unicode, &outpos, &p))
2326             goto onError;
2327     }
2328
2329     if (byteorder)
2330         *byteorder = bo;
2331
2332     if (consumed)
2333         *consumed = (const char *)q-starts;
2334
2335     /* Adjust length */
2336     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2337         goto onError;
2338
2339     Py_XDECREF(errorHandler);
2340     Py_XDECREF(exc);
2341     return (PyObject *)unicode;
2342
2343   onError:
2344     Py_DECREF(unicode);
2345     Py_XDECREF(errorHandler);
2346     Py_XDECREF(exc);
2347     return NULL;
2348 }
2349
2350 PyObject *
2351 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2352                       Py_ssize_t size,
2353                       const char *errors,
2354                       int byteorder)
2355 {
2356     PyObject *v;
2357     unsigned char *p;
2358     Py_ssize_t nsize, bytesize;
2359 #ifndef Py_UNICODE_WIDE
2360     Py_ssize_t i, pairs;
2361 #else
2362     const int pairs = 0;
2363 #endif
2364     /* Offsets from p for storing byte pairs in the right order. */
2365 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2366     int iorder[] = {0, 1, 2, 3};
2367 #else
2368     int iorder[] = {3, 2, 1, 0};
2369 #endif
2370
2371 #define STORECHAR(CH)                           \
2372     do {                                        \
2373         p[iorder[3]] = ((CH) >> 24) & 0xff;     \
2374         p[iorder[2]] = ((CH) >> 16) & 0xff;     \
2375         p[iorder[1]] = ((CH) >> 8) & 0xff;      \
2376         p[iorder[0]] = (CH) & 0xff;             \
2377         p += 4;                                 \
2378     } while(0)
2379
2380     /* In narrow builds we can output surrogate pairs as one codepoint,
2381        so we need less space. */
2382 #ifndef Py_UNICODE_WIDE
2383     for (i = pairs = 0; i < size-1; i++)
2384         if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2385             0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2386             pairs++;
2387 #endif
2388     nsize = (size - pairs + (byteorder == 0));
2389     bytesize = nsize * 4;
2390     if (bytesize / 4 != nsize)
2391         return PyErr_NoMemory();
2392     v = PyString_FromStringAndSize(NULL, bytesize);
2393     if (v == NULL)
2394         return NULL;
2395
2396     p = (unsigned char *)PyString_AS_STRING(v);
2397     if (byteorder == 0)
2398         STORECHAR(0xFEFF);
2399     if (size == 0)
2400         return v;
2401
2402     if (byteorder == -1) {
2403         /* force LE */
2404         iorder[0] = 0;
2405         iorder[1] = 1;
2406         iorder[2] = 2;
2407         iorder[3] = 3;
2408     }
2409     else if (byteorder == 1) {
2410         /* force BE */
2411         iorder[0] = 3;
2412         iorder[1] = 2;
2413         iorder[2] = 1;
2414         iorder[3] = 0;
2415     }
2416
2417     while (size-- > 0) {
2418         Py_UCS4 ch = *s++;
2419 #ifndef Py_UNICODE_WIDE
2420         if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2421             Py_UCS4 ch2 = *s;
2422             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2423                 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2424                 s++;
2425                 size--;
2426             }
2427         }
2428 #endif
2429         STORECHAR(ch);
2430     }
2431     return v;
2432 #undef STORECHAR
2433 }
2434
2435 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2436 {
2437     if (!PyUnicode_Check(unicode)) {
2438         PyErr_BadArgument();
2439         return NULL;
2440     }
2441     return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2442                                  PyUnicode_GET_SIZE(unicode),
2443                                  NULL,
2444                                  0);
2445 }
2446
2447 /* --- UTF-16 Codec ------------------------------------------------------- */
2448
2449 PyObject *
2450 PyUnicode_DecodeUTF16(const char *s,
2451                       Py_ssize_t size,
2452                       const char *errors,
2453                       int *byteorder)
2454 {
2455     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2456 }
2457
2458 PyObject *
2459 PyUnicode_DecodeUTF16Stateful(const char *s,
2460                               Py_ssize_t size,
2461                               const char *errors,
2462                               int *byteorder,
2463                               Py_ssize_t *consumed)
2464 {
2465     const char *starts = s;
2466     Py_ssize_t startinpos;
2467     Py_ssize_t endinpos;
2468     Py_ssize_t outpos;
2469     PyUnicodeObject *unicode;
2470     Py_UNICODE *p;
2471     const unsigned char *q, *e;
2472     int bo = 0;       /* assume native ordering by default */
2473     const char *errmsg = "";
2474     /* Offsets from q for retrieving byte pairs in the right order. */
2475 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2476     int ihi = 1, ilo = 0;
2477 #else
2478     int ihi = 0, ilo = 1;
2479 #endif
2480     PyObject *errorHandler = NULL;
2481     PyObject *exc = NULL;
2482
2483     /* Note: size will always be longer than the resulting Unicode
2484        character count */
2485     unicode = _PyUnicode_New(size);
2486     if (!unicode)
2487         return NULL;
2488     if (size == 0)
2489         return (PyObject *)unicode;
2490
2491     /* Unpack UTF-16 encoded data */
2492     p = unicode->str;
2493     q = (unsigned char *)s;
2494     e = q + size;
2495
2496     if (byteorder)
2497         bo = *byteorder;
2498
2499     /* Check for BOM marks (U+FEFF) in the input and adjust current
2500        byte order setting accordingly. In native mode, the leading BOM
2501        mark is skipped, in all other modes, it is copied to the output
2502        stream as-is (giving a ZWNBSP character). */
2503     if (bo == 0) {
2504         if (size >= 2) {
2505             const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2506 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2507             if (bom == 0xFEFF) {
2508                 q += 2;
2509                 bo = -1;
2510             }
2511             else if (bom == 0xFFFE) {
2512                 q += 2;
2513                 bo = 1;
2514             }
2515 #else
2516             if (bom == 0xFEFF) {
2517                 q += 2;
2518                 bo = 1;
2519             }
2520             else if (bom == 0xFFFE) {
2521                 q += 2;
2522                 bo = -1;
2523             }
2524 #endif
2525         }
2526     }
2527
2528     if (bo == -1) {
2529         /* force LE */
2530         ihi = 1;
2531         ilo = 0;
2532     }
2533     else if (bo == 1) {
2534         /* force BE */
2535         ihi = 0;
2536         ilo = 1;
2537     }
2538
2539     while (q < e) {
2540         Py_UNICODE ch;
2541         /* remaining bytes at the end? (size should be even) */
2542         if (e-q<2) {
2543             if (consumed)
2544                 break;
2545             errmsg = "truncated data";
2546             startinpos = ((const char *)q)-starts;
2547             endinpos = ((const char *)e)-starts;
2548             goto utf16Error;
2549             /* The remaining input chars are ignored if the callback
2550                chooses to skip the input */
2551         }
2552         ch = (q[ihi] << 8) | q[ilo];
2553
2554         q += 2;
2555
2556         if (ch < 0xD800 || ch > 0xDFFF) {
2557             *p++ = ch;
2558             continue;
2559         }
2560
2561         /* UTF-16 code pair: */
2562         if (q >= e) {
2563             errmsg = "unexpected end of data";
2564             startinpos = (((const char *)q)-2)-starts;
2565             endinpos = ((const char *)e)-starts;
2566             goto utf16Error;
2567         }
2568         if (0xD800 <= ch && ch <= 0xDBFF) {
2569             Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2570             q += 2;
2571             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2572 #ifndef Py_UNICODE_WIDE
2573                 *p++ = ch;
2574                 *p++ = ch2;
2575 #else
2576                 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2577 #endif
2578                 continue;
2579             }
2580             else {
2581                 errmsg = "illegal UTF-16 surrogate";
2582                 startinpos = (((const char *)q)-4)-starts;
2583                 endinpos = startinpos+2;
2584                 goto utf16Error;
2585             }
2586
2587         }
2588         errmsg = "illegal encoding";
2589         startinpos = (((const char *)q)-2)-starts;
2590         endinpos = startinpos+2;
2591         /* Fall through to report the error */
2592
2593       utf16Error:
2594         outpos = p-PyUnicode_AS_UNICODE(unicode);
2595         if (unicode_decode_call_errorhandler(
2596                 errors, &errorHandler,
2597                 "utf16", errmsg,
2598                 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2599                 &unicode, &outpos, &p))
2600             goto onError;
2601     }
2602
2603     if (byteorder)
2604         *byteorder = bo;
2605
2606     if (consumed)
2607         *consumed = (const char *)q-starts;
2608
2609     /* Adjust length */
2610     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2611         goto onError;
2612
2613     Py_XDECREF(errorHandler);
2614     Py_XDECREF(exc);
2615     return (PyObject *)unicode;
2616
2617   onError:
2618     Py_DECREF(unicode);
2619     Py_XDECREF(errorHandler);
2620     Py_XDECREF(exc);
2621     return NULL;
2622 }
2623
2624 PyObject *
2625 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2626                       Py_ssize_t size,
2627                       const char *errors,
2628                       int byteorder)
2629 {
2630     PyObject *v;
2631     unsigned char *p;
2632     Py_ssize_t nsize, bytesize;
2633 #ifdef Py_UNICODE_WIDE
2634     Py_ssize_t i, pairs;
2635 #else
2636     const int pairs = 0;
2637 #endif
2638     /* Offsets from p for storing byte pairs in the right order. */
2639 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2640     int ihi = 1, ilo = 0;
2641 #else
2642     int ihi = 0, ilo = 1;
2643 #endif
2644
2645 #define STORECHAR(CH)                           \
2646     do {                                        \
2647         p[ihi] = ((CH) >> 8) & 0xff;            \
2648         p[ilo] = (CH) & 0xff;                   \
2649         p += 2;                                 \
2650     } while(0)
2651
2652 #ifdef Py_UNICODE_WIDE
2653     for (i = pairs = 0; i < size; i++)
2654         if (s[i] >= 0x10000)
2655             pairs++;
2656 #endif
2657     /* 2 * (size + pairs + (byteorder == 0)) */
2658     if (size > PY_SSIZE_T_MAX ||
2659         size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2660         return PyErr_NoMemory();
2661     nsize = size + pairs + (byteorder == 0);
2662     bytesize = nsize * 2;
2663     if (bytesize / 2 != nsize)
2664         return PyErr_NoMemory();
2665     v = PyString_FromStringAndSize(NULL, bytesize);
2666     if (v == NULL)
2667         return NULL;
2668
2669     p = (unsigned char *)PyString_AS_STRING(v);
2670     if (byteorder == 0)
2671         STORECHAR(0xFEFF);
2672     if (size == 0)
2673         return v;
2674
2675     if (byteorder == -1) {
2676         /* force LE */
2677         ihi = 1;
2678         ilo = 0;
2679     }
2680     else if (byteorder == 1) {
2681         /* force BE */
2682         ihi = 0;
2683         ilo = 1;
2684     }
2685
2686     while (size-- > 0) {
2687         Py_UNICODE ch = *s++;
2688         Py_UNICODE ch2 = 0;
2689 #ifdef Py_UNICODE_WIDE
2690         if (ch >= 0x10000) {
2691             ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2692             ch  = 0xD800 | ((ch-0x10000) >> 10);
2693         }
2694 #endif
2695         STORECHAR(ch);
2696         if (ch2)
2697             STORECHAR(ch2);
2698     }
2699     return v;
2700 #undef STORECHAR
2701 }
2702
2703 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2704 {
2705     if (!PyUnicode_Check(unicode)) {
2706         PyErr_BadArgument();
2707         return NULL;
2708     }
2709     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2710                                  PyUnicode_GET_SIZE(unicode),
2711                                  NULL,
2712                                  0);
2713 }
2714
2715 /* --- Unicode Escape Codec ----------------------------------------------- */
2716
2717 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2718
2719 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2720                                         Py_ssize_t size,
2721                                         const char *errors)
2722 {
2723     const char *starts = s;
2724     Py_ssize_t startinpos;
2725     Py_ssize_t endinpos;
2726     Py_ssize_t outpos;
2727     int i;
2728     PyUnicodeObject *v;
2729     Py_UNICODE *p;
2730     const char *end;
2731     char* message;
2732     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2733     PyObject *errorHandler = NULL;
2734     PyObject *exc = NULL;
2735
2736     /* Escaped strings will always be longer than the resulting
2737        Unicode string, so we start with size here and then reduce the
2738        length after conversion to the true value.
2739        (but if the error callback returns a long replacement string
2740        we'll have to allocate more space) */
2741     v = _PyUnicode_New(size);
2742     if (v == NULL)
2743         goto onError;
2744     if (size == 0)
2745         return (PyObject *)v;
2746
2747     p = PyUnicode_AS_UNICODE(v);
2748     end = s + size;
2749
2750     while (s < end) {
2751         unsigned char c;
2752         Py_UNICODE x;
2753         int digits;
2754
2755         /* Non-escape characters are interpreted as Unicode ordinals */
2756         if (*s != '\\') {
2757             *p++ = (unsigned char) *s++;
2758             continue;
2759         }
2760
2761         startinpos = s-starts;
2762         /* \ - Escapes */
2763         s++;
2764         c = *s++;
2765         if (s > end)
2766             c = '\0'; /* Invalid after \ */
2767         switch (c) {
2768
2769             /* \x escapes */
2770         case '\n': break;
2771         case '\\': *p++ = '\\'; break;
2772         case '\'': *p++ = '\''; break;
2773         case '\"': *p++ = '\"'; break;
2774         case 'b': *p++ = '\b'; break;
2775         case 'f': *p++ = '\014'; break; /* FF */
2776         case 't': *p++ = '\t'; break;
2777         case 'n': *p++ = '\n'; break;
2778         case 'r': *p++ = '\r'; break;
2779         case 'v': *p++ = '\013'; break; /* VT */
2780         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2781
2782             /* \OOO (octal) escapes */
2783         case '0': case '1': case '2': case '3':
2784         case '4': case '5': case '6': case '7':
2785             x = s[-1] - '0';
2786             if (s < end && '0' <= *s && *s <= '7') {
2787                 x = (x<<3) + *s++ - '0';
2788                 if (s < end && '0' <= *s && *s <= '7')
2789                     x = (x<<3) + *s++ - '0';
2790             }
2791             *p++ = x;
2792             break;
2793
2794             /* hex escapes */
2795             /* \xXX */
2796         case 'x':
2797             digits = 2;
2798             message = "truncated \\xXX escape";
2799             goto hexescape;
2800
2801             /* \uXXXX */
2802         case 'u':
2803             digits = 4;
2804             message = "truncated \\uXXXX escape";
2805             goto hexescape;
2806
2807             /* \UXXXXXXXX */
2808         case 'U':
2809             digits = 8;
2810             message = "truncated \\UXXXXXXXX escape";
2811         hexescape:
2812             chr = 0;
2813             outpos = p-PyUnicode_AS_UNICODE(v);
2814             if (s+digits>end) {
2815                 endinpos = size;
2816                 if (unicode_decode_call_errorhandler(
2817                         errors, &errorHandler,
2818                         "unicodeescape", "end of string in escape sequence",
2819                         starts, size, &startinpos, &endinpos, &exc, &s,
2820                         &v, &outpos, &p))
2821                     goto onError;
2822                 goto nextByte;
2823             }
2824             for (i = 0; i < digits; ++i) {
2825                 c = (unsigned char) s[i];
2826                 if (!isxdigit(c)) {
2827                     endinpos = (s+i+1)-starts;
2828                     if (unicode_decode_call_errorhandler(
2829                             errors, &errorHandler,
2830                             "unicodeescape", message,
2831                             starts, size, &startinpos, &endinpos, &exc, &s,
2832                             &v, &outpos, &p))
2833                         goto onError;
2834                     goto nextByte;
2835                 }
2836                 chr = (chr<<4) & ~0xF;
2837                 if (c >= '0' && c <= '9')
2838                     chr += c - '0';
2839                 else if (c >= 'a' && c <= 'f')
2840                     chr += 10 + c - 'a';
2841                 else
2842                     chr += 10 + c - 'A';
2843             }
2844             s += i;
2845             if (chr == 0xffffffff && PyErr_Occurred())
2846                 /* _decoding_error will have already written into the
2847                    target buffer. */
2848                 break;
2849         store:
2850             /* when we get here, chr is a 32-bit unicode character */
2851             if (chr <= 0xffff)
2852                 /* UCS-2 character */
2853                 *p++ = (Py_UNICODE) chr;
2854             else if (chr <= 0x10ffff) {
2855                 /* UCS-4 character. Either store directly, or as
2856                    surrogate pair. */
2857 #ifdef Py_UNICODE_WIDE
2858                 *p++ = chr;
2859 #else
2860                 chr -= 0x10000L;
2861                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2862                 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2863 #endif
2864             } else {
2865                 endinpos = s-starts;
2866                 outpos = p-PyUnicode_AS_UNICODE(v);
2867                 if (unicode_decode_call_errorhandler(
2868                         errors, &errorHandler,
2869                         "unicodeescape", "illegal Unicode character",
2870                         starts, size, &startinpos, &endinpos, &exc, &s,
2871                         &v, &outpos, &p))
2872                     goto onError;
2873             }
2874             break;
2875
2876             /* \N{name} */
2877         case 'N':
2878             message = "malformed \\N character escape";
2879             if (ucnhash_CAPI == NULL) {
2880                 /* load the unicode data module */
2881                 PyObject *m, *api;
2882                 m = PyImport_ImportModuleNoBlock("unicodedata");
2883                 if (m == NULL)
2884                     goto ucnhashError;
2885                 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
2886                 Py_DECREF(m);
2887                 if (api == NULL)
2888                     goto ucnhashError;
2889                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
2890                 Py_DECREF(api);
2891                 if (ucnhash_CAPI == NULL)
2892                     goto ucnhashError;
2893             }
2894             if (*s == '{') {
2895                 const char *start = s+1;
2896                 /* look for the closing brace */
2897                 while (*s != '}' && s < end)
2898                     s++;
2899                 if (s > start && s < end && *s == '}') {
2900                     /* found a name.  look it up in the unicode database */
2901                     message = "unknown Unicode character name";
2902                     s++;
2903                     if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2904                         goto store;
2905                 }
2906             }
2907             endinpos = s-starts;
2908             outpos = p-PyUnicode_AS_UNICODE(v);
2909             if (unicode_decode_call_errorhandler(
2910                     errors, &errorHandler,
2911                     "unicodeescape", message,
2912                     starts, size, &startinpos, &endinpos, &exc, &s,
2913                     &v, &outpos, &p))
2914                 goto onError;
2915             break;
2916
2917         default:
2918             if (s > end) {
2919                 message = "\\ at end of string";
2920                 s--;
2921                 endinpos = s-starts;
2922                 outpos = p-PyUnicode_AS_UNICODE(v);
2923                 if (unicode_decode_call_errorhandler(
2924                         errors, &errorHandler,
2925                         "unicodeescape", message,
2926                         starts, size, &startinpos, &endinpos, &exc, &s,
2927                         &v, &outpos, &p))
2928                     goto onError;
2929             }
2930             else {
2931                 *p++ = '\\';
2932                 *p++ = (unsigned char)s[-1];
2933             }
2934             break;
2935         }
2936       nextByte:
2937         ;
2938     }
2939     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2940         goto onError;
2941     Py_XDECREF(errorHandler);
2942     Py_XDECREF(exc);
2943     return (PyObject *)v;
2944
2945   ucnhashError:
2946     PyErr_SetString(
2947         PyExc_UnicodeError,
2948         "\\N escapes not supported (can't load unicodedata module)"
2949         );
2950     Py_XDECREF(v);
2951     Py_XDECREF(errorHandler);
2952     Py_XDECREF(exc);
2953     return NULL;
2954
2955   onError:
2956     Py_XDECREF(v);
2957     Py_XDECREF(errorHandler);
2958     Py_XDECREF(exc);
2959     return NULL;
2960 }
2961
2962 /* Return a Unicode-Escape string version of the Unicode object.
2963
2964    If quotes is true, the string is enclosed in u"" or u'' quotes as
2965    appropriate.
2966
2967 */
2968
2969 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2970                                              Py_ssize_t size,
2971                                              Py_UNICODE ch)
2972 {
2973     /* like wcschr, but doesn't stop at NULL characters */
2974
2975     while (size-- > 0) {
2976         if (*s == ch)
2977             return s;
2978         s++;
2979     }
2980
2981     return NULL;
2982 }
2983
2984 static
2985 PyObject *unicodeescape_string(const Py_UNICODE *s,
2986                                Py_ssize_t size,
2987                                int quotes)
2988 {
2989     PyObject *repr;
2990     char *p;
2991
2992     static const char *hexdigit = "0123456789abcdef";
2993 #ifdef Py_UNICODE_WIDE
2994     const Py_ssize_t expandsize = 10;
2995 #else
2996     const Py_ssize_t expandsize = 6;
2997 #endif
2998
2999     /* XXX(nnorwitz): rather than over-allocating, it would be
3000        better to choose a different scheme.  Perhaps scan the
3001        first N-chars of the string and allocate based on that size.
3002     */
3003     /* Initial allocation is based on the longest-possible unichr
3004        escape.
3005
3006        In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3007        unichr, so in this case it's the longest unichr escape. In
3008        narrow (UTF-16) builds this is five chars per source unichr
3009        since there are two unichrs in the surrogate pair, so in narrow
3010        (UTF-16) builds it's not the longest unichr escape.
3011
3012        In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3013        so in the narrow (UTF-16) build case it's the longest unichr
3014        escape.
3015     */
3016
3017     if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3018         return PyErr_NoMemory();
3019
3020     repr = PyString_FromStringAndSize(NULL,
3021                                       2
3022                                       + expandsize*size
3023                                       + 1);
3024     if (repr == NULL)
3025         return NULL;
3026
3027     p = PyString_AS_STRING(repr);
3028
3029     if (quotes) {
3030         *p++ = 'u';
3031         *p++ = (findchar(s, size, '\'') &&
3032                 !findchar(s, size, '"')) ? '"' : '\'';
3033     }
3034     while (size-- > 0) {
3035         Py_UNICODE ch = *s++;
3036
3037         /* Escape quotes and backslashes */
3038         if ((quotes &&
3039              ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
3040             *p++ = '\\';
3041             *p++ = (char) ch;
3042             continue;
3043         }
3044
3045 #ifdef Py_UNICODE_WIDE
3046         /* Map 21-bit characters to '\U00xxxxxx' */
3047         else if (ch >= 0x10000) {
3048             *p++ = '\\';
3049             *p++ = 'U';
3050             *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3051             *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3052             *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3053             *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3054             *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3055             *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3056             *p++ = hexdigit[(ch >> 4) & 0x0000000F];
3057             *p++ = hexdigit[ch & 0x0000000F];
3058             continue;
3059         }
3060 #else
3061         /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3062         else if (ch >= 0xD800 && ch < 0xDC00) {
3063             Py_UNICODE ch2;
3064             Py_UCS4 ucs;
3065
3066             ch2 = *s++;
3067             size--;
3068             if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3069                 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3070                 *p++ = '\\';
3071                 *p++ = 'U';
3072                 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3073                 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3074                 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3075                 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3076                 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3077                 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3078                 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3079                 *p++ = hexdigit[ucs & 0x0000000F];
3080                 continue;
3081             }
3082             /* Fall through: isolated surrogates are copied as-is */
3083             s--;
3084             size++;
3085         }
3086 #endif
3087
3088         /* Map 16-bit characters to '\uxxxx' */
3089         if (ch >= 256) {
3090             *p++ = '\\';
3091             *p++ = 'u';
3092             *p++ = hexdigit[(ch >> 12) & 0x000F];
3093             *p++ = hexdigit[(ch >> 8) & 0x000F];
3094             *p++ = hexdigit[(ch >> 4) & 0x000F];
3095             *p++ = hexdigit[ch & 0x000F];
3096         }
3097
3098         /* Map special whitespace to '\t', \n', '\r' */
3099         else if (ch == '\t') {
3100             *p++ = '\\';
3101             *p++ = 't';
3102         }
3103         else if (ch == '\n') {
3104             *p++ = '\\';
3105             *p++ = 'n';
3106         }
3107         else if (ch == '\r') {
3108             *p++ = '\\';
3109             *p++ = 'r';
3110         }
3111
3112         /* Map non-printable US ASCII to '\xhh' */
3113         else if (ch < ' ' || ch >= 0x7F) {
3114             *p++ = '\\';
3115             *p++ = 'x';
3116             *p++ = hexdigit[(ch >> 4) & 0x000F];
3117             *p++ = hexdigit[ch & 0x000F];
3118         }
3119
3120         /* Copy everything else as-is */
3121         else
3122             *p++ = (char) ch;
3123     }
3124     if (quotes)
3125         *p++ = PyString_AS_STRING(repr)[1];
3126
3127     *p = '\0';
3128     _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
3129     return repr;
3130 }
3131
3132 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3133                                         Py_ssize_t size)
3134 {
3135     return unicodeescape_string(s, size, 0);
3136 }
3137
3138 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3139 {
3140     if (!PyUnicode_Check(unicode)) {
3141         PyErr_BadArgument();
3142         return NULL;
3143     }
3144     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3145                                          PyUnicode_GET_SIZE(unicode));
3146 }
3147
3148 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3149
3150 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3151                                            Py_ssize_t size,
3152                                            const char *errors)
3153 {
3154     const char *starts = s;
3155     Py_ssize_t startinpos;
3156     Py_ssize_t endinpos;
3157     Py_ssize_t outpos;
3158     PyUnicodeObject *v;
3159     Py_UNICODE *p;
3160     const char *end;
3161     const char *bs;
3162     PyObject *errorHandler = NULL;
3163     PyObject *exc = NULL;
3164
3165     /* Escaped strings will always be longer than the resulting
3166        Unicode string, so we start with size here and then reduce the
3167        length after conversion to the true value. (But decoding error
3168        handler might have to resize the string) */
3169     v = _PyUnicode_New(size);
3170     if (v == NULL)
3171         goto onError;
3172     if (size == 0)
3173         return (PyObject *)v;
3174     p = PyUnicode_AS_UNICODE(v);
3175     end = s + size;
3176     while (s < end) {
3177         unsigned char c;
3178         Py_UCS4 x;
3179         int i;
3180         int count;
3181
3182         /* Non-escape characters are interpreted as Unicode ordinals */
3183         if (*s != '\\') {
3184             *p++ = (unsigned char)*s++;
3185             continue;
3186         }
3187         startinpos = s-starts;
3188
3189         /* \u-escapes are only interpreted iff the number of leading
3190            backslashes if odd */
3191         bs = s;
3192         for (;s < end;) {
3193             if (*s != '\\')
3194                 break;
3195             *p++ = (unsigned char)*s++;
3196         }
3197         if (((s - bs) & 1) == 0 ||
3198             s >= end ||
3199             (*s != 'u' && *s != 'U')) {
3200             continue;
3201         }
3202         p--;
3203         count = *s=='u' ? 4 : 8;
3204         s++;
3205
3206         /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3207         outpos = p-PyUnicode_AS_UNICODE(v);
3208         for (x = 0, i = 0; i < count; ++i, ++s) {
3209             c = (unsigned char)*s;
3210             if (!isxdigit(c)) {
3211                 endinpos = s-starts;
3212                 if (unicode_decode_call_errorhandler(
3213                         errors, &errorHandler,
3214                         "rawunicodeescape", "truncated \\uXXXX",
3215                         starts, size, &startinpos, &endinpos, &exc, &s,
3216                         &v, &outpos, &p))
3217                     goto onError;
3218                 goto nextByte;
3219             }
3220             x = (x<<4) & ~0xF;
3221             if (c >= '0' && c <= '9')
3222                 x += c - '0';
3223             else if (c >= 'a' && c <= 'f')
3224                 x += 10 + c - 'a';
3225             else
3226                 x += 10 + c - 'A';
3227         }
3228         if (x <= 0xffff)
3229             /* UCS-2 character */
3230             *p++ = (Py_UNICODE) x;
3231         else if (x <= 0x10ffff) {
3232             /* UCS-4 character. Either store directly, or as
3233                surrogate pair. */
3234 #ifdef Py_UNICODE_WIDE
3235             *p++ = (Py_UNICODE) x;
3236 #else
3237             x -= 0x10000L;
3238             *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3239             *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3240 #endif
3241         } else {
3242             endinpos = s-starts;
3243             outpos = p-PyUnicode_AS_UNICODE(v);
3244             if (unicode_decode_call_errorhandler(
3245                     errors, &errorHandler,
3246                     "rawunicodeescape", "\\Uxxxxxxxx out of range",
3247                     starts, size, &startinpos, &endinpos, &exc, &s,
3248                     &v, &outpos, &p))
3249                 goto onError;
3250         }
3251       nextByte:
3252         ;
3253     }
3254     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3255         goto onError;
3256     Py_XDECREF(errorHandler);
3257     Py_XDECREF(exc);
3258     return (PyObject *)v;
3259
3260   onError:
3261     Py_XDECREF(v);
3262     Py_XDECREF(errorHandler);
3263     Py_XDECREF(exc);
3264     return NULL;
3265 }
3266
3267 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3268                                            Py_ssize_t size)
3269 {
3270     PyObject *repr;
3271     char *p;
3272     char *q;
3273
3274     static const char *hexdigit = "0123456789abcdef";
3275 #ifdef Py_UNICODE_WIDE
3276     const Py_ssize_t expandsize = 10;
3277 #else
3278     const Py_ssize_t expandsize = 6;
3279 #endif
3280
3281     if (size > PY_SSIZE_T_MAX / expandsize)
3282         return PyErr_NoMemory();
3283
3284     repr = PyString_FromStringAndSize(NULL, expandsize * size);
3285     if (repr == NULL)
3286         return NULL;
3287     if (size == 0)
3288         return repr;
3289
3290     p = q = PyString_AS_STRING(repr);
3291     while (size-- > 0) {
3292         Py_UNICODE ch = *s++;
3293 #ifdef Py_UNICODE_WIDE
3294         /* Map 32-bit characters to '\Uxxxxxxxx' */
3295         if (ch >= 0x10000) {
3296             *p++ = '\\';
3297             *p++ = 'U';
3298             *p++ = hexdigit[(ch >> 28) & 0xf];
3299             *p++ = hexdigit[(ch >> 24) & 0xf];
3300             *p++ = hexdigit[(ch >> 20) & 0xf];
3301             *p++ = hexdigit[(ch >> 16) & 0xf];
3302             *p++ = hexdigit[(ch >> 12) & 0xf];
3303             *p++ = hexdigit[(ch >> 8) & 0xf];
3304             *p++ = hexdigit[(ch >> 4) & 0xf];
3305             *p++ = hexdigit[ch & 15];
3306         }
3307         else
3308 #else
3309             /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3310             if (ch >= 0xD800 && ch < 0xDC00) {
3311                 Py_UNICODE ch2;
3312                 Py_UCS4 ucs;
3313
3314                 ch2 = *s++;
3315                 size--;
3316                 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3317                     ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3318                     *p++ = '\\';
3319                     *p++ = 'U';
3320                     *p++ = hexdigit[(ucs >> 28) & 0xf];
3321                     *p++ = hexdigit[(ucs >> 24) & 0xf];
3322                     *p++ = hexdigit[(ucs >> 20) & 0xf];
3323                     *p++ = hexdigit[(ucs >> 16) & 0xf];
3324                     *p++ = hexdigit[(ucs >> 12) & 0xf];
3325                     *p++ = hexdigit[(ucs >> 8) & 0xf];
3326                     *p++ = hexdigit[(ucs >> 4) & 0xf];
3327                     *p++ = hexdigit[ucs & 0xf];
3328                     continue;
3329                 }
3330                 /* Fall through: isolated surrogates are copied as-is */
3331                 s--;
3332                 size++;
3333             }
3334 #endif
3335         /* Map 16-bit characters to '\uxxxx' */
3336         if (ch >= 256) {
3337             *p++ = '\\';
3338             *p++ = 'u';
3339             *p++ = hexdigit[(ch >> 12) & 0xf];
3340             *p++ = hexdigit[(ch >> 8) & 0xf];
3341             *p++ = hexdigit[(ch >> 4) & 0xf];
3342             *p++ = hexdigit[ch & 15];
3343         }
3344         /* Copy everything else as-is */
3345         else
3346             *p++ = (char) ch;
3347     }
3348     *p = '\0';
3349     _PyString_Resize(&repr, p - q);
3350     return repr;
3351 }
3352
3353 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3354 {
3355     if (!PyUnicode_Check(unicode)) {
3356         PyErr_BadArgument();
3357         return NULL;
3358     }
3359     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3360                                             PyUnicode_GET_SIZE(unicode));
3361 }
3362
3363 /* --- Unicode Internal Codec ------------------------------------------- */
3364
3365 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3366                                            Py_ssize_t size,
3367                                            const char *errors)
3368 {
3369     const char *starts = s;
3370     Py_ssize_t startinpos;
3371     Py_ssize_t endinpos;
3372     Py_ssize_t outpos;
3373     PyUnicodeObject *v;
3374     Py_UNICODE *p;
3375     const char *end;
3376     const char *reason;
3377     PyObject *errorHandler = NULL;
3378     PyObject *exc = NULL;
3379
3380 #ifdef Py_UNICODE_WIDE
3381     Py_UNICODE unimax = PyUnicode_GetMax();
3382 #endif
3383
3384     /* XXX overflow detection missing */
3385     v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3386     if (v == NULL)
3387         goto onError;
3388     if (PyUnicode_GetSize((PyObject *)v) == 0)
3389         return (PyObject *)v;
3390     p = PyUnicode_AS_UNICODE(v);
3391     end = s + size;
3392
3393     while (s < end) {
3394         memcpy(p, s, sizeof(Py_UNICODE));
3395         /* We have to sanity check the raw data, otherwise doom looms for
3396            some malformed UCS-4 data. */
3397         if (
3398 #ifdef Py_UNICODE_WIDE
3399             *p > unimax || *p < 0 ||
3400 #endif
3401             end-s < Py_UNICODE_SIZE
3402             )
3403         {
3404             startinpos = s - starts;
3405             if (end-s < Py_UNICODE_SIZE) {
3406                 endinpos = end-starts;
3407                 reason = "truncated input";
3408             }
3409             else {
3410                 endinpos = s - starts + Py_UNICODE_SIZE;
3411                 reason = "illegal code point (> 0x10FFFF)";
3412             }
3413             outpos = p - PyUnicode_AS_UNICODE(v);
3414             if (unicode_decode_call_errorhandler(
3415                     errors, &errorHandler,
3416                     "unicode_internal", reason,
3417                     starts, size, &startinpos, &endinpos, &exc, &s,
3418                     &v, &outpos, &p)) {
3419                 goto onError;
3420             }
3421         }
3422         else {
3423             p++;
3424             s += Py_UNICODE_SIZE;
3425         }
3426     }
3427
3428     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3429         goto onError;
3430     Py_XDECREF(errorHandler);
3431     Py_XDECREF(exc);
3432     return (PyObject *)v;
3433
3434   onError:
3435     Py_XDECREF(v);
3436     Py_XDECREF(errorHandler);
3437     Py_XDECREF(exc);
3438     return NULL;
3439 }
3440
3441 /* --- Latin-1 Codec ------------------------------------------------------ */
3442
3443 PyObject *PyUnicode_DecodeLatin1(const char *s,
3444                                  Py_ssize_t size,
3445                                  const char *errors)
3446 {
3447     PyUnicodeObject *v;
3448     Py_UNICODE *p;
3449
3450     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3451     if (size == 1) {
3452         Py_UNICODE r = *(unsigned char*)s;
3453         return PyUnicode_FromUnicode(&r, 1);
3454     }
3455
3456     v = _PyUnicode_New(size);
3457     if (v == NULL)
3458         goto onError;
3459     if (size == 0)
3460         return (PyObject *)v;
3461     p = PyUnicode_AS_UNICODE(v);
3462     while (size-- > 0)
3463         *p++ = (unsigned char)*s++;
3464     return (PyObject *)v;
3465
3466   onError:
3467     Py_XDECREF(v);
3468     return NULL;
3469 }
3470
3471 /* create or adjust a UnicodeEncodeError */
3472 static void make_encode_exception(PyObject **exceptionObject,
3473                                   const char *encoding,
3474                                   const Py_UNICODE *unicode, Py_ssize_t size,
3475                                   Py_ssize_t startpos, Py_ssize_t endpos,
3476                                   const char *reason)
3477 {
3478     if (*exceptionObject == NULL) {
3479         *exceptionObject = PyUnicodeEncodeError_Create(
3480             encoding, unicode, size, startpos, endpos, reason);
3481     }
3482     else {
3483         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3484             goto onError;
3485         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3486             goto onError;
3487         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3488             goto onError;
3489         return;
3490       onError:
3491         Py_DECREF(*exceptionObject);
3492         *exceptionObject = NULL;
3493     }
3494 }
3495
3496 /* raises a UnicodeEncodeError */
3497 static void raise_encode_exception(PyObject **exceptionObject,
3498                                    const char *encoding,
3499                                    const Py_UNICODE *unicode, Py_ssize_t size,
3500                                    Py_ssize_t startpos, Py_ssize_t endpos,
3501                                    const char *reason)
3502 {
3503     make_encode_exception(exceptionObject,
3504                           encoding, unicode, size, startpos, endpos, reason);
3505     if (*exceptionObject != NULL)
3506         PyCodec_StrictErrors(*exceptionObject);
3507 }
3508
3509 /* error handling callback helper:
3510    build arguments, call the callback and check the arguments,
3511    put the result into newpos and return the replacement string, which
3512    has to be freed by the caller */
3513 static PyObject *unicode_encode_call_errorhandler(const char *errors,
3514                                                   PyObject **errorHandler,
3515                                                   const char *encoding, const char *reason,
3516                                                   const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3517                                                   Py_ssize_t startpos, Py_ssize_t endpos,
3518                                                   Py_ssize_t *newpos)
3519 {
3520     static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3521
3522     PyObject *restuple;
3523     PyObject *resunicode;
3524
3525     if (*errorHandler == NULL) {
3526         *errorHandler = PyCodec_LookupError(errors);
3527         if (*errorHandler == NULL)
3528             return NULL;
3529     }
3530
3531     make_encode_exception(exceptionObject,
3532                           encoding, unicode, size, startpos, endpos, reason);
3533     if (*exceptionObject == NULL)
3534         return NULL;
3535
3536     restuple = PyObject_CallFunctionObjArgs(
3537         *errorHandler, *exceptionObject, NULL);
3538     if (restuple == NULL)
3539         return NULL;
3540     if (!PyTuple_Check(restuple)) {
3541         PyErr_SetString(PyExc_TypeError, &argparse[4]);
3542         Py_DECREF(restuple);
3543         return NULL;
3544     }
3545     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3546                           &resunicode, newpos)) {
3547         Py_DECREF(restuple);
3548         return NULL;
3549     }
3550     if (*newpos<0)
3551         *newpos = size+*newpos;
3552     if (*newpos<0 || *newpos>size) {
3553         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3554         Py_DECREF(restuple);
3555         return NULL;
3556     }
3557     Py_INCREF(resunicode);
3558     Py_DECREF(restuple);
3559     return resunicode;
3560 }
3561
3562 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3563                                      Py_ssize_t size,
3564                                      const char *errors,
3565                                      int limit)
3566 {
3567     /* output object */
3568     PyObject *res;
3569     /* pointers to the beginning and end+1 of input */
3570     const Py_UNICODE *startp = p;
3571     const Py_UNICODE *endp = p + size;
3572     /* pointer to the beginning of the unencodable characters */
3573     /* const Py_UNICODE *badp = NULL; */
3574     /* pointer into the output */
3575     char *str;
3576     /* current output position */
3577     Py_ssize_t respos = 0;
3578     Py_ssize_t ressize;
3579     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3580     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3581     PyObject *errorHandler = NULL;
3582     PyObject *exc = NULL;
3583     /* the following variable is used for caching string comparisons
3584      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3585     int known_errorHandler = -1;
3586
3587     /* allocate enough for a simple encoding without
3588        replacements, if we need more, we'll resize */
3589     res = PyString_FromStringAndSize(NULL, size);
3590     if (res == NULL)
3591         goto onError;
3592     if (size == 0)
3593         return res;
3594     str = PyString_AS_STRING(res);
3595     ressize = size;
3596
3597     while (p<endp) {
3598         Py_UNICODE c = *p;
3599
3600         /* can we encode this? */
3601         if (c<limit) {
3602             /* no overflow check, because we know that the space is enough */
3603             *str++ = (char)c;
3604             ++p;
3605         }
3606         else {
3607             Py_ssize_t unicodepos = p-startp;
3608             Py_ssize_t requiredsize;
3609             PyObject *repunicode;
3610             Py_ssize_t repsize;
3611             Py_ssize_t newpos;
3612             Py_ssize_t respos;
3613             Py_UNICODE *uni2;
3614             /* startpos for collecting unencodable chars */
3615             const Py_UNICODE *collstart = p;
3616             const Py_UNICODE *collend = p;
3617             /* find all unecodable characters */
3618             while ((collend < endp) && ((*collend)>=limit))
3619                 ++collend;
3620             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3621             if (known_errorHandler==-1) {
3622                 if ((errors==NULL) || (!strcmp(errors, "strict")))
3623                     known_errorHandler = 1;
3624                 else if (!strcmp(errors, "replace"))
3625                     known_errorHandler = 2;
3626                 else if (!strcmp(errors, "ignore"))
3627                     known_errorHandler = 3;
3628                 else if (!strcmp(errors, "xmlcharrefreplace"))
3629                     known_errorHandler = 4;
3630                 else
3631                     known_errorHandler = 0;
3632             }
3633             switch (known_errorHandler) {
3634             case 1: /* strict */
3635                 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3636                 goto onError;
3637             case 2: /* replace */
3638                 while (collstart++<collend)
3639                     *str++ = '?'; /* fall through */
3640             case 3: /* ignore */
3641                 p = collend;
3642                 break;
3643             case 4: /* xmlcharrefreplace */
3644                 respos = str-PyString_AS_STRING(res);
3645                 /* determine replacement size (temporarily (mis)uses p) */
3646                 for (p = collstart, repsize = 0; p < collend; ++p) {
3647                     if (*p<10)
3648                         repsize += 2+1+1;
3649                     else if (*p<100)
3650                         repsize += 2+2+1;
3651                     else if (*p<1000)
3652                         repsize += 2+3+1;
3653                     else if (*p<10000)
3654                         repsize += 2+4+1;
3655 #ifndef Py_UNICODE_WIDE
3656                     else
3657                         repsize += 2+5+1;
3658 #else
3659                     else if (*p<100000)
3660                         repsize += 2+5+1;
3661                     else if (*p<1000000)
3662                         repsize += 2+6+1;
3663                     else
3664                         repsize += 2+7+1;
3665 #endif
3666                 }
3667                 requiredsize = respos+repsize+(endp-collend);
3668                 if (requiredsize > ressize) {
3669                     if (requiredsize<2*ressize)
3670                         requiredsize = 2*ressize;
3671                     if (_PyString_Resize(&res, requiredsize))
3672                         goto onError;
3673                     str = PyString_AS_STRING(res) + respos;
3674                     ressize = requiredsize;
3675                 }
3676                 /* generate replacement (temporarily (mis)uses p) */
3677                 for (p = collstart; p < collend; ++p) {
3678                     str += sprintf(str, "&#%d;", (int)*p);
3679                 }
3680                 p = collend;
3681                 break;
3682             default:
3683                 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3684                                                               encoding, reason, startp, size, &exc,
3685                                                               collstart-startp, collend-startp, &newpos);
3686                 if (repunicode == NULL)
3687                     goto onError;
3688                 /* need more space? (at least enough for what we have+the
3689                    replacement+the rest of the string, so we won't have to
3690                    check space for encodable characters) */
3691                 respos = str-PyString_AS_STRING(res);
3692                 repsize = PyUnicode_GET_SIZE(repunicode);
3693                 requiredsize = respos+repsize+(endp-collend);
3694                 if (requiredsize > ressize) {
3695                     if (requiredsize<2*ressize)
3696                         requiredsize = 2*ressize;
3697                     if (_PyString_Resize(&res, requiredsize)) {
3698                         Py_DECREF(repunicode);
3699                         goto onError;
3700                     }
3701                     str = PyString_AS_STRING(res) + respos;
3702                     ressize = requiredsize;
3703                 }
3704                 /* check if there is anything unencodable in the replacement
3705                    and copy it to the output */
3706                 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3707                     c = *uni2;
3708                     if (c >= limit) {
3709                         raise_encode_exception(&exc, encoding, startp, size,
3710                                                unicodepos, unicodepos+1, reason);
3711                         Py_DECREF(repunicode);
3712                         goto onError;
3713                     }
3714                     *str = (char)c;
3715                 }
3716                 p = startp + newpos;
3717                 Py_DECREF(repunicode);
3718             }
3719         }
3720     }
3721     /* Resize if we allocated to much */
3722     respos = str-PyString_AS_STRING(res);
3723     if (respos<ressize)
3724         /* If this falls res will be NULL */
3725         _PyString_Resize(&res, respos);
3726     Py_XDECREF(errorHandler);
3727     Py_XDECREF(exc);
3728     return res;
3729
3730   onError:
3731     Py_XDECREF(res);
3732     Py_XDECREF(errorHandler);
3733     Py_XDECREF(exc);
3734     return NULL;
3735 }
3736
3737 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3738                                  Py_ssize_t size,
3739                                  const char *errors)
3740 {
3741     return unicode_encode_ucs1(p, size, errors, 256);
3742 }
3743
3744 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3745 {
3746     if (!PyUnicode_Check(unicode)) {
3747         PyErr_BadArgument();
3748         return NULL;
3749     }
3750     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3751                                   PyUnicode_GET_SIZE(unicode),
3752                                   NULL);
3753 }
3754
3755 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3756
3757 PyObject *PyUnicode_DecodeASCII(const char *s,
3758                                 Py_ssize_t size,
3759                                 const char *errors)
3760 {
3761     const char *starts = s;
3762     PyUnicodeObject *v;
3763     Py_UNICODE *p;
3764     Py_ssize_t startinpos;
3765     Py_ssize_t endinpos;
3766     Py_ssize_t outpos;
3767     const char *e;
3768     PyObject *errorHandler = NULL;
3769     PyObject *exc = NULL;
3770
3771     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3772     if (size == 1 && *(unsigned char*)s < 128) {
3773         Py_UNICODE r = *(unsigned char*)s;
3774         return PyUnicode_FromUnicode(&r, 1);
3775     }
3776
3777     v = _PyUnicode_New(size);
3778     if (v == NULL)
3779         goto onError;
3780     if (size == 0)
3781         return (PyObject *)v;
3782     p = PyUnicode_AS_UNICODE(v);
3783     e = s + size;
3784     while (s < e) {
3785         register unsigned char c = (unsigned char)*s;
3786         if (c < 128) {
3787             *p++ = c;
3788             ++s;
3789         }
3790         else {
3791             startinpos = s-starts;
3792             endinpos = startinpos + 1;
3793             outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3794             if (unicode_decode_call_errorhandler(
3795                     errors, &errorHandler,
3796                     "ascii", "ordinal not in range(128)",
3797                     starts, size, &startinpos, &endinpos, &exc, &s,
3798                     &v, &outpos, &p))
3799                 goto onError;
3800         }
3801     }
3802     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
3803         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3804             goto onError;
3805     Py_XDECREF(errorHandler);
3806     Py_XDECREF(exc);
3807     return (PyObject *)v;
3808
3809   onError:
3810     Py_XDECREF(v);
3811     Py_XDECREF(errorHandler);
3812     Py_XDECREF(exc);
3813     return NULL;
3814 }
3815
3816 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3817                                 Py_ssize_t size,
3818                                 const char *errors)
3819 {
3820     return unicode_encode_ucs1(p, size, errors, 128);
3821 }
3822
3823 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3824 {
3825     if (!PyUnicode_Check(unicode)) {
3826         PyErr_BadArgument();
3827         return NULL;
3828     }
3829     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3830                                  PyUnicode_GET_SIZE(unicode),
3831                                  NULL);
3832 }
3833
3834 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3835
3836 /* --- MBCS codecs for Windows -------------------------------------------- */
3837
3838 #if SIZEOF_INT < SIZEOF_SIZE_T
3839 #define NEED_RETRY
3840 #endif
3841
3842 /* XXX This code is limited to "true" double-byte encodings, as
3843    a) it assumes an incomplete character consists of a single byte, and
3844    b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3845    encodings, see IsDBCSLeadByteEx documentation. */
3846
3847 static int is_dbcs_lead_byte(const char *s, int offset)
3848 {
3849     const char *curr = s + offset;
3850
3851     if (IsDBCSLeadByte(*curr)) {
3852         const char *prev = CharPrev(s, curr);
3853         return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3854     }
3855     return 0;
3856 }
3857
3858 /*
3859  * Decode MBCS string into unicode object. If 'final' is set, converts
3860  * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3861  */
3862 static int decode_mbcs(PyUnicodeObject **v,
3863                        const char *s, /* MBCS string */
3864                        int size, /* sizeof MBCS string */
3865                        int final)
3866 {
3867     Py_UNICODE *p;
3868     Py_ssize_t n = 0;
3869     int usize = 0;
3870
3871     assert(size >= 0);
3872
3873     /* Skip trailing lead-byte unless 'final' is set */
3874     if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3875         --size;
3876
3877     /* First get the size of the result */
3878     if (size > 0) {
3879         usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3880         if (usize == 0) {
3881             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3882             return -1;
3883         }
3884     }
3885
3886     if (*v == NULL) {
3887         /* Create unicode object */
3888         *v = _PyUnicode_New(usize);
3889         if (*v == NULL)
3890             return -1;
3891     }
3892     else {
3893         /* Extend unicode object */
3894         n = PyUnicode_GET_SIZE(*v);
3895         if (_PyUnicode_Resize(v, n + usize) < 0)
3896             return -1;
3897     }
3898
3899     /* Do the conversion */
3900     if (size > 0) {
3901         p = PyUnicode_AS_UNICODE(*v) + n;
3902         if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3903             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3904             return -1;
3905         }
3906     }
3907
3908     return size;
3909 }
3910
3911 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3912                                        Py_ssize_t size,
3913                                        const char *errors,
3914                                        Py_ssize_t *consumed)
3915 {
3916     PyUnicodeObject *v = NULL;
3917     int done;
3918
3919     if (consumed)
3920         *consumed = 0;
3921
3922 #ifdef NEED_RETRY
3923   retry:
3924     if (size > INT_MAX)
3925         done = decode_mbcs(&v, s, INT_MAX, 0);
3926     else
3927 #endif
3928         done = decode_mbcs(&v, s, (int)size, !consumed);
3929
3930     if (done < 0) {
3931         Py_XDECREF(v);
3932         return NULL;
3933     }
3934
3935     if (consumed)
3936         *consumed += done;
3937
3938 #ifdef NEED_RETRY
3939     if (size > INT_MAX) {
3940         s += done;
3941         size -= done;
3942         goto retry;
3943     }
3944 #endif
3945
3946     return (PyObject *)v;
3947 }
3948
3949 PyObject *PyUnicode_DecodeMBCS(const char *s,
3950                                Py_ssize_t size,
3951                                const char *errors)
3952 {
3953     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3954 }
3955
3956 /*
3957  * Convert unicode into string object (MBCS).
3958  * Returns 0 if succeed, -1 otherwise.
3959  */
3960 static int encode_mbcs(PyObject **repr,
3961                        const Py_UNICODE *p, /* unicode */
3962                        int size) /* size of unicode */
3963 {
3964     int mbcssize = 0;
3965     Py_ssize_t n = 0;
3966
3967     assert(size >= 0);
3968
3969     /* First get the size of the result */
3970     if (size > 0) {
3971         mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3972         if (mbcssize == 0) {
3973             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3974             return -1;
3975         }
3976     }
3977
3978     if (*repr == NULL) {
3979         /* Create string object */
3980         *repr = PyString_FromStringAndSize(NULL, mbcssize);
3981         if (*repr == NULL)
3982             return -1;
3983     }
3984     else {
3985         /* Extend string object */
3986         n = PyString_Size(*repr);
3987         if (_PyString_Resize(repr, n + mbcssize) < 0)
3988             return -1;
3989     }
3990
3991     /* Do the conversion */
3992     if (size > 0) {
3993         char *s = PyString_AS_STRING(*repr) + n;
3994         if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3995             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3996             return -1;
3997         }
3998     }
3999
4000     return 0;
4001 }
4002
4003 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
4004                                Py_ssize_t size,
4005                                const char *errors)
4006 {
4007     PyObject *repr = NULL;
4008     int ret;
4009
4010 #ifdef NEED_RETRY
4011   retry:
4012     if (size > INT_MAX)
4013         ret = encode_mbcs(&repr, p, INT_MAX);
4014     else
4015 #endif
4016         ret = encode_mbcs(&repr, p, (int)size);
4017
4018     if (ret < 0) {
4019         Py_XDECREF(repr);
4020         return NULL;
4021     }
4022
4023 #ifdef NEED_RETRY
4024     if (size > INT_MAX) {
4025         p += INT_MAX;
4026         size -= INT_MAX;
4027         goto retry;
4028     }
4029 #endif
4030
4031     return repr;
4032 }
4033
4034 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4035 {
4036     if (!PyUnicode_Check(unicode)) {
4037         PyErr_BadArgument();
4038         return NULL;
4039     }
4040     return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4041                                 PyUnicode_GET_SIZE(unicode),
4042                                 NULL);
4043 }
4044
4045 #undef NEED_RETRY
4046
4047 #endif /* MS_WINDOWS */
4048
4049 /* --- Character Mapping Codec -------------------------------------------- */
4050
4051 PyObject *PyUnicode_DecodeCharmap(const char *s,
4052                                   Py_ssize_t size,
4053                                   PyObject *mapping,
4054                                   const char *errors)
4055 {
4056     const char *starts = s;
4057     Py_ssize_t startinpos;
4058     Py_ssize_t endinpos;
4059     Py_ssize_t outpos;
4060     const char *e;
4061     PyUnicodeObject *v;
4062     Py_UNICODE *p;
4063     Py_ssize_t extrachars = 0;
4064     PyObject *errorHandler = NULL;
4065     PyObject *exc = NULL;
4066     Py_UNICODE *mapstring = NULL;
4067     Py_ssize_t maplen = 0;
4068
4069     /* Default to Latin-1 */
4070     if (mapping == NULL)
4071         return PyUnicode_DecodeLatin1(s, size, errors);
4072
4073     v = _PyUnicode_New(size);
4074     if (v == NULL)
4075         goto onError;
4076     if (size == 0)
4077         return (PyObject *)v;
4078     p = PyUnicode_AS_UNICODE(v);
4079     e = s + size;
4080     if (PyUnicode_CheckExact(mapping)) {
4081         mapstring = PyUnicode_AS_UNICODE(mapping);
4082         maplen = PyUnicode_GET_SIZE(mapping);
4083         while (s < e) {
4084             unsigned char ch = *s;
4085             Py_UNICODE x = 0xfffe; /* illegal value */
4086
4087             if (ch < maplen)
4088                 x = mapstring[ch];
4089
4090             if (x == 0xfffe) {
4091                 /* undefined mapping */
4092                 outpos = p-PyUnicode_AS_UNICODE(v);
4093                 startinpos = s-starts;
4094                 endinpos = startinpos+1;
4095                 if (unicode_decode_call_errorhandler(
4096                         errors, &errorHandler,
4097                         "charmap", "character maps to <undefined>",
4098                         starts, size, &startinpos, &endinpos, &exc, &s,
4099                         &v, &outpos, &p)) {
4100                     goto onError;
4101                 }
4102                 continue;
4103             }
4104             *p++ = x;
4105             ++s;
4106         }
4107     }
4108     else {
4109         while (s < e) {
4110             unsigned char ch = *s;
4111             PyObject *w, *x;
4112
4113             /* Get mapping (char ordinal -> integer, Unicode char or None) */
4114             w = PyInt_FromLong((long)ch);
4115             if (w == NULL)
4116                 goto onError;
4117             x = PyObject_GetItem(mapping, w);
4118             Py_DECREF(w);
4119             if (x == NULL) {
4120                 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4121                     /* No mapping found means: mapping is undefined. */
4122                     PyErr_Clear();
4123                     x = Py_None;
4124                     Py_INCREF(x);
4125                 } else
4126                     goto onError;
4127             }
4128
4129             /* Apply mapping */
4130             if (PyInt_Check(x)) {
4131                 long value = PyInt_AS_LONG(x);
4132                 if (value < 0 || value > 65535) {
4133                     PyErr_SetString(PyExc_TypeError,
4134                                     "character mapping must be in range(65536)");
4135                     Py_DECREF(x);
4136                     goto onError;
4137                 }
4138                 *p++ = (Py_UNICODE)value;
4139             }
4140             else if (x == Py_None) {
4141                 /* undefined mapping */
4142                 outpos = p-PyUnicode_AS_UNICODE(v);
4143                 startinpos = s-starts;
4144                 endinpos = startinpos+1;
4145                 if (unicode_decode_call_errorhandler(
4146                         errors, &errorHandler,
4147                         "charmap", "character maps to <undefined>",
4148                         starts, size, &startinpos, &endinpos, &exc, &s,
4149                         &v, &outpos, &p)) {
4150                     Py_DECREF(x);
4151                     goto onError;
4152                 }
4153                 Py_DECREF(x);
4154                 continue;
4155             }
4156             else if (PyUnicode_Check(x)) {
4157                 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4158
4159                 if (targetsize == 1)
4160                     /* 1-1 mapping */
4161                     *p++ = *PyUnicode_AS_UNICODE(x);
4162
4163                 else if (targetsize > 1) {
4164                     /* 1-n mapping */
4165                     if (targetsize > extrachars) {
4166                         /* resize first */
4167                         Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4168                         Py_ssize_t needed = (targetsize - extrachars) + \
4169                             (targetsize << 2);
4170                         extrachars += needed;
4171                         /* XXX overflow detection missing */
4172                         if (_PyUnicode_Resize(&v,
4173                                               PyUnicode_GET_SIZE(v) + needed) < 0) {
4174                             Py_DECREF(x);
4175                             goto onError;
4176                         }
4177                         p = PyUnicode_AS_UNICODE(v) + oldpos;
4178                     }
4179                     Py_UNICODE_COPY(p,
4180                                     PyUnicode_AS_UNICODE(x),
4181                                     targetsize);
4182                     p += targetsize;
4183                     extrachars -= targetsize;
4184                 }
4185                 /* 1-0 mapping: skip the character */
4186             }
4187             else {
4188                 /* wrong return value */
4189                 PyErr_SetString(PyExc_TypeError,
4190                                 "character mapping must return integer, None or unicode");
4191                 Py_DECREF(x);
4192                 goto onError;
4193             }
4194             Py_DECREF(x);
4195             ++s;
4196         }
4197     }
4198     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4199         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4200             goto onError;
4201     Py_XDECREF(errorHandler);
4202     Py_XDECREF(exc);
4203     return (PyObject *)v;
4204
4205   onError:
4206     Py_XDECREF(errorHandler);
4207     Py_XDECREF(exc);
4208     Py_XDECREF(v);
4209     return NULL;
4210 }
4211
4212 /* Charmap encoding: the lookup table */
4213
4214 struct encoding_map{
4215     PyObject_HEAD
4216     unsigned char level1[32];
4217     int count2, count3;
4218     unsigned char level23[1];
4219 };
4220
4221 static PyObject*
4222 encoding_map_size(PyObject *obj, PyObject* args)
4223 {
4224     struct encoding_map *map = (struct encoding_map*)obj;
4225     return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4226                           128*map->count3);
4227 }
4228
4229 static PyMethodDef encoding_map_methods[] = {
4230     {"size", encoding_map_size, METH_NOARGS,
4231      PyDoc_STR("Return the size (in bytes) of this object") },
4232     { 0 }
4233 };
4234
4235 static void
4236 encoding_map_dealloc(PyObject* o)
4237 {
4238     PyObject_FREE(o);
4239 }
4240
4241 static PyTypeObject EncodingMapType = {
4242     PyVarObject_HEAD_INIT(NULL, 0)
4243     "EncodingMap",          /*tp_name*/
4244     sizeof(struct encoding_map),   /*tp_basicsize*/
4245     0,                      /*tp_itemsize*/
4246     /* methods */
4247     encoding_map_dealloc,   /*tp_dealloc*/
4248     0,                      /*tp_print*/
4249     0,                      /*tp_getattr*/
4250     0,                      /*tp_setattr*/
4251     0,                      /*tp_compare*/
4252     0,                      /*tp_repr*/
4253     0,                      /*tp_as_number*/
4254     0,                      /*tp_as_sequence*/
4255     0,                      /*tp_as_mapping*/
4256     0,                      /*tp_hash*/
4257     0,                      /*tp_call*/
4258     0,                      /*tp_str*/
4259     0,                      /*tp_getattro*/
4260     0,                      /*tp_setattro*/
4261     0,                      /*tp_as_buffer*/
4262     Py_TPFLAGS_DEFAULT,     /*tp_flags*/
4263     0,                      /*tp_doc*/
4264     0,                      /*tp_traverse*/
4265     0,                      /*tp_clear*/
4266     0,                      /*tp_richcompare*/
4267     0,                      /*tp_weaklistoffset*/
4268     0,                      /*tp_iter*/
4269     0,                      /*tp_iternext*/
4270     encoding_map_methods,   /*tp_methods*/
4271     0,                      /*tp_members*/
4272     0,                      /*tp_getset*/
4273     0,                      /*tp_base*/
4274     0,                      /*tp_dict*/
4275     0,                      /*tp_descr_get*/
4276     0,                      /*tp_descr_set*/
4277     0,                      /*tp_dictoffset*/
4278     0,                      /*tp_init*/
4279     0,                      /*tp_alloc*/
4280     0,                      /*tp_new*/
4281     0,                      /*tp_free*/
4282     0,                      /*tp_is_gc*/
4283 };
4284
4285 PyObject*
4286 PyUnicode_BuildEncodingMap(PyObject* string)
4287 {
4288     Py_UNICODE *decode;
4289     PyObject *result;
4290     struct encoding_map *mresult;
4291     int i;
4292     int need_dict = 0;
4293     unsigned char level1[32];
4294     unsigned char level2[512];
4295     unsigned char *mlevel1, *mlevel2, *mlevel3;
4296     int count2 = 0, count3 = 0;
4297
4298     if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4299         PyErr_BadArgument();
4300         return NULL;
4301     }
4302     decode = PyUnicode_AS_UNICODE(string);
4303     memset(level1, 0xFF, sizeof level1);
4304     memset(level2, 0xFF, sizeof level2);
4305
4306     /* If there isn't a one-to-one mapping of NULL to \0,
4307        or if there are non-BMP characters, we need to use
4308        a mapping dictionary. */
4309     if (decode[0] != 0)
4310         need_dict = 1;
4311     for (i = 1; i < 256; i++) {
4312         int l1, l2;
4313         if (decode[i] == 0
4314 #ifdef Py_UNICODE_WIDE
4315             || decode[i] > 0xFFFF
4316 #endif
4317             ) {
4318             need_dict = 1;
4319             break;
4320         }
4321         if (decode[i] == 0xFFFE)
4322             /* unmapped character */
4323             continue;
4324         l1 = decode[i] >> 11;
4325         l2 = decode[i] >> 7;
4326         if (level1[l1] == 0xFF)
4327             level1[l1] = count2++;
4328         if (level2[l2] == 0xFF)
4329             level2[l2] = count3++;
4330     }
4331
4332     if (count2 >= 0xFF || count3 >= 0xFF)
4333         need_dict = 1;
4334
4335     if (need_dict) {
4336         PyObject *result = PyDict_New();
4337         PyObject *key, *value;
4338         if (!result)
4339             return NULL;
4340         for (i = 0; i < 256; i++) {
4341             key = value = NULL;
4342             key = PyInt_FromLong(decode[i]);
4343             value = PyInt_FromLong(i);
4344             if (!key || !value)
4345                 goto failed1;
4346             if (PyDict_SetItem(result, key, value) == -1)
4347                 goto failed1;
4348             Py_DECREF(key);
4349             Py_DECREF(value);
4350         }
4351         return result;
4352       failed1:
4353         Py_XDECREF(key);
4354         Py_XDECREF(value);
4355         Py_DECREF(result);
4356         return NULL;
4357     }
4358
4359     /* Create a three-level trie */
4360     result = PyObject_MALLOC(sizeof(struct encoding_map) +
4361                              16*count2 + 128*count3 - 1);
4362     if (!result)
4363         return PyErr_NoMemory();
4364     PyObject_Init(result, &EncodingMapType);
4365     mresult = (struct encoding_map*)result;
4366     mresult->count2 = count2;
4367     mresult->count3 = count3;
4368     mlevel1 = mresult->level1;
4369     mlevel2 = mresult->level23;
4370     mlevel3 = mresult->level23 + 16*count2;
4371     memcpy(mlevel1, level1, 32);
4372     memset(mlevel2, 0xFF, 16*count2);
4373     memset(mlevel3, 0, 128*count3);
4374     count3 = 0;
4375     for (i = 1; i < 256; i++) {
4376         int o1, o2, o3, i2, i3;
4377         if (decode[i] == 0xFFFE)
4378             /* unmapped character */
4379             continue;
4380         o1 = decode[i]>>11;
4381         o2 = (decode[i]>>7) & 0xF;
4382         i2 = 16*mlevel1[o1] + o2;
4383         if (mlevel2[i2] == 0xFF)
4384             mlevel2[i2] = count3++;
4385         o3 = decode[i] & 0x7F;
4386         i3 = 128*mlevel2[i2] + o3;
4387         mlevel3[i3] = i;
4388     }
4389     return result;
4390 }
4391
4392 static int
4393 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4394 {
4395     struct encoding_map *map = (struct encoding_map*)mapping;
4396     int l1 = c>>11;
4397     int l2 = (c>>7) & 0xF;
4398     int l3 = c & 0x7F;
4399     int i;
4400
4401 #ifdef Py_UNICODE_WIDE
4402     if (c > 0xFFFF) {
4403         return -1;
4404     }
4405 #endif
4406     if (c == 0)
4407         return 0;
4408     /* level 1*/
4409     i = map->level1[l1];
4410     if (i == 0xFF) {
4411         return -1;
4412     }
4413     /* level 2*/
4414     i = map->level23[16*i+l2];
4415     if (i == 0xFF) {
4416         return -1;
4417     }
4418     /* level 3 */
4419     i = map->level23[16*map->count2 + 128*i + l3];
4420     if (i == 0) {
4421         return -1;
4422     }
4423     return i;
4424 }
4425
4426 /* Lookup the character ch in the mapping. If the character
4427    can't be found, Py_None is returned (or NULL, if another
4428    error occurred). */
4429 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4430 {
4431     PyObject *w = PyInt_FromLong((long)c);
4432     PyObject *x;
4433
4434     if (w == NULL)
4435         return NULL;
4436     x = PyObject_GetItem(mapping, w);
4437     Py_DECREF(w);
4438     if (x == NULL) {
4439         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4440             /* No mapping found means: mapping is undefined. */
4441             PyErr_Clear();
4442             x = Py_None;
4443             Py_INCREF(x);
4444             return x;
4445         } else
4446             return NULL;
4447     }
4448     else if (x == Py_None)
4449         return x;
4450     else if (PyInt_Check(x)) {
4451         long value = PyInt_AS_LONG(x);
4452         if (value < 0 || value > 255) {
4453             PyErr_SetString(PyExc_TypeError,
4454                             "character mapping must be in range(256)");
4455             Py_DECREF(x);
4456             return NULL;
4457         }
4458         return x;
4459     }
4460     else if (PyString_Check(x))
4461         return x;
4462     else {
4463         /* wrong return value */
4464         PyErr_SetString(PyExc_TypeError,
4465                         "character mapping must return integer, None or str");
4466         Py_DECREF(x);
4467         return NULL;
4468     }
4469 }
4470
4471 static int
4472 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4473 {
4474     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4475     /* exponentially overallocate to minimize reallocations */
4476     if (requiredsize < 2*outsize)
4477         requiredsize = 2*outsize;
4478     if (_PyString_Resize(outobj, requiredsize)) {
4479         return 0;
4480     }
4481     return 1;
4482 }
4483
4484 typedef enum charmapencode_result {
4485     enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4486 }charmapencode_result;
4487 /* lookup the character, put the result in the output string and adjust
4488    various state variables. Reallocate the output string if not enough
4489    space is available. Return a new reference to the object that
4490    was put in the output buffer, or Py_None, if the mapping was undefined
4491    (in which case no character was written) or NULL, if a
4492    reallocation error occurred. The caller must decref the result */
4493 static
4494 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4495                                           PyObject **outobj, Py_ssize_t *outpos)
4496 {
4497     PyObject *rep;
4498     char *outstart;
4499     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4500
4501     if (Py_TYPE(mapping) == &EncodingMapType) {
4502         int res = encoding_map_lookup(c, mapping);
4503         Py_ssize_t requiredsize = *outpos+1;
4504         if (res == -1)
4505             return enc_FAILED;
4506         if (outsize<requiredsize)
4507             if (!charmapencode_resize(outobj, outpos, requiredsize))
4508                 return enc_EXCEPTION;
4509         outstart = PyString_AS_STRING(*outobj);
4510         outstart[(*outpos)++] = (char)res;
4511         return enc_SUCCESS;
4512     }
4513
4514     rep = charmapencode_lookup(c, mapping);
4515     if (rep==NULL)
4516         return enc_EXCEPTION;
4517     else if (rep==Py_None) {
4518         Py_DECREF(rep);
4519         return enc_FAILED;
4520     } else {
4521         if (PyInt_Check(rep)) {
4522             Py_ssize_t requiredsize = *outpos+1;
4523             if (outsize<requiredsize)
4524                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4525                     Py_DECREF(rep);
4526                     return enc_EXCEPTION;
4527                 }
4528             outstart = PyString_AS_STRING(*outobj);
4529             outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4530         }
4531         else {
4532             const char *repchars = PyString_AS_STRING(rep);
4533             Py_ssize_t repsize = PyString_GET_SIZE(rep);
4534             Py_ssize_t requiredsize = *outpos+repsize;
4535             if (outsize<requiredsize)
4536                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4537                     Py_DECREF(rep);
4538                     return enc_EXCEPTION;
4539                 }
4540             outstart = PyString_AS_STRING(*outobj);
4541             memcpy(outstart + *outpos, repchars, repsize);
4542             *outpos += repsize;
4543         }
4544     }
4545     Py_DECREF(rep);
4546     return enc_SUCCESS;
4547 }
4548
4549 /* handle an error in PyUnicode_EncodeCharmap
4550    Return 0 on success, -1 on error */
4551 static
4552 int charmap_encoding_error(
4553     const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4554     PyObject **exceptionObject,
4555     int *known_errorHandler, PyObject **errorHandler, const char *errors,
4556     PyObject **res, Py_ssize_t *respos)
4557 {
4558     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4559     Py_ssize_t repsize;
4560     Py_ssize_t newpos;
4561     Py_UNICODE *uni2;
4562     /* startpos for collecting unencodable chars */
4563     Py_ssize_t collstartpos = *inpos;
4564     Py_ssize_t collendpos = *inpos+1;
4565     Py_ssize_t collpos;
4566     char *encoding = "charmap";
4567     char *reason = "character maps to <undefined>";
4568     charmapencode_result x;
4569
4570     /* find all unencodable characters */
4571     while (collendpos < size) {
4572         PyObject *rep;
4573         if (Py_TYPE(mapping) == &EncodingMapType) {
4574             int res = encoding_map_lookup(p[collendpos], mapping);
4575             if (res != -1)
4576                 break;
4577             ++collendpos;
4578             continue;
4579         }
4580
4581         rep = charmapencode_lookup(p[collendpos], mapping);
4582         if (rep==NULL)
4583             return -1;
4584         else if (rep!=Py_None) {
4585             Py_DECREF(rep);
4586             break;
4587         }
4588         Py_DECREF(rep);
4589         ++collendpos;
4590     }
4591     /* cache callback name lookup
4592      * (if not done yet, i.e. it's the first error) */
4593     if (*known_errorHandler==-1) {
4594         if ((errors==NULL) || (!strcmp(errors, "strict")))
4595             *known_errorHandler = 1;
4596         else if (!strcmp(errors, "replace"))
4597             *known_errorHandler = 2;
4598         else if (!strcmp(errors, "ignore"))
4599             *known_errorHandler = 3;
4600         else if (!strcmp(errors, "xmlcharrefreplace"))
4601             *known_errorHandler = 4;
4602         else
4603             *known_errorHandler = 0;
4604     }
4605     switch (*known_errorHandler) {
4606     case 1: /* strict */
4607         raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4608         return -1;
4609     case 2: /* replace */
4610         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4611             x = charmapencode_output('?', mapping, res, respos);
4612             if (x==enc_EXCEPTION) {
4613                 return -1;
4614             }
4615             else if (x==enc_FAILED) {
4616                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4617                 return -1;
4618             }
4619         }
4620         /* fall through */
4621     case 3: /* ignore */
4622         *inpos = collendpos;
4623         break;
4624     case 4: /* xmlcharrefreplace */
4625         /* generate replacement (temporarily (mis)uses p) */
4626         for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4627             char buffer[2+29+1+1];
4628             char *cp;
4629             sprintf(buffer, "&#%d;", (int)p[collpos]);
4630             for (cp = buffer; *cp; ++cp) {
4631                 x = charmapencode_output(*cp, mapping, res, respos);
4632                 if (x==enc_EXCEPTION)
4633                     return -1;
4634                 else if (x==enc_FAILED) {
4635                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4636                     return -1;
4637                 }
4638             }
4639         }
4640         *inpos = collendpos;
4641         break;
4642     default:
4643         repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4644                                                       encoding, reason, p, size, exceptionObject,
4645                                                       collstartpos, collendpos, &newpos);
4646         if (repunicode == NULL)
4647             return -1;
4648         /* generate replacement  */
4649         repsize = PyUnicode_GET_SIZE(repunicode);
4650         for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4651             x = charmapencode_output(*uni2, mapping, res, respos);
4652             if (x==enc_EXCEPTION) {
4653                 return -1;
4654             }
4655             else if (x==enc_FAILED) {
4656                 Py_DECREF(repunicode);
4657                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4658                 return -1;
4659             }
4660         }
4661         *inpos = newpos;
4662         Py_DECREF(repunicode);
4663     }
4664     return 0;
4665 }
4666
4667 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4668                                   Py_ssize_t size,
4669                                   PyObject *mapping,
4670                                   const char *errors)
4671 {
4672     /* output object */
4673     PyObject *res = NULL;
4674     /* current input position */
4675     Py_ssize_t inpos = 0;
4676     /* current output position */
4677     Py_ssize_t respos = 0;
4678     PyObject *errorHandler = NULL;
4679     PyObject *exc = NULL;
4680     /* the following variable is used for caching string comparisons
4681      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4682      * 3=ignore, 4=xmlcharrefreplace */
4683     int known_errorHandler = -1;
4684
4685     /* Default to Latin-1 */
4686     if (mapping == NULL)
4687         return PyUnicode_EncodeLatin1(p, size, errors);
4688
4689     /* allocate enough for a simple encoding without
4690        replacements, if we need more, we'll resize */
4691     res = PyString_FromStringAndSize(NULL, size);
4692     if (res == NULL)
4693         goto onError;
4694     if (size == 0)
4695         return res;
4696
4697     while (inpos<size) {
4698         /* try to encode it */
4699         charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4700         if (x==enc_EXCEPTION) /* error */
4701             goto onError;
4702         if (x==enc_FAILED) { /* unencodable character */
4703             if (charmap_encoding_error(p, size, &inpos, mapping,
4704                                        &exc,
4705                                        &known_errorHandler, &errorHandler, errors,
4706                                        &res, &respos)) {
4707                 goto onError;
4708             }
4709         }
4710         else
4711             /* done with this character => adjust input position */
4712             ++inpos;
4713     }
4714
4715     /* Resize if we allocated to much */
4716     if (respos<PyString_GET_SIZE(res)) {
4717         if (_PyString_Resize(&res, respos))
4718             goto onError;
4719     }
4720     Py_XDECREF(exc);
4721     Py_XDECREF(errorHandler);
4722     return res;
4723
4724   onError:
4725     Py_XDECREF(res);
4726     Py_XDECREF(exc);
4727     Py_XDECREF(errorHandler);
4728     return NULL;
4729 }
4730
4731 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4732                                     PyObject *mapping)
4733 {
4734     if (!PyUnicode_Check(unicode) || mapping == NULL) {
4735         PyErr_BadArgument();
4736         return NULL;
4737     }
4738     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4739                                    PyUnicode_GET_SIZE(unicode),
4740                                    mapping,
4741                                    NULL);
4742 }
4743
4744 /* create or adjust a UnicodeTranslateError */
4745 static void make_translate_exception(PyObject **exceptionObject,
4746                                      const Py_UNICODE *unicode, Py_ssize_t size,
4747                                      Py_ssize_t startpos, Py_ssize_t endpos,
4748                                      const char *reason)
4749 {
4750     if (*exceptionObject == NULL) {
4751         *exceptionObject = PyUnicodeTranslateError_Create(
4752             unicode, size, startpos, endpos, reason);
4753     }
4754     else {
4755         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4756             goto onError;
4757         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4758             goto onError;
4759         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4760             goto onError;
4761         return;
4762       onError:
4763         Py_DECREF(*exceptionObject);
4764         *exceptionObject = NULL;
4765     }
4766 }
4767
4768 /* raises a UnicodeTranslateError */
4769 static void raise_translate_exception(PyObject **exceptionObject,
4770                                       const Py_UNICODE *unicode, Py_ssize_t size,
4771                                       Py_ssize_t startpos, Py_ssize_t endpos,
4772                                       const char *reason)
4773 {
4774     make_translate_exception(exceptionObject,
4775                              unicode, size, startpos, endpos, reason);
4776     if (*exceptionObject != NULL)
4777         PyCodec_StrictErrors(*exceptionObject);
4778 }
4779
4780 /* error handling callback helper:
4781    build arguments, call the callback and check the arguments,
4782    put the result into newpos and return the replacement string, which
4783    has to be freed by the caller */
4784 static PyObject *unicode_translate_call_errorhandler(const char *errors,
4785                                                      PyObject **errorHandler,
4786                                                      const char *reason,
4787                                                      const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4788                                                      Py_ssize_t startpos, Py_ssize_t endpos,
4789                                                      Py_ssize_t *newpos)
4790 {
4791     static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4792
4793     Py_ssize_t i_newpos;
4794     PyObject *restuple;
4795     PyObject *resunicode;
4796
4797     if (*errorHandler == NULL) {
4798         *errorHandler = PyCodec_LookupError(errors);
4799         if (*errorHandler == NULL)
4800             return NULL;
4801     }
4802
4803     make_translate_exception(exceptionObject,
4804                              unicode, size, startpos, endpos, reason);
4805     if (*exceptionObject == NULL)
4806         return NULL;
4807
4808     restuple = PyObject_CallFunctionObjArgs(
4809         *errorHandler, *exceptionObject, NULL);
4810     if (restuple == NULL)
4811         return NULL;
4812     if (!PyTuple_Check(restuple)) {
4813         PyErr_SetString(PyExc_TypeError, &argparse[4]);
4814         Py_DECREF(restuple);
4815         return NULL;
4816     }
4817     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4818                           &resunicode, &i_newpos)) {
4819         Py_DECREF(restuple);
4820         return NULL;
4821     }
4822     if (i_newpos<0)
4823         *newpos = size+i_newpos;
4824     else
4825         *newpos = i_newpos;
4826     if (*newpos<0 || *newpos>size) {
4827         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4828         Py_DECREF(restuple);
4829         return NULL;
4830     }
4831     Py_INCREF(resunicode);
4832     Py_DECREF(restuple);
4833     return resunicode;
4834 }
4835
4836 /* Lookup the character ch in the mapping and put the result in result,
4837    which must be decrefed by the caller.
4838    Return 0 on success, -1 on error */
4839 static
4840 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4841 {
4842     PyObject *w = PyInt_FromLong((long)c);
4843     PyObject *x;
4844
4845     if (w == NULL)
4846         return -1;
4847     x = PyObject_GetItem(mapping, w);
4848     Py_DECREF(w);
4849     if (x == NULL) {
4850         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4851             /* No mapping found means: use 1:1 mapping. */
4852             PyErr_Clear();
4853             *result = NULL;
4854             return 0;
4855         } else
4856             return -1;
4857     }
4858     else if (x == Py_None) {
4859         *result = x;
4860         return 0;
4861     }
4862     else if (PyInt_Check(x)) {
4863         long value = PyInt_AS_LONG(x);
4864         long max = PyUnicode_GetMax();
4865         if (value < 0 || value > max) {
4866             PyErr_Format(PyExc_TypeError,
4867                          "character mapping must be in range(0x%lx)", max+1);
4868             Py_DECREF(x);
4869             return -1;
4870         }
4871         *result = x;
4872         return 0;
4873     }
4874     else if (PyUnicode_Check(x)) {
4875         *result = x;
4876         return 0;
4877     }
4878     else {
4879         /* wrong return value */
4880         PyErr_SetString(PyExc_TypeError,
4881                         "character mapping must return integer, None or unicode");
4882         Py_DECREF(x);
4883         return -1;
4884     }
4885 }
4886 /* ensure that *outobj is at least requiredsize characters long,
4887    if not reallocate and adjust various state variables.
4888    Return 0 on success, -1 on error */
4889 static
4890 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4891                                Py_ssize_t requiredsize)
4892 {
4893     Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4894     if (requiredsize > oldsize) {
4895         /* remember old output position */
4896         Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4897         /* exponentially overallocate to minimize reallocations */
4898         if (requiredsize < 2 * oldsize)
4899             requiredsize = 2 * oldsize;
4900         if (PyUnicode_Resize(outobj, requiredsize) < 0)
4901             return -1;
4902         *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4903     }
4904     return 0;
4905 }
4906 /* lookup the character, put the result in the output string and adjust
4907    various state variables. Return a new reference to the object that
4908    was put in the output buffer in *result, or Py_None, if the mapping was
4909    undefined (in which case no character was written).
4910    The called must decref result.
4911    Return 0 on success, -1 on error. */
4912 static
4913 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
4914                             Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4915                             PyObject **res)
4916 {
4917     if (charmaptranslate_lookup(*curinp, mapping, res))
4918         return -1;
4919     if (*res==NULL) {
4920         /* not found => default to 1:1 mapping */
4921         *(*outp)++ = *curinp;
4922     }
4923     else if (*res==Py_None)
4924         ;
4925     else if (PyInt_Check(*res)) {
4926         /* no overflow check, because we know that the space is enough */
4927         *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4928     }
4929     else if (PyUnicode_Check(*res)) {
4930         Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4931         if (repsize==1) {
4932             /* no overflow check, because we know that the space is enough */
4933             *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4934         }
4935         else if (repsize!=0) {
4936             /* more than one character */
4937             Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4938                 (insize - (curinp-startinp)) +
4939                 repsize - 1;
4940             if (charmaptranslate_makespace(outobj, outp, requiredsize))
4941                 return -1;
4942             memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4943             *outp += repsize;
4944         }
4945     }
4946     else
4947         return -1;
4948     return 0;
4949 }
4950
4951 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
4952                                      Py_ssize_t size,
4953                                      PyObject *mapping,
4954                                      const char *errors)
4955 {
4956     /* output object */
4957     PyObject *res = NULL;
4958     /* pointers to the beginning and end+1 of input */
4959     const Py_UNICODE *startp = p;
4960     const Py_UNICODE *endp = p + size;
4961     /* pointer into the output */
4962     Py_UNICODE *str;
4963     /* current output position */
4964     Py_ssize_t respos = 0;
4965     char *reason = "character maps to <undefined>";
4966     PyObject *errorHandler = NULL;
4967     PyObject *exc = NULL;
4968     /* the following variable is used for caching string comparisons
4969      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4970      * 3=ignore, 4=xmlcharrefreplace */
4971     int known_errorHandler = -1;
4972
4973     if (mapping == NULL) {
4974         PyErr_BadArgument();
4975         return NULL;
4976     }
4977
4978     /* allocate enough for a simple 1:1 translation without
4979        replacements, if we need more, we'll resize */
4980     res = PyUnicode_FromUnicode(NULL, size);
4981     if (res == NULL)
4982         goto onError;
4983     if (size == 0)
4984         return res;
4985     str = PyUnicode_AS_UNICODE(res);
4986
4987     while (p<endp) {
4988         /* try to encode it */
4989         PyObject *x = NULL;
4990         if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4991             Py_XDECREF(x);
4992             goto onError;
4993         }
4994         Py_XDECREF(x);
4995         if (x!=Py_None) /* it worked => adjust input pointer */
4996             ++p;
4997         else { /* untranslatable character */
4998             PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4999             Py_ssize_t repsize;
5000             Py_ssize_t newpos;
5001             Py_UNICODE *uni2;
5002             /* startpos for collecting untranslatable chars */
5003             const Py_UNICODE *collstart = p;
5004             const Py_UNICODE *collend = p+1;
5005             const Py_UNICODE *coll;
5006
5007             /* find all untranslatable characters */
5008             while (collend < endp) {
5009                 if (charmaptranslate_lookup(*collend, mapping, &x))
5010                     goto onError;
5011                 Py_XDECREF(x);
5012                 if (x!=Py_None)
5013                     break;
5014                 ++collend;
5015             }
5016             /* cache callback name lookup
5017              * (if not done yet, i.e. it's the first error) */
5018             if (known_errorHandler==-1) {
5019                 if ((errors==NULL) || (!strcmp(errors, "strict")))
5020                     known_errorHandler = 1;
5021                 else if (!strcmp(errors, "replace"))
5022                     known_errorHandler = 2;
5023                 else if (!strcmp(errors, "ignore"))
5024                     known_errorHandler = 3;
5025                 else if (!strcmp(errors, "xmlcharrefreplace"))
5026                     known_errorHandler = 4;
5027                 else
5028                     known_errorHandler = 0;
5029             }
5030             switch (known_errorHandler) {
5031             case 1: /* strict */
5032                 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5033                 goto onError;
5034             case 2: /* replace */
5035                 /* No need to check for space, this is a 1:1 replacement */
5036                 for (coll = collstart; coll<collend; ++coll)
5037                     *str++ = '?';
5038                 /* fall through */
5039             case 3: /* ignore */
5040                 p = collend;
5041                 break;
5042             case 4: /* xmlcharrefreplace */
5043                 /* generate replacement (temporarily (mis)uses p) */
5044                 for (p = collstart; p < collend; ++p) {
5045                     char buffer[2+29+1+1];
5046                     char *cp;
5047                     sprintf(buffer, "&#%d;", (int)*p);
5048                     if (charmaptranslate_makespace(&res, &str,
5049                                                    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5050                         goto onError;
5051                     for (cp = buffer; *cp; ++cp)
5052                         *str++ = *cp;
5053                 }
5054                 p = collend;
5055                 break;
5056             default:
5057                 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5058                                                                  reason, startp, size, &exc,
5059                                                                  collstart-startp, collend-startp, &newpos);
5060                 if (repunicode == NULL)
5061                     goto onError;
5062                 /* generate replacement  */
5063                 repsize = PyUnicode_GET_SIZE(repunicode);
5064                 if (charmaptranslate_makespace(&res, &str,
5065                                                (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5066                     Py_DECREF(repunicode);
5067                     goto onError;
5068                 }
5069                 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5070                     *str++ = *uni2;
5071                 p = startp + newpos;
5072                 Py_DECREF(repunicode);
5073             }
5074         }
5075     }
5076     /* Resize if we allocated to much */
5077     respos = str-PyUnicode_AS_UNICODE(res);
5078     if (respos<PyUnicode_GET_SIZE(res)) {
5079         if (PyUnicode_Resize(&res, respos) < 0)
5080             goto onError;
5081     }
5082     Py_XDECREF(exc);
5083     Py_XDECREF(errorHandler);
5084     return res;
5085
5086   onError:
5087     Py_XDECREF(res);
5088     Py_XDECREF(exc);
5089     Py_XDECREF(errorHandler);
5090     return NULL;
5091 }
5092
5093 PyObject *PyUnicode_Translate(PyObject *str,
5094                               PyObject *mapping,
5095                               const char *errors)
5096 {
5097     PyObject *result;
5098
5099     str = PyUnicode_FromObject(str);
5100     if (str == NULL)
5101         goto onError;
5102     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5103                                         PyUnicode_GET_SIZE(str),
5104                                         mapping,
5105                                         errors);
5106     Py_DECREF(str);
5107     return result;
5108
5109   onError:
5110     Py_XDECREF(str);
5111     return NULL;
5112 }
5113
5114 /* --- Decimal Encoder ---------------------------------------------------- */
5115
5116 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5117                             Py_ssize_t length,
5118                             char *output,
5119                             const char *errors)
5120 {
5121     Py_UNICODE *p, *end;
5122     PyObject *errorHandler = NULL;
5123     PyObject *exc = NULL;
5124     const char *encoding = "decimal";
5125     const char *reason = "invalid decimal Unicode string";
5126     /* the following variable is used for caching string comparisons
5127      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5128     int known_errorHandler = -1;
5129
5130     if (output == NULL) {
5131         PyErr_BadArgument();
5132         return -1;
5133     }
5134
5135     p = s;
5136     end = s + length;
5137     while (p < end) {
5138         register Py_UNICODE ch = *p;
5139         int decimal;
5140         PyObject *repunicode;
5141         Py_ssize_t repsize;
5142         Py_ssize_t newpos;
5143         Py_UNICODE *uni2;
5144         Py_UNICODE *collstart;
5145         Py_UNICODE *collend;
5146
5147         if (Py_UNICODE_ISSPACE(ch)) {
5148             *output++ = ' ';
5149             ++p;
5150             continue;
5151         }
5152         decimal = Py_UNICODE_TODECIMAL(ch);
5153         if (decimal >= 0) {
5154             *output++ = '0' + decimal;
5155             ++p;
5156             continue;
5157         }
5158         if (0 < ch && ch < 256) {
5159             *output++ = (char)ch;
5160             ++p;
5161             continue;
5162         }
5163         /* All other characters are considered unencodable */
5164         collstart = p;
5165         collend = p+1;
5166         while (collend < end) {
5167             if ((0 < *collend && *collend < 256) ||
5168                 !Py_UNICODE_ISSPACE(*collend) ||
5169                 Py_UNICODE_TODECIMAL(*collend))
5170                 break;
5171         }
5172         /* cache callback name lookup
5173          * (if not done yet, i.e. it's the first error) */
5174         if (known_errorHandler==-1) {
5175             if ((errors==NULL) || (!strcmp(errors, "strict")))
5176                 known_errorHandler = 1;
5177             else if (!strcmp(errors, "replace"))
5178                 known_errorHandler = 2;
5179             else if (!strcmp(errors, "ignore"))
5180                 known_errorHandler = 3;
5181             else if (!strcmp(errors, "xmlcharrefreplace"))
5182                 known_errorHandler = 4;
5183             else
5184                 known_errorHandler = 0;
5185         }
5186         switch (known_errorHandler) {
5187         case 1: /* strict */
5188             raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5189             goto onError;
5190         case 2: /* replace */
5191             for (p = collstart; p < collend; ++p)
5192                 *output++ = '?';
5193             /* fall through */
5194         case 3: /* ignore */
5195             p = collend;
5196             break;
5197         case 4: /* xmlcharrefreplace */
5198             /* generate replacement (temporarily (mis)uses p) */
5199             for (p = collstart; p < collend; ++p)
5200                 output += sprintf(output, "&#%d;", (int)*p);
5201             p = collend;
5202             break;
5203         default:
5204             repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5205                                                           encoding, reason, s, length, &exc,
5206                                                           collstart-s, collend-s, &newpos);
5207             if (repunicode == NULL)
5208                 goto onError;
5209             /* generate replacement  */
5210             repsize = PyUnicode_GET_SIZE(repunicode);
5211             for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5212                 Py_UNICODE ch = *uni2;
5213                 if (Py_UNICODE_ISSPACE(ch))
5214                     *output++ = ' ';
5215                 else {
5216                     decimal = Py_UNICODE_TODECIMAL(ch);
5217                     if (decimal >= 0)
5218                         *output++ = '0' + decimal;
5219                     else if (0 < ch && ch < 256)
5220                         *output++ = (char)ch;
5221                     else {
5222                         Py_DECREF(repunicode);
5223                         raise_encode_exception(&exc, encoding,
5224                                                s, length, collstart-s, collend-s, reason);
5225                         goto onError;
5226                     }
5227                 }
5228             }
5229             p = s + newpos;
5230             Py_DECREF(repunicode);
5231         }
5232     }
5233     /* 0-terminate the output string */
5234     *output++ = '\0';
5235     Py_XDECREF(exc);
5236     Py_XDECREF(errorHandler);
5237     return 0;
5238
5239   onError:
5240     Py_XDECREF(exc);
5241     Py_XDECREF(errorHandler);
5242     return -1;
5243 }
5244
5245 /* --- Helpers ------------------------------------------------------------ */
5246
5247 #include "stringlib/unicodedefs.h"
5248
5249 #define FROM_UNICODE
5250
5251 #include "stringlib/fastsearch.h"
5252
5253 #include "stringlib/count.h"
5254 #include "stringlib/find.h"
5255 #include "stringlib/partition.h"
5256
5257 /* helper macro to fixup start/end slice values */
5258 #define FIX_START_END(obj)                      \
5259     if (start < 0)                              \
5260         start += (obj)->length;                 \
5261     if (start < 0)                              \
5262         start = 0;                              \
5263     if (end > (obj)->length)                    \
5264         end = (obj)->length;                    \
5265     if (end < 0)                                \
5266         end += (obj)->length;                   \
5267     if (end < 0)                                \
5268         end = 0;
5269
5270 Py_ssize_t PyUnicode_Count(PyObject *str,
5271                            PyObject *substr,
5272                            Py_ssize_t start,
5273                            Py_ssize_t end)
5274 {
5275     Py_ssize_t result;
5276     PyUnicodeObject* str_obj;
5277     PyUnicodeObject* sub_obj;
5278
5279     str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5280     if (!str_obj)
5281         return -1;
5282     sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5283     if (!sub_obj) {
5284         Py_DECREF(str_obj);
5285         return -1;
5286     }
5287
5288     FIX_START_END(str_obj);
5289
5290     result = stringlib_count(
5291         str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5292         );
5293
5294     Py_DECREF(sub_obj);
5295     Py_DECREF(str_obj);
5296
5297     return result;
5298 }
5299
5300 Py_ssize_t PyUnicode_Find(PyObject *str,
5301                           PyObject *sub,
5302                           Py_ssize_t start,
5303                           Py_ssize_t end,
5304                           int direction)
5305 {
5306     Py_ssize_t result;
5307
5308     str = PyUnicode_FromObject(str);
5309     if (!str)
5310         return -2;
5311     sub = PyUnicode_FromObject(sub);
5312     if (!sub) {
5313         Py_DECREF(str);
5314         return -2;
5315     }
5316
5317     if (direction > 0)
5318         result = stringlib_find_slice(
5319             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5320             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5321             start, end
5322             );
5323     else
5324         result = stringlib_rfind_slice(
5325             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5326             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5327             start, end
5328             );
5329
5330     Py_DECREF(str);
5331     Py_DECREF(sub);
5332
5333     return result;
5334 }
5335
5336 static
5337 int tailmatch(PyUnicodeObject *self,
5338               PyUnicodeObject *substring,
5339               Py_ssize_t start,
5340               Py_ssize_t end,
5341               int direction)
5342 {
5343     if (substring->length == 0)
5344         return 1;
5345
5346     FIX_START_END(self);
5347
5348     end -= substring->length;
5349     if (end < start)
5350         return 0;
5351
5352     if (direction > 0) {
5353         if (Py_UNICODE_MATCH(self, end, substring))
5354             return 1;
5355     } else {
5356         if (Py_UNICODE_MATCH(self, start, substring))
5357             return 1;
5358     }
5359
5360     return 0;
5361 }
5362
5363 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5364                                PyObject *substr,
5365                                Py_ssize_t start,
5366                                Py_ssize_t end,
5367                                int direction)
5368 {
5369     Py_ssize_t result;
5370
5371     str = PyUnicode_FromObject(str);
5372     if (str == NULL)
5373         return -1;
5374     substr = PyUnicode_FromObject(substr);
5375     if (substr == NULL) {
5376         Py_DECREF(str);
5377         return -1;
5378     }
5379
5380     result = tailmatch((PyUnicodeObject *)str,
5381                        (PyUnicodeObject *)substr,
5382                        start, end, direction);
5383     Py_DECREF(str);
5384     Py_DECREF(substr);
5385     return result;
5386 }
5387
5388 /* Apply fixfct filter to the Unicode object self and return a
5389    reference to the modified object */
5390
5391 static
5392 PyObject *fixup(PyUnicodeObject *self,
5393                 int (*fixfct)(PyUnicodeObject *s))
5394 {
5395
5396     PyUnicodeObject *u;
5397
5398     u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5399     if (u == NULL)
5400         return NULL;
5401
5402     Py_UNICODE_COPY(u->str, self->str, self->length);
5403
5404     if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5405         /* fixfct should return TRUE if it modified the buffer. If
5406            FALSE, return a reference to the original buffer instead
5407            (to save space, not time) */
5408         Py_INCREF(self);
5409         Py_DECREF(u);
5410         return (PyObject*) self;
5411     }
5412     return (PyObject*) u;
5413 }
5414
5415 static
5416 int fixupper(PyUnicodeObject *self)
5417 {
5418     Py_ssize_t len = self->length;
5419     Py_UNICODE *s = self->str;
5420     int status = 0;
5421
5422     while (len-- > 0) {
5423         register Py_UNICODE ch;
5424
5425         ch = Py_UNICODE_TOUPPER(*s);
5426         if (ch != *s) {
5427             status = 1;
5428             *s = ch;
5429         }
5430         s++;
5431     }
5432
5433     return status;
5434 }
5435
5436 static
5437 int fixlower(PyUnicodeObject *self)
5438 {
5439     Py_ssize_t len = self->length;
5440     Py_UNICODE *s = self->str;
5441     int status = 0;
5442
5443     while (len-- > 0) {
5444         register Py_UNICODE ch;
5445
5446         ch = Py_UNICODE_TOLOWER(*s);
5447         if (ch != *s) {
5448             status = 1;
5449             *s = ch;
5450         }
5451         s++;
5452     }
5453
5454     return status;
5455 }
5456
5457 static
5458 int fixswapcase(PyUnicodeObject *self)
5459 {
5460     Py_ssize_t len = self->length;
5461     Py_UNICODE *s = self->str;
5462     int status = 0;
5463
5464     while (len-- > 0) {
5465         if (Py_UNICODE_ISUPPER(*s)) {
5466             *s = Py_UNICODE_TOLOWER(*s);
5467             status = 1;
5468         } else if (Py_UNICODE_ISLOWER(*s)) {
5469             *s = Py_UNICODE_TOUPPER(*s);
5470             status = 1;
5471         }
5472         s++;
5473     }
5474
5475     return status;
5476 }
5477
5478 static
5479 int fixcapitalize(PyUnicodeObject *self)
5480 {
5481     Py_ssize_t len = self->length;
5482     Py_UNICODE *s = self->str;
5483     int status = 0;
5484
5485     if (len == 0)
5486         return 0;
5487     if (Py_UNICODE_ISLOWER(*s)) {
5488         *s = Py_UNICODE_TOUPPER(*s);
5489         status = 1;
5490     }
5491     s++;
5492     while (--len > 0) {
5493         if (Py_UNICODE_ISUPPER(*s)) {
5494             *s = Py_UNICODE_TOLOWER(*s);
5495             status = 1;
5496         }
5497         s++;
5498     }
5499     return status;
5500 }
5501
5502 static
5503 int fixtitle(PyUnicodeObject *self)
5504 {
5505     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5506     register Py_UNICODE *e;
5507     int previous_is_cased;
5508
5509     /* Shortcut for single character strings */
5510     if (PyUnicode_GET_SIZE(self) == 1) {
5511         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5512         if (*p != ch) {
5513             *p = ch;
5514             return 1;
5515         }
5516         else
5517             return 0;
5518     }
5519
5520     e = p + PyUnicode_GET_SIZE(self);
5521     previous_is_cased = 0;
5522     for (; p < e; p++) {
5523         register const Py_UNICODE ch = *p;
5524
5525         if (previous_is_cased)
5526             *p = Py_UNICODE_TOLOWER(ch);
5527         else
5528             *p = Py_UNICODE_TOTITLE(ch);
5529
5530         if (Py_UNICODE_ISLOWER(ch) ||
5531             Py_UNICODE_ISUPPER(ch) ||
5532             Py_UNICODE_ISTITLE(ch))
5533             previous_is_cased = 1;
5534         else
5535             previous_is_cased = 0;
5536     }
5537     return 1;
5538 }
5539
5540 PyObject *
5541 PyUnicode_Join(PyObject *separator, PyObject *seq)
5542 {
5543     PyObject *internal_separator = NULL;
5544     const Py_UNICODE blank = ' ';
5545     const Py_UNICODE *sep = &blank;
5546     Py_ssize_t seplen = 1;
5547     PyUnicodeObject *res = NULL; /* the result */
5548     Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
5549     Py_ssize_t res_used;         /* # used bytes */
5550     Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
5551     PyObject *fseq;          /* PySequence_Fast(seq) */
5552     Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
5553     PyObject *item;
5554     Py_ssize_t i;
5555
5556     fseq = PySequence_Fast(seq, "");
5557     if (fseq == NULL) {
5558         return NULL;
5559     }
5560
5561     /* Grrrr.  A codec may be invoked to convert str objects to
5562      * Unicode, and so it's possible to call back into Python code
5563      * during PyUnicode_FromObject(), and so it's possible for a sick
5564      * codec to change the size of fseq (if seq is a list).  Therefore
5565      * we have to keep refetching the size -- can't assume seqlen
5566      * is invariant.
5567      */
5568     seqlen = PySequence_Fast_GET_SIZE(fseq);
5569     /* If empty sequence, return u"". */
5570     if (seqlen == 0) {
5571         res = _PyUnicode_New(0);  /* empty sequence; return u"" */
5572         goto Done;
5573     }
5574     /* If singleton sequence with an exact Unicode, return that. */
5575     if (seqlen == 1) {
5576         item = PySequence_Fast_GET_ITEM(fseq, 0);
5577         if (PyUnicode_CheckExact(item)) {
5578             Py_INCREF(item);
5579             res = (PyUnicodeObject *)item;
5580             goto Done;
5581         }
5582     }
5583
5584     /* At least two items to join, or one that isn't exact Unicode. */
5585     if (seqlen > 1) {
5586         /* Set up sep and seplen -- they're needed. */
5587         if (separator == NULL) {
5588             sep = &blank;
5589             seplen = 1;
5590         }
5591         else {
5592             internal_separator = PyUnicode_FromObject(separator);
5593             if (internal_separator == NULL)
5594                 goto onError;
5595             sep = PyUnicode_AS_UNICODE(internal_separator);
5596             seplen = PyUnicode_GET_SIZE(internal_separator);
5597             /* In case PyUnicode_FromObject() mutated seq. */
5598             seqlen = PySequence_Fast_GET_SIZE(fseq);
5599         }
5600     }
5601
5602     /* Get space. */
5603     res = _PyUnicode_New(res_alloc);
5604     if (res == NULL)
5605         goto onError;
5606     res_p = PyUnicode_AS_UNICODE(res);
5607     res_used = 0;
5608
5609     for (i = 0; i < seqlen; ++i) {
5610         Py_ssize_t itemlen;
5611         Py_ssize_t new_res_used;
5612
5613         item = PySequence_Fast_GET_ITEM(fseq, i);
5614         /* Convert item to Unicode. */
5615         if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5616             PyErr_Format(PyExc_TypeError,
5617                          "sequence item %zd: expected string or Unicode,"
5618                          " %.80s found",
5619                          i, Py_TYPE(item)->tp_name);
5620             goto onError;
5621         }
5622         item = PyUnicode_FromObject(item);
5623         if (item == NULL)
5624             goto onError;
5625         /* We own a reference to item from here on. */
5626
5627         /* In case PyUnicode_FromObject() mutated seq. */
5628         seqlen = PySequence_Fast_GET_SIZE(fseq);
5629
5630         /* Make sure we have enough space for the separator and the item. */
5631         itemlen = PyUnicode_GET_SIZE(item);
5632         new_res_used = res_used + itemlen;
5633         if (new_res_used < 0)
5634             goto Overflow;
5635         if (i < seqlen - 1) {
5636             new_res_used += seplen;
5637             if (new_res_used < 0)
5638                 goto Overflow;
5639         }
5640         if (new_res_used > res_alloc) {
5641             /* double allocated size until it's big enough */
5642             do {
5643                 res_alloc += res_alloc;
5644                 if (res_alloc <= 0)
5645                     goto Overflow;
5646             } while (new_res_used > res_alloc);
5647             if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5648                 Py_DECREF(item);
5649                 goto onError;
5650             }
5651             res_p = PyUnicode_AS_UNICODE(res) + res_used;
5652         }
5653
5654         /* Copy item, and maybe the separator. */
5655         Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5656         res_p += itemlen;
5657         if (i < seqlen - 1) {
5658             Py_UNICODE_COPY(res_p, sep, seplen);
5659             res_p += seplen;
5660         }
5661         Py_DECREF(item);
5662         res_used = new_res_used;
5663     }
5664
5665     /* Shrink res to match the used area; this probably can't fail,
5666      * but it's cheap to check.
5667      */
5668     if (_PyUnicode_Resize(&res, res_used) < 0)
5669         goto onError;
5670
5671   Done:
5672     Py_XDECREF(internal_separator);
5673     Py_DECREF(fseq);
5674     return (PyObject *)res;
5675
5676   Overflow:
5677     PyErr_SetString(PyExc_OverflowError,
5678                     "join() result is too long for a Python string");
5679     Py_DECREF(item);
5680     /* fall through */
5681
5682   onError:
5683     Py_XDECREF(internal_separator);
5684     Py_DECREF(fseq);
5685     Py_XDECREF(res);
5686     return NULL;
5687 }
5688
5689 static
5690 PyUnicodeObject *pad(PyUnicodeObject *self,
5691                      Py_ssize_t left,
5692                      Py_ssize_t right,
5693                      Py_UNICODE fill)
5694 {
5695     PyUnicodeObject *u;
5696
5697     if (left < 0)
5698         left = 0;
5699     if (right < 0)
5700         right = 0;
5701
5702     if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5703         Py_INCREF(self);
5704         return self;
5705     }
5706
5707     if (left > PY_SSIZE_T_MAX - self->length ||
5708         right > PY_SSIZE_T_MAX - (left + self->length)) {
5709         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5710         return NULL;
5711     }
5712     u = _PyUnicode_New(left + self->length + right);
5713     if (u) {
5714         if (left)
5715             Py_UNICODE_FILL(u->str, fill, left);
5716         Py_UNICODE_COPY(u->str + left, self->str, self->length);
5717         if (right)
5718             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5719     }
5720
5721     return u;
5722 }
5723
5724 #define SPLIT_APPEND(data, left, right)                                 \
5725     str = PyUnicode_FromUnicode((data) + (left), (right) - (left));     \
5726     if (!str)                                                           \
5727         goto onError;                                                   \
5728     if (PyList_Append(list, str)) {                                     \
5729         Py_DECREF(str);                                                 \
5730         goto onError;                                                   \
5731     }                                                                   \
5732     else                                                                \
5733         Py_DECREF(str);
5734
5735 static
5736 PyObject *split_whitespace(PyUnicodeObject *self,
5737                            PyObject *list,
5738                            Py_ssize_t maxcount)
5739 {
5740     register Py_ssize_t i;
5741     register Py_ssize_t j;
5742     Py_ssize_t len = self->length;
5743     PyObject *str;
5744     register const Py_UNICODE *buf = self->str;
5745
5746     for (i = j = 0; i < len; ) {
5747         /* find a token */
5748         while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5749             i++;
5750         j = i;
5751         while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
5752             i++;
5753         if (j < i) {
5754             if (maxcount-- <= 0)
5755                 break;
5756             SPLIT_APPEND(buf, j, i);
5757             while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5758                 i++;
5759             j = i;
5760         }
5761     }
5762     if (j < len) {
5763         SPLIT_APPEND(buf, j, len);
5764     }
5765     return list;
5766
5767   onError:
5768     Py_DECREF(list);
5769     return NULL;
5770 }
5771
5772 PyObject *PyUnicode_Splitlines(PyObject *string,
5773                                int keepends)
5774 {
5775     register Py_ssize_t i;
5776     register Py_ssize_t j;
5777     Py_ssize_t len;
5778     PyObject *list;
5779     PyObject *str;
5780     Py_UNICODE *data;
5781
5782     string = PyUnicode_FromObject(string);
5783     if (string == NULL)
5784         return NULL;
5785     data = PyUnicode_AS_UNICODE(string);
5786     len = PyUnicode_GET_SIZE(string);
5787
5788     list = PyList_New(0);
5789     if (!list)
5790         goto onError;
5791
5792     for (i = j = 0; i < len; ) {
5793         Py_ssize_t eol;
5794
5795         /* Find a line and append it */
5796         while (i < len && !BLOOM_LINEBREAK(data[i]))
5797             i++;
5798
5799         /* Skip the line break reading CRLF as one line break */
5800         eol = i;
5801         if (i < len) {
5802             if (data[i] == '\r' && i + 1 < len &&
5803                 data[i+1] == '\n')
5804                 i += 2;
5805             else
5806                 i++;
5807             if (keepends)
5808                 eol = i;
5809         }
5810         SPLIT_APPEND(data, j, eol);
5811         j = i;
5812     }
5813     if (j < len) {
5814         SPLIT_APPEND(data, j, len);
5815     }
5816
5817     Py_DECREF(string);
5818     return list;
5819
5820   onError:
5821     Py_XDECREF(list);
5822     Py_DECREF(string);
5823     return NULL;
5824 }
5825
5826 static
5827 PyObject *split_char(PyUnicodeObject *self,
5828                      PyObject *list,
5829                      Py_UNICODE ch,
5830                      Py_ssize_t maxcount)
5831 {
5832     register Py_ssize_t i;
5833     register Py_ssize_t j;
5834     Py_ssize_t len = self->length;
5835     PyObject *str;
5836     register const Py_UNICODE *buf = self->str;
5837
5838     for (i = j = 0; i < len; ) {
5839         if (buf[i] == ch) {
5840             if (maxcount-- <= 0)
5841                 break;
5842             SPLIT_APPEND(buf, j, i);
5843             i = j = i + 1;
5844         } else
5845             i++;
5846     }
5847     if (j <= len) {
5848         SPLIT_APPEND(buf, j, len);
5849     }
5850     return list;
5851
5852   onError:
5853     Py_DECREF(list);
5854     return NULL;
5855 }
5856
5857 static
5858 PyObject *split_substring(PyUnicodeObject *self,
5859                           PyObject *list,
5860                           PyUnicodeObject *substring,
5861                           Py_ssize_t maxcount)
5862 {
5863     register Py_ssize_t i;
5864     register Py_ssize_t j;
5865     Py_ssize_t len = self->length;
5866     Py_ssize_t sublen = substring->length;
5867     PyObject *str;
5868
5869     for (i = j = 0; i <= len - sublen; ) {
5870         if (Py_UNICODE_MATCH(self, i, substring)) {
5871             if (maxcount-- <= 0)
5872                 break;
5873             SPLIT_APPEND(self->str, j, i);
5874             i = j = i + sublen;
5875         } else
5876             i++;
5877     }
5878     if (j <= len) {
5879         SPLIT_APPEND(self->str, j, len);
5880     }
5881     return list;
5882
5883   onError:
5884     Py_DECREF(list);
5885     return NULL;
5886 }
5887
5888 static
5889 PyObject *rsplit_whitespace(PyUnicodeObject *self,
5890                             PyObject *list,
5891                             Py_ssize_t maxcount)
5892 {
5893     register Py_ssize_t i;
5894     register Py_ssize_t j;
5895     Py_ssize_t len = self->length;
5896     PyObject *str;
5897     register const Py_UNICODE *buf = self->str;
5898
5899     for (i = j = len - 1; i >= 0; ) {
5900         /* find a token */
5901         while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5902             i--;
5903         j = i;
5904         while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
5905             i--;
5906         if (j > i) {
5907             if (maxcount-- <= 0)
5908                 break;
5909             SPLIT_APPEND(buf, i + 1, j + 1);
5910             while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5911                 i--;
5912             j = i;
5913         }
5914     }
5915     if (j >= 0) {
5916         SPLIT_APPEND(buf, 0, j + 1);
5917     }
5918     if (PyList_Reverse(list) < 0)
5919         goto onError;
5920     return list;
5921
5922   onError:
5923     Py_DECREF(list);
5924     return NULL;
5925 }
5926
5927 static
5928 PyObject *rsplit_char(PyUnicodeObject *self,
5929                       PyObject *list,
5930                       Py_UNICODE ch,
5931                       Py_ssize_t maxcount)
5932 {
5933     register Py_ssize_t i;
5934     register Py_ssize_t j;
5935     Py_ssize_t len = self->length;
5936     PyObject *str;
5937     register const Py_UNICODE *buf = self->str;
5938
5939     for (i = j = len - 1; i >= 0; ) {
5940         if (buf[i] == ch) {
5941             if (maxcount-- <= 0)
5942                 break;
5943             SPLIT_APPEND(buf, i + 1, j + 1);
5944             j = i = i - 1;
5945         } else
5946             i--;
5947     }
5948     if (j >= -1) {
5949         SPLIT_APPEND(buf, 0, j + 1);
5950     }
5951     if (PyList_Reverse(list) < 0)
5952         goto onError;
5953     return list;
5954
5955   onError:
5956     Py_DECREF(list);
5957     return NULL;
5958 }
5959
5960 static
5961 PyObject *rsplit_substring(PyUnicodeObject *self,
5962                            PyObject *list,
5963                            PyUnicodeObject *substring,
5964                            Py_ssize_t maxcount)
5965 {
5966     register Py_ssize_t i;
5967     register Py_ssize_t j;
5968     Py_ssize_t len = self->length;
5969     Py_ssize_t sublen = substring->length;
5970     PyObject *str;
5971
5972     for (i = len - sublen, j = len; i >= 0; ) {
5973         if (Py_UNICODE_MATCH(self, i, substring)) {
5974             if (maxcount-- <= 0)
5975                 break;
5976             SPLIT_APPEND(self->str, i + sublen, j);
5977             j = i;
5978             i -= sublen;
5979         } else
5980             i--;
5981     }
5982     if (j >= 0) {
5983         SPLIT_APPEND(self->str, 0, j);
5984     }
5985     if (PyList_Reverse(list) < 0)
5986         goto onError;
5987     return list;
5988
5989   onError:
5990     Py_DECREF(list);
5991     return NULL;
5992 }
5993
5994 #undef SPLIT_APPEND
5995
5996 static
5997 PyObject *split(PyUnicodeObject *self,
5998                 PyUnicodeObject *substring,
5999                 Py_ssize_t maxcount)
6000 {
6001     PyObject *list;
6002
6003     if (maxcount < 0)
6004         maxcount = PY_SSIZE_T_MAX;
6005
6006     list = PyList_New(0);
6007     if (!list)
6008         return NULL;
6009
6010     if (substring == NULL)
6011         return split_whitespace(self,list,maxcount);
6012
6013     else if (substring->length == 1)
6014         return split_char(self,list,substring->str[0],maxcount);
6015
6016     else if (substring->length == 0) {
6017         Py_DECREF(list);
6018         PyErr_SetString(PyExc_ValueError, "empty separator");
6019         return NULL;
6020     }
6021     else
6022         return split_substring(self,list,substring,maxcount);
6023 }
6024
6025 static
6026 PyObject *rsplit(PyUnicodeObject *self,
6027                  PyUnicodeObject *substring,
6028                  Py_ssize_t maxcount)
6029 {
6030     PyObject *list;
6031
6032     if (maxcount < 0)
6033         maxcount = PY_SSIZE_T_MAX;
6034
6035     list = PyList_New(0);
6036     if (!list)
6037         return NULL;
6038
6039     if (substring == NULL)
6040         return rsplit_whitespace(self,list,maxcount);
6041
6042     else if (substring->length == 1)
6043         return rsplit_char(self,list,substring->str[0],maxcount);
6044
6045     else if (substring->length == 0) {
6046         Py_DECREF(list);
6047         PyErr_SetString(PyExc_ValueError, "empty separator");
6048         return NULL;
6049     }
6050     else
6051         return rsplit_substring(self,list,substring,maxcount);
6052 }
6053
6054 static
6055 PyObject *replace(PyUnicodeObject *self,
6056                   PyUnicodeObject *str1,
6057                   PyUnicodeObject *str2,
6058                   Py_ssize_t maxcount)
6059 {
6060     PyUnicodeObject *u;
6061
6062     if (maxcount < 0)
6063         maxcount = PY_SSIZE_T_MAX;
6064
6065     if (str1->length == str2->length) {
6066         /* same length */
6067         Py_ssize_t i;
6068         if (str1->length == 1) {
6069             /* replace characters */
6070             Py_UNICODE u1, u2;
6071             if (!findchar(self->str, self->length, str1->str[0]))
6072                 goto nothing;
6073             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6074             if (!u)
6075                 return NULL;
6076             Py_UNICODE_COPY(u->str, self->str, self->length);
6077             u1 = str1->str[0];
6078             u2 = str2->str[0];
6079             for (i = 0; i < u->length; i++)
6080                 if (u->str[i] == u1) {
6081                     if (--maxcount < 0)
6082                         break;
6083                     u->str[i] = u2;
6084                 }
6085         } else {
6086             i = fastsearch(
6087                 self->str, self->length, str1->str, str1->length, FAST_SEARCH
6088                 );
6089             if (i < 0)
6090                 goto nothing;
6091             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6092             if (!u)
6093                 return NULL;
6094             Py_UNICODE_COPY(u->str, self->str, self->length);
6095             while (i <= self->length - str1->length)
6096                 if (Py_UNICODE_MATCH(self, i, str1)) {
6097                     if (--maxcount < 0)
6098                         break;
6099                     Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6100                     i += str1->length;
6101                 } else
6102                     i++;
6103         }
6104     } else {
6105
6106         Py_ssize_t n, i, j, e;
6107         Py_ssize_t product, new_size, delta;
6108         Py_UNICODE *p;
6109
6110         /* replace strings */
6111         n = stringlib_count(self->str, self->length, str1->str, str1->length);
6112         if (n > maxcount)
6113             n = maxcount;
6114         if (n == 0)
6115             goto nothing;
6116         /* new_size = self->length + n * (str2->length - str1->length)); */
6117         delta = (str2->length - str1->length);
6118         if (delta == 0) {
6119             new_size = self->length;
6120         } else {
6121             product = n * (str2->length - str1->length);
6122             if ((product / (str2->length - str1->length)) != n) {
6123                 PyErr_SetString(PyExc_OverflowError,
6124                                 "replace string is too long");
6125                 return NULL;
6126             }
6127             new_size = self->length + product;
6128             if (new_size < 0) {
6129                 PyErr_SetString(PyExc_OverflowError,
6130                                 "replace string is too long");
6131                 return NULL;
6132             }
6133         }
6134         u = _PyUnicode_New(new_size);
6135         if (!u)
6136             return NULL;
6137         i = 0;
6138         p = u->str;
6139         e = self->length - str1->length;
6140         if (str1->length > 0) {
6141             while (n-- > 0) {
6142                 /* look for next match */
6143                 j = i;
6144                 while (j <= e) {
6145                     if (Py_UNICODE_MATCH(self, j, str1))
6146                         break;
6147                     j++;
6148                 }
6149                 if (j > i) {
6150                     if (j > e)
6151                         break;
6152                     /* copy unchanged part [i:j] */
6153                     Py_UNICODE_COPY(p, self->str+i, j-i);
6154                     p += j - i;
6155                 }
6156                 /* copy substitution string */
6157                 if (str2->length > 0) {
6158                     Py_UNICODE_COPY(p, str2->str, str2->length);
6159                     p += str2->length;
6160                 }
6161                 i = j + str1->length;
6162             }
6163             if (i < self->length)
6164                 /* copy tail [i:] */
6165                 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6166         } else {
6167             /* interleave */
6168             while (n > 0) {
6169                 Py_UNICODE_COPY(p, str2->str, str2->length);
6170                 p += str2->length;
6171                 if (--n <= 0)
6172                     break;
6173                 *p++ = self->str[i++];
6174             }
6175             Py_UNICODE_COPY(p, self->str+i, self->length-i);
6176         }
6177     }
6178     return (PyObject *) u;
6179
6180   nothing:
6181     /* nothing to replace; return original string (when possible) */
6182     if (PyUnicode_CheckExact(self)) {
6183         Py_INCREF(self);
6184         return (PyObject *) self;
6185     }
6186     return PyUnicode_FromUnicode(self->str, self->length);
6187 }
6188
6189 /* --- Unicode Object Methods --------------------------------------------- */
6190
6191 PyDoc_STRVAR(title__doc__,
6192              "S.title() -> unicode\n\
6193 \n\
6194 Return a titlecased version of S, i.e. words start with title case\n\
6195 characters, all remaining cased characters have lower case.");
6196
6197 static PyObject*
6198 unicode_title(PyUnicodeObject *self)
6199 {
6200     return fixup(self, fixtitle);
6201 }
6202
6203 PyDoc_STRVAR(capitalize__doc__,
6204              "S.capitalize() -> unicode\n\
6205 \n\
6206 Return a capitalized version of S, i.e. make the first character\n\
6207 have upper case.");
6208
6209 static PyObject*
6210 unicode_capitalize(PyUnicodeObject *self)
6211 {
6212     return fixup(self, fixcapitalize);
6213 }
6214
6215 #if 0
6216 PyDoc_STRVAR(capwords__doc__,
6217              "S.capwords() -> unicode\n\
6218 \n\
6219 Apply .capitalize() to all words in S and return the result with\n\
6220 normalized whitespace (all whitespace strings are replaced by ' ').");
6221
6222 static PyObject*
6223 unicode_capwords(PyUnicodeObject *self)
6224 {
6225     PyObject *list;
6226     PyObject *item;
6227     Py_ssize_t i;
6228
6229     /* Split into words */
6230     list = split(self, NULL, -1);
6231     if (!list)
6232         return NULL;
6233
6234     /* Capitalize each word */
6235     for (i = 0; i < PyList_GET_SIZE(list); i++) {
6236         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6237                      fixcapitalize);
6238         if (item == NULL)
6239             goto onError;
6240         Py_DECREF(PyList_GET_ITEM(list, i));
6241         PyList_SET_ITEM(list, i, item);
6242     }
6243
6244     /* Join the words to form a new string */
6245     item = PyUnicode_Join(NULL, list);
6246
6247   onError:
6248     Py_DECREF(list);
6249     return (PyObject *)item;
6250 }
6251 #endif
6252
6253 /* Argument converter.  Coerces to a single unicode character */
6254
6255 static int
6256 convert_uc(PyObject *obj, void *addr)
6257 {
6258     Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6259     PyObject *uniobj;
6260     Py_UNICODE *unistr;
6261
6262     uniobj = PyUnicode_FromObject(obj);
6263     if (uniobj == NULL) {
6264         PyErr_SetString(PyExc_TypeError,
6265                         "The fill character cannot be converted to Unicode");
6266         return 0;
6267     }
6268     if (PyUnicode_GET_SIZE(uniobj) != 1) {
6269         PyErr_SetString(PyExc_TypeError,
6270                         "The fill character must be exactly one character long");
6271         Py_DECREF(uniobj);
6272         return 0;
6273     }
6274     unistr = PyUnicode_AS_UNICODE(uniobj);
6275     *fillcharloc = unistr[0];
6276     Py_DECREF(uniobj);
6277     return 1;
6278 }
6279
6280 PyDoc_STRVAR(center__doc__,
6281              "S.center(width[, fillchar]) -> unicode\n\
6282 \n\
6283 Return S centered in a Unicode string of length width. Padding is\n\
6284 done using the specified fill character (default is a space)");
6285
6286 static PyObject *
6287 unicode_center(PyUnicodeObject *self, PyObject *args)
6288 {
6289     Py_ssize_t marg, left;
6290     Py_ssize_t width;
6291     Py_UNICODE fillchar = ' ';
6292
6293     if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6294         return NULL;
6295
6296     if (self->length >= width && PyUnicode_CheckExact(self)) {
6297         Py_INCREF(self);
6298         return (PyObject*) self;
6299     }
6300
6301     marg = width - self->length;
6302     left = marg / 2 + (marg & width & 1);
6303
6304     return (PyObject*) pad(self, left, marg - left, fillchar);
6305 }
6306
6307 #if 0
6308
6309 /* This code should go into some future Unicode collation support
6310    module. The basic comparison should compare ordinals on a naive
6311    basis (this is what Java does and thus Jython too). */
6312
6313 /* speedy UTF-16 code point order comparison */
6314 /* gleaned from: */
6315 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6316
6317 static short utf16Fixup[32] =
6318 {
6319     0, 0, 0, 0, 0, 0, 0, 0,
6320     0, 0, 0, 0, 0, 0, 0, 0,
6321     0, 0, 0, 0, 0, 0, 0, 0,
6322     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6323 };
6324
6325 static int
6326 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6327 {
6328     Py_ssize_t len1, len2;
6329
6330     Py_UNICODE *s1 = str1->str;
6331     Py_UNICODE *s2 = str2->str;
6332
6333     len1 = str1->length;
6334     len2 = str2->length;
6335
6336     while (len1 > 0 && len2 > 0) {
6337         Py_UNICODE c1, c2;
6338
6339         c1 = *s1++;
6340         c2 = *s2++;
6341
6342         if (c1 > (1<<11) * 26)
6343             c1 += utf16Fixup[c1>>11];
6344         if (c2 > (1<<11) * 26)
6345             c2 += utf16Fixup[c2>>11];
6346         /* now c1 and c2 are in UTF-32-compatible order */
6347
6348         if (c1 != c2)
6349             return (c1 < c2) ? -1 : 1;
6350
6351         len1--; len2--;
6352     }
6353
6354     return (len1 < len2) ? -1 : (len1 != len2);
6355 }
6356
6357 #else
6358
6359 static int
6360 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6361 {
6362     register Py_ssize_t len1, len2;
6363
6364     Py_UNICODE *s1 = str1->str;
6365     Py_UNICODE *s2 = str2->str;
6366
6367     len1 = str1->length;
6368     len2 = str2->length;
6369
6370     while (len1 > 0 && len2 > 0) {
6371         Py_UNICODE c1, c2;
6372
6373         c1 = *s1++;
6374         c2 = *s2++;
6375
6376         if (c1 != c2)
6377             return (c1 < c2) ? -1 : 1;
6378
6379         len1--; len2--;
6380     }
6381
6382     return (len1 < len2) ? -1 : (len1 != len2);
6383 }
6384
6385 #endif
6386
6387 int PyUnicode_Compare(PyObject *left,
6388                       PyObject *right)
6389 {
6390     PyUnicodeObject *u = NULL, *v = NULL;
6391     int result;
6392
6393     /* Coerce the two arguments */
6394     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6395     if (u == NULL)
6396         goto onError;
6397     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6398     if (v == NULL)
6399         goto onError;
6400
6401     /* Shortcut for empty or interned objects */
6402     if (v == u) {
6403         Py_DECREF(u);
6404         Py_DECREF(v);
6405         return 0;
6406     }
6407
6408     result = unicode_compare(u, v);
6409
6410     Py_DECREF(u);
6411     Py_DECREF(v);
6412     return result;
6413
6414   onError:
6415     Py_XDECREF(u);
6416     Py_XDECREF(v);
6417     return -1;
6418 }
6419
6420 PyObject *PyUnicode_RichCompare(PyObject *left,
6421                                 PyObject *right,
6422                                 int op)
6423 {
6424     int result;
6425
6426     result = PyUnicode_Compare(left, right);
6427     if (result == -1 && PyErr_Occurred())
6428         goto onError;
6429
6430     /* Convert the return value to a Boolean */
6431     switch (op) {
6432     case Py_EQ:
6433         result = (result == 0);
6434         break;
6435     case Py_NE:
6436         result = (result != 0);
6437         break;
6438     case Py_LE:
6439         result = (result <= 0);
6440         break;
6441     case Py_GE:
6442         result = (result >= 0);
6443         break;
6444     case Py_LT:
6445         result = (result == -1);
6446         break;
6447     case Py_GT:
6448         result = (result == 1);
6449         break;
6450     }
6451     return PyBool_FromLong(result);
6452
6453   onError:
6454
6455     /* Standard case
6456
6457        Type errors mean that PyUnicode_FromObject() could not convert
6458        one of the arguments (usually the right hand side) to Unicode,
6459        ie. we can't handle the comparison request. However, it is
6460        possible that the other object knows a comparison method, which
6461        is why we return Py_NotImplemented to give the other object a
6462        chance.
6463
6464     */
6465     if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6466         PyErr_Clear();
6467         Py_INCREF(Py_NotImplemented);
6468         return Py_NotImplemented;
6469     }
6470     if (op != Py_EQ && op != Py_NE)
6471         return NULL;
6472
6473     /* Equality comparison.
6474
6475        This is a special case: we silence any PyExc_UnicodeDecodeError
6476        and instead turn it into a PyErr_UnicodeWarning.
6477
6478     */
6479     if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6480         return NULL;
6481     PyErr_Clear();
6482     if (PyErr_Warn(PyExc_UnicodeWarning,
6483                    (op == Py_EQ) ?
6484                    "Unicode equal comparison "
6485                    "failed to convert both arguments to Unicode - "
6486                    "interpreting them as being unequal" :
6487                    "Unicode unequal comparison "
6488                    "failed to convert both arguments to Unicode - "
6489                    "interpreting them as being unequal"
6490             ) < 0)
6491         return NULL;
6492     result = (op == Py_NE);
6493     return PyBool_FromLong(result);
6494 }
6495
6496 int PyUnicode_Contains(PyObject *container,
6497                        PyObject *element)
6498 {
6499     PyObject *str, *sub;
6500     int result;
6501
6502     /* Coerce the two arguments */
6503     sub = PyUnicode_FromObject(element);
6504     if (!sub) {
6505         PyErr_SetString(PyExc_TypeError,
6506                         "'in <string>' requires string as left operand");
6507         return -1;
6508     }
6509
6510     str = PyUnicode_FromObject(container);
6511     if (!str) {
6512         Py_DECREF(sub);
6513         return -1;
6514     }
6515
6516     result = stringlib_contains_obj(str, sub);
6517
6518     Py_DECREF(str);
6519     Py_DECREF(sub);
6520
6521     return result;
6522 }
6523
6524 /* Concat to string or Unicode object giving a new Unicode object. */
6525
6526 PyObject *PyUnicode_Concat(PyObject *left,
6527                            PyObject *right)
6528 {
6529     PyUnicodeObject *u = NULL, *v = NULL, *w;
6530
6531     /* Coerce the two arguments */
6532     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6533     if (u == NULL)
6534         goto onError;
6535     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6536     if (v == NULL)
6537         goto onError;
6538
6539     /* Shortcuts */
6540     if (v == unicode_empty) {
6541         Py_DECREF(v);
6542         return (PyObject *)u;
6543     }
6544     if (u == unicode_empty) {
6545         Py_DECREF(u);
6546         return (PyObject *)v;
6547     }
6548
6549     /* Concat the two Unicode strings */
6550     w = _PyUnicode_New(u->length + v->length);
6551     if (w == NULL)
6552         goto onError;
6553     Py_UNICODE_COPY(w->str, u->str, u->length);
6554     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6555
6556     Py_DECREF(u);
6557     Py_DECREF(v);
6558     return (PyObject *)w;
6559
6560   onError:
6561     Py_XDECREF(u);
6562     Py_XDECREF(v);
6563     return NULL;
6564 }
6565
6566 PyDoc_STRVAR(count__doc__,
6567              "S.count(sub[, start[, end]]) -> int\n\
6568 \n\
6569 Return the number of non-overlapping occurrences of substring sub in\n\
6570 Unicode string S[start:end].  Optional arguments start and end are\n\
6571 interpreted as in slice notation.");
6572
6573 static PyObject *
6574 unicode_count(PyUnicodeObject *self, PyObject *args)
6575 {
6576     PyUnicodeObject *substring;
6577     Py_ssize_t start = 0;
6578     Py_ssize_t end = PY_SSIZE_T_MAX;
6579     PyObject *result;
6580
6581     if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6582                           _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6583         return NULL;
6584
6585     substring = (PyUnicodeObject *)PyUnicode_FromObject(
6586         (PyObject *)substring);
6587     if (substring == NULL)
6588         return NULL;
6589
6590     FIX_START_END(self);
6591
6592     result = PyInt_FromSsize_t(
6593         stringlib_count(self->str + start, end - start,
6594                         substring->str, substring->length)
6595         );
6596
6597     Py_DECREF(substring);
6598
6599     return result;
6600 }
6601
6602 PyDoc_STRVAR(encode__doc__,
6603              "S.encode([encoding[,errors]]) -> string or unicode\n\
6604 \n\
6605 Encodes S using the codec registered for encoding. encoding defaults\n\
6606 to the default encoding. errors may be given to set a different error\n\
6607 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6608 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6609 'xmlcharrefreplace' as well as any other name registered with\n\
6610 codecs.register_error that can handle UnicodeEncodeErrors.");
6611
6612 static PyObject *
6613 unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6614 {
6615     static char *kwlist[] = {"encoding", "errors", 0};
6616     char *encoding = NULL;
6617     char *errors = NULL;
6618     PyObject *v;
6619
6620     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6621                                      kwlist, &encoding, &errors))
6622         return NULL;
6623     v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6624     if (v == NULL)
6625         goto onError;
6626     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6627         PyErr_Format(PyExc_TypeError,
6628                      "encoder did not return a string/unicode object "
6629                      "(type=%.400s)",
6630                      Py_TYPE(v)->tp_name);
6631         Py_DECREF(v);
6632         return NULL;
6633     }
6634     return v;
6635
6636   onError:
6637     return NULL;
6638 }
6639
6640 PyDoc_STRVAR(decode__doc__,
6641              "S.decode([encoding[,errors]]) -> string or unicode\n\
6642 \n\
6643 Decodes S using the codec registered for encoding. encoding defaults\n\
6644 to the default encoding. errors may be given to set a different error\n\
6645 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6646 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6647 as well as any other name registerd with codecs.register_error that is\n\
6648 able to handle UnicodeDecodeErrors.");
6649
6650 static PyObject *
6651 unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6652 {
6653     static char *kwlist[] = {"encoding", "errors", 0};
6654     char *encoding = NULL;
6655     char *errors = NULL;
6656     PyObject *v;
6657
6658     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6659                                      kwlist, &encoding, &errors))
6660         return NULL;
6661     v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
6662     if (v == NULL)
6663         goto onError;
6664     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6665         PyErr_Format(PyExc_TypeError,
6666                      "decoder did not return a string/unicode object "
6667                      "(type=%.400s)",
6668                      Py_TYPE(v)->tp_name);
6669         Py_DECREF(v);
6670         return NULL;
6671     }
6672     return v;
6673
6674   onError:
6675     return NULL;
6676 }
6677
6678 PyDoc_STRVAR(expandtabs__doc__,
6679              "S.expandtabs([tabsize]) -> unicode\n\
6680 \n\
6681 Return a copy of S where all tab characters are expanded using spaces.\n\
6682 If tabsize is not given, a tab size of 8 characters is assumed.");
6683
6684 static PyObject*
6685 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6686 {
6687     Py_UNICODE *e;
6688     Py_UNICODE *p;
6689     Py_UNICODE *q;
6690     Py_UNICODE *qe;
6691     Py_ssize_t i, j, incr;
6692     PyUnicodeObject *u;
6693     int tabsize = 8;
6694
6695     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6696         return NULL;
6697
6698     /* First pass: determine size of output string */
6699     i = 0; /* chars up to and including most recent \n or \r */
6700     j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6701     e = self->str + self->length; /* end of input */
6702     for (p = self->str; p < e; p++)
6703         if (*p == '\t') {
6704             if (tabsize > 0) {
6705                 incr = tabsize - (j % tabsize); /* cannot overflow */
6706                 if (j > PY_SSIZE_T_MAX - incr)
6707                     goto overflow1;
6708                 j += incr;
6709             }
6710         }
6711         else {
6712             if (j > PY_SSIZE_T_MAX - 1)
6713                 goto overflow1;
6714             j++;
6715             if (*p == '\n' || *p == '\r') {
6716                 if (i > PY_SSIZE_T_MAX - j)
6717                     goto overflow1;
6718                 i += j;
6719                 j = 0;
6720             }
6721         }
6722
6723     if (i > PY_SSIZE_T_MAX - j)
6724         goto overflow1;
6725
6726     /* Second pass: create output string and fill it */
6727     u = _PyUnicode_New(i + j);
6728     if (!u)
6729         return NULL;
6730
6731     j = 0; /* same as in first pass */
6732     q = u->str; /* next output char */
6733     qe = u->str + u->length; /* end of output */
6734
6735     for (p = self->str; p < e; p++)
6736         if (*p == '\t') {
6737             if (tabsize > 0) {
6738                 i = tabsize - (j % tabsize);
6739                 j += i;
6740                 while (i--) {
6741                     if (q >= qe)
6742                         goto overflow2;
6743                     *q++ = ' ';
6744                 }
6745             }
6746         }
6747         else {
6748             if (q >= qe)
6749                 goto overflow2;
6750             *q++ = *p;
6751             j++;
6752             if (*p == '\n' || *p == '\r')
6753                 j = 0;
6754         }
6755
6756     return (PyObject*) u;
6757
6758   overflow2:
6759     Py_DECREF(u);
6760   overflow1:
6761     PyErr_SetString(PyExc_OverflowError, "new string is too long");
6762     return NULL;
6763 }
6764
6765 PyDoc_STRVAR(find__doc__,
6766              "S.find(sub [,start [,end]]) -> int\n\
6767 \n\
6768 Return the lowest index in S where substring sub is found,\n\
6769 such that sub is contained within s[start:end].  Optional\n\
6770 arguments start and end are interpreted as in slice notation.\n\
6771 \n\
6772 Return -1 on failure.");
6773
6774 static PyObject *
6775 unicode_find(PyUnicodeObject *self, PyObject *args)
6776 {
6777     PyObject *substring;
6778     Py_ssize_t start;
6779     Py_ssize_t end;
6780     Py_ssize_t result;
6781
6782     if (!_ParseTupleFinds(args, &substring, &start, &end))
6783         return NULL;
6784
6785     result = stringlib_find_slice(
6786         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6787         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6788         start, end
6789         );
6790
6791     Py_DECREF(substring);
6792
6793     return PyInt_FromSsize_t(result);
6794 }
6795
6796 static PyObject *
6797 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6798 {
6799     if (index < 0 || index >= self->length) {
6800         PyErr_SetString(PyExc_IndexError, "string index out of range");
6801         return NULL;
6802     }
6803
6804     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6805 }
6806
6807 static long
6808 unicode_hash(PyUnicodeObject *self)
6809 {
6810     /* Since Unicode objects compare equal to their ASCII string
6811        counterparts, they should use the individual character values
6812        as basis for their hash value.  This is needed to assure that
6813        strings and Unicode objects behave in the same way as
6814        dictionary keys. */
6815
6816     register Py_ssize_t len;
6817     register Py_UNICODE *p;
6818     register long x;
6819
6820     if (self->hash != -1)
6821         return self->hash;
6822     len = PyUnicode_GET_SIZE(self);
6823     p = PyUnicode_AS_UNICODE(self);
6824     x = *p << 7;
6825     while (--len >= 0)
6826         x = (1000003*x) ^ *p++;
6827     x ^= PyUnicode_GET_SIZE(self);
6828     if (x == -1)
6829         x = -2;
6830     self->hash = x;
6831     return x;
6832 }
6833
6834 PyDoc_STRVAR(index__doc__,
6835              "S.index(sub [,start [,end]]) -> int\n\
6836 \n\
6837 Like S.find() but raise ValueError when the substring is not found.");
6838
6839 static PyObject *
6840 unicode_index(PyUnicodeObject *self, PyObject *args)
6841 {
6842     Py_ssize_t result;
6843     PyObject *substring;
6844     Py_ssize_t start;
6845     Py_ssize_t end;
6846
6847     if (!_ParseTupleFinds(args, &substring, &start, &end))
6848         return NULL;
6849
6850     result = stringlib_find_slice(
6851         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6852         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6853         start, end
6854         );
6855
6856     Py_DECREF(substring);
6857
6858     if (result < 0) {
6859         PyErr_SetString(PyExc_ValueError, "substring not found");
6860         return NULL;
6861     }
6862
6863     return PyInt_FromSsize_t(result);
6864 }
6865
6866 PyDoc_STRVAR(islower__doc__,
6867              "S.islower() -> bool\n\
6868 \n\
6869 Return True if all cased characters in S are lowercase and there is\n\
6870 at least one cased character in S, False otherwise.");
6871
6872 static PyObject*
6873 unicode_islower(PyUnicodeObject *self)
6874 {
6875     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6876     register const Py_UNICODE *e;
6877     int cased;
6878
6879     /* Shortcut for single character strings */
6880     if (PyUnicode_GET_SIZE(self) == 1)
6881         return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6882
6883     /* Special case for empty strings */
6884     if (PyUnicode_GET_SIZE(self) == 0)
6885         return PyBool_FromLong(0);
6886
6887     e = p + PyUnicode_GET_SIZE(self);
6888     cased = 0;
6889     for (; p < e; p++) {
6890         register const Py_UNICODE ch = *p;
6891
6892         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6893             return PyBool_FromLong(0);
6894         else if (!cased && Py_UNICODE_ISLOWER(ch))
6895             cased = 1;
6896     }
6897     return PyBool_FromLong(cased);
6898 }
6899
6900 PyDoc_STRVAR(isupper__doc__,
6901              "S.isupper() -> bool\n\
6902 \n\
6903 Return True if all cased characters in S are uppercase and there is\n\
6904 at least one cased character in S, False otherwise.");
6905
6906 static PyObject*
6907 unicode_isupper(PyUnicodeObject *self)
6908 {
6909     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6910     register const Py_UNICODE *e;
6911     int cased;
6912
6913     /* Shortcut for single character strings */
6914     if (PyUnicode_GET_SIZE(self) == 1)
6915         return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6916
6917     /* Special case for empty strings */
6918     if (PyUnicode_GET_SIZE(self) == 0)
6919         return PyBool_FromLong(0);
6920
6921     e = p + PyUnicode_GET_SIZE(self);
6922     cased = 0;
6923     for (; p < e; p++) {
6924         register const Py_UNICODE ch = *p;
6925
6926         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6927             return PyBool_FromLong(0);
6928         else if (!cased && Py_UNICODE_ISUPPER(ch))
6929             cased = 1;
6930     }
6931     return PyBool_FromLong(cased);
6932 }
6933
6934 PyDoc_STRVAR(istitle__doc__,
6935              "S.istitle() -> bool\n\
6936 \n\
6937 Return True if S is a titlecased string and there is at least one\n\
6938 character in S, i.e. upper- and titlecase characters may only\n\
6939 follow uncased characters and lowercase characters only cased ones.\n\
6940 Return False otherwise.");
6941
6942 static PyObject*
6943 unicode_istitle(PyUnicodeObject *self)
6944 {
6945     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6946     register const Py_UNICODE *e;
6947     int cased, previous_is_cased;
6948
6949     /* Shortcut for single character strings */
6950     if (PyUnicode_GET_SIZE(self) == 1)
6951         return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6952                                (Py_UNICODE_ISUPPER(*p) != 0));
6953
6954     /* Special case for empty strings */
6955     if (PyUnicode_GET_SIZE(self) == 0)
6956         return PyBool_FromLong(0);
6957
6958     e = p + PyUnicode_GET_SIZE(self);
6959     cased = 0;
6960     previous_is_cased = 0;
6961     for (; p < e; p++) {
6962         register const Py_UNICODE ch = *p;
6963
6964         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6965             if (previous_is_cased)
6966                 return PyBool_FromLong(0);
6967             previous_is_cased = 1;
6968             cased = 1;
6969         }
6970         else if (Py_UNICODE_ISLOWER(ch)) {
6971             if (!previous_is_cased)
6972                 return PyBool_FromLong(0);
6973             previous_is_cased = 1;
6974             cased = 1;
6975         }
6976         else
6977             previous_is_cased = 0;
6978     }
6979     return PyBool_FromLong(cased);
6980 }
6981
6982 PyDoc_STRVAR(isspace__doc__,
6983              "S.isspace() -> bool\n\
6984 \n\
6985 Return True if all characters in S are whitespace\n\
6986 and there is at least one character in S, False otherwise.");
6987
6988 static PyObject*
6989 unicode_isspace(PyUnicodeObject *self)
6990 {
6991     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6992     register const Py_UNICODE *e;
6993
6994     /* Shortcut for single character strings */
6995     if (PyUnicode_GET_SIZE(self) == 1 &&
6996         Py_UNICODE_ISSPACE(*p))
6997         return PyBool_FromLong(1);
6998
6999     /* Special case for empty strings */
7000     if (PyUnicode_GET_SIZE(self) == 0)
7001         return PyBool_FromLong(0);
7002
7003     e = p + PyUnicode_GET_SIZE(self);
7004     for (; p < e; p++) {
7005         if (!Py_UNICODE_ISSPACE(*p))
7006             return PyBool_FromLong(0);
7007     }
7008     return PyBool_FromLong(1);
7009 }
7010
7011 PyDoc_STRVAR(isalpha__doc__,
7012              "S.isalpha() -> bool\n\
7013 \n\
7014 Return True if all characters in S are alphabetic\n\
7015 and there is at least one character in S, False otherwise.");
7016
7017 static PyObject*
7018 unicode_isalpha(PyUnicodeObject *self)
7019 {
7020     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7021     register const Py_UNICODE *e;
7022
7023     /* Shortcut for single character strings */
7024     if (PyUnicode_GET_SIZE(self) == 1 &&
7025         Py_UNICODE_ISALPHA(*p))
7026         return PyBool_FromLong(1);
7027
7028     /* Special case for empty strings */
7029     if (PyUnicode_GET_SIZE(self) == 0)
7030         return PyBool_FromLong(0);
7031
7032     e = p + PyUnicode_GET_SIZE(self);
7033     for (; p < e; p++) {
7034         if (!Py_UNICODE_ISALPHA(*p))
7035             return PyBool_FromLong(0);
7036     }
7037     return PyBool_FromLong(1);
7038 }
7039
7040 PyDoc_STRVAR(isalnum__doc__,
7041              "S.isalnum() -> bool\n\
7042 \n\
7043 Return True if all characters in S are alphanumeric\n\
7044 and there is at least one character in S, False otherwise.");
7045
7046 static PyObject*
7047 unicode_isalnum(PyUnicodeObject *self)
7048 {
7049     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7050     register const Py_UNICODE *e;
7051
7052     /* Shortcut for single character strings */
7053     if (PyUnicode_GET_SIZE(self) == 1 &&
7054         Py_UNICODE_ISALNUM(*p))
7055         return PyBool_FromLong(1);
7056
7057     /* Special case for empty strings */
7058     if (PyUnicode_GET_SIZE(self) == 0)
7059         return PyBool_FromLong(0);
7060
7061     e = p + PyUnicode_GET_SIZE(self);
7062     for (; p < e; p++) {
7063         if (!Py_UNICODE_ISALNUM(*p))
7064             return PyBool_FromLong(0);
7065     }
7066     return PyBool_FromLong(1);
7067 }
7068
7069 PyDoc_STRVAR(isdecimal__doc__,
7070              "S.isdecimal() -> bool\n\
7071 \n\
7072 Return True if there are only decimal characters in S,\n\
7073 False otherwise.");
7074
7075 static PyObject*
7076 unicode_isdecimal(PyUnicodeObject *self)
7077 {
7078     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7079     register const Py_UNICODE *e;
7080
7081     /* Shortcut for single character strings */
7082     if (PyUnicode_GET_SIZE(self) == 1 &&
7083         Py_UNICODE_ISDECIMAL(*p))
7084         return PyBool_FromLong(1);
7085
7086     /* Special case for empty strings */
7087     if (PyUnicode_GET_SIZE(self) == 0)
7088         return PyBool_FromLong(0);
7089
7090     e = p + PyUnicode_GET_SIZE(self);
7091     for (; p < e; p++) {
7092         if (!Py_UNICODE_ISDECIMAL(*p))
7093             return PyBool_FromLong(0);
7094     }
7095     return PyBool_FromLong(1);
7096 }
7097
7098 PyDoc_STRVAR(isdigit__doc__,
7099              "S.isdigit() -> bool\n\
7100 \n\
7101 Return True if all characters in S are digits\n\
7102 and there is at least one character in S, False otherwise.");
7103
7104 static PyObject*
7105 unicode_isdigit(PyUnicodeObject *self)
7106 {
7107     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7108     register const Py_UNICODE *e;
7109
7110     /* Shortcut for single character strings */
7111     if (PyUnicode_GET_SIZE(self) == 1 &&
7112         Py_UNICODE_ISDIGIT(*p))
7113         return PyBool_FromLong(1);
7114
7115     /* Special case for empty strings */
7116     if (PyUnicode_GET_SIZE(self) == 0)
7117         return PyBool_FromLong(0);
7118
7119     e = p + PyUnicode_GET_SIZE(self);
7120     for (; p < e; p++) {
7121         if (!Py_UNICODE_ISDIGIT(*p))
7122             return PyBool_FromLong(0);
7123     }
7124     return PyBool_FromLong(1);
7125 }
7126
7127 PyDoc_STRVAR(isnumeric__doc__,
7128              "S.isnumeric() -> bool\n\
7129 \n\
7130 Return True if there are only numeric characters in S,\n\
7131 False otherwise.");
7132
7133 static PyObject*
7134 unicode_isnumeric(PyUnicodeObject *self)
7135 {
7136     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7137     register const Py_UNICODE *e;
7138
7139     /* Shortcut for single character strings */
7140     if (PyUnicode_GET_SIZE(self) == 1 &&
7141         Py_UNICODE_ISNUMERIC(*p))
7142         return PyBool_FromLong(1);
7143
7144     /* Special case for empty strings */
7145     if (PyUnicode_GET_SIZE(self) == 0)
7146         return PyBool_FromLong(0);
7147
7148     e = p + PyUnicode_GET_SIZE(self);
7149     for (; p < e; p++) {
7150         if (!Py_UNICODE_ISNUMERIC(*p))
7151             return PyBool_FromLong(0);
7152     }
7153     return PyBool_FromLong(1);
7154 }
7155
7156 PyDoc_STRVAR(join__doc__,
7157              "S.join(iterable) -> unicode\n\
7158 \n\
7159 Return a string which is the concatenation of the strings in the\n\
7160 iterable.  The separator between elements is S.");
7161
7162 static PyObject*
7163 unicode_join(PyObject *self, PyObject *data)
7164 {
7165     return PyUnicode_Join(self, data);
7166 }
7167
7168 static Py_ssize_t
7169 unicode_length(PyUnicodeObject *self)
7170 {
7171     return self->length;
7172 }
7173
7174 PyDoc_STRVAR(ljust__doc__,
7175              "S.ljust(width[, fillchar]) -> int\n\
7176 \n\
7177 Return S left-justified in a Unicode string of length width. Padding is\n\
7178 done using the specified fill character (default is a space).");
7179
7180 static PyObject *
7181 unicode_ljust(PyUnicodeObject *self, PyObject *args)
7182 {
7183     Py_ssize_t width;
7184     Py_UNICODE fillchar = ' ';
7185
7186     if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7187         return NULL;
7188
7189     if (self->length >= width && PyUnicode_CheckExact(self)) {
7190         Py_INCREF(self);
7191         return (PyObject*) self;
7192     }
7193
7194     return (PyObject*) pad(self, 0, width - self->length, fillchar);
7195 }
7196
7197 PyDoc_STRVAR(lower__doc__,
7198              "S.lower() -> unicode\n\
7199 \n\
7200 Return a copy of the string S converted to lowercase.");
7201
7202 static PyObject*
7203 unicode_lower(PyUnicodeObject *self)
7204 {
7205     return fixup(self, fixlower);
7206 }
7207
7208 #define LEFTSTRIP 0
7209 #define RIGHTSTRIP 1
7210 #define BOTHSTRIP 2
7211
7212 /* Arrays indexed by above */
7213 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7214
7215 #define STRIPNAME(i) (stripformat[i]+3)
7216
7217 /* externally visible for str.strip(unicode) */
7218 PyObject *
7219 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7220 {
7221     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7222     Py_ssize_t len = PyUnicode_GET_SIZE(self);
7223     Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7224     Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7225     Py_ssize_t i, j;
7226
7227     BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7228
7229     i = 0;
7230     if (striptype != RIGHTSTRIP) {
7231         while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7232             i++;
7233         }
7234     }
7235
7236     j = len;
7237     if (striptype != LEFTSTRIP) {
7238         do {
7239             j--;
7240         } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7241         j++;
7242     }
7243
7244     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7245         Py_INCREF(self);
7246         return (PyObject*)self;
7247     }
7248     else
7249         return PyUnicode_FromUnicode(s+i, j-i);
7250 }
7251
7252
7253 static PyObject *
7254 do_strip(PyUnicodeObject *self, int striptype)
7255 {
7256     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7257     Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7258
7259     i = 0;
7260     if (striptype != RIGHTSTRIP) {
7261         while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7262             i++;
7263         }
7264     }
7265
7266     j = len;
7267     if (striptype != LEFTSTRIP) {
7268         do {
7269             j--;
7270         } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7271         j++;
7272     }
7273
7274     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7275         Py_INCREF(self);
7276         return (PyObject*)self;
7277     }
7278     else
7279         return PyUnicode_FromUnicode(s+i, j-i);
7280 }
7281
7282
7283 static PyObject *
7284 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7285 {
7286     PyObject *sep = NULL;
7287
7288     if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7289         return NULL;
7290
7291     if (sep != NULL && sep != Py_None) {
7292         if (PyUnicode_Check(sep))
7293             return _PyUnicode_XStrip(self, striptype, sep);
7294         else if (PyString_Check(sep)) {
7295             PyObject *res;
7296             sep = PyUnicode_FromObject(sep);
7297             if (sep==NULL)
7298                 return NULL;
7299             res = _PyUnicode_XStrip(self, striptype, sep);
7300             Py_DECREF(sep);
7301             return res;
7302         }
7303         else {
7304             PyErr_Format(PyExc_TypeError,
7305                          "%s arg must be None, unicode or str",
7306                          STRIPNAME(striptype));
7307             return NULL;
7308         }
7309     }
7310
7311     return do_strip(self, striptype);
7312 }
7313
7314
7315 PyDoc_STRVAR(strip__doc__,
7316              "S.strip([chars]) -> unicode\n\
7317 \n\
7318 Return a copy of the string S with leading and trailing\n\
7319 whitespace removed.\n\
7320 If chars is given and not None, remove characters in chars instead.\n\
7321 If chars is a str, it will be converted to unicode before stripping");
7322
7323 static PyObject *
7324 unicode_strip(PyUnicodeObject *self, PyObject *args)
7325 {
7326     if (PyTuple_GET_SIZE(args) == 0)
7327         return do_strip(self, BOTHSTRIP); /* Common case */
7328     else
7329         return do_argstrip(self, BOTHSTRIP, args);
7330 }
7331
7332
7333 PyDoc_STRVAR(lstrip__doc__,
7334              "S.lstrip([chars]) -> unicode\n\
7335 \n\
7336 Return a copy of the string S with leading whitespace removed.\n\
7337 If chars is given and not None, remove characters in chars instead.\n\
7338 If chars is a str, it will be converted to unicode before stripping");
7339
7340 static PyObject *
7341 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7342 {
7343     if (PyTuple_GET_SIZE(args) == 0)
7344         return do_strip(self, LEFTSTRIP); /* Common case */
7345     else
7346         return do_argstrip(self, LEFTSTRIP, args);
7347 }
7348
7349
7350 PyDoc_STRVAR(rstrip__doc__,
7351              "S.rstrip([chars]) -> unicode\n\
7352 \n\
7353 Return a copy of the string S with trailing whitespace removed.\n\
7354 If chars is given and not None, remove characters in chars instead.\n\
7355 If chars is a str, it will be converted to unicode before stripping");
7356
7357 static PyObject *
7358 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7359 {
7360     if (PyTuple_GET_SIZE(args) == 0)
7361         return do_strip(self, RIGHTSTRIP); /* Common case */
7362     else
7363         return do_argstrip(self, RIGHTSTRIP, args);
7364 }
7365
7366
7367 static PyObject*
7368 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7369 {
7370     PyUnicodeObject *u;
7371     Py_UNICODE *p;
7372     Py_ssize_t nchars;
7373     size_t nbytes;
7374
7375     if (len < 0)
7376         len = 0;
7377
7378     if (len == 1 && PyUnicode_CheckExact(str)) {
7379         /* no repeat, return original string */
7380         Py_INCREF(str);
7381         return (PyObject*) str;
7382     }
7383
7384     /* ensure # of chars needed doesn't overflow int and # of bytes
7385      * needed doesn't overflow size_t
7386      */
7387     nchars = len * str->length;
7388     if (len && nchars / len != str->length) {
7389         PyErr_SetString(PyExc_OverflowError,
7390                         "repeated string is too long");
7391         return NULL;
7392     }
7393     nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7394     if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7395         PyErr_SetString(PyExc_OverflowError,
7396                         "repeated string is too long");
7397         return NULL;
7398     }
7399     u = _PyUnicode_New(nchars);
7400     if (!u)
7401         return NULL;
7402
7403     p = u->str;
7404
7405     if (str->length == 1 && len > 0) {
7406         Py_UNICODE_FILL(p, str->str[0], len);
7407     } else {
7408         Py_ssize_t done = 0; /* number of characters copied this far */
7409         if (done < nchars) {
7410             Py_UNICODE_COPY(p, str->str, str->length);
7411             done = str->length;
7412         }
7413         while (done < nchars) {
7414             Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7415             Py_UNICODE_COPY(p+done, p, n);
7416             done += n;
7417         }
7418     }
7419
7420     return (PyObject*) u;
7421 }
7422
7423 PyObject *PyUnicode_Replace(PyObject *obj,
7424                             PyObject *subobj,
7425                             PyObject *replobj,
7426                             Py_ssize_t maxcount)
7427 {
7428     PyObject *self;
7429     PyObject *str1;
7430     PyObject *str2;
7431     PyObject *result;
7432
7433     self = PyUnicode_FromObject(obj);
7434     if (self == NULL)
7435         return NULL;
7436     str1 = PyUnicode_FromObject(subobj);
7437     if (str1 == NULL) {
7438         Py_DECREF(self);
7439         return NULL;
7440     }
7441     str2 = PyUnicode_FromObject(replobj);
7442     if (str2 == NULL) {
7443         Py_DECREF(self);
7444         Py_DECREF(str1);
7445         return NULL;
7446     }
7447     result = replace((PyUnicodeObject *)self,
7448                      (PyUnicodeObject *)str1,
7449                      (PyUnicodeObject *)str2,
7450                      maxcount);
7451     Py_DECREF(self);
7452     Py_DECREF(str1);
7453     Py_DECREF(str2);
7454     return result;
7455 }
7456
7457 PyDoc_STRVAR(replace__doc__,
7458              "S.replace (old, new[, count]) -> unicode\n\
7459 \n\
7460 Return a copy of S with all occurrences of substring\n\
7461 old replaced by new.  If the optional argument count is\n\
7462 given, only the first count occurrences are replaced.");
7463
7464 static PyObject*
7465 unicode_replace(PyUnicodeObject *self, PyObject *args)
7466 {
7467     PyUnicodeObject *str1;
7468     PyUnicodeObject *str2;
7469     Py_ssize_t maxcount = -1;
7470     PyObject *result;
7471
7472     if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7473         return NULL;
7474     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7475     if (str1 == NULL)
7476         return NULL;
7477     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7478     if (str2 == NULL) {
7479         Py_DECREF(str1);
7480         return NULL;
7481     }
7482
7483     result = replace(self, str1, str2, maxcount);
7484
7485     Py_DECREF(str1);
7486     Py_DECREF(str2);
7487     return result;
7488 }
7489
7490 static
7491 PyObject *unicode_repr(PyObject *unicode)
7492 {
7493     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7494                                 PyUnicode_GET_SIZE(unicode),
7495                                 1);
7496 }
7497
7498 PyDoc_STRVAR(rfind__doc__,
7499              "S.rfind(sub [,start [,end]]) -> int\n\
7500 \n\
7501 Return the highest index in S where substring sub is found,\n\
7502 such that sub is contained within s[start:end].  Optional\n\
7503 arguments start and end are interpreted as in slice notation.\n\
7504 \n\
7505 Return -1 on failure.");
7506
7507 static PyObject *
7508 unicode_rfind(PyUnicodeObject *self, PyObject *args)
7509 {
7510     PyObject *substring;
7511     Py_ssize_t start;
7512     Py_ssize_t end;
7513     Py_ssize_t result;
7514
7515     if (!_ParseTupleFinds(args, &substring, &start, &end))
7516         return NULL;
7517
7518     result = stringlib_rfind_slice(
7519         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7520         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7521         start, end
7522         );
7523
7524     Py_DECREF(substring);
7525
7526     return PyInt_FromSsize_t(result);
7527 }
7528
7529 PyDoc_STRVAR(rindex__doc__,
7530              "S.rindex(sub [,start [,end]]) -> int\n\
7531 \n\
7532 Like S.rfind() but raise ValueError when the substring is not found.");
7533
7534 static PyObject *
7535 unicode_rindex(PyUnicodeObject *self, PyObject *args)
7536 {
7537     PyObject *substring;
7538     Py_ssize_t start;
7539     Py_ssize_t end;
7540     Py_ssize_t result;
7541
7542     if (!_ParseTupleFinds(args, &substring, &start, &end))
7543         return NULL;
7544
7545     result = stringlib_rfind_slice(
7546         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7547         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7548         start, end
7549         );
7550
7551     Py_DECREF(substring);
7552
7553     if (result < 0) {
7554         PyErr_SetString(PyExc_ValueError, "substring not found");
7555         return NULL;
7556     }
7557     return PyInt_FromSsize_t(result);
7558 }
7559
7560 PyDoc_STRVAR(rjust__doc__,
7561              "S.rjust(width[, fillchar]) -> unicode\n\
7562 \n\
7563 Return S right-justified in a Unicode string of length width. Padding is\n\
7564 done using the specified fill character (default is a space).");
7565
7566 static PyObject *
7567 unicode_rjust(PyUnicodeObject *self, PyObject *args)
7568 {
7569     Py_ssize_t width;
7570     Py_UNICODE fillchar = ' ';
7571
7572     if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7573         return NULL;
7574
7575     if (self->length >= width && PyUnicode_CheckExact(self)) {
7576         Py_INCREF(self);
7577         return (PyObject*) self;
7578     }
7579
7580     return (PyObject*) pad(self, width - self->length, 0, fillchar);
7581 }
7582
7583 static PyObject*
7584 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
7585 {
7586     /* standard clamping */
7587     if (start < 0)
7588         start = 0;
7589     if (end < 0)
7590         end = 0;
7591     if (end > self->length)
7592         end = self->length;
7593     if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
7594         /* full slice, return original string */
7595         Py_INCREF(self);
7596         return (PyObject*) self;
7597     }
7598     if (start > end)
7599         start = end;
7600     /* copy slice */
7601     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7602                                              end - start);
7603 }
7604
7605 PyObject *PyUnicode_Split(PyObject *s,
7606                           PyObject *sep,
7607                           Py_ssize_t maxsplit)
7608 {
7609     PyObject *result;
7610
7611     s = PyUnicode_FromObject(s);
7612     if (s == NULL)
7613         return NULL;
7614     if (sep != NULL) {
7615         sep = PyUnicode_FromObject(sep);
7616         if (sep == NULL) {
7617             Py_DECREF(s);
7618             return NULL;
7619         }
7620     }
7621
7622     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7623
7624     Py_DECREF(s);
7625     Py_XDECREF(sep);
7626     return result;
7627 }
7628
7629 PyDoc_STRVAR(split__doc__,
7630              "S.split([sep [,maxsplit]]) -> list of strings\n\
7631 \n\
7632 Return a list of the words in S, using sep as the\n\
7633 delimiter string.  If maxsplit is given, at most maxsplit\n\
7634 splits are done. If sep is not specified or is None, any\n\
7635 whitespace string is a separator and empty strings are\n\
7636 removed from the result.");
7637
7638 static PyObject*
7639 unicode_split(PyUnicodeObject *self, PyObject *args)
7640 {
7641     PyObject *substring = Py_None;
7642     Py_ssize_t maxcount = -1;
7643
7644     if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7645         return NULL;
7646
7647     if (substring == Py_None)
7648         return split(self, NULL, maxcount);
7649     else if (PyUnicode_Check(substring))
7650         return split(self, (PyUnicodeObject *)substring, maxcount);
7651     else
7652         return PyUnicode_Split((PyObject *)self, substring, maxcount);
7653 }
7654
7655 PyObject *
7656 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7657 {
7658     PyObject* str_obj;
7659     PyObject* sep_obj;
7660     PyObject* out;
7661
7662     str_obj = PyUnicode_FromObject(str_in);
7663     if (!str_obj)
7664         return NULL;
7665     sep_obj = PyUnicode_FromObject(sep_in);
7666     if (!sep_obj) {
7667         Py_DECREF(str_obj);
7668         return NULL;
7669     }
7670
7671     out = stringlib_partition(
7672         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7673         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7674         );
7675
7676     Py_DECREF(sep_obj);
7677     Py_DECREF(str_obj);
7678
7679     return out;
7680 }
7681
7682
7683 PyObject *
7684 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7685 {
7686     PyObject* str_obj;
7687     PyObject* sep_obj;
7688     PyObject* out;
7689
7690     str_obj = PyUnicode_FromObject(str_in);
7691     if (!str_obj)
7692         return NULL;
7693     sep_obj = PyUnicode_FromObject(sep_in);
7694     if (!sep_obj) {
7695         Py_DECREF(str_obj);
7696         return NULL;
7697     }
7698
7699     out = stringlib_rpartition(
7700         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7701         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7702         );
7703
7704     Py_DECREF(sep_obj);
7705     Py_DECREF(str_obj);
7706
7707     return out;
7708 }
7709
7710 PyDoc_STRVAR(partition__doc__,
7711              "S.partition(sep) -> (head, sep, tail)\n\
7712 \n\
7713 Search for the separator sep in S, and return the part before it,\n\
7714 the separator itself, and the part after it.  If the separator is not\n\
7715 found, return S and two empty strings.");
7716
7717 static PyObject*
7718 unicode_partition(PyUnicodeObject *self, PyObject *separator)
7719 {
7720     return PyUnicode_Partition((PyObject *)self, separator);
7721 }
7722
7723 PyDoc_STRVAR(rpartition__doc__,
7724              "S.rpartition(sep) -> (tail, sep, head)\n\
7725 \n\
7726 Search for the separator sep in S, starting at the end of S, and return\n\
7727 the part before it, the separator itself, and the part after it.  If the\n\
7728 separator is not found, return two empty strings and S.");
7729
7730 static PyObject*
7731 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7732 {
7733     return PyUnicode_RPartition((PyObject *)self, separator);
7734 }
7735
7736 PyObject *PyUnicode_RSplit(PyObject *s,
7737                            PyObject *sep,
7738                            Py_ssize_t maxsplit)
7739 {
7740     PyObject *result;
7741
7742     s = PyUnicode_FromObject(s);
7743     if (s == NULL)
7744         return NULL;
7745     if (sep != NULL) {
7746         sep = PyUnicode_FromObject(sep);
7747         if (sep == NULL) {
7748             Py_DECREF(s);
7749             return NULL;
7750         }
7751     }
7752
7753     result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7754
7755     Py_DECREF(s);
7756     Py_XDECREF(sep);
7757     return result;
7758 }
7759
7760 PyDoc_STRVAR(rsplit__doc__,
7761              "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7762 \n\
7763 Return a list of the words in S, using sep as the\n\
7764 delimiter string, starting at the end of the string and\n\
7765 working to the front.  If maxsplit is given, at most maxsplit\n\
7766 splits are done. If sep is not specified, any whitespace string\n\
7767 is a separator.");
7768
7769 static PyObject*
7770 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7771 {
7772     PyObject *substring = Py_None;
7773     Py_ssize_t maxcount = -1;
7774
7775     if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7776         return NULL;
7777
7778     if (substring == Py_None)
7779         return rsplit(self, NULL, maxcount);
7780     else if (PyUnicode_Check(substring))
7781         return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7782     else
7783         return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7784 }
7785
7786 PyDoc_STRVAR(splitlines__doc__,
7787              "S.splitlines([keepends]) -> list of strings\n\
7788 \n\
7789 Return a list of the lines in S, breaking at line boundaries.\n\
7790 Line breaks are not included in the resulting list unless keepends\n\
7791 is given and true.");
7792
7793 static PyObject*
7794 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7795 {
7796     int keepends = 0;
7797
7798     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7799         return NULL;
7800
7801     return PyUnicode_Splitlines((PyObject *)self, keepends);
7802 }
7803
7804 static
7805 PyObject *unicode_str(PyUnicodeObject *self)
7806 {
7807     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
7808 }
7809
7810 PyDoc_STRVAR(swapcase__doc__,
7811              "S.swapcase() -> unicode\n\
7812 \n\
7813 Return a copy of S with uppercase characters converted to lowercase\n\
7814 and vice versa.");
7815
7816 static PyObject*
7817 unicode_swapcase(PyUnicodeObject *self)
7818 {
7819     return fixup(self, fixswapcase);
7820 }
7821
7822 PyDoc_STRVAR(translate__doc__,
7823              "S.translate(table) -> unicode\n\
7824 \n\
7825 Return a copy of the string S, where all characters have been mapped\n\
7826 through the given translation table, which must be a mapping of\n\
7827 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7828 Unmapped characters are left untouched. Characters mapped to None\n\
7829 are deleted.");
7830
7831 static PyObject*
7832 unicode_translate(PyUnicodeObject *self, PyObject *table)
7833 {
7834     return PyUnicode_TranslateCharmap(self->str,
7835                                       self->length,
7836                                       table,
7837                                       "ignore");
7838 }
7839
7840 PyDoc_STRVAR(upper__doc__,
7841              "S.upper() -> unicode\n\
7842 \n\
7843 Return a copy of S converted to uppercase.");
7844
7845 static PyObject*
7846 unicode_upper(PyUnicodeObject *self)
7847 {
7848     return fixup(self, fixupper);
7849 }
7850
7851 PyDoc_STRVAR(zfill__doc__,
7852              "S.zfill(width) -> unicode\n\
7853 \n\
7854 Pad a numeric string S with zeros on the left, to fill a field\n\
7855 of the specified width. The string S is never truncated.");
7856
7857 static PyObject *
7858 unicode_zfill(PyUnicodeObject *self, PyObject *args)
7859 {
7860     Py_ssize_t fill;
7861     PyUnicodeObject *u;
7862
7863     Py_ssize_t width;
7864     if (!PyArg_ParseTuple(args, "n:zfill", &width))
7865         return NULL;
7866
7867     if (self->length >= width) {
7868         if (PyUnicode_CheckExact(self)) {
7869             Py_INCREF(self);
7870             return (PyObject*) self;
7871         }
7872         else
7873             return PyUnicode_FromUnicode(
7874                 PyUnicode_AS_UNICODE(self),
7875                 PyUnicode_GET_SIZE(self)
7876                 );
7877     }
7878
7879     fill = width - self->length;
7880
7881     u = pad(self, fill, 0, '0');
7882
7883     if (u == NULL)
7884         return NULL;
7885
7886     if (u->str[fill] == '+' || u->str[fill] == '-') {
7887         /* move sign to beginning of string */
7888         u->str[0] = u->str[fill];
7889         u->str[fill] = '0';
7890     }
7891
7892     return (PyObject*) u;
7893 }
7894
7895 #if 0
7896 static PyObject*
7897 free_listsize(PyUnicodeObject *self)
7898 {
7899     return PyInt_FromLong(numfree);
7900 }
7901 #endif
7902
7903 PyDoc_STRVAR(startswith__doc__,
7904              "S.startswith(prefix[, start[, end]]) -> bool\n\
7905 \n\
7906 Return True if S starts with the specified prefix, False otherwise.\n\
7907 With optional start, test S beginning at that position.\n\
7908 With optional end, stop comparing S at that position.\n\
7909 prefix can also be a tuple of strings to try.");
7910
7911 static PyObject *
7912 unicode_startswith(PyUnicodeObject *self,
7913                    PyObject *args)
7914 {
7915     PyObject *subobj;
7916     PyUnicodeObject *substring;
7917     Py_ssize_t start = 0;
7918     Py_ssize_t end = PY_SSIZE_T_MAX;
7919     int result;
7920
7921     if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
7922                           _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7923         return NULL;
7924     if (PyTuple_Check(subobj)) {
7925         Py_ssize_t i;
7926         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7927             substring = (PyUnicodeObject *)PyUnicode_FromObject(
7928                 PyTuple_GET_ITEM(subobj, i));
7929             if (substring == NULL)
7930                 return NULL;
7931             result = tailmatch(self, substring, start, end, -1);
7932             Py_DECREF(substring);
7933             if (result) {
7934                 Py_RETURN_TRUE;
7935             }
7936         }
7937         /* nothing matched */
7938         Py_RETURN_FALSE;
7939     }
7940     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7941     if (substring == NULL)
7942         return NULL;
7943     result = tailmatch(self, substring, start, end, -1);
7944     Py_DECREF(substring);
7945     return PyBool_FromLong(result);
7946 }
7947
7948
7949 PyDoc_STRVAR(endswith__doc__,
7950              "S.endswith(suffix[, start[, end]]) -> bool\n\
7951 \n\
7952 Return True if S ends with the specified suffix, False otherwise.\n\
7953 With optional start, test S beginning at that position.\n\
7954 With optional end, stop comparing S at that position.\n\
7955 suffix can also be a tuple of strings to try.");
7956
7957 static PyObject *
7958 unicode_endswith(PyUnicodeObject *self,
7959                  PyObject *args)
7960 {
7961     PyObject *subobj;
7962     PyUnicodeObject *substring;
7963     Py_ssize_t start = 0;
7964     Py_ssize_t end = PY_SSIZE_T_MAX;
7965     int result;
7966
7967     if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7968                           _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7969         return NULL;
7970     if (PyTuple_Check(subobj)) {
7971         Py_ssize_t i;
7972         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7973             substring = (PyUnicodeObject *)PyUnicode_FromObject(
7974                 PyTuple_GET_ITEM(subobj, i));
7975             if (substring == NULL)
7976                 return NULL;
7977             result = tailmatch(self, substring, start, end, +1);
7978             Py_DECREF(substring);
7979             if (result) {
7980                 Py_RETURN_TRUE;
7981             }
7982         }
7983         Py_RETURN_FALSE;
7984     }
7985     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7986     if (substring == NULL)
7987         return NULL;
7988
7989     result = tailmatch(self, substring, start, end, +1);
7990     Py_DECREF(substring);
7991     return PyBool_FromLong(result);
7992 }
7993
7994
7995 /* Implements do_string_format, which is unicode because of stringlib */
7996 #include "stringlib/string_format.h"
7997
7998 PyDoc_STRVAR(format__doc__,
7999              "S.format(*args, **kwargs) -> unicode\n\
8000 \n\
8001 ");
8002
8003 static PyObject *
8004 unicode__format__(PyObject *self, PyObject *args)
8005 {
8006     PyObject *format_spec;
8007     PyObject *result = NULL;
8008     PyObject *tmp = NULL;
8009
8010     /* If 2.x, convert format_spec to the same type as value */
8011     /* This is to allow things like u''.format('') */
8012     if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
8013         goto done;
8014     if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
8015         PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
8016                      "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
8017         goto done;
8018     }
8019     tmp = PyObject_Unicode(format_spec);
8020     if (tmp == NULL)
8021         goto done;
8022     format_spec = tmp;
8023
8024     result = _PyUnicode_FormatAdvanced(self,
8025                                        PyUnicode_AS_UNICODE(format_spec),
8026                                        PyUnicode_GET_SIZE(format_spec));
8027   done:
8028     Py_XDECREF(tmp);
8029     return result;
8030 }
8031
8032 PyDoc_STRVAR(p_format__doc__,
8033              "S.__format__(format_spec) -> unicode\n\
8034 \n\
8035 ");
8036
8037 static PyObject *
8038 unicode__sizeof__(PyUnicodeObject *v)
8039 {
8040     return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
8041                              sizeof(Py_UNICODE) * (v->length + 1));
8042 }
8043
8044 PyDoc_STRVAR(sizeof__doc__,
8045              "S.__sizeof__() -> size of S in memory, in bytes\n\
8046 \n\
8047 ");
8048
8049 static PyObject *
8050 unicode_getnewargs(PyUnicodeObject *v)
8051 {
8052     return Py_BuildValue("(u#)", v->str, v->length);
8053 }
8054
8055
8056 static PyMethodDef unicode_methods[] = {
8057
8058     /* Order is according to common usage: often used methods should
8059        appear first, since lookup is done sequentially. */
8060
8061     {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
8062     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8063     {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
8064     {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
8065     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8066     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8067     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8068     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8069     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8070     {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8071     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
8072     {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
8073     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8074     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8075     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
8076     {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
8077     {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
8078 /*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
8079     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8080     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8081     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
8082     {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
8083     {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
8084     {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
8085     {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
8086     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8087     {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8088     {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8089     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8090     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8091     {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8092     {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8093     {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8094     {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8095     {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8096     {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8097     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8098     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8099     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
8100     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
8101     {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8102     {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
8103     {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8104     {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
8105     {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
8106 #if 0
8107     {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
8108 #endif
8109
8110 #if 0
8111     /* This one is just used for debugging the implementation. */
8112     {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
8113 #endif
8114
8115     {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
8116     {NULL, NULL}
8117 };
8118
8119 static PyObject *
8120 unicode_mod(PyObject *v, PyObject *w)
8121 {
8122     if (!PyUnicode_Check(v)) {
8123         Py_INCREF(Py_NotImplemented);
8124         return Py_NotImplemented;
8125     }
8126     return PyUnicode_Format(v, w);
8127 }
8128
8129 static PyNumberMethods unicode_as_number = {
8130     0,              /*nb_add*/
8131     0,              /*nb_subtract*/
8132     0,              /*nb_multiply*/
8133     0,              /*nb_divide*/
8134     unicode_mod,            /*nb_remainder*/
8135 };
8136
8137 static PySequenceMethods unicode_as_sequence = {
8138     (lenfunc) unicode_length,       /* sq_length */
8139     PyUnicode_Concat,           /* sq_concat */
8140     (ssizeargfunc) unicode_repeat,  /* sq_repeat */
8141     (ssizeargfunc) unicode_getitem,     /* sq_item */
8142     (ssizessizeargfunc) unicode_slice,  /* sq_slice */
8143     0,                  /* sq_ass_item */
8144     0,                  /* sq_ass_slice */
8145     PyUnicode_Contains,         /* sq_contains */
8146 };
8147
8148 static PyObject*
8149 unicode_subscript(PyUnicodeObject* self, PyObject* item)
8150 {
8151     if (PyIndex_Check(item)) {
8152         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
8153         if (i == -1 && PyErr_Occurred())
8154             return NULL;
8155         if (i < 0)
8156             i += PyUnicode_GET_SIZE(self);
8157         return unicode_getitem(self, i);
8158     } else if (PySlice_Check(item)) {
8159         Py_ssize_t start, stop, step, slicelength, cur, i;
8160         Py_UNICODE* source_buf;
8161         Py_UNICODE* result_buf;
8162         PyObject* result;
8163
8164         if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
8165                                  &start, &stop, &step, &slicelength) < 0) {
8166             return NULL;
8167         }
8168
8169         if (slicelength <= 0) {
8170             return PyUnicode_FromUnicode(NULL, 0);
8171         } else if (start == 0 && step == 1 && slicelength == self->length &&
8172                    PyUnicode_CheckExact(self)) {
8173             Py_INCREF(self);
8174             return (PyObject *)self;
8175         } else if (step == 1) {
8176             return PyUnicode_FromUnicode(self->str + start, slicelength);
8177         } else {
8178             source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
8179             result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8180                                                        sizeof(Py_UNICODE));
8181
8182             if (result_buf == NULL)
8183                 return PyErr_NoMemory();
8184
8185             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8186                 result_buf[i] = source_buf[cur];
8187             }
8188
8189             result = PyUnicode_FromUnicode(result_buf, slicelength);
8190             PyObject_FREE(result_buf);
8191             return result;
8192         }
8193     } else {
8194         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8195         return NULL;
8196     }
8197 }
8198
8199 static PyMappingMethods unicode_as_mapping = {
8200     (lenfunc)unicode_length,        /* mp_length */
8201     (binaryfunc)unicode_subscript,  /* mp_subscript */
8202     (objobjargproc)0,           /* mp_ass_subscript */
8203 };
8204
8205 static Py_ssize_t
8206 unicode_buffer_getreadbuf(PyUnicodeObject *self,
8207                           Py_ssize_t index,
8208                           const void **ptr)
8209 {
8210     if (index != 0) {
8211         PyErr_SetString(PyExc_SystemError,
8212                         "accessing non-existent unicode segment");
8213         return -1;
8214     }
8215     *ptr = (void *) self->str;
8216     return PyUnicode_GET_DATA_SIZE(self);
8217 }
8218
8219 static Py_ssize_t
8220 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
8221                            const void **ptr)
8222 {
8223     PyErr_SetString(PyExc_TypeError,
8224                     "cannot use unicode as modifiable buffer");
8225     return -1;
8226 }
8227
8228 static int
8229 unicode_buffer_getsegcount(PyUnicodeObject *self,
8230                            Py_ssize_t *lenp)
8231 {
8232     if (lenp)
8233         *lenp = PyUnicode_GET_DATA_SIZE(self);
8234     return 1;
8235 }
8236
8237 static Py_ssize_t
8238 unicode_buffer_getcharbuf(PyUnicodeObject *self,
8239                           Py_ssize_t index,
8240                           const void **ptr)
8241 {
8242     PyObject *str;
8243
8244     if (index != 0) {
8245         PyErr_SetString(PyExc_SystemError,
8246                         "accessing non-existent unicode segment");
8247         return -1;
8248     }
8249     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
8250     if (str == NULL)
8251         return -1;
8252     *ptr = (void *) PyString_AS_STRING(str);
8253     return PyString_GET_SIZE(str);
8254 }
8255
8256 /* Helpers for PyUnicode_Format() */
8257
8258 static PyObject *
8259 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8260 {
8261     Py_ssize_t argidx = *p_argidx;
8262     if (argidx < arglen) {
8263         (*p_argidx)++;
8264         if (arglen < 0)
8265             return args;
8266         else
8267             return PyTuple_GetItem(args, argidx);
8268     }
8269     PyErr_SetString(PyExc_TypeError,
8270                     "not enough arguments for format string");
8271     return NULL;
8272 }
8273
8274 #define F_LJUST (1<<0)
8275 #define F_SIGN  (1<<1)
8276 #define F_BLANK (1<<2)
8277 #define F_ALT   (1<<3)
8278 #define F_ZERO  (1<<4)
8279
8280 static Py_ssize_t
8281 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8282 {
8283     register Py_ssize_t i;
8284     Py_ssize_t len = strlen(charbuffer);
8285     for (i = len - 1; i >= 0; i--)
8286         buffer[i] = (Py_UNICODE) charbuffer[i];
8287
8288     return len;
8289 }
8290
8291 static int
8292 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8293 {
8294     Py_ssize_t result;
8295
8296     PyOS_snprintf((char *)buffer, len, format, x);
8297     result = strtounicode(buffer, (char *)buffer);
8298     return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8299 }
8300
8301 /* XXX To save some code duplication, formatfloat/long/int could have been
8302    shared with stringobject.c, converting from 8-bit to Unicode after the
8303    formatting is done. */
8304
8305 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
8306
8307 static PyObject *
8308 formatfloat(PyObject *v, int flags, int prec, int type)
8309 {
8310     char *p;
8311     PyObject *result;
8312     double x;
8313
8314     x = PyFloat_AsDouble(v);
8315     if (x == -1.0 && PyErr_Occurred())
8316         return NULL;
8317
8318     if (prec < 0)
8319         prec = 6;
8320
8321     p = PyOS_double_to_string(x, type, prec,
8322                               (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8323     if (p == NULL)
8324         return NULL;
8325     result = PyUnicode_FromStringAndSize(p, strlen(p));
8326     PyMem_Free(p);
8327     return result;
8328 }
8329
8330 static PyObject*
8331 formatlong(PyObject *val, int flags, int prec, int type)
8332 {
8333     char *buf;
8334     int i, len;
8335     PyObject *str; /* temporary string object. */
8336     PyUnicodeObject *result;
8337
8338     str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8339     if (!str)
8340         return NULL;
8341     result = _PyUnicode_New(len);
8342     if (!result) {
8343         Py_DECREF(str);
8344         return NULL;
8345     }
8346     for (i = 0; i < len; i++)
8347         result->str[i] = buf[i];
8348     result->str[len] = 0;
8349     Py_DECREF(str);
8350     return (PyObject*)result;
8351 }
8352
8353 static int
8354 formatint(Py_UNICODE *buf,
8355           size_t buflen,
8356           int flags,
8357           int prec,
8358           int type,
8359           PyObject *v)
8360 {
8361     /* fmt = '%#.' + `prec` + 'l' + `type`
8362      * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8363      *                     + 1 + 1
8364      *                   = 24
8365      */
8366     char fmt[64]; /* plenty big enough! */
8367     char *sign;
8368     long x;
8369
8370     x = PyInt_AsLong(v);
8371     if (x == -1 && PyErr_Occurred())
8372         return -1;
8373     if (x < 0 && type == 'u') {
8374         type = 'd';
8375     }
8376     if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8377         sign = "-";
8378     else
8379         sign = "";
8380     if (prec < 0)
8381         prec = 1;
8382
8383     /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8384      * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8385      */
8386     if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8387         PyErr_SetString(PyExc_OverflowError,
8388                         "formatted integer is too long (precision too large?)");
8389         return -1;
8390     }
8391
8392     if ((flags & F_ALT) &&
8393         (type == 'x' || type == 'X')) {
8394         /* When converting under %#x or %#X, there are a number
8395          * of issues that cause pain:
8396          * - when 0 is being converted, the C standard leaves off
8397          *   the '0x' or '0X', which is inconsistent with other
8398          *   %#x/%#X conversions and inconsistent with Python's
8399          *   hex() function
8400          * - there are platforms that violate the standard and
8401          *   convert 0 with the '0x' or '0X'
8402          *   (Metrowerks, Compaq Tru64)
8403          * - there are platforms that give '0x' when converting
8404          *   under %#X, but convert 0 in accordance with the
8405          *   standard (OS/2 EMX)
8406          *
8407          * We can achieve the desired consistency by inserting our
8408          * own '0x' or '0X' prefix, and substituting %x/%X in place
8409          * of %#x/%#X.
8410          *
8411          * Note that this is the same approach as used in
8412          * formatint() in stringobject.c
8413          */
8414         PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8415                       sign, type, prec, type);
8416     }
8417     else {
8418         PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8419                       sign, (flags&F_ALT) ? "#" : "",
8420                       prec, type);
8421     }
8422     if (sign[0])
8423         return longtounicode(buf, buflen, fmt, -x);
8424     else
8425         return longtounicode(buf, buflen, fmt, x);
8426 }
8427
8428 static int
8429 formatchar(Py_UNICODE *buf,
8430            size_t buflen,
8431            PyObject *v)
8432 {
8433     /* presume that the buffer is at least 2 characters long */
8434     if (PyUnicode_Check(v)) {
8435         if (PyUnicode_GET_SIZE(v) != 1)
8436             goto onError;
8437         buf[0] = PyUnicode_AS_UNICODE(v)[0];
8438     }
8439
8440     else if (PyString_Check(v)) {
8441         if (PyString_GET_SIZE(v) != 1)
8442             goto onError;
8443         buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8444     }
8445
8446     else {
8447         /* Integer input truncated to a character */
8448         long x;
8449         x = PyInt_AsLong(v);
8450         if (x == -1 && PyErr_Occurred())
8451             goto onError;
8452 #ifdef Py_UNICODE_WIDE
8453         if (x < 0 || x > 0x10ffff) {
8454             PyErr_SetString(PyExc_OverflowError,
8455                             "%c arg not in range(0x110000) "
8456                             "(wide Python build)");
8457             return -1;
8458         }
8459 #else
8460         if (x < 0 || x > 0xffff) {
8461             PyErr_SetString(PyExc_OverflowError,
8462                             "%c arg not in range(0x10000) "
8463                             "(narrow Python build)");
8464             return -1;
8465         }
8466 #endif
8467         buf[0] = (Py_UNICODE) x;
8468     }
8469     buf[1] = '\0';
8470     return 1;
8471
8472   onError:
8473     PyErr_SetString(PyExc_TypeError,
8474                     "%c requires int or char");
8475     return -1;
8476 }
8477
8478 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8479
8480    FORMATBUFLEN is the length of the buffer in which the ints &
8481    chars are formatted. XXX This is a magic number. Each formatting
8482    routine does bounds checking to ensure no overflow, but a better
8483    solution may be to malloc a buffer of appropriate size for each
8484    format. For now, the current solution is sufficient.
8485 */
8486 #define FORMATBUFLEN (size_t)120
8487
8488 PyObject *PyUnicode_Format(PyObject *format,
8489                            PyObject *args)
8490 {
8491     Py_UNICODE *fmt, *res;
8492     Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8493     int args_owned = 0;
8494     PyUnicodeObject *result = NULL;
8495     PyObject *dict = NULL;
8496     PyObject *uformat;
8497
8498     if (format == NULL || args == NULL) {
8499         PyErr_BadInternalCall();
8500         return NULL;
8501     }
8502     uformat = PyUnicode_FromObject(format);
8503     if (uformat == NULL)
8504         return NULL;
8505     fmt = PyUnicode_AS_UNICODE(uformat);
8506     fmtcnt = PyUnicode_GET_SIZE(uformat);
8507
8508     reslen = rescnt = fmtcnt + 100;
8509     result = _PyUnicode_New(reslen);
8510     if (result == NULL)
8511         goto onError;
8512     res = PyUnicode_AS_UNICODE(result);
8513
8514     if (PyTuple_Check(args)) {
8515         arglen = PyTuple_Size(args);
8516         argidx = 0;
8517     }
8518     else {
8519         arglen = -1;
8520         argidx = -2;
8521     }
8522     if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
8523         !PyObject_TypeCheck(args, &PyBaseString_Type))
8524         dict = args;
8525
8526     while (--fmtcnt >= 0) {
8527         if (*fmt != '%') {
8528             if (--rescnt < 0) {
8529                 rescnt = fmtcnt + 100;
8530                 reslen += rescnt;
8531                 if (_PyUnicode_Resize(&result, reslen) < 0)
8532                     goto onError;
8533                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8534                 --rescnt;
8535             }
8536             *res++ = *fmt++;
8537         }
8538         else {
8539             /* Got a format specifier */
8540             int flags = 0;
8541             Py_ssize_t width = -1;
8542             int prec = -1;
8543             Py_UNICODE c = '\0';
8544             Py_UNICODE fill;
8545             int isnumok;
8546             PyObject *v = NULL;
8547             PyObject *temp = NULL;
8548             Py_UNICODE *pbuf;
8549             Py_UNICODE sign;
8550             Py_ssize_t len;
8551             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
8552
8553             fmt++;
8554             if (*fmt == '(') {
8555                 Py_UNICODE *keystart;
8556                 Py_ssize_t keylen;
8557                 PyObject *key;
8558                 int pcount = 1;
8559
8560                 if (dict == NULL) {
8561                     PyErr_SetString(PyExc_TypeError,
8562                                     "format requires a mapping");
8563                     goto onError;
8564                 }
8565                 ++fmt;
8566                 --fmtcnt;
8567                 keystart = fmt;
8568                 /* Skip over balanced parentheses */
8569                 while (pcount > 0 && --fmtcnt >= 0) {
8570                     if (*fmt == ')')
8571                         --pcount;
8572                     else if (*fmt == '(')
8573                         ++pcount;
8574                     fmt++;
8575                 }
8576                 keylen = fmt - keystart - 1;
8577                 if (fmtcnt < 0 || pcount > 0) {
8578                     PyErr_SetString(PyExc_ValueError,
8579                                     "incomplete format key");
8580                     goto onError;
8581                 }
8582 #if 0
8583                 /* keys are converted to strings using UTF-8 and
8584                    then looked up since Python uses strings to hold
8585                    variables names etc. in its namespaces and we
8586                    wouldn't want to break common idioms. */
8587                 key = PyUnicode_EncodeUTF8(keystart,
8588                                            keylen,
8589                                            NULL);
8590 #else
8591                 key = PyUnicode_FromUnicode(keystart, keylen);
8592 #endif
8593                 if (key == NULL)
8594                     goto onError;
8595                 if (args_owned) {
8596                     Py_DECREF(args);
8597                     args_owned = 0;
8598                 }
8599                 args = PyObject_GetItem(dict, key);
8600                 Py_DECREF(key);
8601                 if (args == NULL) {
8602                     goto onError;
8603                 }
8604                 args_owned = 1;
8605                 arglen = -1;
8606                 argidx = -2;
8607             }
8608             while (--fmtcnt >= 0) {
8609                 switch (c = *fmt++) {
8610                 case '-': flags |= F_LJUST; continue;
8611                 case '+': flags |= F_SIGN; continue;
8612                 case ' ': flags |= F_BLANK; continue;
8613                 case '#': flags |= F_ALT; continue;
8614                 case '0': flags |= F_ZERO; continue;
8615                 }
8616                 break;
8617             }
8618             if (c == '*') {
8619                 v = getnextarg(args, arglen, &argidx);
8620                 if (v == NULL)
8621                     goto onError;
8622                 if (!PyInt_Check(v)) {
8623                     PyErr_SetString(PyExc_TypeError,
8624                                     "* wants int");
8625                     goto onError;
8626                 }
8627                 width = PyInt_AsLong(v);
8628                 if (width < 0) {
8629                     flags |= F_LJUST;
8630                     width = -width;
8631                 }
8632                 if (--fmtcnt >= 0)
8633                     c = *fmt++;
8634             }
8635             else if (c >= '0' && c <= '9') {
8636                 width = c - '0';
8637                 while (--fmtcnt >= 0) {
8638                     c = *fmt++;
8639                     if (c < '0' || c > '9')
8640                         break;
8641                     if ((width*10) / 10 != width) {
8642                         PyErr_SetString(PyExc_ValueError,
8643                                         "width too big");
8644                         goto onError;
8645                     }
8646                     width = width*10 + (c - '0');
8647                 }
8648             }
8649             if (c == '.') {
8650                 prec = 0;
8651                 if (--fmtcnt >= 0)
8652                     c = *fmt++;
8653                 if (c == '*') {
8654                     v = getnextarg(args, arglen, &argidx);
8655                     if (v == NULL)
8656                         goto onError;
8657                     if (!PyInt_Check(v)) {
8658                         PyErr_SetString(PyExc_TypeError,
8659                                         "* wants int");
8660                         goto onError;
8661                     }
8662                     prec = PyInt_AsLong(v);
8663                     if (prec < 0)
8664                         prec = 0;
8665                     if (--fmtcnt >= 0)
8666                         c = *fmt++;
8667                 }
8668                 else if (c >= '0' && c <= '9') {
8669                     prec = c - '0';
8670                     while (--fmtcnt >= 0) {
8671                         c = Py_CHARMASK(*fmt++);
8672                         if (c < '0' || c > '9')
8673                             break;
8674                         if ((prec*10) / 10 != prec) {
8675                             PyErr_SetString(PyExc_ValueError,
8676                                             "prec too big");
8677                             goto onError;
8678                         }
8679                         prec = prec*10 + (c - '0');
8680                     }
8681                 }
8682             } /* prec */
8683             if (fmtcnt >= 0) {
8684                 if (c == 'h' || c == 'l' || c == 'L') {
8685                     if (--fmtcnt >= 0)
8686                         c = *fmt++;
8687                 }
8688             }
8689             if (fmtcnt < 0) {
8690                 PyErr_SetString(PyExc_ValueError,
8691                                 "incomplete format");
8692                 goto onError;
8693             }
8694             if (c != '%') {
8695                 v = getnextarg(args, arglen, &argidx);
8696                 if (v == NULL)
8697                     goto onError;
8698             }
8699             sign = 0;
8700             fill = ' ';
8701             switch (c) {
8702
8703             case '%':
8704                 pbuf = formatbuf;
8705                 /* presume that buffer length is at least 1 */
8706                 pbuf[0] = '%';
8707                 len = 1;
8708                 break;
8709
8710             case 's':
8711             case 'r':
8712                 if (PyUnicode_Check(v) && c == 's') {
8713                     temp = v;
8714                     Py_INCREF(temp);
8715                 }
8716                 else {
8717                     PyObject *unicode;
8718                     if (c == 's')
8719                         temp = PyObject_Unicode(v);
8720                     else
8721                         temp = PyObject_Repr(v);
8722                     if (temp == NULL)
8723                         goto onError;
8724                     if (PyUnicode_Check(temp))
8725                         /* nothing to do */;
8726                     else if (PyString_Check(temp)) {
8727                         /* convert to string to Unicode */
8728                         unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8729                                                    PyString_GET_SIZE(temp),
8730                                                    NULL,
8731                                                    "strict");
8732                         Py_DECREF(temp);
8733                         temp = unicode;
8734                         if (temp == NULL)
8735                             goto onError;
8736                     }
8737                     else {
8738                         Py_DECREF(temp);
8739                         PyErr_SetString(PyExc_TypeError,
8740                                         "%s argument has non-string str()");
8741                         goto onError;
8742                     }
8743                 }
8744                 pbuf = PyUnicode_AS_UNICODE(temp);
8745                 len = PyUnicode_GET_SIZE(temp);
8746                 if (prec >= 0 && len > prec)
8747                     len = prec;
8748                 break;
8749
8750             case 'i':
8751             case 'd':
8752             case 'u':
8753             case 'o':
8754             case 'x':
8755             case 'X':
8756                 if (c == 'i')
8757                     c = 'd';
8758                 isnumok = 0;
8759                 if (PyNumber_Check(v)) {
8760                     PyObject *iobj=NULL;
8761
8762                     if (PyInt_Check(v) || (PyLong_Check(v))) {
8763                         iobj = v;
8764                         Py_INCREF(iobj);
8765                     }
8766                     else {
8767                         iobj = PyNumber_Int(v);
8768                         if (iobj==NULL) iobj = PyNumber_Long(v);
8769                     }
8770                     if (iobj!=NULL) {
8771                         if (PyInt_Check(iobj)) {
8772                             isnumok = 1;
8773                             pbuf = formatbuf;
8774                             len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8775                                             flags, prec, c, iobj);
8776                             Py_DECREF(iobj);
8777                             if (len < 0)
8778                                 goto onError;
8779                             sign = 1;
8780                         }
8781                         else if (PyLong_Check(iobj)) {
8782                             isnumok = 1;
8783                             temp = formatlong(iobj, flags, prec, c);
8784                             Py_DECREF(iobj);
8785                             if (!temp)
8786                                 goto onError;
8787                             pbuf = PyUnicode_AS_UNICODE(temp);
8788                             len = PyUnicode_GET_SIZE(temp);
8789                             sign = 1;
8790                         }
8791                         else {
8792                             Py_DECREF(iobj);
8793                         }
8794                     }
8795                 }
8796                 if (!isnumok) {
8797                     PyErr_Format(PyExc_TypeError,
8798                                  "%%%c format: a number is required, "
8799                                  "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8800                     goto onError;
8801                 }
8802                 if (flags & F_ZERO)
8803                     fill = '0';
8804                 break;
8805
8806             case 'e':
8807             case 'E':
8808             case 'f':
8809             case 'F':
8810             case 'g':
8811             case 'G':
8812                 if (c == 'F')
8813                     c = 'f';
8814                 temp = formatfloat(v, flags, prec, c);
8815                 if (temp == NULL)
8816                     goto onError;
8817                 pbuf = PyUnicode_AS_UNICODE(temp);
8818                 len = PyUnicode_GET_SIZE(temp);
8819                 sign = 1;
8820                 if (flags & F_ZERO)
8821                     fill = '0';
8822                 break;
8823
8824             case 'c':
8825                 pbuf = formatbuf;
8826                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8827                 if (len < 0)
8828                     goto onError;
8829                 break;
8830
8831             default:
8832                 PyErr_Format(PyExc_ValueError,
8833                              "unsupported format character '%c' (0x%x) "
8834                              "at index %zd",
8835                              (31<=c && c<=126) ? (char)c : '?',
8836                              (int)c,
8837                              (Py_ssize_t)(fmt - 1 -
8838                                           PyUnicode_AS_UNICODE(uformat)));
8839                 goto onError;
8840             }
8841             if (sign) {
8842                 if (*pbuf == '-' || *pbuf == '+') {
8843                     sign = *pbuf++;
8844                     len--;
8845                 }
8846                 else if (flags & F_SIGN)
8847                     sign = '+';
8848                 else if (flags & F_BLANK)
8849                     sign = ' ';
8850                 else
8851                     sign = 0;
8852             }
8853             if (width < len)
8854                 width = len;
8855             if (rescnt - (sign != 0) < width) {
8856                 reslen -= rescnt;
8857                 rescnt = width + fmtcnt + 100;
8858                 reslen += rescnt;
8859                 if (reslen < 0) {
8860                     Py_XDECREF(temp);
8861                     PyErr_NoMemory();
8862                     goto onError;
8863                 }
8864                 if (_PyUnicode_Resize(&result, reslen) < 0) {
8865                     Py_XDECREF(temp);
8866                     goto onError;
8867                 }
8868                 res = PyUnicode_AS_UNICODE(result)
8869                     + reslen - rescnt;
8870             }
8871             if (sign) {
8872                 if (fill != ' ')
8873                     *res++ = sign;
8874                 rescnt--;
8875                 if (width > len)
8876                     width--;
8877             }
8878             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8879                 assert(pbuf[0] == '0');
8880                 assert(pbuf[1] == c);
8881                 if (fill != ' ') {
8882                     *res++ = *pbuf++;
8883                     *res++ = *pbuf++;
8884                 }
8885                 rescnt -= 2;
8886                 width -= 2;
8887                 if (width < 0)
8888                     width = 0;
8889                 len -= 2;
8890             }
8891             if (width > len && !(flags & F_LJUST)) {
8892                 do {
8893                     --rescnt;
8894                     *res++ = fill;
8895                 } while (--width > len);
8896             }
8897             if (fill == ' ') {
8898                 if (sign)
8899                     *res++ = sign;
8900                 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8901                     assert(pbuf[0] == '0');
8902                     assert(pbuf[1] == c);
8903                     *res++ = *pbuf++;
8904                     *res++ = *pbuf++;
8905                 }
8906             }
8907             Py_UNICODE_COPY(res, pbuf, len);
8908             res += len;
8909             rescnt -= len;
8910             while (--width >= len) {
8911                 --rescnt;
8912                 *res++ = ' ';
8913             }
8914             if (dict && (argidx < arglen) && c != '%') {
8915                 PyErr_SetString(PyExc_TypeError,
8916                                 "not all arguments converted during string formatting");
8917                 Py_XDECREF(temp);
8918                 goto onError;
8919             }
8920             Py_XDECREF(temp);
8921         } /* '%' */
8922     } /* until end */
8923     if (argidx < arglen && !dict) {
8924         PyErr_SetString(PyExc_TypeError,
8925                         "not all arguments converted during string formatting");
8926         goto onError;
8927     }
8928
8929     if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8930         goto onError;
8931     if (args_owned) {
8932         Py_DECREF(args);
8933     }
8934     Py_DECREF(uformat);
8935     return (PyObject *)result;
8936
8937   onError:
8938     Py_XDECREF(result);
8939     Py_DECREF(uformat);
8940     if (args_owned) {
8941         Py_DECREF(args);
8942     }
8943     return NULL;
8944 }
8945
8946 static PyBufferProcs unicode_as_buffer = {
8947     (readbufferproc) unicode_buffer_getreadbuf,
8948     (writebufferproc) unicode_buffer_getwritebuf,
8949     (segcountproc) unicode_buffer_getsegcount,
8950     (charbufferproc) unicode_buffer_getcharbuf,
8951 };
8952
8953 static PyObject *
8954 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8955
8956 static PyObject *
8957 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8958 {
8959     PyObject *x = NULL;
8960     static char *kwlist[] = {"string", "encoding", "errors", 0};
8961     char *encoding = NULL;
8962     char *errors = NULL;
8963
8964     if (type != &PyUnicode_Type)
8965         return unicode_subtype_new(type, args, kwds);
8966     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8967                                      kwlist, &x, &encoding, &errors))
8968         return NULL;
8969     if (x == NULL)
8970         return (PyObject *)_PyUnicode_New(0);
8971     if (encoding == NULL && errors == NULL)
8972         return PyObject_Unicode(x);
8973     else
8974         return PyUnicode_FromEncodedObject(x, encoding, errors);
8975 }
8976
8977 static PyObject *
8978 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8979 {
8980     PyUnicodeObject *tmp, *pnew;
8981     Py_ssize_t n;
8982
8983     assert(PyType_IsSubtype(type, &PyUnicode_Type));
8984     tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8985     if (tmp == NULL)
8986         return NULL;
8987     assert(PyUnicode_Check(tmp));
8988     pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8989     if (pnew == NULL) {
8990         Py_DECREF(tmp);
8991         return NULL;
8992     }
8993     pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8994     if (pnew->str == NULL) {
8995         _Py_ForgetReference((PyObject *)pnew);
8996         PyObject_Del(pnew);
8997         Py_DECREF(tmp);
8998         return PyErr_NoMemory();
8999     }
9000     Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9001     pnew->length = n;
9002     pnew->hash = tmp->hash;
9003     Py_DECREF(tmp);
9004     return (PyObject *)pnew;
9005 }
9006
9007 PyDoc_STRVAR(unicode_doc,
9008              "unicode(string [, encoding[, errors]]) -> object\n\
9009 \n\
9010 Create a new Unicode object from the given encoded string.\n\
9011 encoding defaults to the current default string encoding.\n\
9012 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
9013
9014 PyTypeObject PyUnicode_Type = {
9015     PyVarObject_HEAD_INIT(&PyType_Type, 0)
9016     "unicode",              /* tp_name */
9017     sizeof(PyUnicodeObject),        /* tp_size */
9018     0,                  /* tp_itemsize */
9019     /* Slots */
9020     (destructor)unicode_dealloc,    /* tp_dealloc */
9021     0,                  /* tp_print */
9022     0,                  /* tp_getattr */
9023     0,                  /* tp_setattr */
9024     0,                  /* tp_compare */
9025     unicode_repr,           /* tp_repr */
9026     &unicode_as_number,         /* tp_as_number */
9027     &unicode_as_sequence,       /* tp_as_sequence */
9028     &unicode_as_mapping,        /* tp_as_mapping */
9029     (hashfunc) unicode_hash,        /* tp_hash*/
9030     0,                  /* tp_call*/
9031     (reprfunc) unicode_str,     /* tp_str */
9032     PyObject_GenericGetAttr,        /* tp_getattro */
9033     0,                  /* tp_setattro */
9034     &unicode_as_buffer,         /* tp_as_buffer */
9035     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
9036     Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS,  /* tp_flags */
9037     unicode_doc,            /* tp_doc */
9038     0,                  /* tp_traverse */
9039     0,                  /* tp_clear */
9040     PyUnicode_RichCompare,      /* tp_richcompare */
9041     0,                  /* tp_weaklistoffset */
9042     0,                  /* tp_iter */
9043     0,                  /* tp_iternext */
9044     unicode_methods,            /* tp_methods */
9045     0,                  /* tp_members */
9046     0,                  /* tp_getset */
9047     &PyBaseString_Type,         /* tp_base */
9048     0,                  /* tp_dict */
9049     0,                  /* tp_descr_get */
9050     0,                  /* tp_descr_set */
9051     0,                  /* tp_dictoffset */
9052     0,                  /* tp_init */
9053     0,                  /* tp_alloc */
9054     unicode_new,            /* tp_new */
9055     PyObject_Del,           /* tp_free */
9056 };
9057
9058 /* Initialize the Unicode implementation */
9059
9060 void _PyUnicode_Init(void)
9061 {
9062     int i;
9063
9064     /* XXX - move this array to unicodectype.c ? */
9065     Py_UNICODE linebreak[] = {
9066         0x000A, /* LINE FEED */
9067         0x000D, /* CARRIAGE RETURN */
9068         0x001C, /* FILE SEPARATOR */
9069         0x001D, /* GROUP SEPARATOR */
9070         0x001E, /* RECORD SEPARATOR */
9071         0x0085, /* NEXT LINE */
9072         0x2028, /* LINE SEPARATOR */
9073         0x2029, /* PARAGRAPH SEPARATOR */
9074     };
9075
9076     /* Init the implementation */
9077     free_list = NULL;
9078     numfree = 0;
9079     unicode_empty = _PyUnicode_New(0);
9080     if (!unicode_empty)
9081         return;
9082
9083     strcpy(unicode_default_encoding, "ascii");
9084     for (i = 0; i < 256; i++)
9085         unicode_latin1[i] = NULL;
9086     if (PyType_Ready(&PyUnicode_Type) < 0)
9087         Py_FatalError("Can't initialize 'unicode'");
9088
9089     /* initialize the linebreak bloom filter */
9090     bloom_linebreak = make_bloom_mask(
9091         linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9092         );
9093
9094     PyType_Ready(&EncodingMapType);
9095 }
9096
9097 /* Finalize the Unicode implementation */
9098
9099 int
9100 PyUnicode_ClearFreeList(void)
9101 {
9102     int freelist_size = numfree;
9103     PyUnicodeObject *u;
9104
9105     for (u = free_list; u != NULL;) {
9106         PyUnicodeObject *v = u;
9107         u = *(PyUnicodeObject **)u;
9108         if (v->str)
9109             PyObject_DEL(v->str);
9110         Py_XDECREF(v->defenc);
9111         PyObject_Del(v);
9112         numfree--;
9113     }
9114     free_list = NULL;
9115     assert(numfree == 0);
9116     return freelist_size;
9117 }
9118
9119 void
9120 _PyUnicode_Fini(void)
9121 {
9122     int i;
9123
9124     Py_XDECREF(unicode_empty);
9125     unicode_empty = NULL;
9126
9127     for (i = 0; i < 256; i++) {
9128         if (unicode_latin1[i]) {
9129             Py_DECREF(unicode_latin1[i]);
9130             unicode_latin1[i] = NULL;
9131         }
9132     }
9133     (void)PyUnicode_ClearFreeList();
9134 }
9135
9136 #ifdef __cplusplus
9137 }
9138 #endif
9139
9140
9141 /*
9142   Local variables:
9143   c-basic-offset: 4
9144   indent-tabs-mode: nil
9145   End:
9146 */