Objects/unicodeobject.c

   1 /*
   2
   3 Unicode implementation based on original code by Fredrik Lundh,
   4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
   5 Unicode Integration Proposal (see file Misc/unicode.txt).
   6
   7 Major speed upgrades to the method implementations at the Reykjavik
   8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
   9
  10 Copyright (c) Corporation for National Research Initiatives.
  11
  12 --------------------------------------------------------------------
  13 The original string type implementation is:
  14
  15   Copyright (c) 1999 by Secret Labs AB
  16   Copyright (c) 1999 by Fredrik Lundh
  17
  18 By obtaining, using, and/or copying this software and/or its
  19 associated documentation, you agree that you have read, understood,
  20 and will comply with the following terms and conditions:
  21
  22 Permission to use, copy, modify, and distribute this software and its
  23 associated documentation for any purpose and without fee is hereby
  24 granted, provided that the above copyright notice appears in all
  25 copies, and that both that copyright notice and this permission notice
  26 appear in supporting documentation, and that the name of Secret Labs
  27 AB or the author not be used in advertising or publicity pertaining to
  28 distribution of the software without specific, written prior
  29 permission.
  30
  31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  33 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  38 --------------------------------------------------------------------
  39
  40 */
  41
  42 #define PY_SSIZE_T_CLEAN
  43 #include "Python.h"
  44
  45 #include "unicodeobject.h"
  46 #include "ucnhash.h"
  47
  48 #ifdef MS_WINDOWS
  49 #include <windows.h>
  50 #endif
  51
  52 /* Limit for the Unicode object free list */
  53
  54 #define PyUnicode_MAXFREELIST       1024
  55
  56 /* Limit for the Unicode object free list stay alive optimization.
  57
  58    The implementation will keep allocated Unicode memory intact for
  59    all objects on the free list having a size less than this
  60    limit. This reduces malloc() overhead for small Unicode objects.
  61
  62    At worst this will result in PyUnicode_MAXFREELIST *
  63    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
  64    malloc()-overhead) bytes of unused garbage.
  65
  66    Setting the limit to 0 effectively turns the feature off.
  67
  68    Note: This is an experimental feature ! If you get core dumps when
  69    using Unicode objects, turn this feature off.
  70
  71 */
  72
  73 #define KEEPALIVE_SIZE_LIMIT       9
  74
  75 /* Endianness switches; defaults to little endian */
  76
  77 #ifdef WORDS_BIGENDIAN
  78 # define BYTEORDER_IS_BIG_ENDIAN
  79 #else
  80 # define BYTEORDER_IS_LITTLE_ENDIAN
  81 #endif
  82
  83 /* --- Globals ------------------------------------------------------------
  84
  85    The globals are initialized by the _PyUnicode_Init() API and should
  86    not be used before calling that API.
  87
  88 */
  89
  90
  91 #ifdef __cplusplus
  92 extern "C" {
  93 #endif
  94
  95 /* Free list for Unicode objects */
  96 static PyUnicodeObject *free_list;
  97 static int numfree;
  98
  99 /* The empty Unicode object is shared to improve performance. */
 100 static PyUnicodeObject *unicode_empty;
 101
 102 /* Single character Unicode strings in the Latin-1 range are being
 103    shared as well. */
 104 static PyUnicodeObject *unicode_latin1[256];
 105
 106 /* Default encoding to use and assume when NULL is passed as encoding
 107    parameter; it is initialized by _PyUnicode_Init().
 108
 109    Always use the PyUnicode_SetDefaultEncoding() and
 110    PyUnicode_GetDefaultEncoding() APIs to access this global.
 111
 112 */
 113 static char unicode_default_encoding[100];
 114
 115 /* Fast detection of the most frequent whitespace characters */
 116 const unsigned char _Py_ascii_whitespace[] = {
 117     0, 0, 0, 0, 0, 0, 0, 0,
 118 /*     case 0x0009: * HORIZONTAL TABULATION */
 119 /*     case 0x000A: * LINE FEED */
 120 /*     case 0x000B: * VERTICAL TABULATION */
 121 /*     case 0x000C: * FORM FEED */
 122 /*     case 0x000D: * CARRIAGE RETURN */
 123     0, 1, 1, 1, 1, 1, 0, 0,
 124     0, 0, 0, 0, 0, 0, 0, 0,
 125 /*     case 0x001C: * FILE SEPARATOR */
 126 /*     case 0x001D: * GROUP SEPARATOR */
 127 /*     case 0x001E: * RECORD SEPARATOR */
 128 /*     case 0x001F: * UNIT SEPARATOR */
 129     0, 0, 0, 0, 1, 1, 1, 1,
 130 /*     case 0x0020: * SPACE */
 131     1, 0, 0, 0, 0, 0, 0, 0,
 132     0, 0, 0, 0, 0, 0, 0, 0,
 133     0, 0, 0, 0, 0, 0, 0, 0,
 134     0, 0, 0, 0, 0, 0, 0, 0,
 135
 136     0, 0, 0, 0, 0, 0, 0, 0,
 137     0, 0, 0, 0, 0, 0, 0, 0,
 138     0, 0, 0, 0, 0, 0, 0, 0,
 139     0, 0, 0, 0, 0, 0, 0, 0,
 140     0, 0, 0, 0, 0, 0, 0, 0,
 141     0, 0, 0, 0, 0, 0, 0, 0,
 142     0, 0, 0, 0, 0, 0, 0, 0,
 143     0, 0, 0, 0, 0, 0, 0, 0
 144 };
 145
 146 /* Same for linebreaks */
 147 static unsigned char ascii_linebreak[] = {
 148     0, 0, 0, 0, 0, 0, 0, 0,
 149 /*         0x000A, * LINE FEED */
 150 /*         0x000D, * CARRIAGE RETURN */
 151     0, 0, 1, 0, 0, 1, 0, 0,
 152     0, 0, 0, 0, 0, 0, 0, 0,
 153 /*         0x001C, * FILE SEPARATOR */
 154 /*         0x001D, * GROUP SEPARATOR */
 155 /*         0x001E, * RECORD SEPARATOR */
 156     0, 0, 0, 0, 1, 1, 1, 0,
 157     0, 0, 0, 0, 0, 0, 0, 0,
 158     0, 0, 0, 0, 0, 0, 0, 0,
 159     0, 0, 0, 0, 0, 0, 0, 0,
 160     0, 0, 0, 0, 0, 0, 0, 0,
 161
 162     0, 0, 0, 0, 0, 0, 0, 0,
 163     0, 0, 0, 0, 0, 0, 0, 0,
 164     0, 0, 0, 0, 0, 0, 0, 0,
 165     0, 0, 0, 0, 0, 0, 0, 0,
 166     0, 0, 0, 0, 0, 0, 0, 0,
 167     0, 0, 0, 0, 0, 0, 0, 0,
 168     0, 0, 0, 0, 0, 0, 0, 0,
 169     0, 0, 0, 0, 0, 0, 0, 0
 170 };
 171
 172
 173 Py_UNICODE
 174 PyUnicode_GetMax(void)
 175 {
 176 #ifdef Py_UNICODE_WIDE
 177     return 0x10FFFF;
 178 #else
 179     /* This is actually an illegal character, so it should
 180        not be passed to unichr. */
 181     return 0xFFFF;
 182 #endif
 183 }
 184
 185 /* --- Bloom Filters ----------------------------------------------------- */
 186
 187 /* stuff to implement simple "bloom filters" for Unicode characters.
 188    to keep things simple, we use a single bitmask, using the least 5
 189    bits from each unicode characters as the bit index. */
 190
 191 /* the linebreak mask is set up by Unicode_Init below */
 192
 193 #define BLOOM_MASK unsigned long
 194
 195 static BLOOM_MASK bloom_linebreak;
 196
 197 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
 198
 199 #define BLOOM_LINEBREAK(ch)                                             \
 200     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
 201      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
 202
 203 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
 204 {
 205     /* calculate simple bloom-style bitmask for a given unicode string */
 206
 207     long mask;
 208     Py_ssize_t i;
 209
 210     mask = 0;
 211     for (i = 0; i < len; i++)
 212         mask |= (1 << (ptr[i] & 0x1F));
 213
 214     return mask;
 215 }
 216
 217 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
 218 {
 219     Py_ssize_t i;
 220
 221     for (i = 0; i < setlen; i++)
 222         if (set[i] == chr)
 223             return 1;
 224
 225     return 0;
 226 }
 227
 228 #define BLOOM_MEMBER(mask, chr, set, setlen)                    \
 229     BLOOM(mask, chr) && unicode_member(chr, set, setlen)
 230
 231 /* --- Unicode Object ----------------------------------------------------- */
 232
 233 static
 234 int unicode_resize(register PyUnicodeObject *unicode,
 235                    Py_ssize_t length)
 236 {
 237     void *oldstr;
 238
 239     /* Shortcut if there's nothing much to do. */
 240     if (unicode->length == length)
 241         goto reset;
 242
 243     /* Resizing shared object (unicode_empty or single character
 244        objects) in-place is not allowed. Use PyUnicode_Resize()
 245        instead ! */
 246
 247     if (unicode == unicode_empty ||
 248         (unicode->length == 1 &&
 249          unicode->str[0] < 256U &&
 250          unicode_latin1[unicode->str[0]] == unicode)) {
 251         PyErr_SetString(PyExc_SystemError,
 252                         "can't resize shared unicode objects");
 253         return -1;
 254     }
 255
 256     /* We allocate one more byte to make sure the string is Ux0000 terminated.
 257        The overallocation is also used by fastsearch, which assumes that it's
 258        safe to look at str[length] (without making any assumptions about what
 259        it contains). */
 260
 261     oldstr = unicode->str;
 262     unicode->str = PyObject_REALLOC(unicode->str,
 263                                     sizeof(Py_UNICODE) * (length + 1));
 264     if (!unicode->str) {
 265         unicode->str = (Py_UNICODE *)oldstr;
 266         PyErr_NoMemory();
 267         return -1;
 268     }
 269     unicode->str[length] = 0;
 270     unicode->length = length;
 271
 272   reset:
 273     /* Reset the object caches */
 274     if (unicode->defenc) {
 275         Py_DECREF(unicode->defenc);
 276         unicode->defenc = NULL;
 277     }
 278     unicode->hash = -1;
 279
 280     return 0;
 281 }
 282
 283 /* We allocate one more byte to make sure the string is
 284    Ux0000 terminated -- XXX is this needed ?
 285
 286    XXX This allocator could further be enhanced by assuring that the
 287    free list never reduces its size below 1.
 288
 289 */
 290
 291 static
 292 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
 293 {
 294     register PyUnicodeObject *unicode;
 295
 296     /* Optimization for empty strings */
 297     if (length == 0 && unicode_empty != NULL) {
 298         Py_INCREF(unicode_empty);
 299         return unicode_empty;
 300     }
 301
 302     /* Ensure we won't overflow the size. */
 303     if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
 304         return (PyUnicodeObject *)PyErr_NoMemory();
 305     }
 306
 307     /* Unicode freelist & memory allocation */
 308     if (free_list) {
 309         unicode = free_list;
 310         free_list = *(PyUnicodeObject **)unicode;
 311         numfree--;
 312         if (unicode->str) {
 313             /* Keep-Alive optimization: we only upsize the buffer,
 314                never downsize it. */
 315             if ((unicode->length < length) &&
 316                 unicode_resize(unicode, length) < 0) {
 317                 PyObject_DEL(unicode->str);
 318                 unicode->str = NULL;
 319             }
 320         }
 321         else {
 322             size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
 323             unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
 324         }
 325         PyObject_INIT(unicode, &PyUnicode_Type);
 326     }
 327     else {
 328         size_t new_size;
 329         unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
 330         if (unicode == NULL)
 331             return NULL;
 332         new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
 333         unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
 334     }
 335
 336     if (!unicode->str) {
 337         PyErr_NoMemory();
 338         goto onError;
 339     }
 340     /* Initialize the first element to guard against cases where
 341      * the caller fails before initializing str -- unicode_resize()
 342      * reads str[0], and the Keep-Alive optimization can keep memory
 343      * allocated for str alive across a call to unicode_dealloc(unicode).
 344      * We don't want unicode_resize to read uninitialized memory in
 345      * that case.
 346      */
 347     unicode->str[0] = 0;
 348     unicode->str[length] = 0;
 349     unicode->length = length;
 350     unicode->hash = -1;
 351     unicode->defenc = NULL;
 352     return unicode;
 353
 354   onError:
 355     /* XXX UNREF/NEWREF interface should be more symmetrical */
 356     _Py_DEC_REFTOTAL;
 357     _Py_ForgetReference((PyObject *)unicode);
 358     PyObject_Del(unicode);
 359     return NULL;
 360 }
 361
 362 static
 363 void unicode_dealloc(register PyUnicodeObject *unicode)
 364 {
 365     if (PyUnicode_CheckExact(unicode) &&
 366         numfree < PyUnicode_MAXFREELIST) {
 367         /* Keep-Alive optimization */
 368         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
 369             PyObject_DEL(unicode->str);
 370             unicode->str = NULL;
 371             unicode->length = 0;
 372         }
 373         if (unicode->defenc) {
 374             Py_DECREF(unicode->defenc);
 375             unicode->defenc = NULL;
 376         }
 377         /* Add to free list */
 378         *(PyUnicodeObject **)unicode = free_list;
 379         free_list = unicode;
 380         numfree++;
 381     }
 382     else {
 383         PyObject_DEL(unicode->str);
 384         Py_XDECREF(unicode->defenc);
 385         Py_TYPE(unicode)->tp_free((PyObject *)unicode);
 386     }
 387 }
 388
 389 static
 390 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
 391 {
 392     register PyUnicodeObject *v;
 393
 394     /* Argument checks */
 395     if (unicode == NULL) {
 396         PyErr_BadInternalCall();
 397         return -1;
 398     }
 399     v = *unicode;
 400     if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
 401         PyErr_BadInternalCall();
 402         return -1;
 403     }
 404
 405     /* Resizing unicode_empty and single character objects is not
 406        possible since these are being shared. We simply return a fresh
 407        copy with the same Unicode content. */
 408     if (v->length != length &&
 409         (v == unicode_empty || v->length == 1)) {
 410         PyUnicodeObject *w = _PyUnicode_New(length);
 411         if (w == NULL)
 412             return -1;
 413         Py_UNICODE_COPY(w->str, v->str,
 414                         length < v->length ? length : v->length);
 415         Py_DECREF(*unicode);
 416         *unicode = w;
 417         return 0;
 418     }
 419
 420     /* Note that we don't have to modify *unicode for unshared Unicode
 421        objects, since we can modify them in-place. */
 422     return unicode_resize(v, length);
 423 }
 424
 425 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
 426 {
 427     return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
 428 }
 429
 430 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
 431                                 Py_ssize_t size)
 432 {
 433     PyUnicodeObject *unicode;
 434
 435     /* If the Unicode data is known at construction time, we can apply
 436        some optimizations which share commonly used objects. */
 437     if (u != NULL) {
 438
 439         /* Optimization for empty strings */
 440         if (size == 0 && unicode_empty != NULL) {
 441             Py_INCREF(unicode_empty);
 442             return (PyObject *)unicode_empty;
 443         }
 444
 445         /* Single character Unicode objects in the Latin-1 range are
 446            shared when using this constructor */
 447         if (size == 1 && *u < 256) {
 448             unicode = unicode_latin1[*u];
 449             if (!unicode) {
 450                 unicode = _PyUnicode_New(1);
 451                 if (!unicode)
 452                     return NULL;
 453                 unicode->str[0] = *u;
 454                 unicode_latin1[*u] = unicode;
 455             }
 456             Py_INCREF(unicode);
 457             return (PyObject *)unicode;
 458         }
 459     }
 460
 461     unicode = _PyUnicode_New(size);
 462     if (!unicode)
 463         return NULL;
 464
 465     /* Copy the Unicode data into the new object */
 466     if (u != NULL)
 467         Py_UNICODE_COPY(unicode->str, u, size);
 468
 469     return (PyObject *)unicode;
 470 }
 471
 472 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
 473 {
 474     PyUnicodeObject *unicode;
 475
 476     if (size < 0) {
 477         PyErr_SetString(PyExc_SystemError,
 478                         "Negative size passed to PyUnicode_FromStringAndSize");
 479         return NULL;
 480     }
 481
 482     /* If the Unicode data is known at construction time, we can apply
 483        some optimizations which share commonly used objects.
 484        Also, this means the input must be UTF-8, so fall back to the
 485        UTF-8 decoder at the end. */
 486     if (u != NULL) {
 487
 488         /* Optimization for empty strings */
 489         if (size == 0 && unicode_empty != NULL) {
 490             Py_INCREF(unicode_empty);
 491             return (PyObject *)unicode_empty;
 492         }
 493
 494         /* Single characters are shared when using this constructor.
 495            Restrict to ASCII, since the input must be UTF-8. */
 496         if (size == 1 && Py_CHARMASK(*u) < 128) {
 497             unicode = unicode_latin1[Py_CHARMASK(*u)];
 498             if (!unicode) {
 499                 unicode = _PyUnicode_New(1);
 500                 if (!unicode)
 501                     return NULL;
 502                 unicode->str[0] = Py_CHARMASK(*u);
 503                 unicode_latin1[Py_CHARMASK(*u)] = unicode;
 504             }
 505             Py_INCREF(unicode);
 506             return (PyObject *)unicode;
 507         }
 508
 509         return PyUnicode_DecodeUTF8(u, size, NULL);
 510     }
 511
 512     unicode = _PyUnicode_New(size);
 513     if (!unicode)
 514         return NULL;
 515
 516     return (PyObject *)unicode;
 517 }
 518
 519 PyObject *PyUnicode_FromString(const char *u)
 520 {
 521     size_t size = strlen(u);
 522     if (size > PY_SSIZE_T_MAX) {
 523         PyErr_SetString(PyExc_OverflowError, "input too long");
 524         return NULL;
 525     }
 526
 527     return PyUnicode_FromStringAndSize(u, size);
 528 }
 529
 530 #ifdef HAVE_WCHAR_H
 531
 532 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
 533 # define CONVERT_WCHAR_TO_SURROGATES
 534 #endif
 535
 536 #ifdef CONVERT_WCHAR_TO_SURROGATES
 537
 538 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
 539    to convert from UTF32 to UTF16. */
 540
 541 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 542                                  Py_ssize_t size)
 543 {
 544     PyUnicodeObject *unicode;
 545     register Py_ssize_t i;
 546     Py_ssize_t alloc;
 547     const wchar_t *orig_w;
 548
 549     if (w == NULL) {
 550         PyErr_BadInternalCall();
 551         return NULL;
 552     }
 553
 554     alloc = size;
 555     orig_w = w;
 556     for (i = size; i > 0; i--) {
 557         if (*w > 0xFFFF)
 558             alloc++;
 559         w++;
 560     }
 561     w = orig_w;
 562     unicode = _PyUnicode_New(alloc);
 563     if (!unicode)
 564         return NULL;
 565
 566     /* Copy the wchar_t data into the new object */
 567     {
 568         register Py_UNICODE *u;
 569         u = PyUnicode_AS_UNICODE(unicode);
 570         for (i = size; i > 0; i--) {
 571             if (*w > 0xFFFF) {
 572                 wchar_t ordinal = *w++;
 573                 ordinal -= 0x10000;
 574                 *u++ = 0xD800 | (ordinal >> 10);
 575                 *u++ = 0xDC00 | (ordinal & 0x3FF);
 576             }
 577             else
 578                 *u++ = *w++;
 579         }
 580     }
 581     return (PyObject *)unicode;
 582 }
 583
 584 #else
 585
 586 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 587                                  Py_ssize_t size)
 588 {
 589     PyUnicodeObject *unicode;
 590
 591     if (w == NULL) {
 592         PyErr_BadInternalCall();
 593         return NULL;
 594     }
 595
 596     unicode = _PyUnicode_New(size);
 597     if (!unicode)
 598         return NULL;
 599
 600     /* Copy the wchar_t data into the new object */
 601 #ifdef HAVE_USABLE_WCHAR_T
 602     memcpy(unicode->str, w, size * sizeof(wchar_t));
 603 #else
 604     {
 605         register Py_UNICODE *u;
 606         register Py_ssize_t i;
 607         u = PyUnicode_AS_UNICODE(unicode);
 608         for (i = size; i > 0; i--)
 609             *u++ = *w++;
 610     }
 611 #endif
 612
 613     return (PyObject *)unicode;
 614 }
 615
 616 #endif /* CONVERT_WCHAR_TO_SURROGATES */
 617
 618 #undef CONVERT_WCHAR_TO_SURROGATES
 619
 620 static void
 621 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
 622 {
 623     *fmt++ = '%';
 624     if (width) {
 625         if (zeropad)
 626             *fmt++ = '0';
 627         fmt += sprintf(fmt, "%d", width);
 628     }
 629     if (precision)
 630         fmt += sprintf(fmt, ".%d", precision);
 631     if (longflag)
 632         *fmt++ = 'l';
 633     else if (size_tflag) {
 634         char *f = PY_FORMAT_SIZE_T;
 635         while (*f)
 636             *fmt++ = *f++;
 637     }
 638     *fmt++ = c;
 639     *fmt = '\0';
 640 }
 641
 642 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
 643
 644 PyObject *
 645 PyUnicode_FromFormatV(const char *format, va_list vargs)
 646 {
 647     va_list count;
 648     Py_ssize_t callcount = 0;
 649     PyObject **callresults = NULL;
 650     PyObject **callresult = NULL;
 651     Py_ssize_t n = 0;
 652     int width = 0;
 653     int precision = 0;
 654     int zeropad;
 655     const char* f;
 656     Py_UNICODE *s;
 657     PyObject *string;
 658     /* used by sprintf */
 659     char buffer[21];
 660     /* use abuffer instead of buffer, if we need more space
 661      * (which can happen if there's a format specifier with width). */
 662     char *abuffer = NULL;
 663     char *realbuffer;
 664     Py_ssize_t abuffersize = 0;
 665     char fmt[60]; /* should be enough for %0width.precisionld */
 666     const char *copy;
 667
 668 #ifdef VA_LIST_IS_ARRAY
 669     Py_MEMCPY(count, vargs, sizeof(va_list));
 670 #else
 671 #ifdef  __va_copy
 672     __va_copy(count, vargs);
 673 #else
 674     count = vargs;
 675 #endif
 676 #endif
 677      /* step 1: count the number of %S/%R/%s format specifications
 678       * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
 679       * objects once during step 3 and put the result in an array) */
 680     for (f = format; *f; f++) {
 681          if (*f == '%') {
 682              if (*(f+1)=='%')
 683                  continue;
 684              if (*(f+1)=='S' || *(f+1)=='R')
 685                  ++callcount;
 686              while (isdigit((unsigned)*f))
 687                  width = (width*10) + *f++ - '0';
 688              while (*++f && *f != '%' && !isalpha((unsigned)*f))
 689                  ;
 690              if (*f == 's')
 691                  ++callcount;
 692          }
 693     }
 694     /* step 2: allocate memory for the results of
 695      * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
 696     if (callcount) {
 697         callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
 698         if (!callresults) {
 699             PyErr_NoMemory();
 700             return NULL;
 701         }
 702         callresult = callresults;
 703     }
 704     /* step 3: figure out how large a buffer we need */
 705     for (f = format; *f; f++) {
 706         if (*f == '%') {
 707             const char* p = f;
 708             width = 0;
 709             while (isdigit((unsigned)*f))
 710                 width = (width*10) + *f++ - '0';
 711             while (*++f && *f != '%' && !isalpha((unsigned)*f))
 712                 ;
 713
 714             /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
 715              * they don't affect the amount of space we reserve.
 716              */
 717             if ((*f == 'l' || *f == 'z') &&
 718                 (f[1] == 'd' || f[1] == 'u'))
 719                 ++f;
 720
 721             switch (*f) {
 722             case 'c':
 723                 (void)va_arg(count, int);
 724                 /* fall through... */
 725             case '%':
 726                 n++;
 727                 break;
 728             case 'd': case 'u': case 'i': case 'x':
 729                 (void) va_arg(count, int);
 730                 /* 20 bytes is enough to hold a 64-bit
 731                    integer.  Decimal takes the most space.
 732                    This isn't enough for octal.
 733                    If a width is specified we need more
 734                    (which we allocate later). */
 735                 if (width < 20)
 736                     width = 20;
 737                 n += width;
 738                 if (abuffersize < width)
 739                     abuffersize = width;
 740                 break;
 741             case 's':
 742             {
 743                 /* UTF-8 */
 744                 const char *s = va_arg(count, const char*);
 745                 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
 746                 if (!str)
 747                     goto fail;
 748                 n += PyUnicode_GET_SIZE(str);
 749                 /* Remember the str and switch to the next slot */
 750                 *callresult++ = str;
 751                 break;
 752             }
 753             case 'U':
 754             {
 755                 PyObject *obj = va_arg(count, PyObject *);
 756                 assert(obj && PyUnicode_Check(obj));
 757                 n += PyUnicode_GET_SIZE(obj);
 758                 break;
 759             }
 760             case 'V':
 761             {
 762                 PyObject *obj = va_arg(count, PyObject *);
 763                 const char *str = va_arg(count, const char *);
 764                 assert(obj || str);
 765                 assert(!obj || PyUnicode_Check(obj));
 766                 if (obj)
 767                     n += PyUnicode_GET_SIZE(obj);
 768                 else
 769                     n += strlen(str);
 770                 break;
 771             }
 772             case 'S':
 773             {
 774                 PyObject *obj = va_arg(count, PyObject *);
 775                 PyObject *str;
 776                 assert(obj);
 777                 str = PyObject_Str(obj);
 778                 if (!str)
 779                     goto fail;
 780                 n += PyUnicode_GET_SIZE(str);
 781                 /* Remember the str and switch to the next slot */
 782                 *callresult++ = str;
 783                 break;
 784             }
 785             case 'R':
 786             {
 787                 PyObject *obj = va_arg(count, PyObject *);
 788                 PyObject *repr;
 789                 assert(obj);
 790                 repr = PyObject_Repr(obj);
 791                 if (!repr)
 792                     goto fail;
 793                 n += PyUnicode_GET_SIZE(repr);
 794                 /* Remember the repr and switch to the next slot */
 795                 *callresult++ = repr;
 796                 break;
 797             }
 798             case 'p':
 799                 (void) va_arg(count, int);
 800                 /* maximum 64-bit pointer representation:
 801                  * 0xffffffffffffffff
 802                  * so 19 characters is enough.
 803                  * XXX I count 18 -- what's the extra for?
 804                  */
 805                 n += 19;
 806                 break;
 807             default:
 808                 /* if we stumble upon an unknown
 809                    formatting code, copy the rest of
 810                    the format string to the output
 811                    string. (we cannot just skip the
 812                    code, since there's no way to know
 813                    what's in the argument list) */
 814                 n += strlen(p);
 815                 goto expand;
 816             }
 817         } else
 818             n++;
 819     }
 820   expand:
 821     if (abuffersize > 20) {
 822         abuffer = PyObject_Malloc(abuffersize);
 823         if (!abuffer) {
 824             PyErr_NoMemory();
 825             goto fail;
 826         }
 827         realbuffer = abuffer;
 828     }
 829     else
 830         realbuffer = buffer;
 831     /* step 4: fill the buffer */
 832     /* Since we've analyzed how much space we need for the worst case,
 833        we don't have to resize the string.
 834        There can be no errors beyond this point. */
 835     string = PyUnicode_FromUnicode(NULL, n);
 836     if (!string)
 837         goto fail;
 838
 839     s = PyUnicode_AS_UNICODE(string);
 840     callresult = callresults;
 841
 842     for (f = format; *f; f++) {
 843         if (*f == '%') {
 844             const char* p = f++;
 845             int longflag = 0;
 846             int size_tflag = 0;
 847             zeropad = (*f == '0');
 848             /* parse the width.precision part */
 849             width = 0;
 850             while (isdigit((unsigned)*f))
 851                 width = (width*10) + *f++ - '0';
 852             precision = 0;
 853             if (*f == '.') {
 854                 f++;
 855                 while (isdigit((unsigned)*f))
 856                     precision = (precision*10) + *f++ - '0';
 857             }
 858             /* handle the long flag, but only for %ld and %lu.
 859                others can be added when necessary. */
 860             if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
 861                 longflag = 1;
 862                 ++f;
 863             }
 864             /* handle the size_t flag. */
 865             if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
 866                 size_tflag = 1;
 867                 ++f;
 868             }
 869
 870             switch (*f) {
 871             case 'c':
 872                 *s++ = va_arg(vargs, int);
 873                 break;
 874             case 'd':
 875                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
 876                 if (longflag)
 877                     sprintf(realbuffer, fmt, va_arg(vargs, long));
 878                 else if (size_tflag)
 879                     sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
 880                 else
 881                     sprintf(realbuffer, fmt, va_arg(vargs, int));
 882                 appendstring(realbuffer);
 883                 break;
 884             case 'u':
 885                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
 886                 if (longflag)
 887                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
 888                 else if (size_tflag)
 889                     sprintf(realbuffer, fmt, va_arg(vargs, size_t));
 890                 else
 891                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
 892                 appendstring(realbuffer);
 893                 break;
 894             case 'i':
 895                 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
 896                 sprintf(realbuffer, fmt, va_arg(vargs, int));
 897                 appendstring(realbuffer);
 898                 break;
 899             case 'x':
 900                 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
 901                 sprintf(realbuffer, fmt, va_arg(vargs, int));
 902                 appendstring(realbuffer);
 903                 break;
 904             case 's':
 905             {
 906                 /* unused, since we already have the result */
 907                 (void) va_arg(vargs, char *);
 908                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
 909                                 PyUnicode_GET_SIZE(*callresult));
 910                 s += PyUnicode_GET_SIZE(*callresult);
 911                 /* We're done with the unicode()/repr() => forget it */
 912                 Py_DECREF(*callresult);
 913                 /* switch to next unicode()/repr() result */
 914                 ++callresult;
 915                 break;
 916             }
 917             case 'U':
 918             {
 919                 PyObject *obj = va_arg(vargs, PyObject *);
 920                 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
 921                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
 922                 s += size;
 923                 break;
 924             }
 925             case 'V':
 926             {
 927                 PyObject *obj = va_arg(vargs, PyObject *);
 928                 const char *str = va_arg(vargs, const char *);
 929                 if (obj) {
 930                     Py_ssize_t size = PyUnicode_GET_SIZE(obj);
 931                     Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
 932                     s += size;
 933                 } else {
 934                     appendstring(str);
 935                 }
 936                 break;
 937             }
 938             case 'S':
 939             case 'R':
 940             {
 941                 Py_UNICODE *ucopy;
 942                 Py_ssize_t usize;
 943                 Py_ssize_t upos;
 944                 /* unused, since we already have the result */
 945                 (void) va_arg(vargs, PyObject *);
 946                 ucopy = PyUnicode_AS_UNICODE(*callresult);
 947                 usize = PyUnicode_GET_SIZE(*callresult);
 948                 for (upos = 0; upos<usize;)
 949                     *s++ = ucopy[upos++];
 950                 /* We're done with the unicode()/repr() => forget it */
 951                 Py_DECREF(*callresult);
 952                 /* switch to next unicode()/repr() result */
 953                 ++callresult;
 954                 break;
 955             }
 956             case 'p':
 957                 sprintf(buffer, "%p", va_arg(vargs, void*));
 958                 /* %p is ill-defined:  ensure leading 0x. */
 959                 if (buffer[1] == 'X')
 960                     buffer[1] = 'x';
 961                 else if (buffer[1] != 'x') {
 962                     memmove(buffer+2, buffer, strlen(buffer)+1);
 963                     buffer[0] = '0';
 964                     buffer[1] = 'x';
 965                 }
 966                 appendstring(buffer);
 967                 break;
 968             case '%':
 969                 *s++ = '%';
 970                 break;
 971             default:
 972                 appendstring(p);
 973                 goto end;
 974             }
 975         } else
 976             *s++ = *f;
 977     }
 978
 979   end:
 980     if (callresults)
 981         PyObject_Free(callresults);
 982     if (abuffer)
 983         PyObject_Free(abuffer);
 984     PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
 985     return string;
 986   fail:
 987     if (callresults) {
 988         PyObject **callresult2 = callresults;
 989         while (callresult2 < callresult) {
 990             Py_DECREF(*callresult2);
 991             ++callresult2;
 992         }
 993         PyObject_Free(callresults);
 994     }
 995     if (abuffer)
 996         PyObject_Free(abuffer);
 997     return NULL;
 998 }
 999
1000 #undef appendstring
1001
1002 PyObject *
1003 PyUnicode_FromFormat(const char *format, ...)
1004 {
1005     PyObject* ret;
1006     va_list vargs;
1007
1008 #ifdef HAVE_STDARG_PROTOTYPES
1009     va_start(vargs, format);
1010 #else
1011     va_start(vargs);
1012 #endif
1013     ret = PyUnicode_FromFormatV(format, vargs);
1014     va_end(vargs);
1015     return ret;
1016 }
1017
1018 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1019                                 wchar_t *w,
1020                                 Py_ssize_t size)
1021 {
1022     if (unicode == NULL) {
1023         PyErr_BadInternalCall();
1024         return -1;
1025     }
1026
1027     /* If possible, try to copy the 0-termination as well */
1028     if (size > PyUnicode_GET_SIZE(unicode))
1029         size = PyUnicode_GET_SIZE(unicode) + 1;
1030
1031 #ifdef HAVE_USABLE_WCHAR_T
1032     memcpy(w, unicode->str, size * sizeof(wchar_t));
1033 #else
1034     {
1035         register Py_UNICODE *u;
1036         register Py_ssize_t i;
1037         u = PyUnicode_AS_UNICODE(unicode);
1038         for (i = size; i > 0; i--)
1039             *w++ = *u++;
1040     }
1041 #endif
1042
1043     if (size > PyUnicode_GET_SIZE(unicode))
1044         return PyUnicode_GET_SIZE(unicode);
1045     else
1046         return size;
1047 }
1048
1049 #endif
1050
1051 PyObject *PyUnicode_FromOrdinal(int ordinal)
1052 {
1053     Py_UNICODE s[1];
1054
1055 #ifdef Py_UNICODE_WIDE
1056     if (ordinal < 0 || ordinal > 0x10ffff) {
1057         PyErr_SetString(PyExc_ValueError,
1058                         "unichr() arg not in range(0x110000) "
1059                         "(wide Python build)");
1060         return NULL;
1061     }
1062 #else
1063     if (ordinal < 0 || ordinal > 0xffff) {
1064         PyErr_SetString(PyExc_ValueError,
1065                         "unichr() arg not in range(0x10000) "
1066                         "(narrow Python build)");
1067         return NULL;
1068     }
1069 #endif
1070
1071     s[0] = (Py_UNICODE)ordinal;
1072     return PyUnicode_FromUnicode(s, 1);
1073 }
1074
1075 PyObject *PyUnicode_FromObject(register PyObject *obj)
1076 {
1077     /* XXX Perhaps we should make this API an alias of
1078        PyObject_Unicode() instead ?! */
1079     if (PyUnicode_CheckExact(obj)) {
1080         Py_INCREF(obj);
1081         return obj;
1082     }
1083     if (PyUnicode_Check(obj)) {
1084         /* For a Unicode subtype that's not a Unicode object,
1085            return a true Unicode object with the same data. */
1086         return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1087                                      PyUnicode_GET_SIZE(obj));
1088     }
1089     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1090 }
1091
1092 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1093                                       const char *encoding,
1094                                       const char *errors)
1095 {
1096     const char *s = NULL;
1097     Py_ssize_t len;
1098     PyObject *v;
1099
1100     if (obj == NULL) {
1101         PyErr_BadInternalCall();
1102         return NULL;
1103     }
1104
1105 #if 0
1106     /* For b/w compatibility we also accept Unicode objects provided
1107        that no encodings is given and then redirect to
1108        PyObject_Unicode() which then applies the additional logic for
1109        Unicode subclasses.
1110
1111        NOTE: This API should really only be used for object which
1112        represent *encoded* Unicode !
1113
1114     */
1115     if (PyUnicode_Check(obj)) {
1116         if (encoding) {
1117             PyErr_SetString(PyExc_TypeError,
1118                             "decoding Unicode is not supported");
1119             return NULL;
1120         }
1121         return PyObject_Unicode(obj);
1122     }
1123 #else
1124     if (PyUnicode_Check(obj)) {
1125         PyErr_SetString(PyExc_TypeError,
1126                         "decoding Unicode is not supported");
1127         return NULL;
1128     }
1129 #endif
1130
1131     /* Coerce object */
1132     if (PyString_Check(obj)) {
1133         s = PyString_AS_STRING(obj);
1134         len = PyString_GET_SIZE(obj);
1135     }
1136     else if (PyByteArray_Check(obj)) {
1137         /* Python 2.x specific */
1138         PyErr_Format(PyExc_TypeError,
1139                      "decoding bytearray is not supported");
1140         return NULL;
1141     }
1142     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1143         /* Overwrite the error message with something more useful in
1144            case of a TypeError. */
1145         if (PyErr_ExceptionMatches(PyExc_TypeError))
1146             PyErr_Format(PyExc_TypeError,
1147                          "coercing to Unicode: need string or buffer, "
1148                          "%.80s found",
1149                          Py_TYPE(obj)->tp_name);
1150         goto onError;
1151     }
1152
1153     /* Convert to Unicode */
1154     if (len == 0) {
1155         Py_INCREF(unicode_empty);
1156         v = (PyObject *)unicode_empty;
1157     }
1158     else
1159         v = PyUnicode_Decode(s, len, encoding, errors);
1160
1161     return v;
1162
1163   onError:
1164     return NULL;
1165 }
1166
1167 PyObject *PyUnicode_Decode(const char *s,
1168                            Py_ssize_t size,
1169                            const char *encoding,
1170                            const char *errors)
1171 {
1172     PyObject *buffer = NULL, *unicode;
1173
1174     if (encoding == NULL)
1175         encoding = PyUnicode_GetDefaultEncoding();
1176
1177     /* Shortcuts for common default encodings */
1178     if (strcmp(encoding, "utf-8") == 0)
1179         return PyUnicode_DecodeUTF8(s, size, errors);
1180     else if (strcmp(encoding, "latin-1") == 0)
1181         return PyUnicode_DecodeLatin1(s, size, errors);
1182 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1183     else if (strcmp(encoding, "mbcs") == 0)
1184         return PyUnicode_DecodeMBCS(s, size, errors);
1185 #endif
1186     else if (strcmp(encoding, "ascii") == 0)
1187         return PyUnicode_DecodeASCII(s, size, errors);
1188
1189     /* Decode via the codec registry */
1190     buffer = PyBuffer_FromMemory((void *)s, size);
1191     if (buffer == NULL)
1192         goto onError;
1193     unicode = PyCodec_Decode(buffer, encoding, errors);
1194     if (unicode == NULL)
1195         goto onError;
1196     if (!PyUnicode_Check(unicode)) {
1197         PyErr_Format(PyExc_TypeError,
1198                      "decoder did not return an unicode object (type=%.400s)",
1199                      Py_TYPE(unicode)->tp_name);
1200         Py_DECREF(unicode);
1201         goto onError;
1202     }
1203     Py_DECREF(buffer);
1204     return unicode;
1205
1206   onError:
1207     Py_XDECREF(buffer);
1208     return NULL;
1209 }
1210
1211 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1212                                     const char *encoding,
1213                                     const char *errors)
1214 {
1215     PyObject *v;
1216
1217     if (!PyUnicode_Check(unicode)) {
1218         PyErr_BadArgument();
1219         goto onError;
1220     }
1221
1222     if (encoding == NULL)
1223         encoding = PyUnicode_GetDefaultEncoding();
1224
1225     /* Decode via the codec registry */
1226     v = PyCodec_Decode(unicode, encoding, errors);
1227     if (v == NULL)
1228         goto onError;
1229     return v;
1230
1231   onError:
1232     return NULL;
1233 }
1234
1235 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1236                            Py_ssize_t size,
1237                            const char *encoding,
1238                            const char *errors)
1239 {
1240     PyObject *v, *unicode;
1241
1242     unicode = PyUnicode_FromUnicode(s, size);
1243     if (unicode == NULL)
1244         return NULL;
1245     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1246     Py_DECREF(unicode);
1247     return v;
1248 }
1249
1250 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1251                                     const char *encoding,
1252                                     const char *errors)
1253 {
1254     PyObject *v;
1255
1256     if (!PyUnicode_Check(unicode)) {
1257         PyErr_BadArgument();
1258         goto onError;
1259     }
1260
1261     if (encoding == NULL)
1262         encoding = PyUnicode_GetDefaultEncoding();
1263
1264     /* Encode via the codec registry */
1265     v = PyCodec_Encode(unicode, encoding, errors);
1266     if (v == NULL)
1267         goto onError;
1268     return v;
1269
1270   onError:
1271     return NULL;
1272 }
1273
1274 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1275                                     const char *encoding,
1276                                     const char *errors)
1277 {
1278     PyObject *v;
1279
1280     if (!PyUnicode_Check(unicode)) {
1281         PyErr_BadArgument();
1282         goto onError;
1283     }
1284
1285     if (encoding == NULL)
1286         encoding = PyUnicode_GetDefaultEncoding();
1287
1288     /* Shortcuts for common default encodings */
1289     if (errors == NULL) {
1290         if (strcmp(encoding, "utf-8") == 0)
1291             return PyUnicode_AsUTF8String(unicode);
1292         else if (strcmp(encoding, "latin-1") == 0)
1293             return PyUnicode_AsLatin1String(unicode);
1294 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1295         else if (strcmp(encoding, "mbcs") == 0)
1296             return PyUnicode_AsMBCSString(unicode);
1297 #endif
1298         else if (strcmp(encoding, "ascii") == 0)
1299             return PyUnicode_AsASCIIString(unicode);
1300     }
1301
1302     /* Encode via the codec registry */
1303     v = PyCodec_Encode(unicode, encoding, errors);
1304     if (v == NULL)
1305         goto onError;
1306     if (!PyString_Check(v)) {
1307         PyErr_Format(PyExc_TypeError,
1308                      "encoder did not return a string object (type=%.400s)",
1309                      Py_TYPE(v)->tp_name);
1310         Py_DECREF(v);
1311         goto onError;
1312     }
1313     return v;
1314
1315   onError:
1316     return NULL;
1317 }
1318
1319 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1320                                             const char *errors)
1321 {
1322     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1323
1324     if (v)
1325         return v;
1326     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1327     if (v && errors == NULL)
1328         ((PyUnicodeObject *)unicode)->defenc = v;
1329     return v;
1330 }
1331
1332 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1333 {
1334     if (!PyUnicode_Check(unicode)) {
1335         PyErr_BadArgument();
1336         goto onError;
1337     }
1338     return PyUnicode_AS_UNICODE(unicode);
1339
1340   onError:
1341     return NULL;
1342 }
1343
1344 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1345 {
1346     if (!PyUnicode_Check(unicode)) {
1347         PyErr_BadArgument();
1348         goto onError;
1349     }
1350     return PyUnicode_GET_SIZE(unicode);
1351
1352   onError:
1353     return -1;
1354 }
1355
1356 const char *PyUnicode_GetDefaultEncoding(void)
1357 {
1358     return unicode_default_encoding;
1359 }
1360
1361 int PyUnicode_SetDefaultEncoding(const char *encoding)
1362 {
1363     PyObject *v;
1364
1365     /* Make sure the encoding is valid. As side effect, this also
1366        loads the encoding into the codec registry cache. */
1367     v = _PyCodec_Lookup(encoding);
1368     if (v == NULL)
1369         goto onError;
1370     Py_DECREF(v);
1371     strncpy(unicode_default_encoding,
1372             encoding,
1373             sizeof(unicode_default_encoding));
1374     return 0;
1375
1376   onError:
1377     return -1;
1378 }
1379
1380 /* error handling callback helper:
1381    build arguments, call the callback and check the arguments,
1382    if no exception occurred, copy the replacement to the output
1383    and adjust various state variables.
1384    return 0 on success, -1 on error
1385 */
1386
1387 static
1388 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1389                                      const char *encoding, const char *reason,
1390                                      const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1391                                      Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1392                                      PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1393 {
1394     static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1395
1396     PyObject *restuple = NULL;
1397     PyObject *repunicode = NULL;
1398     Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1399     Py_ssize_t requiredsize;
1400     Py_ssize_t newpos;
1401     Py_UNICODE *repptr;
1402     Py_ssize_t repsize;
1403     int res = -1;
1404
1405     if (*errorHandler == NULL) {
1406         *errorHandler = PyCodec_LookupError(errors);
1407         if (*errorHandler == NULL)
1408             goto onError;
1409     }
1410
1411     if (*exceptionObject == NULL) {
1412         *exceptionObject = PyUnicodeDecodeError_Create(
1413             encoding, input, insize, *startinpos, *endinpos, reason);
1414         if (*exceptionObject == NULL)
1415             goto onError;
1416     }
1417     else {
1418         if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1419             goto onError;
1420         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1421             goto onError;
1422         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1423             goto onError;
1424     }
1425
1426     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1427     if (restuple == NULL)
1428         goto onError;
1429     if (!PyTuple_Check(restuple)) {
1430         PyErr_SetString(PyExc_TypeError, &argparse[4]);
1431         goto onError;
1432     }
1433     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1434         goto onError;
1435     if (newpos<0)
1436         newpos = insize+newpos;
1437     if (newpos<0 || newpos>insize) {
1438         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1439         goto onError;
1440     }
1441
1442     /* need more space? (at least enough for what we
1443        have+the replacement+the rest of the string (starting
1444        at the new input position), so we won't have to check space
1445        when there are no errors in the rest of the string) */
1446     repptr = PyUnicode_AS_UNICODE(repunicode);
1447     repsize = PyUnicode_GET_SIZE(repunicode);
1448     requiredsize = *outpos + repsize + insize-newpos;
1449     if (requiredsize > outsize) {
1450         if (requiredsize<2*outsize)
1451             requiredsize = 2*outsize;
1452         if (_PyUnicode_Resize(output, requiredsize) < 0)
1453             goto onError;
1454         *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1455     }
1456     *endinpos = newpos;
1457     *inptr = input + newpos;
1458     Py_UNICODE_COPY(*outptr, repptr, repsize);
1459     *outptr += repsize;
1460     *outpos += repsize;
1461     /* we made it! */
1462     res = 0;
1463
1464   onError:
1465     Py_XDECREF(restuple);
1466     return res;
1467 }
1468
1469 /* --- UTF-7 Codec -------------------------------------------------------- */
1470
1471 /* See RFC2152 for details.  We encode conservatively and decode liberally. */
1472
1473 /* Three simple macros defining base-64. */
1474
1475 /* Is c a base-64 character? */
1476
1477 #define IS_BASE64(c) \
1478     (isalnum(c) || (c) == '+' || (c) == '/')
1479
1480 /* given that c is a base-64 character, what is its base-64 value? */
1481
1482 #define FROM_BASE64(c)                                                  \
1483     (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
1484      ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
1485      ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
1486      (c) == '+' ? 62 : 63)
1487
1488 /* What is the base-64 character of the bottom 6 bits of n? */
1489
1490 #define TO_BASE64(n)  \
1491     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1492
1493 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1494  * decoded as itself.  We are permissive on decoding; the only ASCII
1495  * byte not decoding to itself is the + which begins a base64
1496  * string. */
1497
1498 #define DECODE_DIRECT(c)                                \
1499     ((c) <= 127 && (c) != '+')
1500
1501 /* The UTF-7 encoder treats ASCII characters differently according to
1502  * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1503  * the above).  See RFC2152.  This array identifies these different
1504  * sets:
1505  * 0 : "Set D"
1506  *     alphanumeric and '(),-./:?
1507  * 1 : "Set O"
1508  *     !"#$%&*;<=>@[]^_`{|}
1509  * 2 : "whitespace"
1510  *     ht nl cr sp
1511  * 3 : special (must be base64 encoded)
1512  *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1513  */
1514
1515 static
1516 char utf7_category[128] = {
1517 /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
1518     3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
1519 /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
1520     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
1521 /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
1522     2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
1523 /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
1524     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
1525 /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
1526     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1527 /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
1528     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
1529 /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
1530     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1531 /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
1532     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
1533 };
1534
1535 /* ENCODE_DIRECT: this character should be encoded as itself.  The
1536  * answer depends on whether we are encoding set O as itself, and also
1537  * on whether we are encoding whitespace as itself.  RFC2152 makes it
1538  * clear that the answers to these questions vary between
1539  * applications, so this code needs to be flexible.  */
1540
1541 #define ENCODE_DIRECT(c, directO, directWS)             \
1542     ((c) < 128 && (c) > 0 &&                            \
1543      ((utf7_category[(c)] == 0) ||                      \
1544       (directWS && (utf7_category[(c)] == 2)) ||        \
1545       (directO && (utf7_category[(c)] == 1))))
1546
1547 PyObject *PyUnicode_DecodeUTF7(const char *s,
1548                                Py_ssize_t size,
1549                                const char *errors)
1550 {
1551     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1552 }
1553
1554 /* The decoder.  The only state we preserve is our read position,
1555  * i.e. how many characters we have consumed.  So if we end in the
1556  * middle of a shift sequence we have to back off the read position
1557  * and the output to the beginning of the sequence, otherwise we lose
1558  * all the shift state (seen bits, number of bits seen, high
1559  * surrogate). */
1560
1561 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1562                                        Py_ssize_t size,
1563                                        const char *errors,
1564                                        Py_ssize_t *consumed)
1565 {
1566     const char *starts = s;
1567     Py_ssize_t startinpos;
1568     Py_ssize_t endinpos;
1569     Py_ssize_t outpos;
1570     const char *e;
1571     PyUnicodeObject *unicode;
1572     Py_UNICODE *p;
1573     const char *errmsg = "";
1574     int inShift = 0;
1575     Py_UNICODE *shiftOutStart;
1576     unsigned int base64bits = 0;
1577     unsigned long base64buffer = 0;
1578     Py_UNICODE surrogate = 0;
1579     PyObject *errorHandler = NULL;
1580     PyObject *exc = NULL;
1581
1582     unicode = _PyUnicode_New(size);
1583     if (!unicode)
1584         return NULL;
1585     if (size == 0) {
1586         if (consumed)
1587             *consumed = 0;
1588         return (PyObject *)unicode;
1589     }
1590
1591     p = unicode->str;
1592     shiftOutStart = p;
1593     e = s + size;
1594
1595     while (s < e) {
1596         Py_UNICODE ch = (unsigned char) *s;
1597
1598         if (inShift) { /* in a base-64 section */
1599             if (IS_BASE64(ch)) { /* consume a base-64 character */
1600                 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1601                 base64bits += 6;
1602                 s++;
1603                 if (base64bits >= 16) {
1604                     /* we have enough bits for a UTF-16 value */
1605                     Py_UNICODE outCh = (Py_UNICODE)
1606                                        (base64buffer >> (base64bits-16));
1607                     base64bits -= 16;
1608                     base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1609                     if (surrogate) {
1610                         /* expecting a second surrogate */
1611                         if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1612 #ifdef Py_UNICODE_WIDE
1613                             *p++ = (((surrogate & 0x3FF)<<10)
1614                                     | (outCh & 0x3FF)) + 0x10000;
1615 #else
1616                             *p++ = surrogate;
1617                             *p++ = outCh;
1618 #endif
1619                             surrogate = 0;
1620                         }
1621                         else {
1622                             surrogate = 0;
1623                             errmsg = "second surrogate missing";
1624                             goto utf7Error;
1625                         }
1626                     }
1627                     else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1628                         /* first surrogate */
1629                         surrogate = outCh;
1630                     }
1631                     else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1632                         errmsg = "unexpected second surrogate";
1633                         goto utf7Error;
1634                     }
1635                     else {
1636                         *p++ = outCh;
1637                     }
1638                 }
1639             }
1640             else { /* now leaving a base-64 section */
1641                 inShift = 0;
1642                 s++;
1643                 if (surrogate) {
1644                     errmsg = "second surrogate missing at end of shift sequence";
1645                     goto utf7Error;
1646                 }
1647                 if (base64bits > 0) { /* left-over bits */
1648                     if (base64bits >= 6) {
1649                         /* We've seen at least one base-64 character */
1650                         errmsg = "partial character in shift sequence";
1651                         goto utf7Error;
1652                     }
1653                     else {
1654                         /* Some bits remain; they should be zero */
1655                         if (base64buffer != 0) {
1656                             errmsg = "non-zero padding bits in shift sequence";
1657                             goto utf7Error;
1658                         }
1659                     }
1660                 }
1661                 if (ch != '-') {
1662                     /* '-' is absorbed; other terminating
1663                        characters are preserved */
1664                     *p++ = ch;
1665                 }
1666             }
1667         }
1668         else if ( ch == '+' ) {
1669             startinpos = s-starts;
1670             s++; /* consume '+' */
1671             if (s < e && *s == '-') { /* '+-' encodes '+' */
1672                 s++;
1673                 *p++ = '+';
1674             }
1675             else { /* begin base64-encoded section */
1676                 inShift = 1;
1677                 shiftOutStart = p;
1678                 base64bits = 0;
1679             }
1680         }
1681         else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
1682             *p++ = ch;
1683             s++;
1684         }
1685         else {
1686             startinpos = s-starts;
1687             s++;
1688             errmsg = "unexpected special character";
1689             goto utf7Error;
1690         }
1691         continue;
1692 utf7Error:
1693         outpos = p-PyUnicode_AS_UNICODE(unicode);
1694         endinpos = s-starts;
1695         if (unicode_decode_call_errorhandler(
1696                 errors, &errorHandler,
1697                 "utf7", errmsg,
1698                 starts, size, &startinpos, &endinpos, &exc, &s,
1699                 &unicode, &outpos, &p))
1700             goto onError;
1701     }
1702
1703     /* end of string */
1704
1705     if (inShift && !consumed) { /* in shift sequence, no more to follow */
1706         /* if we're in an inconsistent state, that's an error */
1707         if (surrogate ||
1708                 (base64bits >= 6) ||
1709                 (base64bits > 0 && base64buffer != 0)) {
1710             outpos = p-PyUnicode_AS_UNICODE(unicode);
1711             endinpos = size;
1712             if (unicode_decode_call_errorhandler(
1713                     errors, &errorHandler,
1714                     "utf7", "unterminated shift sequence",
1715                     starts, size, &startinpos, &endinpos, &exc, &s,
1716                     &unicode, &outpos, &p))
1717                 goto onError;
1718         }
1719     }
1720
1721     /* return state */
1722     if (consumed) {
1723         if (inShift) {
1724             p = shiftOutStart; /* back off output */
1725             *consumed = startinpos;
1726         }
1727         else {
1728             *consumed = s-starts;
1729         }
1730     }
1731
1732     if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1733         goto onError;
1734
1735     Py_XDECREF(errorHandler);
1736     Py_XDECREF(exc);
1737     return (PyObject *)unicode;
1738
1739   onError:
1740     Py_XDECREF(errorHandler);
1741     Py_XDECREF(exc);
1742     Py_DECREF(unicode);
1743     return NULL;
1744 }
1745
1746
1747 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1748                                Py_ssize_t size,
1749                                int base64SetO,
1750                                int base64WhiteSpace,
1751                                const char *errors)
1752 {
1753     PyObject *v;
1754     /* It might be possible to tighten this worst case */
1755     Py_ssize_t allocated = 8 * size;
1756     int inShift = 0;
1757     Py_ssize_t i = 0;
1758     unsigned int base64bits = 0;
1759     unsigned long base64buffer = 0;
1760     char * out;
1761     char * start;
1762
1763     if (allocated / 8 != size)
1764         return PyErr_NoMemory();
1765
1766     if (size == 0)
1767         return PyString_FromStringAndSize(NULL, 0);
1768
1769     v = PyString_FromStringAndSize(NULL, allocated);
1770     if (v == NULL)
1771         return NULL;
1772
1773     start = out = PyString_AS_STRING(v);
1774     for (;i < size; ++i) {
1775         Py_UNICODE ch = s[i];
1776
1777         if (inShift) {
1778             if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1779                 /* shifting out */
1780                 if (base64bits) { /* output remaining bits */
1781                     *out++ = TO_BASE64(base64buffer << (6-base64bits));
1782                     base64buffer = 0;
1783                     base64bits = 0;
1784                 }
1785                 inShift = 0;
1786                 /* Characters not in the BASE64 set implicitly unshift the sequence
1787                    so no '-' is required, except if the character is itself a '-' */
1788                 if (IS_BASE64(ch) || ch == '-') {
1789                     *out++ = '-';
1790                 }
1791                 *out++ = (char) ch;
1792             }
1793             else {
1794                 goto encode_char;
1795             }
1796         }
1797         else { /* not in a shift sequence */
1798             if (ch == '+') {
1799                 *out++ = '+';
1800                         *out++ = '-';
1801             }
1802             else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1803                 *out++ = (char) ch;
1804             }
1805             else {
1806                 *out++ = '+';
1807                 inShift = 1;
1808                 goto encode_char;
1809             }
1810         }
1811         continue;
1812 encode_char:
1813 #ifdef Py_UNICODE_WIDE
1814         if (ch >= 0x10000) {
1815             /* code first surrogate */
1816             base64bits += 16;
1817             base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1818             while (base64bits >= 6) {
1819                 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1820                 base64bits -= 6;
1821             }
1822             /* prepare second surrogate */
1823             ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
1824         }
1825 #endif
1826         base64bits += 16;
1827         base64buffer = (base64buffer << 16) | ch;
1828         while (base64bits >= 6) {
1829             *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1830             base64bits -= 6;
1831         }
1832     }
1833     if (base64bits)
1834         *out++= TO_BASE64(base64buffer << (6-base64bits) );
1835     if (inShift)
1836         *out++ = '-';
1837
1838     _PyString_Resize(&v, out - start);
1839     return v;
1840 }
1841
1842 #undef IS_BASE64
1843 #undef FROM_BASE64
1844 #undef TO_BASE64
1845 #undef DECODE_DIRECT
1846 #undef ENCODE_DIRECT
1847
1848 /* --- UTF-8 Codec -------------------------------------------------------- */
1849
1850 static
1851 char utf8_code_length[256] = {
1852     /* Map UTF-8 encoded prefix byte to sequence length.  zero means
1853        illegal prefix.  see RFC 2279 for details */
1854     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1855     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1856     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1857     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1858     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1859     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1860     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1861     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1862     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1863     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1864     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1865     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1866     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1867     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1868     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1869     4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1870 };
1871
1872 PyObject *PyUnicode_DecodeUTF8(const char *s,
1873                                Py_ssize_t size,
1874                                const char *errors)
1875 {
1876     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1877 }
1878
1879 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1880                                        Py_ssize_t size,
1881                                        const char *errors,
1882                                        Py_ssize_t *consumed)
1883 {
1884     const char *starts = s;
1885     int n;
1886     Py_ssize_t startinpos;
1887     Py_ssize_t endinpos;
1888     Py_ssize_t outpos;
1889     const char *e;
1890     PyUnicodeObject *unicode;
1891     Py_UNICODE *p;
1892     const char *errmsg = "";
1893     PyObject *errorHandler = NULL;
1894     PyObject *exc = NULL;
1895
1896     /* Note: size will always be longer than the resulting Unicode
1897        character count */
1898     unicode = _PyUnicode_New(size);
1899     if (!unicode)
1900         return NULL;
1901     if (size == 0) {
1902         if (consumed)
1903             *consumed = 0;
1904         return (PyObject *)unicode;
1905     }
1906
1907     /* Unpack UTF-8 encoded data */
1908     p = unicode->str;
1909     e = s + size;
1910
1911     while (s < e) {
1912         Py_UCS4 ch = (unsigned char)*s;
1913
1914         if (ch < 0x80) {
1915             *p++ = (Py_UNICODE)ch;
1916             s++;
1917             continue;
1918         }
1919
1920         n = utf8_code_length[ch];
1921
1922         if (s + n > e) {
1923             if (consumed)
1924                 break;
1925             else {
1926                 errmsg = "unexpected end of data";
1927                 startinpos = s-starts;
1928                 endinpos = size;
1929                 goto utf8Error;
1930             }
1931         }
1932
1933         switch (n) {
1934
1935         case 0:
1936             errmsg = "unexpected code byte";
1937             startinpos = s-starts;
1938             endinpos = startinpos+1;
1939             goto utf8Error;
1940
1941         case 1:
1942             errmsg = "internal error";
1943             startinpos = s-starts;
1944             endinpos = startinpos+1;
1945             goto utf8Error;
1946
1947         case 2:
1948             if ((s[1] & 0xc0) != 0x80) {
1949                 errmsg = "invalid data";
1950                 startinpos = s-starts;
1951                 endinpos = startinpos+2;
1952                 goto utf8Error;
1953             }
1954             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1955             if (ch < 0x80) {
1956                 startinpos = s-starts;
1957                 endinpos = startinpos+2;
1958                 errmsg = "illegal encoding";
1959                 goto utf8Error;
1960             }
1961             else
1962                 *p++ = (Py_UNICODE)ch;
1963             break;
1964
1965         case 3:
1966             if ((s[1] & 0xc0) != 0x80 ||
1967                 (s[2] & 0xc0) != 0x80) {
1968                 errmsg = "invalid data";
1969                 startinpos = s-starts;
1970                 endinpos = startinpos+3;
1971                 goto utf8Error;
1972             }
1973             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1974             if (ch < 0x0800) {
1975                 /* Note: UTF-8 encodings of surrogates are considered
1976                    legal UTF-8 sequences;
1977
1978                    XXX For wide builds (UCS-4) we should probably try
1979                    to recombine the surrogates into a single code
1980                    unit.
1981                 */
1982                 errmsg = "illegal encoding";
1983                 startinpos = s-starts;
1984                 endinpos = startinpos+3;
1985                 goto utf8Error;
1986             }
1987             else
1988                 *p++ = (Py_UNICODE)ch;
1989             break;
1990
1991         case 4:
1992             if ((s[1] & 0xc0) != 0x80 ||
1993                 (s[2] & 0xc0) != 0x80 ||
1994                 (s[3] & 0xc0) != 0x80) {
1995                 errmsg = "invalid data";
1996                 startinpos = s-starts;
1997                 endinpos = startinpos+4;
1998                 goto utf8Error;
1999             }
2000             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2001                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2002             /* validate and convert to UTF-16 */
2003             if ((ch < 0x10000)        /* minimum value allowed for 4
2004                                          byte encoding */
2005                 || (ch > 0x10ffff))   /* maximum value allowed for
2006                                          UTF-16 */
2007             {
2008                 errmsg = "illegal encoding";
2009                 startinpos = s-starts;
2010                 endinpos = startinpos+4;
2011                 goto utf8Error;
2012             }
2013 #ifdef Py_UNICODE_WIDE
2014             *p++ = (Py_UNICODE)ch;
2015 #else
2016             /*  compute and append the two surrogates: */
2017
2018             /*  translate from 10000..10FFFF to 0..FFFF */
2019             ch -= 0x10000;
2020
2021             /*  high surrogate = top 10 bits added to D800 */
2022             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2023
2024             /*  low surrogate = bottom 10 bits added to DC00 */
2025             *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2026 #endif
2027             break;
2028
2029         default:
2030             /* Other sizes are only needed for UCS-4 */
2031             errmsg = "unsupported Unicode code range";
2032             startinpos = s-starts;
2033             endinpos = startinpos+n;
2034             goto utf8Error;
2035         }
2036         s += n;
2037         continue;
2038
2039       utf8Error:
2040         outpos = p-PyUnicode_AS_UNICODE(unicode);
2041         if (unicode_decode_call_errorhandler(
2042                 errors, &errorHandler,
2043                 "utf8", errmsg,
2044                 starts, size, &startinpos, &endinpos, &exc, &s,
2045                 &unicode, &outpos, &p))
2046             goto onError;
2047     }
2048     if (consumed)
2049         *consumed = s-starts;
2050
2051     /* Adjust length */
2052     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2053         goto onError;
2054
2055     Py_XDECREF(errorHandler);
2056     Py_XDECREF(exc);
2057     return (PyObject *)unicode;
2058
2059   onError:
2060     Py_XDECREF(errorHandler);
2061     Py_XDECREF(exc);
2062     Py_DECREF(unicode);
2063     return NULL;
2064 }
2065
2066 /* Allocation strategy:  if the string is short, convert into a stack buffer
2067    and allocate exactly as much space needed at the end.  Else allocate the
2068    maximum possible needed (4 result bytes per Unicode character), and return
2069    the excess memory at the end.
2070 */
2071 PyObject *
2072 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2073                      Py_ssize_t size,
2074                      const char *errors)
2075 {
2076 #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
2077
2078     Py_ssize_t i;           /* index into s of next input byte */
2079     PyObject *v;        /* result string object */
2080     char *p;            /* next free byte in output buffer */
2081     Py_ssize_t nallocated;  /* number of result bytes allocated */
2082     Py_ssize_t nneeded;        /* number of result bytes needed */
2083     char stackbuf[MAX_SHORT_UNICHARS * 4];
2084
2085     assert(s != NULL);
2086     assert(size >= 0);
2087
2088     if (size <= MAX_SHORT_UNICHARS) {
2089         /* Write into the stack buffer; nallocated can't overflow.
2090          * At the end, we'll allocate exactly as much heap space as it
2091          * turns out we need.
2092          */
2093         nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2094         v = NULL;   /* will allocate after we're done */
2095         p = stackbuf;
2096     }
2097     else {
2098         /* Overallocate on the heap, and give the excess back at the end. */
2099         nallocated = size * 4;
2100         if (nallocated / 4 != size)  /* overflow! */
2101             return PyErr_NoMemory();
2102         v = PyString_FromStringAndSize(NULL, nallocated);
2103         if (v == NULL)
2104             return NULL;
2105         p = PyString_AS_STRING(v);
2106     }
2107
2108     for (i = 0; i < size;) {
2109         Py_UCS4 ch = s[i++];
2110
2111         if (ch < 0x80)
2112             /* Encode ASCII */
2113             *p++ = (char) ch;
2114
2115         else if (ch < 0x0800) {
2116             /* Encode Latin-1 */
2117             *p++ = (char)(0xc0 | (ch >> 6));
2118             *p++ = (char)(0x80 | (ch & 0x3f));
2119         }
2120         else {
2121             /* Encode UCS2 Unicode ordinals */
2122             if (ch < 0x10000) {
2123                 /* Special case: check for high surrogate */
2124                 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2125                     Py_UCS4 ch2 = s[i];
2126                     /* Check for low surrogate and combine the two to
2127                        form a UCS4 value */
2128                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2129                         ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2130                         i++;
2131                         goto encodeUCS4;
2132                     }
2133                     /* Fall through: handles isolated high surrogates */
2134                 }
2135                 *p++ = (char)(0xe0 | (ch >> 12));
2136                 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2137                 *p++ = (char)(0x80 | (ch & 0x3f));
2138                 continue;
2139             }
2140           encodeUCS4:
2141             /* Encode UCS4 Unicode ordinals */
2142             *p++ = (char)(0xf0 | (ch >> 18));
2143             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2144             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2145             *p++ = (char)(0x80 | (ch & 0x3f));
2146         }
2147     }
2148
2149     if (v == NULL) {
2150         /* This was stack allocated. */
2151         nneeded = p - stackbuf;
2152         assert(nneeded <= nallocated);
2153         v = PyString_FromStringAndSize(stackbuf, nneeded);
2154     }
2155     else {
2156         /* Cut back to size actually needed. */
2157         nneeded = p - PyString_AS_STRING(v);
2158         assert(nneeded <= nallocated);
2159         _PyString_Resize(&v, nneeded);
2160     }
2161     return v;
2162
2163 #undef MAX_SHORT_UNICHARS
2164 }
2165
2166 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2167 {
2168     if (!PyUnicode_Check(unicode)) {
2169         PyErr_BadArgument();
2170         return NULL;
2171     }
2172     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2173                                 PyUnicode_GET_SIZE(unicode),
2174                                 NULL);
2175 }
2176
2177 /* --- UTF-32 Codec ------------------------------------------------------- */
2178
2179 PyObject *
2180 PyUnicode_DecodeUTF32(const char *s,
2181                       Py_ssize_t size,
2182                       const char *errors,
2183                       int *byteorder)
2184 {
2185     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2186 }
2187
2188 PyObject *
2189 PyUnicode_DecodeUTF32Stateful(const char *s,
2190                               Py_ssize_t size,
2191                               const char *errors,
2192                               int *byteorder,
2193                               Py_ssize_t *consumed)
2194 {
2195     const char *starts = s;
2196     Py_ssize_t startinpos;
2197     Py_ssize_t endinpos;
2198     Py_ssize_t outpos;
2199     PyUnicodeObject *unicode;
2200     Py_UNICODE *p;
2201 #ifndef Py_UNICODE_WIDE
2202     int i, pairs;
2203 #else
2204     const int pairs = 0;
2205 #endif
2206     const unsigned char *q, *e;
2207     int bo = 0;       /* assume native ordering by default */
2208     const char *errmsg = "";
2209     /* Offsets from q for retrieving bytes in the right order. */
2210 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2211     int iorder[] = {0, 1, 2, 3};
2212 #else
2213     int iorder[] = {3, 2, 1, 0};
2214 #endif
2215     PyObject *errorHandler = NULL;
2216     PyObject *exc = NULL;
2217     /* On narrow builds we split characters outside the BMP into two
2218        codepoints => count how much extra space we need. */
2219 #ifndef Py_UNICODE_WIDE
2220     for (i = pairs = 0; i < size/4; i++)
2221         if (((Py_UCS4 *)s)[i] >= 0x10000)
2222             pairs++;
2223 #endif
2224
2225     /* This might be one to much, because of a BOM */
2226     unicode = _PyUnicode_New((size+3)/4+pairs);
2227     if (!unicode)
2228         return NULL;
2229     if (size == 0)
2230         return (PyObject *)unicode;
2231
2232     /* Unpack UTF-32 encoded data */
2233     p = unicode->str;
2234     q = (unsigned char *)s;
2235     e = q + size;
2236
2237     if (byteorder)
2238         bo = *byteorder;
2239
2240     /* Check for BOM marks (U+FEFF) in the input and adjust current
2241        byte order setting accordingly. In native mode, the leading BOM
2242        mark is skipped, in all other modes, it is copied to the output
2243        stream as-is (giving a ZWNBSP character). */
2244     if (bo == 0) {
2245         if (size >= 4) {
2246             const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2247                 (q[iorder[1]] << 8) | q[iorder[0]];
2248 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2249             if (bom == 0x0000FEFF) {
2250                 q += 4;
2251                 bo = -1;
2252             }
2253             else if (bom == 0xFFFE0000) {
2254                 q += 4;
2255                 bo = 1;
2256             }
2257 #else
2258             if (bom == 0x0000FEFF) {
2259                 q += 4;
2260                 bo = 1;
2261             }
2262             else if (bom == 0xFFFE0000) {
2263                 q += 4;
2264                 bo = -1;
2265             }
2266 #endif
2267         }
2268     }
2269
2270     if (bo == -1) {
2271         /* force LE */
2272         iorder[0] = 0;
2273         iorder[1] = 1;
2274         iorder[2] = 2;
2275         iorder[3] = 3;
2276     }
2277     else if (bo == 1) {
2278         /* force BE */
2279         iorder[0] = 3;
2280         iorder[1] = 2;
2281         iorder[2] = 1;
2282         iorder[3] = 0;
2283     }
2284
2285     while (q < e) {
2286         Py_UCS4 ch;
2287         /* remaining bytes at the end? (size should be divisible by 4) */
2288         if (e-q<4) {
2289             if (consumed)
2290                 break;
2291             errmsg = "truncated data";
2292             startinpos = ((const char *)q)-starts;
2293             endinpos = ((const char *)e)-starts;
2294             goto utf32Error;
2295             /* The remaining input chars are ignored if the callback
2296                chooses to skip the input */
2297         }
2298         ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2299             (q[iorder[1]] << 8) | q[iorder[0]];
2300
2301         if (ch >= 0x110000)
2302         {
2303             errmsg = "codepoint not in range(0x110000)";
2304             startinpos = ((const char *)q)-starts;
2305             endinpos = startinpos+4;
2306             goto utf32Error;
2307         }
2308 #ifndef Py_UNICODE_WIDE
2309         if (ch >= 0x10000)
2310         {
2311             *p++ = 0xD800 | ((ch-0x10000) >> 10);
2312             *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2313         }
2314         else
2315 #endif
2316             *p++ = ch;
2317         q += 4;
2318         continue;
2319       utf32Error:
2320         outpos = p-PyUnicode_AS_UNICODE(unicode);
2321         if (unicode_decode_call_errorhandler(
2322                 errors, &errorHandler,
2323                 "utf32", errmsg,
2324                 starts, size, &startinpos, &endinpos, &exc, &s,
2325                 &unicode, &outpos, &p))
2326             goto onError;
2327     }
2328
2329     if (byteorder)
2330         *byteorder = bo;
2331
2332     if (consumed)
2333         *consumed = (const char *)q-starts;
2334
2335     /* Adjust length */
2336     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2337         goto onError;
2338
2339     Py_XDECREF(errorHandler);
2340     Py_XDECREF(exc);
2341     return (PyObject *)unicode;
2342
2343   onError:
2344     Py_DECREF(unicode);
2345     Py_XDECREF(errorHandler);
2346     Py_XDECREF(exc);
2347     return NULL;
2348 }
2349
2350 PyObject *
2351 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2352                       Py_ssize_t size,
2353                       const char *errors,
2354                       int byteorder)
2355 {
2356     PyObject *v;
2357     unsigned char *p;
2358     Py_ssize_t nsize, bytesize;
2359 #ifndef Py_UNICODE_WIDE
2360     Py_ssize_t i, pairs;
2361 #else
2362     const int pairs = 0;
2363 #endif
2364     /* Offsets from p for storing byte pairs in the right order. */
2365 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2366     int iorder[] = {0, 1, 2, 3};
2367 #else
2368     int iorder[] = {3, 2, 1, 0};
2369 #endif
2370
2371 #define STORECHAR(CH)                           \
2372     do {                                        \
2373         p[iorder[3]] = ((CH) >> 24) & 0xff;     \
2374         p[iorder[2]] = ((CH) >> 16) & 0xff;     \
2375         p[iorder[1]] = ((CH) >> 8) & 0xff;      \
2376         p[iorder[0]] = (CH) & 0xff;             \
2377         p += 4;                                 \
2378     } while(0)
2379
2380     /* In narrow builds we can output surrogate pairs as one codepoint,
2381        so we need less space. */
2382 #ifndef Py_UNICODE_WIDE
2383     for (i = pairs = 0; i < size-1; i++)
2384         if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2385             0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2386             pairs++;
2387 #endif
2388     nsize = (size - pairs + (byteorder == 0));
2389     bytesize = nsize * 4;
2390     if (bytesize / 4 != nsize)
2391         return PyErr_NoMemory();
2392     v = PyString_FromStringAndSize(NULL, bytesize);
2393     if (v == NULL)
2394         return NULL;
2395
2396     p = (unsigned char *)PyString_AS_STRING(v);
2397     if (byteorder == 0)
2398         STORECHAR(0xFEFF);
2399     if (size == 0)
2400         return v;
2401
2402     if (byteorder == -1) {
2403         /* force LE */
2404         iorder[0] = 0;
2405         iorder[1] = 1;
2406         iorder[2] = 2;
2407         iorder[3] = 3;
2408     }
2409     else if (byteorder == 1) {
2410         /* force BE */
2411         iorder[0] = 3;
2412         iorder[1] = 2;
2413         iorder[2] = 1;
2414         iorder[3] = 0;
2415     }
2416
2417     while (size-- > 0) {
2418         Py_UCS4 ch = *s++;
2419 #ifndef Py_UNICODE_WIDE
2420         if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2421             Py_UCS4 ch2 = *s;
2422             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2423                 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2424                 s++;
2425                 size--;
2426             }
2427         }
2428 #endif
2429         STORECHAR(ch);
2430     }
2431     return v;
2432 #undef STORECHAR
2433 }
2434
2435 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2436 {
2437     if (!PyUnicode_Check(unicode)) {
2438         PyErr_BadArgument();
2439         return NULL;
2440     }
2441     return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2442                                  PyUnicode_GET_SIZE(unicode),
2443                                  NULL,
2444                                  0);
2445 }
2446
2447 /* --- UTF-16 Codec ------------------------------------------------------- */
2448
2449 PyObject *
2450 PyUnicode_DecodeUTF16(const char *s,
2451                       Py_ssize_t size,
2452                       const char *errors,
2453                       int *byteorder)
2454 {
2455     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2456 }
2457
2458 PyObject *
2459 PyUnicode_DecodeUTF16Stateful(const char *s,
2460                               Py_ssize_t size,
2461                               const char *errors,
2462                               int *byteorder,
2463                               Py_ssize_t *consumed)
2464 {
2465     const char *starts = s;
2466     Py_ssize_t startinpos;
2467     Py_ssize_t endinpos;
2468     Py_ssize_t outpos;
2469     PyUnicodeObject *unicode;
2470     Py_UNICODE *p;
2471     const unsigned char *q, *e;
2472     int bo = 0;       /* assume native ordering by default */
2473     const char *errmsg = "";
2474     /* Offsets from q for retrieving byte pairs in the right order. */
2475 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2476     int ihi = 1, ilo = 0;
2477 #else
2478     int ihi = 0, ilo = 1;
2479 #endif
2480     PyObject *errorHandler = NULL;
2481     PyObject *exc = NULL;
2482
2483     /* Note: size will always be longer than the resulting Unicode
2484        character count */
2485     unicode = _PyUnicode_New(size);
2486     if (!unicode)
2487         return NULL;
2488     if (size == 0)
2489         return (PyObject *)unicode;
2490
2491     /* Unpack UTF-16 encoded data */
2492     p = unicode->str;
2493     q = (unsigned char *)s;
2494     e = q + size;
2495
2496     if (byteorder)
2497         bo = *byteorder;
2498
2499     /* Check for BOM marks (U+FEFF) in the input and adjust current
2500        byte order setting accordingly. In native mode, the leading BOM
2501        mark is skipped, in all other modes, it is copied to the output
2502        stream as-is (giving a ZWNBSP character). */
2503     if (bo == 0) {
2504         if (size >= 2) {
2505             const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2506 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2507             if (bom == 0xFEFF) {
2508                 q += 2;
2509                 bo = -1;
2510             }
2511             else if (bom == 0xFFFE) {
2512                 q += 2;
2513                 bo = 1;
2514             }
2515 #else
2516             if (bom == 0xFEFF) {
2517                 q += 2;
2518                 bo = 1;
2519             }
2520             else if (bom == 0xFFFE) {
2521                 q += 2;
2522                 bo = -1;
2523             }
2524 #endif
2525         }
2526     }
2527
2528     if (bo == -1) {
2529         /* force LE */
2530         ihi = 1;
2531         ilo = 0;
2532     }
2533     else if (bo == 1) {
2534         /* force BE */
2535         ihi = 0;
2536         ilo = 1;
2537     }
2538
2539     while (q < e) {
2540         Py_UNICODE ch;
2541         /* remaining bytes at the end? (size should be even) */
2542         if (e-q<2) {
2543             if (consumed)
2544                 break;
2545             errmsg = "truncated data";
2546             startinpos = ((const char *)q)-starts;
2547             endinpos = ((const char *)e)-starts;
2548             goto utf16Error;
2549             /* The remaining input chars are ignored if the callback
2550                chooses to skip the input */
2551         }
2552         ch = (q[ihi] << 8) | q[ilo];
2553
2554         q += 2;
2555
2556         if (ch < 0xD800 || ch > 0xDFFF) {
2557             *p++ = ch;
2558             continue;
2559         }
2560
2561         /* UTF-16 code pair: */
2562         if (q >= e) {
2563             errmsg = "unexpected end of data";
2564             startinpos = (((const char *)q)-2)-starts;
2565             endinpos = ((const char *)e)-starts;
2566             goto utf16Error;
2567         }
2568         if (0xD800 <= ch && ch <= 0xDBFF) {
2569             Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2570             q += 2;
2571             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2572 #ifndef Py_UNICODE_WIDE
2573                 *p++ = ch;
2574                 *p++ = ch2;
2575 #else
2576                 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2577 #endif
2578                 continue;
2579             }
2580             else {
2581                 errmsg = "illegal UTF-16 surrogate";
2582                 startinpos = (((const char *)q)-4)-starts;
2583                 endinpos = startinpos+2;
2584                 goto utf16Error;
2585             }
2586
2587         }
2588         errmsg = "illegal encoding";
2589         startinpos = (((const char *)q)-2)-starts;
2590         endinpos = startinpos+2;
2591         /* Fall through to report the error */
2592
2593       utf16Error:
2594         outpos = p-PyUnicode_AS_UNICODE(unicode);
2595         if (unicode_decode_call_errorhandler(
2596                 errors, &errorHandler,
2597                 "utf16", errmsg,
2598                 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2599                 &unicode, &outpos, &p))
2600             goto onError;
2601     }
2602
2603     if (byteorder)
2604         *byteorder = bo;
2605
2606     if (consumed)
2607         *consumed = (const char *)q-starts;
2608
2609     /* Adjust length */
2610     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2611         goto onError;
2612
2613     Py_XDECREF(errorHandler);
2614     Py_XDECREF(exc);
2615     return (PyObject *)unicode;
2616
2617   onError:
2618     Py_DECREF(unicode);
2619     Py_XDECREF(errorHandler);
2620     Py_XDECREF(exc);
2621     return NULL;
2622 }
2623
2624 PyObject *
2625 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2626                       Py_ssize_t size,
2627                       const char *errors,
2628                       int byteorder)
2629 {
2630     PyObject *v;
2631     unsigned char *p;
2632     Py_ssize_t nsize, bytesize;
2633 #ifdef Py_UNICODE_WIDE
2634     Py_ssize_t i, pairs;
2635 #else
2636     const int pairs = 0;
2637 #endif
2638     /* Offsets from p for storing byte pairs in the right order. */
2639 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2640     int ihi = 1, ilo = 0;
2641 #else
2642     int ihi = 0, ilo = 1;
2643 #endif
2644
2645 #define STORECHAR(CH)                           \
2646     do {                                        \
2647         p[ihi] = ((CH) >> 8) & 0xff;            \
2648         p[ilo] = (CH) & 0xff;                   \
2649         p += 2;                                 \
2650     } while(0)
2651
2652 #ifdef Py_UNICODE_WIDE
2653     for (i = pairs = 0; i < size; i++)
2654         if (s[i] >= 0x10000)
2655             pairs++;
2656 #endif
2657     /* 2 * (size + pairs + (byteorder == 0)) */
2658     if (size > PY_SSIZE_T_MAX ||
2659         size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2660         return PyErr_NoMemory();
2661     nsize = size + pairs + (byteorder == 0);
2662     bytesize = nsize * 2;
2663     if (bytesize / 2 != nsize)
2664         return PyErr_NoMemory();
2665     v = PyString_FromStringAndSize(NULL, bytesize);
2666     if (v == NULL)
2667         return NULL;
2668
2669     p = (unsigned char *)PyString_AS_STRING(v);
2670     if (byteorder == 0)
2671         STORECHAR(0xFEFF);
2672     if (size == 0)
2673         return v;
2674
2675     if (byteorder == -1) {
2676         /* force LE */
2677         ihi = 1;
2678         ilo = 0;
2679     }
2680     else if (byteorder == 1) {
2681         /* force BE */
2682         ihi = 0;
2683         ilo = 1;
2684     }
2685
2686     while (size-- > 0) {
2687         Py_UNICODE ch = *s++;
2688         Py_UNICODE ch2 = 0;
2689 #ifdef Py_UNICODE_WIDE
2690         if (ch >= 0x10000) {
2691             ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2692             ch  = 0xD800 | ((ch-0x10000) >> 10);
2693         }
2694 #endif
2695         STORECHAR(ch);
2696         if (ch2)
2697             STORECHAR(ch2);
2698     }
2699     return v;
2700 #undef STORECHAR
2701 }
2702
2703 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2704 {
2705     if (!PyUnicode_Check(unicode)) {
2706         PyErr_BadArgument();
2707         return NULL;
2708     }
2709     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2710                                  PyUnicode_GET_SIZE(unicode),
2711                                  NULL,
2712                                  0);
2713 }
2714
2715 /* --- Unicode Escape Codec ----------------------------------------------- */
2716
2717 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2718
2719 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2720                                         Py_ssize_t size,
2721                                         const char *errors)
2722 {
2723     const char *starts = s;
2724     Py_ssize_t startinpos;
2725     Py_ssize_t endinpos;
2726     Py_ssize_t outpos;
2727     int i;
2728     PyUnicodeObject *v;
2729     Py_UNICODE *p;
2730     const char *end;
2731     char* message;
2732     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2733     PyObject *errorHandler = NULL;
2734     PyObject *exc = NULL;
2735
2736     /* Escaped strings will always be longer than the resulting
2737        Unicode string, so we start with size here and then reduce the
2738        length after conversion to the true value.
2739        (but if the error callback returns a long replacement string
2740        we'll have to allocate more space) */
2741     v = _PyUnicode_New(size);
2742     if (v == NULL)
2743         goto onError;
2744     if (size == 0)
2745         return (PyObject *)v;
2746
2747     p = PyUnicode_AS_UNICODE(v);
2748     end = s + size;
2749
2750     while (s < end) {
2751         unsigned char c;
2752         Py_UNICODE x;
2753         int digits;
2754
2755         /* Non-escape characters are interpreted as Unicode ordinals */
2756         if (*s != '\\') {
2757             *p++ = (unsigned char) *s++;
2758             continue;
2759         }
2760
2761         startinpos = s-starts;
2762         /* \ - Escapes */
2763         s++;
2764         c = *s++;
2765         if (s > end)
2766             c = '\0'; /* Invalid after \ */
2767         switch (c) {
2768
2769             /* \x escapes */
2770         case '\n': break;
2771         case '\\': *p++ = '\\'; break;
2772         case '\'': *p++ = '\''; break;
2773         case '\"': *p++ = '\"'; break;
2774         case 'b': *p++ = '\b'; break;
2775         case 'f': *p++ = '\014'; break; /* FF */
2776         case 't': *p++ = '\t'; break;
2777         case 'n': *p++ = '\n'; break;
2778         case 'r': *p++ = '\r'; break;
2779         case 'v': *p++ = '\013'; break; /* VT */
2780         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2781
2782             /* \OOO (octal) escapes */
2783         case '0': case '1': case '2': case '3':
2784         case '4': case '5': case '6': case '7':
2785             x = s[-1] - '0';
2786             if (s < end && '0' <= *s && *s <= '7') {
2787                 x = (x<<3) + *s++ - '0';
2788                 if (s < end && '0' <= *s && *s <= '7')
2789                     x = (x<<3) + *s++ - '0';
2790             }
2791             *p++ = x;
2792             break;
2793
2794             /* hex escapes */
2795             /* \xXX */
2796         case 'x':
2797             digits = 2;
2798             message = "truncated \\xXX escape";
2799             goto hexescape;
2800
2801             /* \uXXXX */
2802         case 'u':
2803             digits = 4;
2804             message = "truncated \\uXXXX escape";
2805             goto hexescape;
2806
2807             /* \UXXXXXXXX */
2808         case 'U':
2809             digits = 8;
2810             message = "truncated \\UXXXXXXXX escape";
2811         hexescape:
2812             chr = 0;
2813             outpos = p-PyUnicode_AS_UNICODE(v);
2814             if (s+digits>end) {
2815                 endinpos = size;
2816                 if (unicode_decode_call_errorhandler(
2817                         errors, &errorHandler,
2818                         "unicodeescape", "end of string in escape sequence",
2819                         starts, size, &startinpos, &endinpos, &exc, &s,
2820                         &v, &outpos, &p))
2821                     goto onError;
2822                 goto nextByte;
2823             }
2824             for (i = 0; i < digits; ++i) {
2825                 c = (unsigned char) s[i];
2826                 if (!isxdigit(c)) {
2827                     endinpos = (s+i+1)-starts;
2828                     if (unicode_decode_call_errorhandler(
2829                             errors, &errorHandler,
2830                             "unicodeescape", message,
2831                             starts, size, &startinpos, &endinpos, &exc, &s,
2832                             &v, &outpos, &p))
2833                         goto onError;
2834                     goto nextByte;
2835                 }
2836                 chr = (chr<<4) & ~0xF;
2837                 if (c >= '0' && c <= '9')
2838                     chr += c - '0';
2839                 else if (c >= 'a' && c <= 'f')
2840                     chr += 10 + c - 'a';
2841                 else
2842                     chr += 10 + c - 'A';
2843             }
2844             s += i;
2845             if (chr == 0xffffffff && PyErr_Occurred())
2846                 /* _decoding_error will have already written into the
2847                    target buffer. */
2848                 break;
2849         store:
2850             /* when we get here, chr is a 32-bit unicode character */
2851             if (chr <= 0xffff)
2852                 /* UCS-2 character */
2853                 *p++ = (Py_UNICODE) chr;
2854             else if (chr <= 0x10ffff) {
2855                 /* UCS-4 character. Either store directly, or as
2856                    surrogate pair. */
2857 #ifdef Py_UNICODE_WIDE
2858                 *p++ = chr;
2859 #else
2860                 chr -= 0x10000L;
2861                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2862                 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2863 #endif
2864             } else {
2865                 endinpos = s-starts;
2866                 outpos = p-PyUnicode_AS_UNICODE(v);
2867                 if (unicode_decode_call_errorhandler(
2868                         errors, &errorHandler,
2869                         "unicodeescape", "illegal Unicode character",
2870                         starts, size, &startinpos, &endinpos, &exc, &s,
2871                         &v, &outpos, &p))
2872                     goto onError;
2873             }
2874             break;
2875
2876             /* \N{name} */
2877         case 'N':
2878             message = "malformed \\N character escape";
2879             if (ucnhash_CAPI == NULL) {
2880                 /* load the unicode data module */
2881                 PyObject *m, *api;
2882                 m = PyImport_ImportModuleNoBlock("unicodedata");
2883                 if (m == NULL)
2884                     goto ucnhashError;
2885                 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
2886                 Py_DECREF(m);
2887                 if (api == NULL)
2888                     goto ucnhashError;
2889                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
2890                 Py_DECREF(api);
2891                 if (ucnhash_CAPI == NULL)
2892                     goto ucnhashError;
2893             }
2894             if (*s == '{') {
2895                 const char *start = s+1;
2896                 /* look for the closing brace */
2897                 while (*s != '}' && s < end)
2898                     s++;
2899                 if (s > start && s < end && *s == '}') {
2900                     /* found a name.  look it up in the unicode database */
2901                     message = "unknown Unicode character name";
2902                     s++;
2903                     if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2904                         goto store;
2905                 }
2906             }
2907             endinpos = s-starts;
2908             outpos = p-PyUnicode_AS_UNICODE(v);
2909             if (unicode_decode_call_errorhandler(
2910                     errors, &errorHandler,
2911                     "unicodeescape", message,
2912                     starts, size, &startinpos, &endinpos, &exc, &s,
2913                     &v, &outpos, &p))
2914                 goto onError;
2915             break;
2916
2917         default:
2918             if (s > end) {
2919                 message = "\\ at end of string";
2920                 s--;
2921                 endinpos = s-starts;
2922                 outpos = p-PyUnicode_AS_UNICODE(v);
2923                 if (unicode_decode_call_errorhandler(
2924                         errors, &errorHandler,
2925                         "unicodeescape", message,
2926                         starts, size, &startinpos, &endinpos, &exc, &s,
2927                         &v, &outpos, &p))
2928                     goto onError;
2929             }
2930             else {
2931                 *p++ = '\\';
2932                 *p++ = (unsigned char)s[-1];
2933             }
2934             break;
2935         }
2936       nextByte:
2937         ;
2938     }
2939     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2940         goto onError;
2941     Py_XDECREF(errorHandler);
2942     Py_XDECREF(exc);
2943     return (PyObject *)v;
2944
2945   ucnhashError:
2946     PyErr_SetString(
2947         PyExc_UnicodeError,
2948         "\\N escapes not supported (can't load unicodedata module)"
2949         );
2950     Py_XDECREF(v);
2951     Py_XDECREF(errorHandler);
2952     Py_XDECREF(exc);
2953     return NULL;
2954
2955   onError:
2956     Py_XDECREF(v);
2957     Py_XDECREF(errorHandler);
2958     Py_XDECREF(exc);
2959     return NULL;
2960 }
2961
2962 /* Return a Unicode-Escape string version of the Unicode object.
2963
2964    If quotes is true, the string is enclosed in u"" or u'' quotes as
2965    appropriate.
2966
2967 */
2968
2969 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2970                                              Py_ssize_t size,
2971                                              Py_UNICODE ch)
2972 {
2973     /* like wcschr, but doesn't stop at NULL characters */
2974
2975     while (size-- > 0) {
2976         if (*s == ch)
2977             return s;
2978         s++;
2979     }
2980
2981     return NULL;
2982 }
2983
2984 static
2985 PyObject *unicodeescape_string(const Py_UNICODE *s,
2986                                Py_ssize_t size,
2987                                int quotes)
2988 {
2989     PyObject *repr;
2990     char *p;
2991
2992     static const char *hexdigit = "0123456789abcdef";
2993 #ifdef Py_UNICODE_WIDE
2994     const Py_ssize_t expandsize = 10;
2995 #else
2996     const Py_ssize_t expandsize = 6;
2997 #endif
2998
2999     /* XXX(nnorwitz): rather than over-allocating, it would be
3000        better to choose a different scheme.  Perhaps scan the
3001        first N-chars of the string and allocate based on that size.
3002     */
3003     /* Initial allocation is based on the longest-possible unichr
3004        escape.
3005
3006        In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3007        unichr, so in this case it's the longest unichr escape. In
3008        narrow (UTF-16) builds this is five chars per source unichr
3009        since there are two unichrs in the surrogate pair, so in narrow
3010        (UTF-16) builds it's not the longest unichr escape.
3011
3012        In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3013        so in the narrow (UTF-16) build case it's the longest unichr
3014        escape.
3015     */
3016
3017     if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3018         return PyErr_NoMemory();
3019
3020     repr = PyString_FromStringAndSize(NULL,
3021                                       2
3022                                       + expandsize*size
3023                                       + 1);
3024     if (repr == NULL)
3025         return NULL;
3026
3027     p = PyString_AS_STRING(repr);
3028
3029     if (quotes) {
3030         *p++ = 'u';
3031         *p++ = (findchar(s, size, '\'') &&
3032                 !findchar(s, size, '"')) ? '"' : '\'';
3033     }
3034     while (size-- > 0) {
3035         Py_UNICODE ch = *s++;
3036
3037         /* Escape quotes and backslashes */
3038         if ((quotes &&
3039              ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
3040             *p++ = '\\';
3041             *p++ = (char) ch;
3042             continue;
3043         }
3044
3045 #ifdef Py_UNICODE_WIDE
3046         /* Map 21-bit characters to '\U00xxxxxx' */
3047         else if (ch >= 0x10000) {
3048             *p++ = '\\';
3049             *p++ = 'U';
3050             *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3051             *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3052             *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3053             *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3054             *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3055             *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3056             *p++ = hexdigit[(ch >> 4) & 0x0000000F];
3057             *p++ = hexdigit[ch & 0x0000000F];
3058             continue;
3059         }
3060 #else
3061         /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3062         else if (ch >= 0xD800 && ch < 0xDC00) {
3063             Py_UNICODE ch2;
3064             Py_UCS4 ucs;
3065
3066             ch2 = *s++;
3067             size--;
3068             if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3069                 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3070                 *p++ = '\\';
3071                 *p++ = 'U';
3072                 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3073                 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3074                 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3075                 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3076                 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3077                 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3078                 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3079                 *p++ = hexdigit[ucs & 0x0000000F];
3080                 continue;
3081             }
3082             /* Fall through: isolated surrogates are copied as-is */
3083             s--;
3084             size++;
3085         }
3086 #endif
3087
3088         /* Map 16-bit characters to '\uxxxx' */
3089         if (ch >= 256) {
3090             *p++ = '\\';
3091             *p++ = 'u';
3092             *p++ = hexdigit[(ch >> 12) & 0x000F];
3093             *p++ = hexdigit[(ch >> 8) & 0x000F];
3094             *p++ = hexdigit[(ch >> 4) & 0x000F];
3095             *p++ = hexdigit[ch & 0x000F];
3096         }
3097
3098         /* Map special whitespace to '\t', \n', '\r' */
3099         else if (ch == '\t') {
3100             *p++ = '\\';
3101             *p++ = 't';
3102         }
3103         else if (ch == '\n') {
3104             *p++ = '\\';
3105             *p++ = 'n';
3106         }
3107         else if (ch == '\r') {
3108             *p++ = '\\';
3109             *p++ = 'r';
3110         }
3111
3112         /* Map non-printable US ASCII to '\xhh' */
3113         else if (ch < ' ' || ch >= 0x7F) {
3114             *p++ = '\\';
3115             *p++ = 'x';
3116             *p++ = hexdigit[(ch >> 4) & 0x000F];
3117             *p++ = hexdigit[ch & 0x000F];
3118         }
3119
3120         /* Copy everything else as-is */
3121         else
3122             *p++ = (char) ch;
3123     }
3124     if (quotes)
3125         *p++ = PyString_AS_STRING(repr)[1];
3126
3127     *p = '\0';
3128     _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
3129     return repr;
3130 }
3131
3132 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3133                                         Py_ssize_t size)
3134 {
3135     return unicodeescape_string(s, size, 0);
3136 }
3137
3138 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3139 {
3140     if (!PyUnicode_Check(unicode)) {
3141         PyErr_BadArgument();
3142         return NULL;
3143     }
3144     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3145                                          PyUnicode_GET_SIZE(unicode));
3146 }
3147
3148 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3149
3150 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3151                                            Py_ssize_t size,
3152                                            const char *errors)
3153 {
3154     const char *starts = s;
3155     Py_ssize_t startinpos;
3156     Py_ssize_t endinpos;
3157     Py_ssize_t outpos;
3158     PyUnicodeObject *v;
3159     Py_UNICODE *p;
3160     const char *end;
3161     const char *bs;
3162     PyObject *errorHandler = NULL;
3163     PyObject *exc = NULL;
3164
3165     /* Escaped strings will always be longer than the resulting
3166        Unicode string, so we start with size here and then reduce the
3167        length after conversion to the true value. (But decoding error
3168        handler might have to resize the string) */
3169     v = _PyUnicode_New(size);
3170     if (v == NULL)
3171         goto onError;
3172     if (size == 0)
3173         return (PyObject *)v;
3174     p = PyUnicode_AS_UNICODE(v);
3175     end = s + size;
3176     while (s < end) {
3177         unsigned char c;
3178         Py_UCS4 x;
3179         int i;
3180         int count;
3181
3182         /* Non-escape characters are interpreted as Unicode ordinals */
3183         if (*s != '\\') {
3184             *p++ = (unsigned char)*s++;
3185             continue;
3186         }
3187         startinpos = s-starts;
3188
3189         /* \u-escapes are only interpreted iff the number of leading
3190            backslashes if odd */
3191         bs = s;
3192         for (;s < end;) {
3193             if (*s != '\\')
3194                 break;
3195             *p++ = (unsigned char)*s++;
3196         }
3197         if (((s - bs) & 1) == 0 ||
3198             s >= end ||
3199             (*s != 'u' && *s != 'U')) {
3200             continue;
3201         }
3202         p--;
3203         count = *s=='u' ? 4 : 8;
3204         s++;
3205
3206         /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3207         outpos = p-PyUnicode_AS_UNICODE(v);
3208         for (x = 0, i = 0; i < count; ++i, ++s) {
3209             c = (unsigned char)*s;
3210             if (!isxdigit(c)) {
3211                 endinpos = s-starts;
3212                 if (unicode_decode_call_errorhandler(
3213                         errors, &errorHandler,
3214                         "rawunicodeescape", "truncated \\uXXXX",
3215                         starts, size, &startinpos, &endinpos, &exc, &s,
3216                         &v, &outpos, &p))
3217                     goto onError;
3218                 goto nextByte;
3219             }
3220             x = (x<<4) & ~0xF;
3221             if (c >= '0' && c <= '9')
3222                 x += c - '0';
3223             else if (c >= 'a' && c <= 'f')
3224                 x += 10 + c - 'a';
3225             else
3226                 x += 10 + c - 'A';
3227         }
3228         if (x <= 0xffff)
3229             /* UCS-2 character */
3230             *p++ = (Py_UNICODE) x;
3231         else if (x <= 0x10ffff) {
3232             /* UCS-4 character. Either store directly, or as
3233                surrogate pair. */
3234 #ifdef Py_UNICODE_WIDE
3235             *p++ = (Py_UNICODE) x;
3236 #else
3237             x -= 0x10000L;
3238             *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3239             *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3240 #endif
3241         } else {
3242             endinpos = s-starts;
3243             outpos = p-PyUnicode_AS_UNICODE(v);
3244             if (unicode_decode_call_errorhandler(
3245                     errors, &errorHandler,
3246                     "rawunicodeescape", "\\Uxxxxxxxx out of range",
3247                     starts, size, &startinpos, &endinpos, &exc, &s,
3248                     &v, &outpos, &p))
3249                 goto onError;
3250         }
3251       nextByte:
3252         ;
3253     }
3254     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3255         goto onError;
3256     Py_XDECREF(errorHandler);
3257     Py_XDECREF(exc);
3258     return (PyObject *)v;
3259
3260   onError:
3261     Py_XDECREF(v);
3262     Py_XDECREF(errorHandler);
3263     Py_XDECREF(exc);
3264     return NULL;
3265 }
3266
3267 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3268                                            Py_ssize_t size)
3269 {
3270     PyObject *repr;
3271     char *p;
3272     char *q;
3273
3274     static const char *hexdigit = "0123456789abcdef";
3275 #ifdef Py_UNICODE_WIDE
3276     const Py_ssize_t expandsize = 10;
3277 #else
3278     const Py_ssize_t expandsize = 6;
3279 #endif
3280
3281     if (size > PY_SSIZE_T_MAX / expandsize)
3282         return PyErr_NoMemory();
3283
3284     repr = PyString_FromStringAndSize(NULL, expandsize * size);
3285     if (repr == NULL)
3286         return NULL;
3287     if (size == 0)
3288         return repr;
3289
3290     p = q = PyString_AS_STRING(repr);
3291     while (size-- > 0) {
3292         Py_UNICODE ch = *s++;
3293 #ifdef Py_UNICODE_WIDE
3294         /* Map 32-bit characters to '\Uxxxxxxxx' */
3295         if (ch >= 0x10000) {
3296             *p++ = '\\';
3297             *p++ = 'U';
3298             *p++ = hexdigit[(ch >> 28) & 0xf];
3299             *p++ = hexdigit[(ch >> 24) & 0xf];
3300             *p++ = hexdigit[(ch >> 20) & 0xf];
3301             *p++ = hexdigit[(ch >> 16) & 0xf];
3302             *p++ = hexdigit[(ch >> 12) & 0xf];
3303             *p++ = hexdigit[(ch >> 8) & 0xf];
3304             *p++ = hexdigit[(ch >> 4) & 0xf];
3305             *p++ = hexdigit[ch & 15];
3306         }
3307         else
3308 #else
3309             /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3310             if (ch >= 0xD800 && ch < 0xDC00) {
3311                 Py_UNICODE ch2;
3312                 Py_UCS4 ucs;
3313
3314                 ch2 = *s++;
3315                 size--;
3316                 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3317                     ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3318                     *p++ = '\\';
3319                     *p++ = 'U';
3320                     *p++ = hexdigit[(ucs >> 28) & 0xf];
3321                     *p++ = hexdigit[(ucs >> 24) & 0xf];
3322                     *p++ = hexdigit[(ucs >> 20) & 0xf];
3323                     *p++ = hexdigit[(ucs >> 16) & 0xf];
3324                     *p++ = hexdigit[(ucs >> 12) & 0xf];
3325                     *p++ = hexdigit[(ucs >> 8) & 0xf];
3326                     *p++ = hexdigit[(ucs >> 4) & 0xf];
3327                     *p++ = hexdigit[ucs & 0xf];
3328                     continue;
3329                 }
3330                 /* Fall through: isolated surrogates are copied as-is */
3331                 s--;
3332                 size++;
3333             }
3334 #endif
3335         /* Map 16-bit characters to '\uxxxx' */
3336         if (ch >= 256) {
3337             *p++ = '\\';
3338             *p++ = 'u';
3339             *p++ = hexdigit[(ch >> 12) & 0xf];
3340             *p++ = hexdigit[(ch >> 8) & 0xf];
3341             *p++ = hexdigit[(ch >> 4) & 0xf];
3342             *p++ = hexdigit[ch & 15];
3343         }
3344         /* Copy everything else as-is */
3345         else
3346             *p++ = (char) ch;
3347     }
3348     *p = '\0';
3349     _PyString_Resize(&repr, p - q);
3350     return repr;
3351 }
3352
3353 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3354 {
3355     if (!PyUnicode_Check(unicode)) {
3356         PyErr_BadArgument();
3357         return NULL;
3358     }
3359     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3360                                             PyUnicode_GET_SIZE(unicode));
3361 }
3362
3363 /* --- Unicode Internal Codec ------------------------------------------- */
3364
3365 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3366                                            Py_ssize_t size,
3367                                            const char *errors)
3368 {
3369     const char *starts = s;
3370     Py_ssize_t startinpos;
3371     Py_ssize_t endinpos;
3372     Py_ssize_t outpos;
3373     PyUnicodeObject *v;
3374     Py_UNICODE *p;
3375     const char *end;
3376     const char *reason;
3377     PyObject *errorHandler = NULL;
3378     PyObject *exc = NULL;
3379
3380 #ifdef Py_UNICODE_WIDE
3381     Py_UNICODE unimax = PyUnicode_GetMax();
3382 #endif
3383
3384     /* XXX overflow detection missing */
3385     v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3386     if (v == NULL)
3387         goto onError;
3388     if (PyUnicode_GetSize((PyObject *)v) == 0)
3389         return (PyObject *)v;
3390     p = PyUnicode_AS_UNICODE(v);
3391     end = s + size;
3392
3393     while (s < end) {
3394         memcpy(p, s, sizeof(Py_UNICODE));
3395         /* We have to sanity check the raw data, otherwise doom looms for
3396            some malformed UCS-4 data. */
3397         if (
3398 #ifdef Py_UNICODE_WIDE
3399             *p > unimax || *p < 0 ||
3400 #endif
3401             end-s < Py_UNICODE_SIZE
3402             )
3403         {
3404             startinpos = s - starts;
3405             if (end-s < Py_UNICODE_SIZE) {
3406                 endinpos = end-starts;
3407                 reason = "truncated input";
3408             }
3409             else {
3410                 endinpos = s - starts + Py_UNICODE_SIZE;
3411                 reason = "illegal code point (> 0x10FFFF)";
3412             }
3413             outpos = p - PyUnicode_AS_UNICODE(v);
3414             if (unicode_decode_call_errorhandler(
3415                     errors, &errorHandler,
3416                     "unicode_internal", reason,
3417                     starts, size, &startinpos, &endinpos, &exc, &s,
3418                     &v, &outpos, &p)) {
3419                 goto onError;
3420             }
3421         }
3422         else {
3423             p++;
3424             s += Py_UNICODE_SIZE;
3425         }
3426     }
3427
3428     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3429         goto onError;
3430     Py_XDECREF(errorHandler);
3431     Py_XDECREF(exc);
3432     return (PyObject *)v;
3433
3434   onError:
3435     Py_XDECREF(v);
3436     Py_XDECREF(errorHandler);
3437     Py_XDECREF(exc);
3438     return NULL;
3439 }
3440
3441 /* --- Latin-1 Codec ------------------------------------------------------ */
3442
3443 PyObject *PyUnicode_DecodeLatin1(const char *s,
3444                                  Py_ssize_t size,
3445                                  const char *errors)
3446 {
3447     PyUnicodeObject *v;
3448     Py_UNICODE *p;
3449
3450     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3451     if (size == 1) {
3452         Py_UNICODE r = *(unsigned char*)s;
3453         return PyUnicode_FromUnicode(&r, 1);
3454     }
3455
3456     v = _PyUnicode_New(size);
3457     if (v == NULL)
3458         goto onError;
3459     if (size == 0)
3460         return (PyObject *)v;
3461     p = PyUnicode_AS_UNICODE(v);
3462     while (size-- > 0)
3463         *p++ = (unsigned char)*s++;
3464     return (PyObject *)v;
3465
3466   onError:
3467     Py_XDECREF(v);
3468     return NULL;
3469 }
3470
3471 /* create or adjust a UnicodeEncodeError */
3472 static void make_encode_exception(PyObject **exceptionObject,
3473                                   const char *encoding,
3474                                   const Py_UNICODE *unicode, Py_ssize_t size,
3475                                   Py_ssize_t startpos, Py_ssize_t endpos,
3476                                   const char *reason)
3477 {
3478     if (*exceptionObject == NULL) {
3479         *exceptionObject = PyUnicodeEncodeError_Create(
3480             encoding, unicode, size, startpos, endpos, reason);
3481     }
3482     else {
3483         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3484             goto onError;
3485         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3486             goto onError;
3487         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3488             goto onError;
3489         return;
3490       onError:
3491         Py_DECREF(*exceptionObject);
3492         *exceptionObject = NULL;
3493     }
3494 }
3495
3496 /* raises a UnicodeEncodeError */
3497 static void raise_encode_exception(PyObject **exceptionObject,
3498                                    const char *encoding,
3499                                    const Py_UNICODE *unicode, Py_ssize_t size,
3500                                    Py_ssize_t startpos, Py_ssize_t endpos,
3501                                    const char *reason)
3502 {
3503     make_encode_exception(exceptionObject,
3504                           encoding, unicode, size, startpos, endpos, reason);
3505     if (*exceptionObject != NULL)
3506         PyCodec_StrictErrors(*exceptionObject);
3507 }
3508
3509 /* error handling callback helper:
3510    build arguments, call the callback and check the arguments,
3511    put the result into newpos and return the replacement string, which
3512    has to be freed by the caller */
3513 static PyObject *unicode_encode_call_errorhandler(const char *errors,
3514                                                   PyObject **errorHandler,
3515                                                   const char *encoding, const char *reason,
3516                                                   const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3517                                                   Py_ssize_t startpos, Py_ssize_t endpos,
3518                                                   Py_ssize_t *newpos)
3519 {
3520     static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3521
3522     PyObject *restuple;
3523     PyObject *resunicode;
3524
3525     if (*errorHandler == NULL) {
3526         *errorHandler = PyCodec_LookupError(errors);
3527         if (*errorHandler == NULL)
3528             return NULL;
3529     }
3530
3531     make_encode_exception(exceptionObject,
3532                           encoding, unicode, size, startpos, endpos, reason);
3533     if (*exceptionObject == NULL)
3534         return NULL;
3535
3536     restuple = PyObject_CallFunctionObjArgs(
3537         *errorHandler, *exceptionObject, NULL);
3538     if (restuple == NULL)
3539         return NULL;
3540     if (!PyTuple_Check(restuple)) {
3541         PyErr_SetString(PyExc_TypeError, &argparse[4]);
3542         Py_DECREF(restuple);
3543         return NULL;
3544     }
3545     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3546                           &resunicode, newpos)) {
3547         Py_DECREF(restuple);
3548         return NULL;
3549     }
3550     if (*newpos<0)
3551         *newpos = size+*newpos;
3552     if (*newpos<0 || *newpos>size) {
3553         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3554         Py_DECREF(restuple);
3555         return NULL;
3556     }
3557     Py_INCREF(resunicode);
3558     Py_DECREF(restuple);
3559     return resunicode;
3560 }
3561
3562 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3563                                      Py_ssize_t size,
3564                                      const char *errors,
3565                                      int limit)
3566 {
3567     /* output object */
3568     PyObject *res;
3569     /* pointers to the beginning and end+1 of input */
3570     const Py_UNICODE *startp = p;
3571     const Py_UNICODE *endp = p + size;
3572     /* pointer to the beginning of the unencodable characters */
3573     /* const Py_UNICODE *badp = NULL; */
3574     /* pointer into the output */
3575     char *str;
3576     /* current output position */
3577     Py_ssize_t respos = 0;
3578     Py_ssize_t ressize;
3579     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3580     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3581     PyObject *errorHandler = NULL;
3582     PyObject *exc = NULL;
3583     /* the following variable is used for caching string comparisons
3584      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3585     int known_errorHandler = -1;
3586
3587     /* allocate enough for a simple encoding without
3588        replacements, if we need more, we'll resize */
3589     res = PyString_FromStringAndSize(NULL, size);
3590     if (res == NULL)
3591         goto onError;
3592     if (size == 0)
3593         return res;
3594     str = PyString_AS_STRING(res);
3595     ressize = size;
3596
3597     while (p<endp) {
3598         Py_UNICODE c = *p;
3599
3600         /* can we encode this? */
3601         if (c<limit) {
3602             /* no overflow check, because we know that the space is enough */
3603             *str++ = (char)c;
3604             ++p;
3605         }
3606         else {
3607             Py_ssize_t unicodepos = p-startp;
3608             Py_ssize_t requiredsize;
3609             PyObject *repunicode;
3610             Py_ssize_t repsize;
3611             Py_ssize_t newpos;
3612             Py_ssize_t respos;
3613             Py_UNICODE *uni2;
3614             /* startpos for collecting unencodable chars */
3615             const Py_UNICODE *collstart = p;
3616             const Py_UNICODE *collend = p;
3617             /* find all unecodable characters */
3618             while ((collend < endp) && ((*collend)>=limit))
3619                 ++collend;
3620             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3621             if (known_errorHandler==-1) {
3622                 if ((errors==NULL) || (!strcmp(errors, "strict")))
3623                     known_errorHandler = 1;
3624                 else if (!strcmp(errors, "replace"))
3625                     known_errorHandler = 2;
3626                 else if (!strcmp(errors, "ignore"))
3627                     known_errorHandler = 3;
3628                 else if (!strcmp(errors, "xmlcharrefreplace"))
3629                     known_errorHandler = 4;
3630                 else
3631                     known_errorHandler = 0;
3632             }
3633             switch (known_errorHandler) {
3634             case 1: /* strict */
3635                 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3636                 goto onError;
3637             case 2: /* replace */
3638                 while (collstart++<collend)
3639                     *str++ = '?'; /* fall through */
3640             case 3: /* ignore */
3641                 p = collend;
3642                 break;
3643             case 4: /* xmlcharrefreplace */
3644                 respos = str-PyString_AS_STRING(res);
3645                 /* determine replacement size (temporarily (mis)uses p) */
3646                 for (p = collstart, repsize = 0; p < collend; ++p) {
3647                     if (*p<10)
3648                         repsize += 2+1+1;
3649                     else if (*p<100)
3650                         repsize += 2+2+1;
3651                     else if (*p<1000)
3652                         repsize += 2+3+1;
3653                     else if (*p<10000)
3654                         repsize += 2+4+1;
3655 #ifndef Py_UNICODE_WIDE
3656                     else
3657                         repsize += 2+5+1;
3658 #else
3659                     else if (*p<100000)
3660                         repsize += 2+5+1;
3661                     else if (*p<1000000)
3662                         repsize += 2+6+1;
3663                     else
3664                         repsize += 2+7+1;
3665 #endif
3666                 }
3667                 requiredsize = respos+repsize+(endp-collend);
3668                 if (requiredsize > ressize) {
3669                     if (requiredsize<2*ressize)
3670                         requiredsize = 2*ressize;
3671                     if (_PyString_Resize(&res, requiredsize))
3672                         goto onError;
3673                     str = PyString_AS_STRING(res) + respos;
3674                     ressize = requiredsize;
3675                 }
3676                 /* generate replacement (temporarily (mis)uses p) */
3677                 for (p = collstart; p < collend; ++p) {
3678                     str += sprintf(str, "&#%d;", (int)*p);
3679                 }
3680                 p = collend;
3681                 break;
3682             default:
3683                 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3684                                                               encoding, reason, startp, size, &exc,
3685                                                               collstart-startp, collend-startp, &newpos);
3686                 if (repunicode == NULL)
3687                     goto onError;
3688                 /* need more space? (at least enough for what we have+the
3689                    replacement+the rest of the string, so we won't have to
3690                    check space for encodable characters) */
3691                 respos = str-PyString_AS_STRING(res);
3692                 repsize = PyUnicode_GET_SIZE(repunicode);
3693                 requiredsize = respos+repsize+(endp-collend);
3694                 if (requiredsize > ressize) {
3695                     if (requiredsize<2*ressize)
3696                         requiredsize = 2*ressize;
3697                     if (_PyString_Resize(&res, requiredsize)) {
3698                         Py_DECREF(repunicode);
3699                         goto onError;
3700                     }
3701                     str = PyString_AS_STRING(res) + respos;
3702                     ressize = requiredsize;
3703                 }
3704                 /* check if there is anything unencodable in the replacement
3705                    and copy it to the output */
3706                 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3707                     c = *uni2;
3708                     if (c >= limit) {
3709                         raise_encode_exception(&exc, encoding, startp, size,
3710                                                unicodepos, unicodepos+1, reason);
3711                         Py_DECREF(repunicode);
3712                         goto onError;
3713                     }
3714                     *str = (char)c;
3715                 }
3716                 p = startp + newpos;
3717                 Py_DECREF(repunicode);
3718             }
3719         }
3720     }
3721     /* Resize if we allocated to much */
3722     respos = str-PyString_AS_STRING(res);
3723     if (respos<ressize)
3724         /* If this falls res will be NULL */
3725         _PyString_Resize(&res, respos);
3726     Py_XDECREF(errorHandler);
3727     Py_XDECREF(exc);
3728     return res;
3729
3730   onError:
3731     Py_XDECREF(res);
3732     Py_XDECREF(errorHandler);
3733     Py_XDECREF(exc);
3734     return NULL;
3735 }
3736
3737 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3738                                  Py_ssize_t size,
3739                                  const char *errors)
3740 {
3741     return unicode_encode_ucs1(p, size, errors, 256);
3742 }
3743
3744 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3745 {
3746     if (!PyUnicode_Check(unicode)) {
3747         PyErr_BadArgument();
3748         return NULL;
3749     }
3750     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3751                                   PyUnicode_GET_SIZE(unicode),
3752                                   NULL);
3753 }
3754
3755 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3756
3757 PyObject *PyUnicode_DecodeASCII(const char *s,
3758                                 Py_ssize_t size,
3759                                 const char *errors)
3760 {
3761     const char *starts = s;
3762     PyUnicodeObject *v;
3763     Py_UNICODE *p;
3764     Py_ssize_t startinpos;
3765     Py_ssize_t endinpos;
3766     Py_ssize_t outpos;
3767     const char *e;
3768     PyObject *errorHandler = NULL;
3769     PyObject *exc = NULL;
3770
3771     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3772     if (size == 1 && *(unsigned char*)s < 128) {
3773         Py_UNICODE r = *(unsigned char*)s;
3774         return PyUnicode_FromUnicode(&r, 1);
3775     }
3776
3777     v = _PyUnicode_New(size);
3778     if (v == NULL)
3779         goto onError;
3780     if (size == 0)
3781         return (PyObject *)v;
3782     p = PyUnicode_AS_UNICODE(v);
3783     e = s + size;
3784     while (s < e) {
3785         register unsigned char c = (unsigned char)*s;
3786         if (c < 128) {
3787             *p++ = c;
3788             ++s;
3789         }
3790         else {
3791             startinpos = s-starts;
3792             endinpos = startinpos + 1;
3793             outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3794             if (unicode_decode_call_errorhandler(
3795                     errors, &errorHandler,
3796                     "ascii", "ordinal not in range(128)",
3797                     starts, size, &startinpos, &endinpos, &exc, &s,
3798                     &v, &outpos, &p))
3799                 goto onError;
3800         }
3801     }
3802     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
3803         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3804             goto onError;
3805     Py_XDECREF(errorHandler);
3806     Py_XDECREF(exc);
3807     return (PyObject *)v;
3808
3809   onError:
3810     Py_XDECREF(v);
3811     Py_XDECREF(errorHandler);
3812     Py_XDECREF(exc);
3813     return NULL;
3814 }
3815
3816 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3817                                 Py_ssize_t size,
3818                                 const char *errors)
3819 {
3820     return unicode_encode_ucs1(p, size, errors, 128);
3821 }
3822
3823 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3824 {
3825     if (!PyUnicode_Check(unicode)) {
3826         PyErr_BadArgument();
3827         return NULL;
3828     }
3829     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3830                                  PyUnicode_GET_SIZE(unicode),
3831                                  NULL);
3832 }
3833
3834 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3835
3836 /* --- MBCS codecs for Windows -------------------------------------------- */
3837
3838 #if SIZEOF_INT < SIZEOF_SIZE_T
3839 #define NEED_RETRY
3840 #endif
3841
3842 /* XXX This code is limited to "true" double-byte encodings, as
3843    a) it assumes an incomplete character consists of a single byte, and
3844    b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3845    encodings, see IsDBCSLeadByteEx documentation. */
3846
3847 static int is_dbcs_lead_byte(const char *s, int offset)
3848 {
3849     const char *curr = s + offset;
3850
3851     if (IsDBCSLeadByte(*curr)) {
3852         const char *prev = CharPrev(s, curr);
3853         return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3854     }
3855     return 0;
3856 }
3857
3858 /*
3859  * Decode MBCS string into unicode object. If 'final' is set, converts
3860  * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3861  */
3862 static int decode_mbcs(PyUnicodeObject **v,
3863                        const char *s, /* MBCS string */
3864                        int size, /* sizeof MBCS string */
3865                        int final)
3866 {
3867     Py_UNICODE *p;
3868     Py_ssize_t n = 0;
3869     int usize = 0;
3870
3871     assert(size >= 0);
3872
3873     /* Skip trailing lead-byte unless 'final' is set */
3874     if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3875         --size;
3876
3877     /* First get the size of the result */
3878     if (size > 0) {
3879         usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3880         if (usize == 0) {
3881             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3882             return -1;
3883         }
3884     }
3885
3886     if (*v == NULL) {
3887         /* Create unicode object */
3888         *v = _PyUnicode_New(usize);
3889         if (*v == NULL)
3890             return -1;
3891     }
3892     else {
3893         /* Extend unicode object */
3894         n = PyUnicode_GET_SIZE(*v);
3895         if (_PyUnicode_Resize(v, n + usize) < 0)
3896             return -1;
3897     }
3898
3899     /* Do the conversion */
3900     if (size > 0) {
3901         p = PyUnicode_AS_UNICODE(*v) + n;
3902         if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3903             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3904             return -1;
3905         }
3906     }
3907
3908     return size;
3909 }
3910
3911 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3912                                        Py_ssize_t size,
3913                                        const char *errors,
3914                                        Py_ssize_t *consumed)
3915 {
3916     PyUnicodeObject *v = NULL;
3917     int done;
3918
3919     if (consumed)
3920         *consumed = 0;
3921
3922 #ifdef NEED_RETRY
3923   retry:
3924     if (size > INT_MAX)
3925         done = decode_mbcs(&v, s, INT_MAX, 0);
3926     else
3927 #endif
3928         done = decode_mbcs(&v, s, (int)size, !consumed);
3929
3930     if (done < 0) {
3931         Py_XDECREF(v);
3932         return NULL;
3933     }
3934
3935     if (consumed)
3936         *consumed += done;
3937
3938 #ifdef NEED_RETRY
3939     if (size > INT_MAX) {
3940         s += done;
3941         size -= done;
3942         goto retry;
3943     }
3944 #endif
3945
3946     return (PyObject *)v;
3947 }
3948
3949 PyObject *PyUnicode_DecodeMBCS(const char *s,
3950                                Py_ssize_t size,
3951                                const char *errors)
3952 {
3953     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3954 }
3955
3956 /*
3957  * Convert unicode into string object (MBCS).
3958  * Returns 0 if succeed, -1 otherwise.
3959  */
3960 static int encode_mbcs(PyObject **repr,
3961                        const Py_UNICODE *p, /* unicode */
3962                        int size) /* size of unicode */
3963 {
3964     int mbcssize = 0;
3965     Py_ssize_t n = 0;
3966
3967     assert(size >= 0);
3968
3969     /* First get the size of the result */
3970     if (size > 0) {
3971         mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3972         if (mbcssize == 0) {
3973             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3974             return -1;
3975         }
3976     }
3977
3978     if (*repr == NULL) {
3979         /* Create string object */
3980         *repr = PyString_FromStringAndSize(NULL, mbcssize);
3981         if (*repr == NULL)
3982             return -1;
3983     }
3984     else {
3985         /* Extend string object */
3986         n = PyString_Size(*repr);
3987         if (_PyString_Resize(repr, n + mbcssize) < 0)
3988             return -1;
3989     }
3990
3991     /* Do the conversion */
3992     if (size > 0) {
3993         char *s = PyString_AS_STRING(*repr) + n;
3994         if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3995             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3996             return -1;
3997         }
3998     }
3999
4000     return 0;
4001 }
4002
4003 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
4004                                Py_ssize_t size,
4005                                const char *errors)
4006 {
4007     PyObject *repr = NULL;
4008     int ret;
4009
4010 #ifdef NEED_RETRY
4011   retry:
4012     if (size > INT_MAX)
4013         ret = encode_mbcs(&repr, p, INT_MAX);
4014     else
4015 #endif
4016         ret = encode_mbcs(&repr, p, (int)size);
4017
4018     if (ret < 0) {
4019         Py_XDECREF(repr);
4020         return NULL;
4021     }
4022
4023 #ifdef NEED_RETRY
4024     if (size > INT_MAX) {
4025         p += INT_MAX;
4026         size -= INT_MAX;
4027         goto retry;
4028     }
4029 #endif
4030
4031     return repr;
4032 }
4033
4034 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4035 {
4036     if (!PyUnicode_Check(unicode)) {
4037         PyErr_BadArgument();
4038         return NULL;
4039     }
4040     return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4041                                 PyUnicode_GET_SIZE(unicode),
4042                                 NULL);
4043 }
4044
4045 #undef NEED_RETRY
4046
4047 #endif /* MS_WINDOWS */
4048
4049 /* --- Character Mapping Codec -------------------------------------------- */
4050
4051 PyObject *PyUnicode_DecodeCharmap(const char *s,
4052                                   Py_ssize_t size,
4053                                   PyObject *mapping,
4054                                   const char *errors)
4055 {
4056     const char *starts = s;
4057     Py_ssize_t startinpos;
4058     Py_ssize_t endinpos;
4059     Py_ssize_t outpos;
4060     const char *e;
4061     PyUnicodeObject *v;
4062     Py_UNICODE *p;
4063     Py_ssize_t extrachars = 0;
4064     PyObject *errorHandler = NULL;
4065     PyObject *exc = NULL;
4066     Py_UNICODE *mapstring = NULL;
4067     Py_ssize_t maplen = 0;
4068
4069     /* Default to Latin-1 */
4070     if (mapping == NULL)
4071         return PyUnicode_DecodeLatin1(s, size, errors);
4072
4073     v = _PyUnicode_New(size);
4074     if (v == NULL)
4075         goto onError;
4076     if (size == 0)
4077         return (PyObject *)v;
4078     p = PyUnicode_AS_UNICODE(v);
4079     e = s + size;
4080     if (PyUnicode_CheckExact(mapping)) {
4081         mapstring = PyUnicode_AS_UNICODE(mapping);
4082         maplen = PyUnicode_GET_SIZE(mapping);
4083         while (s < e) {
4084             unsigned char ch = *s;
4085             Py_UNICODE x = 0xfffe; /* illegal value */
4086
4087             if (ch < maplen)
4088                 x = mapstring[ch];
4089
4090             if (x == 0xfffe) {
4091                 /* undefined mapping */
4092                 outpos = p-PyUnicode_AS_UNICODE(v);
4093                 startinpos = s-starts;
4094                 endinpos = startinpos+1;
4095                 if (unicode_decode_call_errorhandler(
4096                         errors, &errorHandler,
4097                         "charmap", "character maps to <undefined>",
4098                         starts, size, &startinpos, &endinpos, &exc, &s,
4099                         &v, &outpos, &p)) {
4100                     goto onError;
4101                 }
4102                 continue;
4103             }
4104             *p++ = x;
4105             ++s;
4106         }
4107     }
4108     else {
4109         while (s < e) {
4110             unsigned char ch = *s;
4111             PyObject *w, *x;
4112
4113             /* Get mapping (char ordinal -> integer, Unicode char or None) */
4114             w = PyInt_FromLong((long)ch);
4115             if (w == NULL)
4116                 goto onError;
4117             x = PyObject_GetItem(mapping, w);
4118             Py_DECREF(w);
4119             if (x == NULL) {
4120                 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4121                     /* No mapping found means: mapping is undefined. */
4122                     PyErr_Clear();
4123                     x = Py_None;
4124                     Py_INCREF(x);
4125                 } else
4126                     goto onError;
4127             }
4128
4129             /* Apply mapping */
4130             if (PyInt_Check(x)) {
4131                 long value = PyInt_AS_LONG(x);
4132                 if (value < 0 || value > 65535) {
4133                     PyErr_SetString(PyExc_TypeError,
4134                                     "character mapping must be in range(65536)");
4135                     Py_DECREF(x);
4136                     goto onError;
4137                 }
4138                 *p++ = (Py_UNICODE)value;
4139             }
4140             else if (x == Py_None) {
4141                 /* undefined mapping */
4142                 outpos = p-PyUnicode_AS_UNICODE(v);
4143                 startinpos = s-starts;
4144                 endinpos = startinpos+1;
4145                 if (unicode_decode_call_errorhandler(
4146                         errors, &errorHandler,
4147                         "charmap", "character maps to <undefined>",
4148                         starts, size, &startinpos, &endinpos, &exc, &s,
4149                         &v, &outpos, &p)) {
4150                     Py_DECREF(x);
4151                     goto onError;
4152                 }
4153                 Py_DECREF(x);
4154                 continue;
4155             }
4156             else if (PyUnicode_Check(x)) {
4157                 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4158
4159                 if (targetsize == 1)
4160                     /* 1-1 mapping */
4161                     *p++ = *PyUnicode_AS_UNICODE(x);
4162
4163                 else if (targetsize > 1) {
4164                     /* 1-n mapping */
4165                     if (targetsize > extrachars) {
4166                         /* resize first */
4167                         Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4168                         Py_ssize_t needed = (targetsize - extrachars) + \
4169                             (targetsize << 2);
4170                         extrachars += needed;
4171                         /* XXX overflow detection missing */
4172                         if (_PyUnicode_Resize(&v,
4173                                               PyUnicode_GET_SIZE(v) + needed) < 0) {
4174                             Py_DECREF(x);
4175                             goto onError;
4176                         }
4177                         p = PyUnicode_AS_UNICODE(v) + oldpos;
4178                     }
4179                     Py_UNICODE_COPY(p,
4180                                     PyUnicode_AS_UNICODE(x),
4181                                     targetsize);
4182                     p += targetsize;
4183                     extrachars -= targetsize;
4184                 }
4185                 /* 1-0 mapping: skip the character */
4186             }
4187             else {
4188                 /* wrong return value */
4189                 PyErr_SetString(PyExc_TypeError,
4190                                 "character mapping must return integer, None or unicode");
4191                 Py_DECREF(x);
4192                 goto onError;
4193             }
4194             Py_DECREF(x);
4195             ++s;
4196         }
4197     }
4198     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4199         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4200             goto onError;
4201     Py_XDECREF(errorHandler);
4202     Py_XDECREF(exc);
4203     return (PyObject *)v;
4204
4205   onError:
4206     Py_XDECREF(errorHandler);
4207     Py_XDECREF(exc);
4208     Py_XDECREF(v);
4209     return NULL;
4210 }
4211
4212 /* Charmap encoding: the lookup table */
4213
4214 struct encoding_map{
4215     PyObject_HEAD
4216     unsigned char level1[32];
4217     int count2, count3;
4218     unsigned char level23[1];
4219 };
4220
4221 static PyObject*
4222 encoding_map_size(PyObject *obj, PyObject* args)
4223 {
4224     struct encoding_map *map = (struct encoding_map*)obj;
4225     return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4226                           128*map->count3);
4227 }
4228
4229 static PyMethodDef encoding_map_methods[] = {
4230     {"size", encoding_map_size, METH_NOARGS,
4231      PyDoc_STR("Return the size (in bytes) of this object") },
4232     { 0 }
4233 };
4234
4235 static void
4236 encoding_map_dealloc(PyObject* o)
4237 {
4238     PyObject_FREE(o);
4239 }
4240
4241 static PyTypeObject EncodingMapType = {
4242     PyVarObject_HEAD_INIT(NULL, 0)
4243     "EncodingMap",          /*tp_name*/
4244     sizeof(struct encoding_map),   /*tp_basicsize*/
4245     0,                      /*tp_itemsize*/
4246     /* methods */
4247     encoding_map_dealloc,   /*tp_dealloc*/
4248     0,                      /*tp_print*/
4249     0,                      /*tp_getattr*/
4250     0,                      /*tp_setattr*/
4251     0,                      /*tp_compare*/
4252     0,                      /*tp_repr*/
4253     0,                      /*tp_as_number*/
4254     0,                      /*tp_as_sequence*/
4255     0,                      /*tp_as_mapping*/
4256     0,                      /*tp_hash*/
4257     0,                      /*tp_call*/
4258     0,                      /*tp_str*/
4259     0,                      /*tp_getattro*/
4260     0,                      /*tp_setattro*/
4261     0,                      /*tp_as_buffer*/
4262     Py_TPFLAGS_DEFAULT,     /*tp_flags*/
4263     0,                      /*tp_doc*/
4264     0,                      /*tp_traverse*/
4265     0,                      /*tp_clear*/
4266     0,                      /*tp_richcompare*/
4267     0,                      /*tp_weaklistoffset*/
4268     0,                      /*tp_iter*/
4269     0,                      /*tp_iternext*/
4270     encoding_map_methods,   /*tp_methods*/
4271     0,                      /*tp_members*/
4272     0,                      /*tp_getset*/
4273     0,                      /*tp_base*/
4274     0,                      /*tp_dict*/
4275     0,                      /*tp_descr_get*/
4276     0,                      /*tp_descr_set*/
4277     0,                      /*tp_dictoffset*/
4278     0,                      /*tp_init*/
4279     0,                      /*tp_alloc*/
4280     0,                      /*tp_new*/
4281     0,                      /*tp_free*/
4282     0,                      /*tp_is_gc*/
4283 };
4284
4285 PyObject*
4286 PyUnicode_BuildEncodingMap(PyObject* string)
4287 {
4288     Py_UNICODE *decode;
4289     PyObject *result;
4290     struct encoding_map *mresult;
4291     int i;
4292     int need_dict = 0;
4293     unsigned char level1[32];
4294     unsigned char level2[512];
4295     unsigned char *mlevel1, *mlevel2, *mlevel3;
4296     int count2 = 0, count3 = 0;
4297
4298     if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4299         PyErr_BadArgument();
4300         return NULL;
4301     }
4302     decode = PyUnicode_AS_UNICODE(string);
4303     memset(level1, 0xFF, sizeof level1);
4304     memset(level2, 0xFF, sizeof level2);
4305
4306     /* If there isn't a one-to-one mapping of NULL to \0,
4307        or if there are non-BMP characters, we need to use
4308        a mapping dictionary. */
4309     if (decode[0] != 0)
4310         need_dict = 1;
4311     for (i = 1; i < 256; i++) {
4312         int l1, l2;
4313         if (decode[i] == 0
4314 #ifdef Py_UNICODE_WIDE
4315             || decode[i] > 0xFFFF
4316 #endif
4317             ) {
4318             need_dict = 1;
4319             break;
4320         }
4321         if (decode[i] == 0xFFFE)
4322             /* unmapped character */
4323             continue;
4324         l1 = decode[i] >> 11;
4325         l2 = decode[i] >> 7;
4326         if (level1[l1] == 0xFF)
4327             level1[l1] = count2++;
4328         if (level2[l2] == 0xFF)
4329             level2[l2] = count3++;
4330     }
4331
4332     if (count2 >= 0xFF || count3 >= 0xFF)
4333         need_dict = 1;
4334
4335     if (need_dict) {
4336         PyObject *result = PyDict_New();
4337         PyObject *key, *value;
4338         if (!result)
4339             return NULL;
4340         for (i = 0; i < 256; i++) {
4341             key = value = NULL;
4342             key = PyInt_FromLong(decode[i]);
4343             value = PyInt_FromLong(i);
4344             if (!key || !value)
4345                 goto failed1;
4346             if (PyDict_SetItem(result, key, value) == -1)
4347                 goto failed1;
4348             Py_DECREF(key);
4349             Py_DECREF(value);
4350         }
4351         return result;
4352       failed1:
4353         Py_XDECREF(key);
4354         Py_XDECREF(value);
4355         Py_DECREF(result);
4356         return NULL;
4357     }
4358
4359     /* Create a three-level trie */
4360     result = PyObject_MALLOC(sizeof(struct encoding_map) +
4361                              16*count2 + 128*count3 - 1);
4362     if (!result)
4363         return PyErr_NoMemory();
4364     PyObject_Init(result, &EncodingMapType);
4365     mresult = (struct encoding_map*)result;
4366     mresult->count2 = count2;
4367     mresult->count3 = count3;
4368     mlevel1 = mresult->level1;
4369     mlevel2 = mresult->level23;
4370     mlevel3 = mresult->level23 + 16*count2;
4371     memcpy(mlevel1, level1, 32);
4372     memset(mlevel2, 0xFF, 16*count2);
4373     memset(mlevel3, 0, 128*count3);
4374     count3 = 0;
4375     for (i = 1; i < 256; i++) {
4376         int o1, o2, o3, i2, i3;
4377         if (decode[i] == 0xFFFE)
4378             /* unmapped character */
4379             continue;
4380         o1 = decode[i]>>11;
4381         o2 = (decode[i]>>7) & 0xF;
4382         i2 = 16*mlevel1[o1] + o2;
4383         if (mlevel2[i2] == 0xFF)
4384             mlevel2[i2] = count3++;
4385         o3 = decode[i] & 0x7F;
4386         i3 = 128*mlevel2[i2] + o3;
4387         mlevel3[i3] = i;
4388     }
4389     return result;
4390 }
4391
4392 static int
4393 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4394 {
4395     struct encoding_map *map = (struct encoding_map*)mapping;
4396     int l1 = c>>11;
4397     int l2 = (c>>7) & 0xF;
4398     int l3 = c & 0x7F;
4399     int i;
4400
4401 #ifdef Py_UNICODE_WIDE
4402     if (c > 0xFFFF) {
4403         return -1;
4404     }
4405 #endif
4406     if (c == 0)
4407         return 0;
4408     /* level 1*/
4409     i = map->level1[l1];
4410     if (i == 0xFF) {
4411         return -1;
4412     }
4413     /* level 2*/
4414     i = map->level23[16*i+l2];
4415     if (i == 0xFF) {
4416         return -1;
4417     }
4418     /* level 3 */
4419     i = map->level23[16*map->count2 + 128*i + l3];
4420     if (i == 0) {
4421         return -1;
4422     }
4423     return i;
4424 }
4425
4426 /* Lookup the character ch in the mapping. If the character
4427    can't be found, Py_None is returned (or NULL, if another
4428    error occurred). */
4429 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4430 {
4431     PyObject *w = PyInt_FromLong((long)c);
4432     PyObject *x;
4433
4434     if (w == NULL)
4435         return NULL;
4436     x = PyObject_GetItem(mapping, w);
4437     Py_DECREF(w);
4438     if (x == NULL) {
4439         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4440             /* No mapping found means: mapping is undefined. */
4441             PyErr_Clear();
4442             x = Py_None;
4443             Py_INCREF(x);
4444             return x;
4445         } else
4446             return NULL;
4447     }
4448     else if (x == Py_None)
4449         return x;
4450     else if (PyInt_Check(x)) {
4451         long value = PyInt_AS_LONG(x);
4452         if (value < 0 || value > 255) {
4453             PyErr_SetString(PyExc_TypeError,
4454                             "character mapping must be in range(256)");
4455             Py_DECREF(x);
4456             return NULL;
4457         }
4458         return x;
4459     }
4460     else if (PyString_Check(x))
4461         return x;
4462     else {
4463         /* wrong return value */
4464         PyErr_SetString(PyExc_TypeError,
4465                         "character mapping must return integer, None or str");
4466         Py_DECREF(x);
4467         return NULL;
4468     }
4469 }
4470
4471 static int
4472 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4473 {
4474     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4475     /* exponentially overallocate to minimize reallocations */
4476     if (requiredsize < 2*outsize)
4477         requiredsize = 2*outsize;
4478     if (_PyString_Resize(outobj, requiredsize)) {
4479         return 0;
4480     }
4481     return 1;
4482 }
4483
4484 typedef enum charmapencode_result {
4485     enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4486 }charmapencode_result;
4487 /* lookup the character, put the result in the output string and adjust
4488    various state variables. Reallocate the output string if not enough
4489    space is available. Return a new reference to the object that
4490    was put in the output buffer, or Py_None, if the mapping was undefined
4491    (in which case no character was written) or NULL, if a
4492    reallocation error occurred. The caller must decref the result */
4493 static
4494 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4495                                           PyObject **outobj, Py_ssize_t *outpos)
4496 {
4497     PyObject *rep;
4498     char *outstart;
4499     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4500
4501     if (Py_TYPE(mapping) == &EncodingMapType) {
4502         int res = encoding_map_lookup(c, mapping);
4503         Py_ssize_t requiredsize = *outpos+1;
4504         if (res == -1)
4505             return enc_FAILED;
4506         if (outsize<requiredsize)
4507             if (!charmapencode_resize(outobj, outpos, requiredsize))
4508                 return enc_EXCEPTION;
4509         outstart = PyString_AS_STRING(*outobj);
4510         outstart[(*outpos)++] = (char)res;
4511         return enc_SUCCESS;
4512     }
4513
4514     rep = charmapencode_lookup(c, mapping);
4515     if (rep==NULL)
4516         return enc_EXCEPTION;
4517     else if (rep==Py_None) {
4518         Py_DECREF(rep);
4519         return enc_FAILED;
4520     } else {
4521         if (PyInt_Check(rep)) {
4522             Py_ssize_t requiredsize = *outpos+1;
4523             if (outsize<requiredsize)
4524                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4525                     Py_DECREF(rep);
4526                     return enc_EXCEPTION;
4527                 }
4528             outstart = PyString_AS_STRING(*outobj);
4529             outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4530         }
4531         else {
4532             const char *repchars = PyString_AS_STRING(rep);
4533             Py_ssize_t repsize = PyString_GET_SIZE(rep);
4534             Py_ssize_t requiredsize = *outpos+repsize;
4535             if (outsize<requiredsize)
4536                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4537                     Py_DECREF(rep);
4538                     return enc_EXCEPTION;
4539                 }
4540             outstart = PyString_AS_STRING(*outobj);
4541             memcpy(outstart + *outpos, repchars, repsize);
4542             *outpos += repsize;
4543         }
4544     }
4545     Py_DECREF(rep);
4546     return enc_SUCCESS;
4547 }
4548
4549 /* handle an error in PyUnicode_EncodeCharmap
4550    Return 0 on success, -1 on error */
4551 static
4552 int charmap_encoding_error(
4553     const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4554     PyObject **exceptionObject,
4555     int *known_errorHandler, PyObject **errorHandler, const char *errors,
4556     PyObject **res, Py_ssize_t *respos)
4557 {
4558     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4559     Py_ssize_t repsize;
4560     Py_ssize_t newpos;
4561     Py_UNICODE *uni2;
4562     /* startpos for collecting unencodable chars */
4563     Py_ssize_t collstartpos = *inpos;
4564     Py_ssize_t collendpos = *inpos+1;
4565     Py_ssize_t collpos;
4566     char *encoding = "charmap";
4567     char *reason = "character maps to <undefined>";
4568     charmapencode_result x;
4569
4570     /* find all unencodable characters */
4571     while (collendpos < size) {
4572         PyObject *rep;
4573         if (Py_TYPE(mapping) == &EncodingMapType) {
4574             int res = encoding_map_lookup(p[collendpos], mapping);
4575             if (res != -1)
4576                 break;
4577             ++collendpos;
4578             continue;
4579         }
4580
4581         rep = charmapencode_lookup(p[collendpos], mapping);
4582         if (rep==NULL)
4583             return -1;
4584         else if (rep!=Py_None) {
4585             Py_DECREF(rep);
4586             break;
4587         }
4588         Py_DECREF(rep);
4589         ++collendpos;
4590     }
4591     /* cache callback name lookup
4592      * (if not done yet, i.e. it's the first error) */
4593     if (*known_errorHandler==-1) {
4594         if ((errors==NULL) || (!strcmp(errors, "strict")))
4595             *known_errorHandler = 1;
4596         else if (!strcmp(errors, "replace"))
4597             *known_errorHandler = 2;
4598         else if (!strcmp(errors, "ignore"))
4599             *known_errorHandler = 3;
4600         else if (!strcmp(errors, "xmlcharrefreplace"))
4601             *known_errorHandler = 4;
4602         else
4603             *known_errorHandler = 0;
4604     }
4605     switch (*known_errorHandler) {
4606     case 1: /* strict */
4607         raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4608         return -1;
4609     case 2: /* replace */
4610         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4611             x = charmapencode_output('?', mapping, res, respos);
4612             if (x==enc_EXCEPTION) {
4613                 return -1;
4614             }
4615             else if (x==enc_FAILED) {
4616                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4617                 return -1;
4618             }
4619         }
4620         /* fall through */
4621     case 3: /* ignore */
4622         *inpos = collendpos;
4623         break;
4624     case 4: /* xmlcharrefreplace */
4625         /* generate replacement (temporarily (mis)uses p) */
4626         for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4627             char buffer[2+29+1+1];
4628             char *cp;
4629             sprintf(buffer, "&#%d;", (int)p[collpos]);
4630             for (cp = buffer; *cp; ++cp) {
4631                 x = charmapencode_output(*cp, mapping, res, respos);
4632                 if (x==enc_EXCEPTION)
4633                     return -1;
4634                 else if (x==enc_FAILED) {
4635                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4636                     return -1;
4637                 }
4638             }
4639         }
4640         *inpos = collendpos;
4641         break;
4642     default:
4643         repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4644                                                       encoding, reason, p, size, exceptionObject,
4645                                                       collstartpos, collendpos, &newpos);
4646         if (repunicode == NULL)
4647             return -1;
4648         /* generate replacement  */
4649         repsize = PyUnicode_GET_SIZE(repunicode);
4650         for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4651             x = charmapencode_output(*uni2, mapping, res, respos);
4652             if (x==enc_EXCEPTION) {
4653                 return -1;
4654             }
4655             else if (x==enc_FAILED) {
4656                 Py_DECREF(repunicode);
4657                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4658                 return -1;
4659             }
4660         }
4661         *inpos = newpos;
4662         Py_DECREF(repunicode);
4663     }
4664     return 0;
4665 }
4666
4667 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4668                                   Py_ssize_t size,
4669                                   PyObject *mapping,
4670                                   const char *errors)
4671 {
4672     /* output object */
4673     PyObject *res = NULL;
4674     /* current input position */
4675     Py_ssize_t inpos = 0;
4676     /* current output position */
4677     Py_ssize_t respos = 0;
4678     PyObject *errorHandler = NULL;
4679     PyObject *exc = NULL;
4680     /* the following variable is used for caching string comparisons
4681      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4682      * 3=ignore, 4=xmlcharrefreplace */
4683     int known_errorHandler = -1;
4684
4685     /* Default to Latin-1 */
4686     if (mapping == NULL)
4687         return PyUnicode_EncodeLatin1(p, size, errors);
4688
4689     /* allocate enough for a simple encoding without
4690        replacements, if we need more, we'll resize */
4691     res = PyString_FromStringAndSize(NULL, size);
4692     if (res == NULL)
4693         goto onError;
4694     if (size == 0)
4695         return res;
4696
4697     while (inpos<size) {
4698         /* try to encode it */
4699         charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4700         if (x==enc_EXCEPTION) /* error */
4701             goto onError;
4702         if (x==enc_FAILED) { /* unencodable character */
4703             if (charmap_encoding_error(p, size, &inpos, mapping,
4704                                        &exc,
4705                                        &known_errorHandler, &errorHandler, errors,
4706                                        &res, &respos)) {
4707                 goto onError;
4708             }
4709         }
4710         else
4711             /* done with this character => adjust input position */
4712             ++inpos;
4713     }
4714
4715     /* Resize if we allocated to much */
4716     if (respos<PyString_GET_SIZE(res)) {
4717         if (_PyString_Resize(&res, respos))
4718             goto onError;
4719     }
4720     Py_XDECREF(exc);
4721     Py_XDECREF(errorHandler);
4722     return res;
4723
4724   onError:
4725     Py_XDECREF(res);
4726     Py_XDECREF(exc);
4727     Py_XDECREF(errorHandler);
4728     return NULL;
4729 }
4730
4731 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4732                                     PyObject *mapping)
4733 {
4734     if (!PyUnicode_Check(unicode) || mapping == NULL) {
4735         PyErr_BadArgument();
4736         return NULL;
4737     }
4738     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4739                                    PyUnicode_GET_SIZE(unicode),
4740                                    mapping,
4741                                    NULL);
4742 }
4743
4744 /* create or adjust a UnicodeTranslateError */
4745 static void make_translate_exception(PyObject **exceptionObject,
4746                                      const Py_UNICODE *unicode, Py_ssize_t size,
4747                                      Py_ssize_t startpos, Py_ssize_t endpos,
4748                                      const char *reason)
4749 {
4750     if (*exceptionObject == NULL) {
4751         *exceptionObject = PyUnicodeTranslateError_Create(
4752             unicode, size, startpos, endpos, reason);
4753     }
4754     else {
4755         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4756             goto onError;
4757         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4758             goto onError;
4759         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4760             goto onError;
4761         return;
4762       onError:
4763         Py_DECREF(*exceptionObject);
4764         *exceptionObject = NULL;
4765     }
4766 }
4767
4768 /* raises a UnicodeTranslateError */
4769 static void raise_translate_exception(PyObject **exceptionObject,
4770                                       const Py_UNICODE *unicode, Py_ssize_t size,
4771                                       Py_ssize_t startpos, Py_ssize_t endpos,
4772                                       const char *reason)
4773 {
4774     make_translate_exception(exceptionObject,
4775                              unicode, size, startpos, endpos, reason);
4776     if (*exceptionObject != NULL)
4777         PyCodec_StrictErrors(*exceptionObject);
4778 }
4779
4780 /* error handling callback helper:
4781    build arguments, call the callback and check the arguments,
4782    put the result into newpos and return the replacement string, which
4783    has to be freed by the caller */
4784 static PyObject *unicode_translate_call_errorhandler(const char *errors,
4785                                                      PyObject **errorHandler,
4786                                                      const char *reason,
4787                                                      const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4788                                                      Py_ssize_t startpos, Py_ssize_t endpos,
4789                                                      Py_ssize_t *newpos)
4790 {
4791     static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4792
4793     Py_ssize_t i_newpos;
4794     PyObject *restuple;
4795     PyObject *resunicode;
4796
4797     if (*errorHandler == NULL) {
4798         *errorHandler = PyCodec_LookupError(errors);
4799         if (*errorHandler == NULL)
4800             return NULL;
4801     }
4802
4803     make_translate_exception(exceptionObject,
4804                              unicode, size, startpos, endpos, reason);
4805     if (*exceptionObject == NULL)
4806         return NULL;
4807
4808     restuple = PyObject_CallFunctionObjArgs(
4809         *errorHandler, *exceptionObject, NULL);
4810     if (restuple == NULL)
4811         return NULL;
4812     if (!PyTuple_Check(restuple)) {
4813         PyErr_SetString(PyExc_TypeError, &argparse[4]);
4814         Py_DECREF(restuple);
4815         return NULL;
4816     }
4817     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4818                           &resunicode, &i_newpos)) {
4819         Py_DECREF(restuple);
4820         return NULL;
4821     }
4822     if (i_newpos<0)
4823         *newpos = size+i_newpos;
4824     else
4825         *newpos = i_newpos;
4826     if (*newpos<0 || *newpos>size) {
4827         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4828         Py_DECREF(restuple);
4829         return NULL;
4830     }
4831     Py_INCREF(resunicode);
4832     Py_DECREF(restuple);
4833     return resunicode;
4834 }
4835
4836 /* Lookup the character ch in the mapping and put the result in result,
4837    which must be decrefed by the caller.
4838    Return 0 on success, -1 on error */
4839 static
4840 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4841 {
4842     PyObject *w = PyInt_FromLong((long)c);
4843     PyObject *x;
4844
4845     if (w == NULL)
4846         return -1;
4847     x = PyObject_GetItem(mapping, w);
4848     Py_DECREF(w);
4849     if (x == NULL) {
4850         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4851             /* No mapping found means: use 1:1 mapping. */
4852             PyErr_Clear();
4853             *result = NULL;
4854             return 0;
4855         } else
4856             return -1;
4857     }
4858     else if (x == Py_None) {
4859         *result = x;
4860         return 0;
4861     }
4862     else if (PyInt_Check(x)) {
4863         long value = PyInt_AS_LONG(x);
4864         long max = PyUnicode_GetMax();
4865         if (value < 0 || value > max) {
4866             PyErr_Format(PyExc_TypeError,
4867                          "character mapping must be in range(0x%lx)", max+1);
4868             Py_DECREF(x);
4869             return -1;
4870         }
4871         *result = x;
4872         return 0;
4873     }
4874     else if (PyUnicode_Check(x)) {
4875         *result = x;
4876         return 0;
4877     }
4878     else {
4879         /* wrong return value */
4880         PyErr_SetString(PyExc_TypeError,
4881                         "character mapping must return integer, None or unicode");
4882         Py_DECREF(x);
4883         return -1;
4884     }
4885 }
4886 /* ensure that *outobj is at least requiredsize characters long,
4887    if not reallocate and adjust various state variables.
4888    Return 0 on success, -1 on error */
4889 static
4890 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4891                                Py_ssize_t requiredsize)
4892 {
4893     Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4894     if (requiredsize > oldsize) {
4895         /* remember old output position */
4896         Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4897         /* exponentially overallocate to minimize reallocations */
4898         if (requiredsize < 2 * oldsize)
4899             requiredsize = 2 * oldsize;
4900         if (PyUnicode_Resize(outobj, requiredsize) < 0)
4901             return -1;
4902         *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4903     }
4904     return 0;
4905 }
4906 /* lookup the character, put the result in the output string and adjust
4907    various state variables. Return a new reference to the object that
4908    was put in the output buffer in *result, or Py_None, if the mapping was
4909    undefined (in which case no character was written).
4910    The called must decref result.
4911    Return 0 on success, -1 on error. */
4912 static
4913 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
4914                             Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4915                             PyObject **res)
4916 {
4917     if (charmaptranslate_lookup(*curinp, mapping, res))
4918         return -1;
4919     if (*res==NULL) {
4920         /* not found => default to 1:1 mapping */
4921         *(*outp)++ = *curinp;
4922     }
4923     else if (*res==Py_None)
4924         ;
4925     else if (PyInt_Check(*res)) {
4926         /* no overflow check, because we know that the space is enough */
4927         *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4928     }
4929     else if (PyUnicode_Check(*res)) {
4930         Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4931         if (repsize==1) {
4932             /* no overflow check, because we know that the space is enough */
4933             *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4934         }
4935         else if (repsize!=0) {
4936             /* more than one character */
4937             Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4938                 (insize - (curinp-startinp)) +
4939                 repsize - 1;
4940             if (charmaptranslate_makespace(outobj, outp, requiredsize))
4941                 return -1;
4942             memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4943             *outp += repsize;
4944         }
4945     }
4946     else
4947         return -1;
4948     return 0;
4949 }
4950
4951 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
4952                                      Py_ssize_t size,
4953                                      PyObject *mapping,
4954                                      const char *errors)
4955 {
4956     /* output object */
4957     PyObject *res = NULL;
4958     /* pointers to the beginning and end+1 of input */
4959     const Py_UNICODE *startp = p;
4960     const Py_UNICODE *endp = p + size;
4961     /* pointer into the output */
4962     Py_UNICODE *str;
4963     /* current output position */
4964     Py_ssize_t respos = 0;
4965     char *reason = "character maps to <undefined>";
4966     PyObject *errorHandler = NULL;
4967     PyObject *exc = NULL;
4968     /* the following variable is used for caching string comparisons
4969      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4970      * 3=ignore, 4=xmlcharrefreplace */
4971     int known_errorHandler = -1;
4972
4973     if (mapping == NULL) {
4974         PyErr_BadArgument();
4975         return NULL;
4976     }
4977
4978     /* allocate enough for a simple 1:1 translation without
4979        replacements, if we need more, we'll resize */
4980     res = PyUnicode_FromUnicode(NULL, size);
4981     if (res == NULL)
4982         goto onError;
4983     if (size == 0)
4984         return res;
4985     str = PyUnicode_AS_UNICODE(res);
4986
4987     while (p<endp) {
4988         /* try to encode it */
4989         PyObject *x = NULL;
4990         if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4991             Py_XDECREF(x);
4992             goto onError;
4993         }
4994         Py_XDECREF(x);
4995         if (x!=Py_None) /* it worked => adjust input pointer */
4996             ++p;
4997         else { /* untranslatable character */
4998             PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4999             Py_ssize_t repsize;
5000             Py_ssize_t newpos;
5001             Py_UNICODE *uni2;
5002             /* startpos for collecting untranslatable chars */
5003             const Py_UNICODE *collstart = p;
5004             const Py_UNICODE *collend = p+1;
5005             const Py_UNICODE *coll;
5006
5007             /* find all untranslatable characters */
5008             while (collend < endp) {
5009                 if (charmaptranslate_lookup(*collend, mapping, &x))
5010                     goto onError;
5011                 Py_XDECREF(x);
5012                 if (x!=Py_None)
5013                     break;
5014                 ++collend;
5015             }
5016             /* cache callback name lookup
5017              * (if not done yet, i.e. it's the first error) */
5018             if (known_errorHandler==-1) {
5019                 if ((errors==NULL) || (!strcmp(errors, "strict")))
5020                     known_errorHandler = 1;
5021                 else if (!strcmp(errors, "replace"))
5022                     known_errorHandler = 2;
5023                 else if (!strcmp(errors, "ignore"))
5024                     known_errorHandler = 3;
5025                 else if (!strcmp(errors, "xmlcharrefreplace"))
5026                     known_errorHandler = 4;
5027                 else
5028                     known_errorHandler = 0;
5029             }
5030             switch (known_errorHandler) {
5031             case 1: /* strict */
5032                 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5033                 goto onError;
5034             case 2: /* replace */
5035                 /* No need to check for space, this is a 1:1 replacement */
5036                 for (coll = collstart; coll<collend; ++coll)
5037                     *str++ = '?';
5038                 /* fall through */
5039             case 3: /* ignore */
5040                 p = collend;
5041                 break;
5042             case 4: /* xmlcharrefreplace */
5043                 /* generate replacement (temporarily (mis)uses p) */
5044                 for (p = collstart; p < collend; ++p) {
5045                     char buffer[2+29+1+1];
5046                     char *cp;
5047                     sprintf(buffer, "&#%d;", (int)*p);
5048                     if (charmaptranslate_makespace(&res, &str,
5049                                                    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5050                         goto onError;
5051                     for (cp = buffer; *cp; ++cp)
5052                         *str++ = *cp;
5053                 }
5054                 p = collend;
5055                 break;
5056             default:
5057                 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5058                                                                  reason, startp, size, &exc,
5059                                                                  collstart-startp, collend-startp, &newpos);
5060                 if (repunicode == NULL)
5061                     goto onError;
5062                 /* generate replacement  */
5063                 repsize = PyUnicode_GET_SIZE(repunicode);
5064                 if (charmaptranslate_makespace(&res, &str,
5065                                                (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5066                     Py_DECREF(repunicode);
5067                     goto onError;
5068                 }
5069                 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5070                     *str++ = *uni2;
5071                 p = startp + newpos;
5072                 Py_DECREF(repunicode);
5073             }
5074         }
5075     }
5076     /* Resize if we allocated to much */
5077     respos = str-PyUnicode_AS_UNICODE(res);
5078     if (respos<PyUnicode_GET_SIZE(res)) {
5079         if (PyUnicode_Resize(&res, respos) < 0)
5080             goto onError;
5081     }
5082     Py_XDECREF(exc);
5083     Py_XDECREF(errorHandler);
5084     return res;
5085
5086   onError:
5087     Py_XDECREF(res);
5088     Py_XDECREF(exc);
5089     Py_XDECREF(errorHandler);
5090     return NULL;
5091 }
5092
5093 PyObject *PyUnicode_Translate(PyObject *str,
5094                               PyObject *mapping,
5095                               const char *errors)
5096 {
5097     PyObject *result;
5098
5099     str = PyUnicode_FromObject(str);
5100     if (str == NULL)
5101         goto onError;
5102     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5103                                         PyUnicode_GET_SIZE(str),
5104                                         mapping,
5105                                         errors);
5106     Py_DECREF(str);
5107     return result;
5108
5109   onError:
5110     Py_XDECREF(str);
5111     return NULL;
5112 }
5113
5114 /* --- Decimal Encoder ---------------------------------------------------- */
5115
5116 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5117                             Py_ssize_t length,
5118                             char *output,
5119                             const char *errors)
5120 {
5121     Py_UNICODE *p, *end;
5122     PyObject *errorHandler = NULL;
5123     PyObject *exc = NULL;
5124     const char *encoding = "decimal";
5125     const char *reason = "invalid decimal Unicode string";
5126     /* the following variable is used for caching string comparisons
5127      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5128     int known_errorHandler = -1;
5129
5130     if (output == NULL) {
5131         PyErr_BadArgument();
5132         return -1;
5133     }
5134
5135     p = s;
5136     end = s + length;
5137     while (p < end) {
5138         register Py_UNICODE ch = *p;
5139         int decimal;
5140         PyObject *repunicode;
5141         Py_ssize_t repsize;
5142         Py_ssize_t newpos;
5143         Py_UNICODE *uni2;
5144         Py_UNICODE *collstart;
5145         Py_UNICODE *collend;
5146
5147         if (Py_UNICODE_ISSPACE(ch)) {
5148             *output++ = ' ';
5149             ++p;
5150             continue;
5151         }
5152         decimal = Py_UNICODE_TODECIMAL(ch);
5153         if (decimal >= 0) {
5154             *output++ = '0' + decimal;
5155             ++p;
5156             continue;
5157         }
5158         if (0 < ch && ch < 256) {
5159             *output++ = (char)ch;
5160             ++p;
5161             continue;
5162         }
5163         /* All other characters are considered unencodable */
5164         collstart = p;
5165         collend = p+1;
5166         while (collend < end) {
5167             if ((0 < *collend && *collend < 256) ||
5168                 !Py_UNICODE_ISSPACE(*collend) ||
5169                 Py_UNICODE_TODECIMAL(*collend))
5170                 break;
5171         }
5172         /* cache callback name lookup
5173          * (if not done yet, i.e. it's the first error) */
5174         if (known_errorHandler==-1) {
5175             if ((errors==NULL) || (!strcmp(errors, "strict")))
5176                 known_errorHandler = 1;
5177             else if (!strcmp(errors, "replace"))
5178                 known_errorHandler = 2;
5179             else if (!strcmp(errors, "ignore"))
5180                 known_errorHandler = 3;
5181             else if (!strcmp(errors, "xmlcharrefreplace"))
5182                 known_errorHandler = 4;
5183             else
5184                 known_errorHandler = 0;
5185         }
5186         switch (known_errorHandler) {
5187         case 1: /* strict */
5188             raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5189             goto onError;
5190         case 2: /* replace */
5191             for (p = collstart; p < collend; ++p)
5192                 *output++ = '?';
5193             /* fall through */
5194         case 3: /* ignore */
5195             p = collend;
5196             break;
5197         case 4: /* xmlcharrefreplace */
5198             /* generate replacement (temporarily (mis)uses p) */
5199             for (p = collstart; p < collend; ++p)
5200                 output += sprintf(output, "&#%d;", (int)*p);
5201             p = collend;
5202             break;
5203         default:
5204             repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5205                                                           encoding, reason, s, length, &exc,
5206                                                           collstart-s, collend-s, &newpos);
5207             if (repunicode == NULL)
5208                 goto onError;
5209             /* generate replacement  */
5210             repsize = PyUnicode_GET_SIZE(repunicode);
5211             for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5212                 Py_UNICODE ch = *uni2;
5213                 if (Py_UNICODE_ISSPACE(ch))
5214                     *output++ = ' ';
5215                 else {
5216                     decimal = Py_UNICODE_TODECIMAL(ch);
5217                     if (decimal >= 0)
5218                         *output++ = '0' + decimal;
5219                     else if (0 < ch && ch < 256)
5220                         *output++ = (char)ch;
5221                     else {
5222                         Py_DECREF(repunicode);
5223                         raise_encode_exception(&exc, encoding,
5224                                                s, length, collstart-s, collend-s, reason);
5225                         goto onError;
5226                     }
5227                 }
5228             }
5229             p = s + newpos;
5230             Py_DECREF(repunicode);
5231         }
5232     }
5233     /* 0-terminate the output string */
5234     *output++ = '\0';
5235     Py_XDECREF(exc);
5236     Py_XDECREF(errorHandler);
5237     return 0;
5238
5239   onError:
5240     Py_XDECREF(exc);
5241     Py_XDECREF(errorHandler);
5242     return -1;
5243 }
5244
5245 /* --- Helpers ------------------------------------------------------------ */
5246
5247 #include "stringlib/unicodedefs.h"
5248
5249 #define FROM_UNICODE
5250
5251 #include "stringlib/fastsearch.h"
5252
5253 #include "stringlib/count.h"
5254 #include "stringlib/find.h"
5255 #include "stringlib/partition.h"
5256
5257 /* helper macro to fixup start/end slice values */
5258 #define FIX_START_END(obj)                      \
5259     if (start < 0)                              \
5260         start += (obj)->length;                 \
5261     if (start < 0)                              \
5262         start = 0;                              \
5263     if (end > (obj)->length)                    \
5264         end = (obj)->length;                    \
5265     if (end < 0)                                \
5266         end += (obj)->length;                   \
5267     if (end < 0)                                \
5268         end = 0;
5269
5270 Py_ssize_t PyUnicode_Count(PyObject *str,
5271                            PyObject *substr,
5272                            Py_ssize_t start,
5273                            Py_ssize_t end)
5274 {
5275     Py_ssize_t result;
5276     PyUnicodeObject* str_obj;
5277     PyUnicodeObject* sub_obj;
5278
5279     str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5280     if (!str_obj)
5281         return -1;
5282     sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5283     if (!sub_obj) {
5284         Py_DECREF(str_obj);
5285         return -1;
5286     }
5287
5288     FIX_START_END(str_obj);
5289
5290     result = stringlib_count(
5291         str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5292         );
5293
5294     Py_DECREF(sub_obj);
5295     Py_DECREF(str_obj);
5296
5297     return result;
5298 }
5299
5300 Py_ssize_t PyUnicode_Find(PyObject *str,
5301                           PyObject *sub,
5302                           Py_ssize_t start,
5303                           Py_ssize_t end,
5304                           int direction)
5305 {
5306     Py_ssize_t result;
5307
5308     str = PyUnicode_FromObject(str);
5309     if (!str)
5310         return -2;
5311     sub = PyUnicode_FromObject(sub);
5312     if (!sub) {
5313         Py_DECREF(str);
5314         return -2;
5315     }
5316
5317     if (direction > 0)
5318         result = stringlib_find_slice(
5319             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5320             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5321             start, end
5322             );
5323     else
5324         result = stringlib_rfind_slice(
5325             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5326             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5327             start, end
5328             );
5329
5330     Py_DECREF(str);
5331     Py_DECREF(sub);
5332
5333     return result;
5334 }
5335
5336 static
5337 int tailmatch(PyUnicodeObject *self,
5338               PyUnicodeObject *substring,
5339               Py_ssize_t start,
5340               Py_ssize_t end,
5341               int direction)
5342 {
5343     if (substring->length == 0)
5344         return 1;
5345
5346     FIX_START_END(self);
5347
5348     end -= substring->length;
5349     if (end < start)
5350         return 0;
5351
5352     if (direction > 0) {
5353         if (Py_UNICODE_MATCH(self, end, substring))
5354             return 1;
5355     } else {
5356         if (Py_UNICODE_MATCH(self, start, substring))
5357             return 1;
5358     }
5359
5360     return 0;
5361 }
5362
5363 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5364                                PyObject *substr,
5365                                Py_ssize_t start,
5366                                Py_ssize_t end,
5367                                int direction)
5368 {
5369     Py_ssize_t result;
5370
5371     str = PyUnicode_FromObject(str);
5372     if (str == NULL)
5373         return -1;
5374     substr = PyUnicode_FromObject(substr);
5375     if (substr == NULL) {
5376         Py_DECREF(str);
5377         return -1;
5378     }
5379
5380     result = tailmatch((PyUnicodeObject *)str,
5381                        (PyUnicodeObject *)substr,
5382                        start, end, direction);
5383     Py_DECREF(str);
5384     Py_DECREF(substr);
5385     return result;
5386 }
5387
5388 /* Apply fixfct filter to the Unicode object self and return a
5389    reference to the modified object */
5390
5391 static
5392 PyObject *fixup(PyUnicodeObject *self,
5393                 int (*fixfct)(PyUnicodeObject *s))
5394 {
5395
5396     PyUnicodeObject *u;
5397
5398     u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5399     if (u == NULL)
5400         return NULL;
5401
5402     Py_UNICODE_COPY(u->str, self->str, self->length);
5403
5404     if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5405         /* fixfct should return TRUE if it modified the buffer. If
5406            FALSE, return a reference to the original buffer instead
5407            (to save space, not time) */
5408         Py_INCREF(self);
5409         Py_DECREF(u);
5410         return (PyObject*) self;
5411     }
5412     return (PyObject*) u;
5413 }
5414
5415 static
5416 int fixupper(PyUnicodeObject *self)
5417 {
5418     Py_ssize_t len = self->length;
5419     Py_UNICODE *s = self->str;
5420     int status = 0;
5421
5422     while (len-- > 0) {
5423         register Py_UNICODE ch;
5424
5425         ch = Py_UNICODE_TOUPPER(*s);
5426         if (ch != *s) {
5427             status = 1;
5428             *s = ch;
5429         }
5430         s++;
5431     }
5432
5433     return status;
5434 }
5435
5436 static
5437 int fixlower(PyUnicodeObject *self)
5438 {
5439     Py_ssize_t len = self->length;
5440     Py_UNICODE *s = self->str;
5441     int status = 0;
5442
5443     while (len-- > 0) {
5444         register Py_UNICODE ch;
5445
5446         ch = Py_UNICODE_TOLOWER(*s);
5447         if (ch != *s) {
5448             status = 1;
5449             *s = ch;
5450         }
5451         s++;
5452     }
5453
5454     return status;
5455 }
5456
5457 static
5458 int fixswapcase(PyUnicodeObject *self)
5459 {
5460     Py_ssize_t len = self->length;
5461     Py_UNICODE *s = self->str;
5462     int status = 0;
5463
5464     while (len-- > 0) {
5465         if (Py_UNICODE_ISUPPER(*s)) {
5466             *s = Py_UNICODE_TOLOWER(*s);
5467             status = 1;
5468         } else if (Py_UNICODE_ISLOWER(*s)) {
5469             *s = Py_UNICODE_TOUPPER(*s);
5470             status = 1;
5471         }
5472         s++;
5473     }
5474
5475     return status;
5476 }
5477
5478 static
5479 int fixcapitalize(PyUnicodeObject *self)
5480 {
5481     Py_ssize_t len = self->length;
5482     Py_UNICODE *s = self->str;
5483     int status = 0;
5484
5485     if (len == 0)
5486         return 0;
5487     if (Py_UNICODE_ISLOWER(*s)) {
5488         *s = Py_UNICODE_TOUPPER(*s);
5489         status = 1;
5490     }
5491     s++;
5492     while (--len > 0) {
5493         if (Py_UNICODE_ISUPPER(*s)) {
5494             *s = Py_UNICODE_TOLOWER(*s);
5495             status = 1;
5496         }
5497         s++;
5498     }
5499     return status;
5500 }
5501
5502 static
5503 int fixtitle(PyUnicodeObject *self)
5504 {
5505     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5506     register Py_UNICODE *e;
5507     int previous_is_cased;
5508
5509     /* Shortcut for single character strings */
5510     if (PyUnicode_GET_SIZE(self) == 1) {
5511         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5512         if (*p != ch) {
5513             *p = ch;
5514             return 1;
5515         }
5516         else
5517             return 0;
5518     }
5519
5520     e = p + PyUnicode_GET_SIZE(self);
5521     previous_is_cased = 0;
5522     for (; p < e; p++) {
5523         register const Py_UNICODE ch = *p;
5524
5525         if (previous_is_cased)
5526             *p = Py_UNICODE_TOLOWER(ch);
5527         else
5528             *p = Py_UNICODE_TOTITLE(ch);
5529
5530         if (Py_UNICODE_ISLOWER(ch) ||
5531             Py_UNICODE_ISUPPER(ch) ||
5532             Py_UNICODE_ISTITLE(ch))
5533             previous_is_cased = 1;
5534         else
5535             previous_is_cased = 0;
5536     }
5537     return 1;
5538 }
5539
5540 PyObject *
5541 PyUnicode_Join(PyObject *separator, PyObject *seq)
5542 {
5543     PyObject *internal_separator = NULL;
5544     const Py_UNICODE blank = ' ';
5545     const Py_UNICODE *sep = &blank;
5546     Py_ssize_t seplen = 1;
5547     PyUnicodeObject *res = NULL; /* the result */
5548     Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
5549     Py_ssize_t res_used;         /* # used bytes */
5550     Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
5551     PyObject *fseq;          /* PySequence_Fast(seq) */
5552     Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
5553     PyObject *item;
5554     Py_ssize_t i;
5555
5556     fseq = PySequence_Fast(seq, "");
5557     if (fseq == NULL) {
5558         return NULL;
5559     }
5560
5561     /* Grrrr.  A codec may be invoked to convert str objects to
5562      * Unicode, and so it's possible to call back into Python code
5563      * during PyUnicode_FromObject(), and so it's possible for a sick
5564      * codec to change the size of fseq (if seq is a list).  Therefore
5565      * we have to keep refetching the size -- can't assume seqlen
5566      * is invariant.
5567      */
5568     seqlen = PySequence_Fast_GET_SIZE(fseq);
5569     /* If empty sequence, return u"". */
5570     if (seqlen == 0) {
5571         res = _PyUnicode_New(0);  /* empty sequence; return u"" */
5572         goto Done;
5573     }
5574     /* If singleton sequence with an exact Unicode, return that. */
5575     if (seqlen == 1) {
5576         item = PySequence_Fast_GET_ITEM(fseq, 0);
5577         if (PyUnicode_CheckExact(item)) {
5578             Py_INCREF(item);
5579             res = (PyUnicodeObject *)item;
5580             goto Done;
5581         }
5582     }
5583
5584     /* At least two items to join, or one that isn't exact Unicode. */
5585     if (seqlen > 1) {
5586         /* Set up sep and seplen -- they're needed. */
5587         if (separator == NULL) {
5588             sep = &blank;
5589             seplen = 1;
5590         }
5591         else {
5592             internal_separator = PyUnicode_FromObject(separator);
5593             if (internal_separator == NULL)
5594                 goto onError;
5595             sep = PyUnicode_AS_UNICODE(internal_separator);
5596             seplen = PyUnicode_GET_SIZE(internal_separator);
5597             /* In case PyUnicode_FromObject() mutated seq. */
5598             seqlen = PySequence_Fast_GET_SIZE(fseq);
5599         }
5600     }
5601
5602     /* Get space. */
5603     res = _PyUnicode_New(res_alloc);
5604     if (res == NULL)
5605         goto onError;
5606     res_p = PyUnicode_AS_UNICODE(res);
5607     res_used = 0;
5608
5609     for (i = 0; i < seqlen; ++i) {
5610         Py_ssize_t itemlen;
5611         Py_ssize_t new_res_used;
5612
5613         item = PySequence_Fast_GET_ITEM(fseq, i);
5614         /* Convert item to Unicode. */
5615         if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5616             PyErr_Format(PyExc_TypeError,
5617                          "sequence item %zd: expected string or Unicode,"
5618                          " %.80s found",
5619                          i, Py_TYPE(item)->tp_name);
5620             goto onError;
5621         }
5622         item = PyUnicode_FromObject(item);
5623         if (item == NULL)
5624             goto onError;
5625         /* We own a reference to item from here on. */
5626
5627         /* In case PyUnicode_FromObject() mutated seq. */
5628         seqlen = PySequence_Fast_GET_SIZE(fseq);
5629
5630         /* Make sure we have enough space for the separator and the item. */
5631         itemlen = PyUnicode_GET_SIZE(item);
5632         new_res_used = res_used + itemlen;
5633         if (new_res_used < 0)
5634             goto Overflow;
5635         if (i < seqlen - 1) {
5636             new_res_used += seplen;
5637             if (new_res_used < 0)
5638                 goto Overflow;
5639         }
5640         if (new_res_used > res_alloc) {
5641             /* double allocated size until it's big enough */
5642             do {
5643                 res_alloc += res_alloc;
5644                 if (res_alloc <= 0)
5645                     goto Overflow;
5646             } while (new_res_used > res_alloc);
5647             if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5648                 Py_DECREF(item);
5649                 goto onError;
5650             }
5651             res_p = PyUnicode_AS_UNICODE(res) + res_used;
5652         }
5653
5654         /* Copy item, and maybe the separator. */
5655         Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5656         res_p += itemlen;
5657         if (i < seqlen - 1) {
5658             Py_UNICODE_COPY(res_p, sep, seplen);
5659             res_p += seplen;
5660         }
5661         Py_DECREF(item);
5662         res_used = new_res_used;
5663     }
5664
5665     /* Shrink res to match the used area; this probably can't fail,
5666      * but it's cheap to check.
5667      */
5668     if (_PyUnicode_Resize(&res, res_used) < 0)
5669         goto onError;
5670
5671   Done:
5672     Py_XDECREF(internal_separator);
5673     Py_DECREF(fseq);
5674     return (PyObject *)res;
5675
5676   Overflow:
5677     PyErr_SetString(PyExc_OverflowError,
5678                     "join() result is too long for a Python string");
5679     Py_DECREF(item);
5680     /* fall through */
5681
5682   onError:
5683     Py_XDECREF(internal_separator);
5684     Py_DECREF(fseq);
5685     Py_XDECREF(res);
5686     return NULL;
5687 }
5688
5689 static
5690 PyUnicodeObject *pad(PyUnicodeObject *self,
5691                      Py_ssize_t left,
5692                      Py_ssize_t right,
5693                      Py_UNICODE fill)
5694 {
5695     PyUnicodeObject *u;
5696
5697     if (left < 0)
5698         left = 0;
5699     if (right < 0)
5700         right = 0;
5701
5702     if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5703         Py_INCREF(self);
5704         return self;
5705     }
5706
5707     if (left > PY_SSIZE_T_MAX - self->length ||
5708         right > PY_SSIZE_T_MAX - (left + self->length)) {
5709         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5710         return NULL;
5711     }
5712     u = _PyUnicode_New(left + self->length + right);
5713     if (u) {
5714         if (left)
5715             Py_UNICODE_FILL(u->str, fill, left);
5716         Py_UNICODE_COPY(u->str + left, self->str, self->length);
5717         if (right)
5718             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5719     }
5720
5721     return u;
5722 }
5723
5724 #define SPLIT_APPEND(data, left, right)                                 \
5725     str = PyUnicode_FromUnicode((data) + (left), (right) - (left));     \
5726     if (!str)                                                           \
5727         goto onError;                                                   \
5728     if (PyList_Append(list, str)) {                                     \
5729         Py_DECREF(str);                                                 \
5730         goto onError;                                                   \
5731     }                                                                   \
5732     else                                                                \
5733         Py_DECREF(str);
5734
5735 static
5736 PyObject *split_whitespace(PyUnicodeObject *self,
5737                            PyObject *list,
5738                            Py_ssize_t maxcount)
5739 {
5740     register Py_ssize_t i;
5741     register Py_ssize_t j;
5742     Py_ssize_t len = self->length;
5743     PyObject *str;
5744     register const Py_UNICODE *buf = self->str;
5745
5746     for (i = j = 0; i < len; ) {
5747         /* find a token */
5748         while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5749             i++;
5750         j = i;
5751         while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
5752             i++;
5753         if (j < i) {
5754             if (maxcount-- <= 0)
5755                 break;
5756             SPLIT_APPEND(buf, j, i);
5757             while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5758                 i++;
5759             j = i;
5760         }
5761     }
5762     if (j < len) {
5763         SPLIT_APPEND(buf, j, len);
5764     }
5765     return list;
5766
5767   onError:
5768     Py_DECREF(list);
5769     return NULL;
5770 }
5771
5772 PyObject *PyUnicode_Splitlines(PyObject *string,
5773                                int keepends)
5774 {
5775     register Py_ssize_t i;
5776     register Py_ssize_t j;
5777     Py_ssize_t len;
5778     PyObject *list;
5779     PyObject *str;
5780     Py_UNICODE *data;
5781
5782     string = PyUnicode_FromObject(string);
5783     if (string == NULL)
5784         return NULL;
5785     data = PyUnicode_AS_UNICODE(string);
5786     len = PyUnicode_GET_SIZE(string);
5787
5788     list = PyList_New(0);
5789     if (!list)
5790         goto onError;
5791
5792     for (i = j = 0; i < len; ) {
5793         Py_ssize_t eol;
5794
5795         /* Find a line and append it */
5796         while (i < len && !BLOOM_LINEBREAK(data[i]))
5797             i++;
5798
5799         /* Skip the line break reading CRLF as one line break */
5800         eol = i;
5801         if (i < len) {
5802             if (data[i] == '\r' && i + 1 < len &&
5803                 data[i+1] == '\n')
5804                 i += 2;
5805             else
5806                 i++;
5807             if (keepends)
5808                 eol = i;
5809         }
5810         SPLIT_APPEND(data, j, eol);
5811         j = i;
5812     }
5813     if (j < len) {
5814         SPLIT_APPEND(data, j, len);
5815     }
5816
5817     Py_DECREF(string);
5818     return list;
5819
5820   onError:
5821     Py_XDECREF(list);
5822     Py_DECREF(string);
5823     return NULL;
5824 }
5825
5826 static
5827 PyObject *split_char(PyUnicodeObject *self,
5828                      PyObject *list,
5829                      Py_UNICODE ch,
5830                      Py_ssize_t maxcount)
5831 {
5832     register Py_ssize_t i;
5833     register Py_ssize_t j;
5834     Py_ssize_t len = self->length;
5835     PyObject *str;
5836     register const Py_UNICODE *buf = self->str;
5837
5838     for (i = j = 0; i < len; ) {
5839         if (buf[i] == ch) {
5840             if (maxcount-- <= 0)
5841                 break;
5842             SPLIT_APPEND(buf, j, i);
5843             i = j = i + 1;
5844         } else
5845             i++;
5846     }
5847     if (j <= len) {
5848         SPLIT_APPEND(buf, j, len);
5849     }
5850     return list;
5851
5852   onError:
5853     Py_DECREF(list);
5854     return NULL;
5855 }
5856
5857 static
5858 PyObject *split_substring(PyUnicodeObject *self,
5859                           PyObject *list,
5860                           PyUnicodeObject *substring,
5861                           Py_ssize_t maxcount)
5862 {
5863     register Py_ssize_t i;
5864     register Py_ssize_t j;
5865     Py_ssize_t len = self->length;
5866     Py_ssize_t sublen = substring->length;
5867     PyObject *str;
5868
5869     for (i = j = 0; i <= len - sublen; ) {
5870         if (Py_UNICODE_MATCH(self, i, substring)) {
5871             if (maxcount-- <= 0)
5872                 break;
5873             SPLIT_APPEND(self->str, j, i);
5874             i = j = i + sublen;
5875         } else
5876             i++;
5877     }
5878     if (j <= len) {
5879         SPLIT_APPEND(self->str, j, len);
5880     }
5881     return list;
5882
5883   onError:
5884     Py_DECREF(list);
5885     return NULL;
5886 }
5887
5888 static
5889 PyObject *rsplit_whitespace(PyUnicodeObject *self,
5890                             PyObject *list,
5891                             Py_ssize_t maxcount)
5892 {
5893     register Py_ssize_t i;
5894     register Py_ssize_t j;
5895     Py_ssize_t len = self->length;
5896     PyObject *str;
5897     register const Py_UNICODE *buf = self->str;
5898
5899     for (i = j = len - 1; i >= 0; ) {
5900         /* find a token */
5901         while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5902             i--;
5903         j = i;
5904         while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
5905             i--;
5906         if (j > i) {
5907             if (maxcount-- <= 0)
5908                 break;
5909             SPLIT_APPEND(buf, i + 1, j + 1);
5910             while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5911                 i--;
5912             j = i;
5913         }
5914     }
5915     if (j >= 0) {
5916         SPLIT_APPEND(buf, 0, j + 1);
5917     }
5918     if (PyList_Reverse(list) < 0)
5919         goto onError;
5920     return list;
5921
5922   onError:
5923     Py_DECREF(list);
5924     return NULL;
5925 }
5926
5927 static
5928 PyObject *rsplit_char(PyUnicodeObject *self,
5929                       PyObject *list,
5930                       Py_UNICODE ch,
5931                       Py_ssize_t maxcount)
5932 {
5933     register Py_ssize_t i;
5934     register Py_ssize_t j;
5935     Py_ssize_t len = self->length;
5936     PyObject *str;
5937     register const Py_UNICODE *buf = self->str;
5938
5939     for (i = j = len - 1; i >= 0; ) {
5940         if (buf[i] == ch) {
5941             if (maxcount-- <= 0)
5942                 break;
5943             SPLIT_APPEND(buf, i + 1, j + 1);
5944             j = i = i - 1;
5945         } else
5946             i--;
5947     }
5948     if (j >= -1) {
5949         SPLIT_APPEND(buf, 0, j + 1);
5950     }
5951     if (PyList_Reverse(list) < 0)
5952         goto onError;
5953     return list;
5954
5955   onError:
5956     Py_DECREF(list);
5957     return NULL;
5958 }
5959
5960 static
5961 PyObject *rsplit_substring(PyUnicodeObject *self,
5962                            PyObject *list,
5963                            PyUnicodeObject *substring,
5964                            Py_ssize_t maxcount)
5965 {
5966     register Py_ssize_t i;
5967     register Py_ssize_t j;
5968     Py_ssize_t len = self->length;
5969     Py_ssize_t sublen = substring->length;
5970     PyObject *str;
5971
5972     for (i = len - sublen, j = len; i >= 0; ) {
5973         if (Py_UNICODE_MATCH(self, i, substring)) {
5974             if (maxcount-- <= 0)
5975                 break;
5976             SPLIT_APPEND(self->str, i + sublen, j);
5977             j = i;
5978             i -= sublen;
5979         } else
5980             i--;
5981     }
5982     if (j >= 0) {
5983         SPLIT_APPEND(self->str, 0, j);
5984     }
5985     if (PyList_Reverse(list) < 0)
5986         goto onError;
5987     return list;
5988
5989   onError:
5990     Py_DECREF(list);
5991     return NULL;
5992 }
5993
5994 #undef SPLIT_APPEND
5995
5996 static
5997 PyObject *split(PyUnicodeObject *self,
5998                 PyUnicodeObject *substring,
5999                 Py_ssize_t maxcount)
6000 {
6001     PyObject *list;
6002
6003     if (maxcount < 0)
6004         maxcount = PY_SSIZE_T_MAX;
6005
6006     list = PyList_New(0);
6007     if (!list)
6008         return NULL;
6009
6010     if (substring == NULL)
6011         return split_whitespace(self,list,maxcount);
6012
6013     else if (substring->length == 1)
6014         return split_char(self,list,substring->str[0],maxcount);
6015
6016     else if (substring->length == 0) {
6017         Py_DECREF(list);
6018         PyErr_SetString(PyExc_ValueError, "empty separator");
6019         return NULL;
6020     }
6021     else
6022         return split_substring(self,list,substring,maxcount);
6023 }
6024
6025 static
6026 PyObject *rsplit(PyUnicodeObject *self,
6027                  PyUnicodeObject *substring,
6028                  Py_ssize_t maxcount)
6029 {
6030     PyObject *list;
6031
6032     if (maxcount < 0)
6033         maxcount = PY_SSIZE_T_MAX;
6034
6035     list = PyList_New(0);
6036     if (!list)
6037         return NULL;
6038
6039     if (substring == NULL)
6040         return rsplit_whitespace(self,list,maxcount);
6041
6042     else if (substring->length == 1)
6043         return rsplit_char(self,list,substring->str[0],maxcount);
6044
6045     else if (substring->length == 0) {
6046         Py_DECREF(list);
6047         PyErr_SetString(PyExc_ValueError, "empty separator");
6048         return NULL;
6049     }
6050     else
6051         return rsplit_substring(self,list,substring,maxcount);
6052 }
6053
6054 static
6055 PyObject *replace(PyUnicodeObject *self,
6056                   PyUnicodeObject *str1,
6057                   PyUnicodeObject *str2,
6058                   Py_ssize_t maxcount)
6059 {
6060     PyUnicodeObject *u;
6061
6062     if (maxcount < 0)
6063         maxcount = PY_SSIZE_T_MAX;
6064
6065     if (str1->length == str2->length) {
6066         /* same length */
6067         Py_ssize_t i;
6068         if (str1->length == 1) {
6069             /* replace characters */
6070             Py_UNICODE u1, u2;
6071             if (!findchar(self->str, self->length, str1->str[0]))
6072                 goto nothing;
6073             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6074             if (!u)
6075                 return NULL;
6076             Py_UNICODE_COPY(u->str, self->str, self->length);
6077             u1 = str1->str[0];
6078             u2 = str2->str[0];
6079             for (i = 0; i < u->length; i++)
6080                 if (u->str[i] == u1) {
6081                     if (--maxcount < 0)
6082                         break;
6083                     u->str[i] = u2;
6084                 }
6085         } else {
6086             i = fastsearch(
6087                 self->str, self->length, str1->str, str1->length, FAST_SEARCH
6088                 );
6089             if (i < 0)
6090                 goto nothing;
6091             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6092             if (!u)
6093                 return NULL;
6094             Py_UNICODE_COPY(u->str, self->str, self->length);
6095             while (i <= self->length - str1->length)
6096                 if (Py_UNICODE_MATCH(self, i, str1)) {
6097                     if (--maxcount < 0)
6098                         break;
6099                     Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6100                     i += str1->length;
6101                 } else
6102                     i++;
6103         }
6104     } else {
6105
6106         Py_ssize_t n, i, j, e;
6107         Py_ssize_t product, new_size, delta;
6108         Py_UNICODE *p;
6109
6110         /* replace strings */
6111         n = stringlib_count(self->str, self->length, str1->str, str1->length);
6112         if (n > maxcount)
6113             n = maxcount;
6114         if (n == 0)
6115             goto nothing;
6116         /* new_size = self->length + n * (str2->length - str1->length)); */
6117         delta = (str2->length - str1->length);
6118         if (delta == 0) {
6119             new_size = self->length;
6120         } else {
6121             product = n * (str2->length - str1->length);
6122             if ((product / (str2->length - str1->length)) != n) {
6123                 PyErr_SetString(PyExc_OverflowError,
6124                                 "replace string is too long");
6125                 return NULL;
6126             }
6127             new_size = self->length + product;
6128             if (new_size < 0) {
6129                 PyErr_SetString(PyExc_OverflowError,
6130                                 "replace string is too long");
6131                 return NULL;
6132             }
6133         }
6134         u = _PyUnicode_New(new_size);
6135         if (!u)
6136             return NULL;
6137         i = 0;
6138         p = u->str;
6139         e = self->length - str1->length;
6140         if (str1->length > 0) {
6141             while (n-- > 0) {
6142                 /* look for next match */
6143                 j = i;
6144                 while (j <= e) {
6145                     if (Py_UNICODE_MATCH(self, j, str1))
6146                         break;
6147                     j++;
6148                 }
6149                 if (j > i) {
6150                     if (j > e)
6151                         break;
6152                     /* copy unchanged part [i:j] */
6153                     Py_UNICODE_COPY(p, self->str+i, j-i);
6154                     p += j - i;
6155                 }
6156                 /* copy substitution string */
6157                 if (str2->length > 0) {
6158                     Py_UNICODE_COPY(p, str2->str, str2->length);
6159                     p += str2->length;
6160                 }
6161                 i = j + str1->length;
6162             }
6163             if (i < self->length)
6164                 /* copy tail [i:] */
6165                 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6166         } else {
6167             /* interleave */
6168             while (n > 0) {
6169                 Py_UNICODE_COPY(p, str2->str, str2->length);
6170                 p += str2->length;
6171                 if (--n <= 0)
6172                     break;
6173                 *p++ = self->str[i++];
6174             }
6175             Py_UNICODE_COPY(p, self->str+i, self->length-i);
6176         }
6177     }
6178     return (PyObject *) u;
6179
6180   nothing:
6181     /* nothing to replace; return original string (when possible) */
6182     if (PyUnicode_CheckExact(self)) {
6183         Py_INCREF(self);
6184         return (PyObject *) self;
6185     }
6186     return PyUnicode_FromUnicode(self->str, self->length);
6187 }
6188
6189 /* --- Unicode Object Methods --------------------------------------------- */
6190
6191 PyDoc_STRVAR(title__doc__,
6192              "S.title() -> unicode\n\
6193 \n\
6194 Return a titlecased version of S, i.e. words start with title case\n\
6195 characters, all remaining cased characters have lower case.");
6196
6197 static PyObject*
6198 unicode_title(PyUnicodeObject *self)
6199 {
6200     return fixup(self, fixtitle);
6201 }
6202
6203 PyDoc_STRVAR(capitalize__doc__,
6204              "S.capitalize() -> unicode\n\
6205 \n\
6206 Return a capitalized version of S, i.e. make the first character\n\
6207 have upper case.");
6208
6209 static PyObject*
6210 unicode_capitalize(PyUnicodeObject *self)
6211 {
6212     return fixup(self, fixcapitalize);
6213 }
6214
6215 #if 0
6216 PyDoc_STRVAR(capwords__doc__,
6217              "S.capwords() -> unicode\n\
6218 \n\
6219 Apply .capitalize() to all words in S and return the result with\n\
6220 normalized whitespace (all whitespace strings are replaced by ' ').");
6221
6222 static PyObject*
6223 unicode_capwords(PyUnicodeObject *self)
6224 {
6225     PyObject *list;
6226     PyObject *item;
6227     Py_ssize_t i;
6228
6229     /* Split into words */
6230     list = split(self, NULL, -1);
6231     if (!list)
6232         return NULL;
6233
6234     /* Capitalize each word */
6235     for (i = 0; i < PyList_GET_SIZE(list); i++) {
6236         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6237                      fixcapitalize);
6238         if (item == NULL)
6239             goto onError;
6240         Py_DECREF(PyList_GET_ITEM(list, i));
6241         PyList_SET_ITEM(list, i, item);
6242     }
6243
6244     /* Join the words to form a new string */
6245     item = PyUnicode_Join(NULL, list);
6246
6247   onError:
6248     Py_DECREF(list);
6249     return (PyObject *)item;
6250 }
6251 #endif
6252
6253 /* Argument converter.  Coerces to a single unicode character */
6254
6255 static int
6256 convert_uc(PyObject *obj, void *addr)
6257 {
6258     Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6259     PyObject *uniobj;
6260     Py_UNICODE *unistr;
6261
6262     uniobj = PyUnicode_FromObject(obj);
6263     if (uniobj == NULL) {
6264         PyErr_SetString(PyExc_TypeError,
6265                         "The fill character cannot be converted to Unicode");
6266         return 0;
6267     }
6268     if (PyUnicode_GET_SIZE(uniobj) != 1) {
6269         PyErr_SetString(PyExc_TypeError,
6270                         "The fill character must be exactly one character long");
6271         Py_DECREF(uniobj);
6272         return 0;
6273     }
6274     unistr = PyUnicode_AS_UNICODE(uniobj);
6275     *fillcharloc = unistr[0];
6276     Py_DECREF(uniobj);
6277     return 1;
6278 }
6279
6280 PyDoc_STRVAR(center__doc__,
6281              "S.center(width[, fillchar]) -> unicode\n\
6282 \n\
6283 Return S centered in a Unicode string of length width. Padding is\n\
6284 done using the specified fill character (default is a space)");
6285
6286 static PyObject *
6287 unicode_center(PyUnicodeObject *self, PyObject *args)
6288 {
6289     Py_ssize_t marg, left;
6290     Py_ssize_t width;
6291     Py_UNICODE fillchar = ' ';
6292
6293     if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6294         return NULL;
6295
6296     if (self->length >= width && PyUnicode_CheckExact(self)) {
6297         Py_INCREF(self);
6298         return (PyObject*) self;
6299     }
6300
6301     marg = width - self->length;
6302     left = marg / 2 + (marg & width & 1);
6303
6304     return (PyObject*) pad(self, left, marg - left, fillchar);
6305 }
6306
6307 #if 0
6308
6309 /* This code should go into some future Unicode collation support
6310    module. The basic comparison should compare ordinals on a naive
6311    basis (this is what Java does and thus Jython too). */
6312
6313 /* speedy UTF-16 code point order comparison */
6314 /* gleaned from: */
6315 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6316
6317 static short utf16Fixup[32] =
6318 {
6319     0, 0, 0, 0, 0, 0, 0, 0,
6320     0, 0, 0, 0, 0, 0, 0, 0,
6321     0, 0, 0, 0, 0, 0, 0, 0,
6322     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6323 };
6324
6325 static int
6326 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6327 {
6328     Py_ssize_t len1, len2;
6329
6330     Py_UNICODE *s1 = str1->str;
6331     Py_UNICODE *s2 = str2->str;
6332
6333     len1 = str1->length;
6334     len2 = str2->length;
6335
6336     while (len1 > 0 && len2 > 0) {
6337         Py_UNICODE c1, c2;
6338
6339         c1 = *s1++;
6340         c2 = *s2++;
6341
6342         if (c1 > (1<<11) * 26)
6343             c1 += utf16Fixup[c1>>11];
6344         if (c2 > (1<<11) * 26)
6345             c2 += utf16Fixup[c2>>11];
6346         /* now c1 and c2 are in UTF-32-compatible order */
6347
6348         if (c1 != c2)
6349             return (c1 < c2) ? -1 : 1;
6350
6351         len1--; len2--;
6352     }
6353
6354     return (len1 < len2) ? -1 : (len1 != len2);
6355 }
6356
6357 #else
6358
6359 static int
6360 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6361 {
6362     register Py_ssize_t len1, len2;
6363
6364     Py_UNICODE *s1 = str1->str;
6365     Py_UNICODE *s2 = str2->str;
6366
6367     len1 = str1->length;
6368     len2 = str2->length;
6369
6370     while (len1 > 0 && len2 > 0) {
6371         Py_UNICODE c1, c2;
6372
6373         c1 = *s1++;
6374         c2 = *s2++;
6375
6376         if (c1 != c2)
6377             return (c1 < c2) ? -1 : 1;
6378
6379         len1--; len2--;
6380     }
6381
6382     return (len1 < len2) ? -1 : (len1 != len2);
6383 }
6384
6385 #endif
6386
6387 int PyUnicode_Compare(PyObject *left,
6388                       PyObject *right)
6389 {
6390     PyUnicodeObject *u = NULL, *v = NULL;
6391     int result;
6392
6393     /* Coerce the two arguments */
6394     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6395     if (u == NULL)
6396         goto onError;
6397     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6398     if (v == NULL)
6399         goto onError;
6400
6401     /* Shortcut for empty or interned objects */
6402     if (v == u) {
6403         Py_DECREF(u);
6404         Py_DECREF(v);
6405         return 0;
6406     }
6407
6408     result = unicode_compare(u, v);
6409
6410     Py_DECREF(u);
6411     Py_DECREF(v);
6412     return result;
6413
6414   onError:
6415     Py_XDECREF(u);
6416     Py_XDECREF(v);
6417     return -1;
6418 }
6419
6420 PyObject *PyUnicode_RichCompare(PyObject *left,
6421                                 PyObject *right,
6422                                 int op)
6423 {
6424     int result;
6425
6426     result = PyUnicode_Compare(left, right);
6427     if (result == -1 && PyErr_Occurred())
6428         goto onError;
6429
6430     /* Convert the return value to a Boolean */
6431     switch (op) {
6432     case Py_EQ:
6433         result = (result == 0);
6434         break;
6435     case Py_NE:
6436         result = (result != 0);
6437         break;
6438     case Py_LE:
6439         result = (result <= 0);
6440         break;
6441     case Py_GE:
6442         result = (result >= 0);
6443         break;
6444     case Py_LT:
6445         result = (result == -1);
6446         break;
6447     case Py_GT:
6448         result = (result == 1);
6449         break;
6450     }
6451     return PyBool_FromLong(result);
6452
6453   onError:
6454
6455     /* Standard case
6456
6457        Type errors mean that PyUnicode_FromObject() could not convert
6458        one of the arguments (usually the right hand side) to Unicode,
6459        ie. we can't handle the comparison request. However, it is
6460        possible that the other object knows a comparison method, which
6461        is why we return Py_NotImplemented to give the other object a
6462        chance.
6463
6464     */
6465     if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6466         PyErr_Clear();
6467         Py_INCREF(Py_NotImplemented);
6468         return Py_NotImplemented;
6469     }
6470     if (op != Py_EQ && op != Py_NE)
6471         return NULL;
6472
6473     /* Equality comparison.
6474
6475        This is a special case: we silence any PyExc_UnicodeDecodeError
6476        and instead turn it into a PyErr_UnicodeWarning.
6477
6478     */
6479     if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6480         return NULL;
6481     PyErr_Clear();
6482     if (PyErr_Warn(PyExc_UnicodeWarning,
6483                    (op == Py_EQ) ?
6484                    "Unicode equal comparison "
6485                    "failed to convert both arguments to Unicode - "
6486                    "interpreting them as being unequal" :
6487                    "Unicode unequal comparison "
6488                    "failed to convert both arguments to Unicode - "
6489                    "interpreting them as being unequal"
6490             ) < 0)
6491         return NULL;
6492     result = (op == Py_NE);
6493     return PyBool_FromLong(result);
6494 }
6495
6496 int PyUnicode_Contains(PyObject *container,
6497                        PyObject *element)
6498 {
6499     PyObject *str, *sub;
6500     int result;
6501
6502     /* Coerce the two arguments */
6503     sub = PyUnicode_FromObject(element);
6504     if (!sub) {
6505         PyErr_SetString(PyExc_TypeError,
6506                         "'in <string>' requires string as left operand");
6507         return -1;
6508     }
6509
6510     str = PyUnicode_FromObject(container);
6511     if (!str) {
6512         Py_DECREF(sub);
6513         return -1;
6514     }
6515
6516     result = stringlib_contains_obj(str, sub);
6517
6518     Py_DECREF(str);
6519     Py_DECREF(sub);
6520
6521     return result;
6522 }
6523
6524 /* Concat to string or Unicode object giving a new Unicode object. */
6525
6526 PyObject *PyUnicode_Concat(PyObject *left,
6527                            PyObject *right)
6528 {
6529     PyUnicodeObject *u = NULL, *v = NULL, *w;
6530
6531     /* Coerce the two arguments */
6532     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6533     if (u == NULL)
6534         goto onError;
6535     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6536     if (v == NULL)
6537         goto onError;
6538
6539     /* Shortcuts */
6540     if (v == unicode_empty) {
6541         Py_DECREF(v);
6542         return (PyObject *)u;
6543     }
6544     if (u == unicode_empty) {
6545         Py_DECREF(u);
6546         return (PyObject *)v;
6547     }
6548
6549     /* Concat the two Unicode strings */
6550     w = _PyUnicode_New(u->length + v->length);
6551     if (w == NULL)
6552         goto onError;
6553     Py_UNICODE_COPY(w->str, u->str, u->length);
6554     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6555
6556     Py_DECREF(u);
6557     Py_DECREF(v);
6558     return (PyObject *)w;
6559
6560   onError:
6561     Py_XDECREF(u);
6562     Py_XDECREF(v);
6563     return NULL;
6564 }
6565
6566 PyDoc_STRVAR(count__doc__,
6567              "S.count(sub[, start[, end]]) -> int\n\
6568 \n\
6569 Return the number of non-overlapping occurrences of substring sub in\n\
6570 Unicode string S[start:end].  Optional arguments start and end are\n\
6571 interpreted as in slice notation.");
6572
6573 static PyObject *
6574 unicode_count(PyUnicodeObject *self, PyObject *args)
6575 {
6576     PyUnicodeObject *substring;
6577     Py_ssize_t start = 0;
6578     Py_ssize_t end = PY_SSIZE_T_MAX;
6579     PyObject *result;
6580
6581     if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6582                           _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6583         return NULL;
6584
6585     substring = (PyUnicodeObject *)PyUnicode_FromObject(
6586         (PyObject *)substring);
6587     if (substring == NULL)
6588         return NULL;
6589
6590     FIX_START_END(self);
6591
6592     result = PyInt_FromSsize_t(
6593         stringlib_count(self->str + start, end - start,
6594                         substring->str, substring->length)
6595         );
6596
6597     Py_DECREF(substring);
6598
6599     return result;
6600 }
6601
6602 PyDoc_STRVAR(encode__doc__,
6603              "S.encode([encoding[,errors]]) -> string or unicode\n\
6604 \n\
6605 Encodes S using the codec registered for encoding. encoding defaults\n\
6606 to the default encoding. errors may be given to set a different error\n\
6607 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6608 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6609 'xmlcharrefreplace' as well as any other name registered with\n\
6610 codecs.register_error that can handle UnicodeEncodeErrors.");
6611
6612 static PyObject *
6613 unicode_encode(PyUnicodeObject *self, PyObject *args)
6614 {
6615     char *encoding = NULL;
6616     char *errors = NULL;
6617     PyObject *v;
6618
6619     if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6620         return NULL;
6621     v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6622     if (v == NULL)
6623         goto onError;
6624     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6625         PyErr_Format(PyExc_TypeError,
6626                      "encoder did not return a string/unicode object "
6627                      "(type=%.400s)",
6628                      Py_TYPE(v)->tp_name);
6629         Py_DECREF(v);
6630         return NULL;
6631     }
6632     return v;
6633
6634   onError:
6635     return NULL;
6636 }
6637
6638 PyDoc_STRVAR(decode__doc__,
6639              "S.decode([encoding[,errors]]) -> string or unicode\n\
6640 \n\
6641 Decodes S using the codec registered for encoding. encoding defaults\n\
6642 to the default encoding. errors may be given to set a different error\n\
6643 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6644 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6645 as well as any other name registerd with codecs.register_error that is\n\
6646 able to handle UnicodeDecodeErrors.");
6647
6648 static PyObject *
6649 unicode_decode(PyUnicodeObject *self, PyObject *args)
6650 {
6651     char *encoding = NULL;
6652     char *errors = NULL;
6653     PyObject *v;
6654
6655     if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6656         return NULL;
6657     v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
6658     if (v == NULL)
6659         goto onError;
6660     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6661         PyErr_Format(PyExc_TypeError,
6662                      "decoder did not return a string/unicode object "
6663                      "(type=%.400s)",
6664                      Py_TYPE(v)->tp_name);
6665         Py_DECREF(v);
6666         return NULL;
6667     }
6668     return v;
6669
6670   onError:
6671     return NULL;
6672 }
6673
6674 PyDoc_STRVAR(expandtabs__doc__,
6675              "S.expandtabs([tabsize]) -> unicode\n\
6676 \n\
6677 Return a copy of S where all tab characters are expanded using spaces.\n\
6678 If tabsize is not given, a tab size of 8 characters is assumed.");
6679
6680 static PyObject*
6681 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6682 {
6683     Py_UNICODE *e;
6684     Py_UNICODE *p;
6685     Py_UNICODE *q;
6686     Py_UNICODE *qe;
6687     Py_ssize_t i, j, incr;
6688     PyUnicodeObject *u;
6689     int tabsize = 8;
6690
6691     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6692         return NULL;
6693
6694     /* First pass: determine size of output string */
6695     i = 0; /* chars up to and including most recent \n or \r */
6696     j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6697     e = self->str + self->length; /* end of input */
6698     for (p = self->str; p < e; p++)
6699         if (*p == '\t') {
6700             if (tabsize > 0) {
6701                 incr = tabsize - (j % tabsize); /* cannot overflow */
6702                 if (j > PY_SSIZE_T_MAX - incr)
6703                     goto overflow1;
6704                 j += incr;
6705             }
6706         }
6707         else {
6708             if (j > PY_SSIZE_T_MAX - 1)
6709                 goto overflow1;
6710             j++;
6711             if (*p == '\n' || *p == '\r') {
6712                 if (i > PY_SSIZE_T_MAX - j)
6713                     goto overflow1;
6714                 i += j;
6715                 j = 0;
6716             }
6717         }
6718
6719     if (i > PY_SSIZE_T_MAX - j)
6720         goto overflow1;
6721
6722     /* Second pass: create output string and fill it */
6723     u = _PyUnicode_New(i + j);
6724     if (!u)
6725         return NULL;
6726
6727     j = 0; /* same as in first pass */
6728     q = u->str; /* next output char */
6729     qe = u->str + u->length; /* end of output */
6730
6731     for (p = self->str; p < e; p++)
6732         if (*p == '\t') {
6733             if (tabsize > 0) {
6734                 i = tabsize - (j % tabsize);
6735                 j += i;
6736                 while (i--) {
6737                     if (q >= qe)
6738                         goto overflow2;
6739                     *q++ = ' ';
6740                 }
6741             }
6742         }
6743         else {
6744             if (q >= qe)
6745                 goto overflow2;
6746             *q++ = *p;
6747             j++;
6748             if (*p == '\n' || *p == '\r')
6749                 j = 0;
6750         }
6751
6752     return (PyObject*) u;
6753
6754   overflow2:
6755     Py_DECREF(u);
6756   overflow1:
6757     PyErr_SetString(PyExc_OverflowError, "new string is too long");
6758     return NULL;
6759 }
6760
6761 PyDoc_STRVAR(find__doc__,
6762              "S.find(sub [,start [,end]]) -> int\n\
6763 \n\
6764 Return the lowest index in S where substring sub is found,\n\
6765 such that sub is contained within s[start:end].  Optional\n\
6766 arguments start and end are interpreted as in slice notation.\n\
6767 \n\
6768 Return -1 on failure.");
6769
6770 static PyObject *
6771 unicode_find(PyUnicodeObject *self, PyObject *args)
6772 {
6773     PyObject *substring;
6774     Py_ssize_t start;
6775     Py_ssize_t end;
6776     Py_ssize_t result;
6777
6778     if (!_ParseTupleFinds(args, &substring, &start, &end))
6779         return NULL;
6780
6781     result = stringlib_find_slice(
6782         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6783         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6784         start, end
6785         );
6786
6787     Py_DECREF(substring);
6788
6789     return PyInt_FromSsize_t(result);
6790 }
6791
6792 static PyObject *
6793 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6794 {
6795     if (index < 0 || index >= self->length) {
6796         PyErr_SetString(PyExc_IndexError, "string index out of range");
6797         return NULL;
6798     }
6799
6800     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6801 }
6802
6803 static long
6804 unicode_hash(PyUnicodeObject *self)
6805 {
6806     /* Since Unicode objects compare equal to their ASCII string
6807        counterparts, they should use the individual character values
6808        as basis for their hash value.  This is needed to assure that
6809        strings and Unicode objects behave in the same way as
6810        dictionary keys. */
6811
6812     register Py_ssize_t len;
6813     register Py_UNICODE *p;
6814     register long x;
6815
6816     if (self->hash != -1)
6817         return self->hash;
6818     len = PyUnicode_GET_SIZE(self);
6819     p = PyUnicode_AS_UNICODE(self);
6820     x = *p << 7;
6821     while (--len >= 0)
6822         x = (1000003*x) ^ *p++;
6823     x ^= PyUnicode_GET_SIZE(self);
6824     if (x == -1)
6825         x = -2;
6826     self->hash = x;
6827     return x;
6828 }
6829
6830 PyDoc_STRVAR(index__doc__,
6831              "S.index(sub [,start [,end]]) -> int\n\
6832 \n\
6833 Like S.find() but raise ValueError when the substring is not found.");
6834
6835 static PyObject *
6836 unicode_index(PyUnicodeObject *self, PyObject *args)
6837 {
6838     Py_ssize_t result;
6839     PyObject *substring;
6840     Py_ssize_t start;
6841     Py_ssize_t end;
6842
6843     if (!_ParseTupleFinds(args, &substring, &start, &end))
6844         return NULL;
6845
6846     result = stringlib_find_slice(
6847         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6848         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6849         start, end
6850         );
6851
6852     Py_DECREF(substring);
6853
6854     if (result < 0) {
6855         PyErr_SetString(PyExc_ValueError, "substring not found");
6856         return NULL;
6857     }
6858
6859     return PyInt_FromSsize_t(result);
6860 }
6861
6862 PyDoc_STRVAR(islower__doc__,
6863              "S.islower() -> bool\n\
6864 \n\
6865 Return True if all cased characters in S are lowercase and there is\n\
6866 at least one cased character in S, False otherwise.");
6867
6868 static PyObject*
6869 unicode_islower(PyUnicodeObject *self)
6870 {
6871     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6872     register const Py_UNICODE *e;
6873     int cased;
6874
6875     /* Shortcut for single character strings */
6876     if (PyUnicode_GET_SIZE(self) == 1)
6877         return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6878
6879     /* Special case for empty strings */
6880     if (PyUnicode_GET_SIZE(self) == 0)
6881         return PyBool_FromLong(0);
6882
6883     e = p + PyUnicode_GET_SIZE(self);
6884     cased = 0;
6885     for (; p < e; p++) {
6886         register const Py_UNICODE ch = *p;
6887
6888         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6889             return PyBool_FromLong(0);
6890         else if (!cased && Py_UNICODE_ISLOWER(ch))
6891             cased = 1;
6892     }
6893     return PyBool_FromLong(cased);
6894 }
6895
6896 PyDoc_STRVAR(isupper__doc__,
6897              "S.isupper() -> bool\n\
6898 \n\
6899 Return True if all cased characters in S are uppercase and there is\n\
6900 at least one cased character in S, False otherwise.");
6901
6902 static PyObject*
6903 unicode_isupper(PyUnicodeObject *self)
6904 {
6905     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6906     register const Py_UNICODE *e;
6907     int cased;
6908
6909     /* Shortcut for single character strings */
6910     if (PyUnicode_GET_SIZE(self) == 1)
6911         return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6912
6913     /* Special case for empty strings */
6914     if (PyUnicode_GET_SIZE(self) == 0)
6915         return PyBool_FromLong(0);
6916
6917     e = p + PyUnicode_GET_SIZE(self);
6918     cased = 0;
6919     for (; p < e; p++) {
6920         register const Py_UNICODE ch = *p;
6921
6922         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6923             return PyBool_FromLong(0);
6924         else if (!cased && Py_UNICODE_ISUPPER(ch))
6925             cased = 1;
6926     }
6927     return PyBool_FromLong(cased);
6928 }
6929
6930 PyDoc_STRVAR(istitle__doc__,
6931              "S.istitle() -> bool\n\
6932 \n\
6933 Return True if S is a titlecased string and there is at least one\n\
6934 character in S, i.e. upper- and titlecase characters may only\n\
6935 follow uncased characters and lowercase characters only cased ones.\n\
6936 Return False otherwise.");
6937
6938 static PyObject*
6939 unicode_istitle(PyUnicodeObject *self)
6940 {
6941     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6942     register const Py_UNICODE *e;
6943     int cased, previous_is_cased;
6944
6945     /* Shortcut for single character strings */
6946     if (PyUnicode_GET_SIZE(self) == 1)
6947         return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6948                                (Py_UNICODE_ISUPPER(*p) != 0));
6949
6950     /* Special case for empty strings */
6951     if (PyUnicode_GET_SIZE(self) == 0)
6952         return PyBool_FromLong(0);
6953
6954     e = p + PyUnicode_GET_SIZE(self);
6955     cased = 0;
6956     previous_is_cased = 0;
6957     for (; p < e; p++) {
6958         register const Py_UNICODE ch = *p;
6959
6960         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6961             if (previous_is_cased)
6962                 return PyBool_FromLong(0);
6963             previous_is_cased = 1;
6964             cased = 1;
6965         }
6966         else if (Py_UNICODE_ISLOWER(ch)) {
6967             if (!previous_is_cased)
6968                 return PyBool_FromLong(0);
6969             previous_is_cased = 1;
6970             cased = 1;
6971         }
6972         else
6973             previous_is_cased = 0;
6974     }
6975     return PyBool_FromLong(cased);
6976 }
6977
6978 PyDoc_STRVAR(isspace__doc__,
6979              "S.isspace() -> bool\n\
6980 \n\
6981 Return True if all characters in S are whitespace\n\
6982 and there is at least one character in S, False otherwise.");
6983
6984 static PyObject*
6985 unicode_isspace(PyUnicodeObject *self)
6986 {
6987     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6988     register const Py_UNICODE *e;
6989
6990     /* Shortcut for single character strings */
6991     if (PyUnicode_GET_SIZE(self) == 1 &&
6992         Py_UNICODE_ISSPACE(*p))
6993         return PyBool_FromLong(1);
6994
6995     /* Special case for empty strings */
6996     if (PyUnicode_GET_SIZE(self) == 0)
6997         return PyBool_FromLong(0);
6998
6999     e = p + PyUnicode_GET_SIZE(self);
7000     for (; p < e; p++) {
7001         if (!Py_UNICODE_ISSPACE(*p))
7002             return PyBool_FromLong(0);
7003     }
7004     return PyBool_FromLong(1);
7005 }
7006
7007 PyDoc_STRVAR(isalpha__doc__,
7008              "S.isalpha() -> bool\n\
7009 \n\
7010 Return True if all characters in S are alphabetic\n\
7011 and there is at least one character in S, False otherwise.");
7012
7013 static PyObject*
7014 unicode_isalpha(PyUnicodeObject *self)
7015 {
7016     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7017     register const Py_UNICODE *e;
7018
7019     /* Shortcut for single character strings */
7020     if (PyUnicode_GET_SIZE(self) == 1 &&
7021         Py_UNICODE_ISALPHA(*p))
7022         return PyBool_FromLong(1);
7023
7024     /* Special case for empty strings */
7025     if (PyUnicode_GET_SIZE(self) == 0)
7026         return PyBool_FromLong(0);
7027
7028     e = p + PyUnicode_GET_SIZE(self);
7029     for (; p < e; p++) {
7030         if (!Py_UNICODE_ISALPHA(*p))
7031             return PyBool_FromLong(0);
7032     }
7033     return PyBool_FromLong(1);
7034 }
7035
7036 PyDoc_STRVAR(isalnum__doc__,
7037              "S.isalnum() -> bool\n\
7038 \n\
7039 Return True if all characters in S are alphanumeric\n\
7040 and there is at least one character in S, False otherwise.");
7041
7042 static PyObject*
7043 unicode_isalnum(PyUnicodeObject *self)
7044 {
7045     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7046     register const Py_UNICODE *e;
7047
7048     /* Shortcut for single character strings */
7049     if (PyUnicode_GET_SIZE(self) == 1 &&
7050         Py_UNICODE_ISALNUM(*p))
7051         return PyBool_FromLong(1);
7052
7053     /* Special case for empty strings */
7054     if (PyUnicode_GET_SIZE(self) == 0)
7055         return PyBool_FromLong(0);
7056
7057     e = p + PyUnicode_GET_SIZE(self);
7058     for (; p < e; p++) {
7059         if (!Py_UNICODE_ISALNUM(*p))
7060             return PyBool_FromLong(0);
7061     }
7062     return PyBool_FromLong(1);
7063 }
7064
7065 PyDoc_STRVAR(isdecimal__doc__,
7066              "S.isdecimal() -> bool\n\
7067 \n\
7068 Return True if there are only decimal characters in S,\n\
7069 False otherwise.");
7070
7071 static PyObject*
7072 unicode_isdecimal(PyUnicodeObject *self)
7073 {
7074     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7075     register const Py_UNICODE *e;
7076
7077     /* Shortcut for single character strings */
7078     if (PyUnicode_GET_SIZE(self) == 1 &&
7079         Py_UNICODE_ISDECIMAL(*p))
7080         return PyBool_FromLong(1);
7081
7082     /* Special case for empty strings */
7083     if (PyUnicode_GET_SIZE(self) == 0)
7084         return PyBool_FromLong(0);
7085
7086     e = p + PyUnicode_GET_SIZE(self);
7087     for (; p < e; p++) {
7088         if (!Py_UNICODE_ISDECIMAL(*p))
7089             return PyBool_FromLong(0);
7090     }
7091     return PyBool_FromLong(1);
7092 }
7093
7094 PyDoc_STRVAR(isdigit__doc__,
7095              "S.isdigit() -> bool\n\
7096 \n\
7097 Return True if all characters in S are digits\n\
7098 and there is at least one character in S, False otherwise.");
7099
7100 static PyObject*
7101 unicode_isdigit(PyUnicodeObject *self)
7102 {
7103     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7104     register const Py_UNICODE *e;
7105
7106     /* Shortcut for single character strings */
7107     if (PyUnicode_GET_SIZE(self) == 1 &&
7108         Py_UNICODE_ISDIGIT(*p))
7109         return PyBool_FromLong(1);
7110
7111     /* Special case for empty strings */
7112     if (PyUnicode_GET_SIZE(self) == 0)
7113         return PyBool_FromLong(0);
7114
7115     e = p + PyUnicode_GET_SIZE(self);
7116     for (; p < e; p++) {
7117         if (!Py_UNICODE_ISDIGIT(*p))
7118             return PyBool_FromLong(0);
7119     }
7120     return PyBool_FromLong(1);
7121 }
7122
7123 PyDoc_STRVAR(isnumeric__doc__,
7124              "S.isnumeric() -> bool\n\
7125 \n\
7126 Return True if there are only numeric characters in S,\n\
7127 False otherwise.");
7128
7129 static PyObject*
7130 unicode_isnumeric(PyUnicodeObject *self)
7131 {
7132     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7133     register const Py_UNICODE *e;
7134
7135     /* Shortcut for single character strings */
7136     if (PyUnicode_GET_SIZE(self) == 1 &&
7137         Py_UNICODE_ISNUMERIC(*p))
7138         return PyBool_FromLong(1);
7139
7140     /* Special case for empty strings */
7141     if (PyUnicode_GET_SIZE(self) == 0)
7142         return PyBool_FromLong(0);
7143
7144     e = p + PyUnicode_GET_SIZE(self);
7145     for (; p < e; p++) {
7146         if (!Py_UNICODE_ISNUMERIC(*p))
7147             return PyBool_FromLong(0);
7148     }
7149     return PyBool_FromLong(1);
7150 }
7151
7152 PyDoc_STRVAR(join__doc__,
7153              "S.join(sequence) -> unicode\n\
7154 \n\
7155 Return a string which is the concatenation of the strings in the\n\
7156 sequence.  The separator between elements is S.");
7157
7158 static PyObject*
7159 unicode_join(PyObject *self, PyObject *data)
7160 {
7161     return PyUnicode_Join(self, data);
7162 }
7163
7164 static Py_ssize_t
7165 unicode_length(PyUnicodeObject *self)
7166 {
7167     return self->length;
7168 }
7169
7170 PyDoc_STRVAR(ljust__doc__,
7171              "S.ljust(width[, fillchar]) -> int\n\
7172 \n\
7173 Return S left-justified in a Unicode string of length width. Padding is\n\
7174 done using the specified fill character (default is a space).");
7175
7176 static PyObject *
7177 unicode_ljust(PyUnicodeObject *self, PyObject *args)
7178 {
7179     Py_ssize_t width;
7180     Py_UNICODE fillchar = ' ';
7181
7182     if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7183         return NULL;
7184
7185     if (self->length >= width && PyUnicode_CheckExact(self)) {
7186         Py_INCREF(self);
7187         return (PyObject*) self;
7188     }
7189
7190     return (PyObject*) pad(self, 0, width - self->length, fillchar);
7191 }
7192
7193 PyDoc_STRVAR(lower__doc__,
7194              "S.lower() -> unicode\n\
7195 \n\
7196 Return a copy of the string S converted to lowercase.");
7197
7198 static PyObject*
7199 unicode_lower(PyUnicodeObject *self)
7200 {
7201     return fixup(self, fixlower);
7202 }
7203
7204 #define LEFTSTRIP 0
7205 #define RIGHTSTRIP 1
7206 #define BOTHSTRIP 2
7207
7208 /* Arrays indexed by above */
7209 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7210
7211 #define STRIPNAME(i) (stripformat[i]+3)
7212
7213 /* externally visible for str.strip(unicode) */
7214 PyObject *
7215 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7216 {
7217     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7218     Py_ssize_t len = PyUnicode_GET_SIZE(self);
7219     Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7220     Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7221     Py_ssize_t i, j;
7222
7223     BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7224
7225     i = 0;
7226     if (striptype != RIGHTSTRIP) {
7227         while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7228             i++;
7229         }
7230     }
7231
7232     j = len;
7233     if (striptype != LEFTSTRIP) {
7234         do {
7235             j--;
7236         } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7237         j++;
7238     }
7239
7240     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7241         Py_INCREF(self);
7242         return (PyObject*)self;
7243     }
7244     else
7245         return PyUnicode_FromUnicode(s+i, j-i);
7246 }
7247
7248
7249 static PyObject *
7250 do_strip(PyUnicodeObject *self, int striptype)
7251 {
7252     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7253     Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7254
7255     i = 0;
7256     if (striptype != RIGHTSTRIP) {
7257         while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7258             i++;
7259         }
7260     }
7261
7262     j = len;
7263     if (striptype != LEFTSTRIP) {
7264         do {
7265             j--;
7266         } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7267         j++;
7268     }
7269
7270     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7271         Py_INCREF(self);
7272         return (PyObject*)self;
7273     }
7274     else
7275         return PyUnicode_FromUnicode(s+i, j-i);
7276 }
7277
7278
7279 static PyObject *
7280 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7281 {
7282     PyObject *sep = NULL;
7283
7284     if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7285         return NULL;
7286
7287     if (sep != NULL && sep != Py_None) {
7288         if (PyUnicode_Check(sep))
7289             return _PyUnicode_XStrip(self, striptype, sep);
7290         else if (PyString_Check(sep)) {
7291             PyObject *res;
7292             sep = PyUnicode_FromObject(sep);
7293             if (sep==NULL)
7294                 return NULL;
7295             res = _PyUnicode_XStrip(self, striptype, sep);
7296             Py_DECREF(sep);
7297             return res;
7298         }
7299         else {
7300             PyErr_Format(PyExc_TypeError,
7301                          "%s arg must be None, unicode or str",
7302                          STRIPNAME(striptype));
7303             return NULL;
7304         }
7305     }
7306
7307     return do_strip(self, striptype);
7308 }
7309
7310
7311 PyDoc_STRVAR(strip__doc__,
7312              "S.strip([chars]) -> unicode\n\
7313 \n\
7314 Return a copy of the string S with leading and trailing\n\
7315 whitespace removed.\n\
7316 If chars is given and not None, remove characters in chars instead.\n\
7317 If chars is a str, it will be converted to unicode before stripping");
7318
7319 static PyObject *
7320 unicode_strip(PyUnicodeObject *self, PyObject *args)
7321 {
7322     if (PyTuple_GET_SIZE(args) == 0)
7323         return do_strip(self, BOTHSTRIP); /* Common case */
7324     else
7325         return do_argstrip(self, BOTHSTRIP, args);
7326 }
7327
7328
7329 PyDoc_STRVAR(lstrip__doc__,
7330              "S.lstrip([chars]) -> unicode\n\
7331 \n\
7332 Return a copy of the string S with leading whitespace removed.\n\
7333 If chars is given and not None, remove characters in chars instead.\n\
7334 If chars is a str, it will be converted to unicode before stripping");
7335
7336 static PyObject *
7337 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7338 {
7339     if (PyTuple_GET_SIZE(args) == 0)
7340         return do_strip(self, LEFTSTRIP); /* Common case */
7341     else
7342         return do_argstrip(self, LEFTSTRIP, args);
7343 }
7344
7345
7346 PyDoc_STRVAR(rstrip__doc__,
7347              "S.rstrip([chars]) -> unicode\n\
7348 \n\
7349 Return a copy of the string S with trailing whitespace removed.\n\
7350 If chars is given and not None, remove characters in chars instead.\n\
7351 If chars is a str, it will be converted to unicode before stripping");
7352
7353 static PyObject *
7354 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7355 {
7356     if (PyTuple_GET_SIZE(args) == 0)
7357         return do_strip(self, RIGHTSTRIP); /* Common case */
7358     else
7359         return do_argstrip(self, RIGHTSTRIP, args);
7360 }
7361
7362
7363 static PyObject*
7364 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7365 {
7366     PyUnicodeObject *u;
7367     Py_UNICODE *p;
7368     Py_ssize_t nchars;
7369     size_t nbytes;
7370
7371     if (len < 0)
7372         len = 0;
7373
7374     if (len == 1 && PyUnicode_CheckExact(str)) {
7375         /* no repeat, return original string */
7376         Py_INCREF(str);
7377         return (PyObject*) str;
7378     }
7379
7380     /* ensure # of chars needed doesn't overflow int and # of bytes
7381      * needed doesn't overflow size_t
7382      */
7383     nchars = len * str->length;
7384     if (len && nchars / len != str->length) {
7385         PyErr_SetString(PyExc_OverflowError,
7386                         "repeated string is too long");
7387         return NULL;
7388     }
7389     nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7390     if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7391         PyErr_SetString(PyExc_OverflowError,
7392                         "repeated string is too long");
7393         return NULL;
7394     }
7395     u = _PyUnicode_New(nchars);
7396     if (!u)
7397         return NULL;
7398
7399     p = u->str;
7400
7401     if (str->length == 1 && len > 0) {
7402         Py_UNICODE_FILL(p, str->str[0], len);
7403     } else {
7404         Py_ssize_t done = 0; /* number of characters copied this far */
7405         if (done < nchars) {
7406             Py_UNICODE_COPY(p, str->str, str->length);
7407             done = str->length;
7408         }
7409         while (done < nchars) {
7410             Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7411             Py_UNICODE_COPY(p+done, p, n);
7412             done += n;
7413         }
7414     }
7415
7416     return (PyObject*) u;
7417 }
7418
7419 PyObject *PyUnicode_Replace(PyObject *obj,
7420                             PyObject *subobj,
7421                             PyObject *replobj,
7422                             Py_ssize_t maxcount)
7423 {
7424     PyObject *self;
7425     PyObject *str1;
7426     PyObject *str2;
7427     PyObject *result;
7428
7429     self = PyUnicode_FromObject(obj);
7430     if (self == NULL)
7431         return NULL;
7432     str1 = PyUnicode_FromObject(subobj);
7433     if (str1 == NULL) {
7434         Py_DECREF(self);
7435         return NULL;
7436     }
7437     str2 = PyUnicode_FromObject(replobj);
7438     if (str2 == NULL) {
7439         Py_DECREF(self);
7440         Py_DECREF(str1);
7441         return NULL;
7442     }
7443     result = replace((PyUnicodeObject *)self,
7444                      (PyUnicodeObject *)str1,
7445                      (PyUnicodeObject *)str2,
7446                      maxcount);
7447     Py_DECREF(self);
7448     Py_DECREF(str1);
7449     Py_DECREF(str2);
7450     return result;
7451 }
7452
7453 PyDoc_STRVAR(replace__doc__,
7454              "S.replace (old, new[, count]) -> unicode\n\
7455 \n\
7456 Return a copy of S with all occurrences of substring\n\
7457 old replaced by new.  If the optional argument count is\n\
7458 given, only the first count occurrences are replaced.");
7459
7460 static PyObject*
7461 unicode_replace(PyUnicodeObject *self, PyObject *args)
7462 {
7463     PyUnicodeObject *str1;
7464     PyUnicodeObject *str2;
7465     Py_ssize_t maxcount = -1;
7466     PyObject *result;
7467
7468     if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7469         return NULL;
7470     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7471     if (str1 == NULL)
7472         return NULL;
7473     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7474     if (str2 == NULL) {
7475         Py_DECREF(str1);
7476         return NULL;
7477     }
7478
7479     result = replace(self, str1, str2, maxcount);
7480
7481     Py_DECREF(str1);
7482     Py_DECREF(str2);
7483     return result;
7484 }
7485
7486 static
7487 PyObject *unicode_repr(PyObject *unicode)
7488 {
7489     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7490                                 PyUnicode_GET_SIZE(unicode),
7491                                 1);
7492 }
7493
7494 PyDoc_STRVAR(rfind__doc__,
7495              "S.rfind(sub [,start [,end]]) -> int\n\
7496 \n\
7497 Return the highest index in S where substring sub is found,\n\
7498 such that sub is contained within s[start:end].  Optional\n\
7499 arguments start and end are interpreted as in slice notation.\n\
7500 \n\
7501 Return -1 on failure.");
7502
7503 static PyObject *
7504 unicode_rfind(PyUnicodeObject *self, PyObject *args)
7505 {
7506     PyObject *substring;
7507     Py_ssize_t start;
7508     Py_ssize_t end;
7509     Py_ssize_t result;
7510
7511     if (!_ParseTupleFinds(args, &substring, &start, &end))
7512         return NULL;
7513
7514     result = stringlib_rfind_slice(
7515         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7516         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7517         start, end
7518         );
7519
7520     Py_DECREF(substring);
7521
7522     return PyInt_FromSsize_t(result);
7523 }
7524
7525 PyDoc_STRVAR(rindex__doc__,
7526              "S.rindex(sub [,start [,end]]) -> int\n\
7527 \n\
7528 Like S.rfind() but raise ValueError when the substring is not found.");
7529
7530 static PyObject *
7531 unicode_rindex(PyUnicodeObject *self, PyObject *args)
7532 {
7533     PyObject *substring;
7534     Py_ssize_t start;
7535     Py_ssize_t end;
7536     Py_ssize_t result;
7537
7538     if (!_ParseTupleFinds(args, &substring, &start, &end))
7539         return NULL;
7540
7541     result = stringlib_rfind_slice(
7542         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7543         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7544         start, end
7545         );
7546
7547     Py_DECREF(substring);
7548
7549     if (result < 0) {
7550         PyErr_SetString(PyExc_ValueError, "substring not found");
7551         return NULL;
7552     }
7553     return PyInt_FromSsize_t(result);
7554 }
7555
7556 PyDoc_STRVAR(rjust__doc__,
7557              "S.rjust(width[, fillchar]) -> unicode\n\
7558 \n\
7559 Return S right-justified in a Unicode string of length width. Padding is\n\
7560 done using the specified fill character (default is a space).");
7561
7562 static PyObject *
7563 unicode_rjust(PyUnicodeObject *self, PyObject *args)
7564 {
7565     Py_ssize_t width;
7566     Py_UNICODE fillchar = ' ';
7567
7568     if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7569         return NULL;
7570
7571     if (self->length >= width && PyUnicode_CheckExact(self)) {
7572         Py_INCREF(self);
7573         return (PyObject*) self;
7574     }
7575
7576     return (PyObject*) pad(self, width - self->length, 0, fillchar);
7577 }
7578
7579 static PyObject*
7580 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
7581 {
7582     /* standard clamping */
7583     if (start < 0)
7584         start = 0;
7585     if (end < 0)
7586         end = 0;
7587     if (end > self->length)
7588         end = self->length;
7589     if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
7590         /* full slice, return original string */
7591         Py_INCREF(self);
7592         return (PyObject*) self;
7593     }
7594     if (start > end)
7595         start = end;
7596     /* copy slice */
7597     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7598                                              end - start);
7599 }
7600
7601 PyObject *PyUnicode_Split(PyObject *s,
7602                           PyObject *sep,
7603                           Py_ssize_t maxsplit)
7604 {
7605     PyObject *result;
7606
7607     s = PyUnicode_FromObject(s);
7608     if (s == NULL)
7609         return NULL;
7610     if (sep != NULL) {
7611         sep = PyUnicode_FromObject(sep);
7612         if (sep == NULL) {
7613             Py_DECREF(s);
7614             return NULL;
7615         }
7616     }
7617
7618     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7619
7620     Py_DECREF(s);
7621     Py_XDECREF(sep);
7622     return result;
7623 }
7624
7625 PyDoc_STRVAR(split__doc__,
7626              "S.split([sep [,maxsplit]]) -> list of strings\n\
7627 \n\
7628 Return a list of the words in S, using sep as the\n\
7629 delimiter string.  If maxsplit is given, at most maxsplit\n\
7630 splits are done. If sep is not specified or is None, any\n\
7631 whitespace string is a separator and empty strings are\n\
7632 removed from the result.");
7633
7634 static PyObject*
7635 unicode_split(PyUnicodeObject *self, PyObject *args)
7636 {
7637     PyObject *substring = Py_None;
7638     Py_ssize_t maxcount = -1;
7639
7640     if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7641         return NULL;
7642
7643     if (substring == Py_None)
7644         return split(self, NULL, maxcount);
7645     else if (PyUnicode_Check(substring))
7646         return split(self, (PyUnicodeObject *)substring, maxcount);
7647     else
7648         return PyUnicode_Split((PyObject *)self, substring, maxcount);
7649 }
7650
7651 PyObject *
7652 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7653 {
7654     PyObject* str_obj;
7655     PyObject* sep_obj;
7656     PyObject* out;
7657
7658     str_obj = PyUnicode_FromObject(str_in);
7659     if (!str_obj)
7660         return NULL;
7661     sep_obj = PyUnicode_FromObject(sep_in);
7662     if (!sep_obj) {
7663         Py_DECREF(str_obj);
7664         return NULL;
7665     }
7666
7667     out = stringlib_partition(
7668         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7669         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7670         );
7671
7672     Py_DECREF(sep_obj);
7673     Py_DECREF(str_obj);
7674
7675     return out;
7676 }
7677
7678
7679 PyObject *
7680 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7681 {
7682     PyObject* str_obj;
7683     PyObject* sep_obj;
7684     PyObject* out;
7685
7686     str_obj = PyUnicode_FromObject(str_in);
7687     if (!str_obj)
7688         return NULL;
7689     sep_obj = PyUnicode_FromObject(sep_in);
7690     if (!sep_obj) {
7691         Py_DECREF(str_obj);
7692         return NULL;
7693     }
7694
7695     out = stringlib_rpartition(
7696         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7697         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7698         );
7699
7700     Py_DECREF(sep_obj);
7701     Py_DECREF(str_obj);
7702
7703     return out;
7704 }
7705
7706 PyDoc_STRVAR(partition__doc__,
7707              "S.partition(sep) -> (head, sep, tail)\n\
7708 \n\
7709 Search for the separator sep in S, and return the part before it,\n\
7710 the separator itself, and the part after it.  If the separator is not\n\
7711 found, return S and two empty strings.");
7712
7713 static PyObject*
7714 unicode_partition(PyUnicodeObject *self, PyObject *separator)
7715 {
7716     return PyUnicode_Partition((PyObject *)self, separator);
7717 }
7718
7719 PyDoc_STRVAR(rpartition__doc__,
7720              "S.rpartition(sep) -> (tail, sep, head)\n\
7721 \n\
7722 Search for the separator sep in S, starting at the end of S, and return\n\
7723 the part before it, the separator itself, and the part after it.  If the\n\
7724 separator is not found, return two empty strings and S.");
7725
7726 static PyObject*
7727 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7728 {
7729     return PyUnicode_RPartition((PyObject *)self, separator);
7730 }
7731
7732 PyObject *PyUnicode_RSplit(PyObject *s,
7733                            PyObject *sep,
7734                            Py_ssize_t maxsplit)
7735 {
7736     PyObject *result;
7737
7738     s = PyUnicode_FromObject(s);
7739     if (s == NULL)
7740         return NULL;
7741     if (sep != NULL) {
7742         sep = PyUnicode_FromObject(sep);
7743         if (sep == NULL) {
7744             Py_DECREF(s);
7745             return NULL;
7746         }
7747     }
7748
7749     result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7750
7751     Py_DECREF(s);
7752     Py_XDECREF(sep);
7753     return result;
7754 }
7755
7756 PyDoc_STRVAR(rsplit__doc__,
7757              "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7758 \n\
7759 Return a list of the words in S, using sep as the\n\
7760 delimiter string, starting at the end of the string and\n\
7761 working to the front.  If maxsplit is given, at most maxsplit\n\
7762 splits are done. If sep is not specified, any whitespace string\n\
7763 is a separator.");
7764
7765 static PyObject*
7766 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7767 {
7768     PyObject *substring = Py_None;
7769     Py_ssize_t maxcount = -1;
7770
7771     if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7772         return NULL;
7773
7774     if (substring == Py_None)
7775         return rsplit(self, NULL, maxcount);
7776     else if (PyUnicode_Check(substring))
7777         return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7778     else
7779         return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7780 }
7781
7782 PyDoc_STRVAR(splitlines__doc__,
7783              "S.splitlines([keepends]) -> list of strings\n\
7784 \n\
7785 Return a list of the lines in S, breaking at line boundaries.\n\
7786 Line breaks are not included in the resulting list unless keepends\n\
7787 is given and true.");
7788
7789 static PyObject*
7790 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7791 {
7792     int keepends = 0;
7793
7794     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7795         return NULL;
7796
7797     return PyUnicode_Splitlines((PyObject *)self, keepends);
7798 }
7799
7800 static
7801 PyObject *unicode_str(PyUnicodeObject *self)
7802 {
7803     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
7804 }
7805
7806 PyDoc_STRVAR(swapcase__doc__,
7807              "S.swapcase() -> unicode\n\
7808 \n\
7809 Return a copy of S with uppercase characters converted to lowercase\n\
7810 and vice versa.");
7811
7812 static PyObject*
7813 unicode_swapcase(PyUnicodeObject *self)
7814 {
7815     return fixup(self, fixswapcase);
7816 }
7817
7818 PyDoc_STRVAR(translate__doc__,
7819              "S.translate(table) -> unicode\n\
7820 \n\
7821 Return a copy of the string S, where all characters have been mapped\n\
7822 through the given translation table, which must be a mapping of\n\
7823 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7824 Unmapped characters are left untouched. Characters mapped to None\n\
7825 are deleted.");
7826
7827 static PyObject*
7828 unicode_translate(PyUnicodeObject *self, PyObject *table)
7829 {
7830     return PyUnicode_TranslateCharmap(self->str,
7831                                       self->length,
7832                                       table,
7833                                       "ignore");
7834 }
7835
7836 PyDoc_STRVAR(upper__doc__,
7837              "S.upper() -> unicode\n\
7838 \n\
7839 Return a copy of S converted to uppercase.");
7840
7841 static PyObject*
7842 unicode_upper(PyUnicodeObject *self)
7843 {
7844     return fixup(self, fixupper);
7845 }
7846
7847 PyDoc_STRVAR(zfill__doc__,
7848              "S.zfill(width) -> unicode\n\
7849 \n\
7850 Pad a numeric string S with zeros on the left, to fill a field\n\
7851 of the specified width. The string S is never truncated.");
7852
7853 static PyObject *
7854 unicode_zfill(PyUnicodeObject *self, PyObject *args)
7855 {
7856     Py_ssize_t fill;
7857     PyUnicodeObject *u;
7858
7859     Py_ssize_t width;
7860     if (!PyArg_ParseTuple(args, "n:zfill", &width))
7861         return NULL;
7862
7863     if (self->length >= width) {
7864         if (PyUnicode_CheckExact(self)) {
7865             Py_INCREF(self);
7866             return (PyObject*) self;
7867         }
7868         else
7869             return PyUnicode_FromUnicode(
7870                 PyUnicode_AS_UNICODE(self),
7871                 PyUnicode_GET_SIZE(self)
7872                 );
7873     }
7874
7875     fill = width - self->length;
7876
7877     u = pad(self, fill, 0, '0');
7878
7879     if (u == NULL)
7880         return NULL;
7881
7882     if (u->str[fill] == '+' || u->str[fill] == '-') {
7883         /* move sign to beginning of string */
7884         u->str[0] = u->str[fill];
7885         u->str[fill] = '0';
7886     }
7887
7888     return (PyObject*) u;
7889 }
7890
7891 #if 0
7892 static PyObject*
7893 free_listsize(PyUnicodeObject *self)
7894 {
7895     return PyInt_FromLong(numfree);
7896 }
7897 #endif
7898
7899 PyDoc_STRVAR(startswith__doc__,
7900              "S.startswith(prefix[, start[, end]]) -> bool\n\
7901 \n\
7902 Return True if S starts with the specified prefix, False otherwise.\n\
7903 With optional start, test S beginning at that position.\n\
7904 With optional end, stop comparing S at that position.\n\
7905 prefix can also be a tuple of strings to try.");
7906
7907 static PyObject *
7908 unicode_startswith(PyUnicodeObject *self,
7909                    PyObject *args)
7910 {
7911     PyObject *subobj;
7912     PyUnicodeObject *substring;
7913     Py_ssize_t start = 0;
7914     Py_ssize_t end = PY_SSIZE_T_MAX;
7915     int result;
7916
7917     if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
7918                           _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7919         return NULL;
7920     if (PyTuple_Check(subobj)) {
7921         Py_ssize_t i;
7922         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7923             substring = (PyUnicodeObject *)PyUnicode_FromObject(
7924                 PyTuple_GET_ITEM(subobj, i));
7925             if (substring == NULL)
7926                 return NULL;
7927             result = tailmatch(self, substring, start, end, -1);
7928             Py_DECREF(substring);
7929             if (result) {
7930                 Py_RETURN_TRUE;
7931             }
7932         }
7933         /* nothing matched */
7934         Py_RETURN_FALSE;
7935     }
7936     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7937     if (substring == NULL)
7938         return NULL;
7939     result = tailmatch(self, substring, start, end, -1);
7940     Py_DECREF(substring);
7941     return PyBool_FromLong(result);
7942 }
7943
7944
7945 PyDoc_STRVAR(endswith__doc__,
7946              "S.endswith(suffix[, start[, end]]) -> bool\n\
7947 \n\
7948 Return True if S ends with the specified suffix, False otherwise.\n\
7949 With optional start, test S beginning at that position.\n\
7950 With optional end, stop comparing S at that position.\n\
7951 suffix can also be a tuple of strings to try.");
7952
7953 static PyObject *
7954 unicode_endswith(PyUnicodeObject *self,
7955                  PyObject *args)
7956 {
7957     PyObject *subobj;
7958     PyUnicodeObject *substring;
7959     Py_ssize_t start = 0;
7960     Py_ssize_t end = PY_SSIZE_T_MAX;
7961     int result;
7962
7963     if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7964                           _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7965         return NULL;
7966     if (PyTuple_Check(subobj)) {
7967         Py_ssize_t i;
7968         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7969             substring = (PyUnicodeObject *)PyUnicode_FromObject(
7970                 PyTuple_GET_ITEM(subobj, i));
7971             if (substring == NULL)
7972                 return NULL;
7973             result = tailmatch(self, substring, start, end, +1);
7974             Py_DECREF(substring);
7975             if (result) {
7976                 Py_RETURN_TRUE;
7977             }
7978         }
7979         Py_RETURN_FALSE;
7980     }
7981     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7982     if (substring == NULL)
7983         return NULL;
7984
7985     result = tailmatch(self, substring, start, end, +1);
7986     Py_DECREF(substring);
7987     return PyBool_FromLong(result);
7988 }
7989
7990
7991 /* Implements do_string_format, which is unicode because of stringlib */
7992 #include "stringlib/string_format.h"
7993
7994 PyDoc_STRVAR(format__doc__,
7995              "S.format(*args, **kwargs) -> unicode\n\
7996 \n\
7997 ");
7998
7999 static PyObject *
8000 unicode__format__(PyObject *self, PyObject *args)
8001 {
8002     PyObject *format_spec;
8003     PyObject *result = NULL;
8004     PyObject *tmp = NULL;
8005
8006     /* If 2.x, convert format_spec to the same type as value */
8007     /* This is to allow things like u''.format('') */
8008     if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
8009         goto done;
8010     if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
8011         PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
8012                      "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
8013         goto done;
8014     }
8015     tmp = PyObject_Unicode(format_spec);
8016     if (tmp == NULL)
8017         goto done;
8018     format_spec = tmp;
8019
8020     result = _PyUnicode_FormatAdvanced(self,
8021                                        PyUnicode_AS_UNICODE(format_spec),
8022                                        PyUnicode_GET_SIZE(format_spec));
8023   done:
8024     Py_XDECREF(tmp);
8025     return result;
8026 }
8027
8028 PyDoc_STRVAR(p_format__doc__,
8029              "S.__format__(format_spec) -> unicode\n\
8030 \n\
8031 ");
8032
8033 static PyObject *
8034 unicode__sizeof__(PyUnicodeObject *v)
8035 {
8036     return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
8037                              sizeof(Py_UNICODE) * (v->length + 1));
8038 }
8039
8040 PyDoc_STRVAR(sizeof__doc__,
8041              "S.__sizeof__() -> size of S in memory, in bytes\n\
8042 \n\
8043 ");
8044
8045 static PyObject *
8046 unicode_getnewargs(PyUnicodeObject *v)
8047 {
8048     return Py_BuildValue("(u#)", v->str, v->length);
8049 }
8050
8051
8052 static PyMethodDef unicode_methods[] = {
8053
8054     /* Order is according to common usage: often used methods should
8055        appear first, since lookup is done sequentially. */
8056
8057     {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8058     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8059     {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
8060     {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
8061     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8062     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8063     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8064     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8065     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8066     {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8067     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
8068     {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
8069     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8070     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8071     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
8072     {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
8073     {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
8074 /*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
8075     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8076     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8077     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
8078     {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
8079     {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
8080     {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
8081     {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
8082     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8083     {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8084     {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8085     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8086     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8087     {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8088     {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8089     {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8090     {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8091     {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8092     {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8093     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8094     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8095     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
8096     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
8097     {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8098     {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
8099     {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8100     {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
8101     {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
8102 #if 0
8103     {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
8104 #endif
8105
8106 #if 0
8107     /* This one is just used for debugging the implementation. */
8108     {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
8109 #endif
8110
8111     {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
8112     {NULL, NULL}
8113 };
8114
8115 static PyObject *
8116 unicode_mod(PyObject *v, PyObject *w)
8117 {
8118     if (!PyUnicode_Check(v)) {
8119         Py_INCREF(Py_NotImplemented);
8120         return Py_NotImplemented;
8121     }
8122     return PyUnicode_Format(v, w);
8123 }
8124
8125 static PyNumberMethods unicode_as_number = {
8126     0,              /*nb_add*/
8127     0,              /*nb_subtract*/
8128     0,              /*nb_multiply*/
8129     0,              /*nb_divide*/
8130     unicode_mod,            /*nb_remainder*/
8131 };
8132
8133 static PySequenceMethods unicode_as_sequence = {
8134     (lenfunc) unicode_length,       /* sq_length */
8135     PyUnicode_Concat,           /* sq_concat */
8136     (ssizeargfunc) unicode_repeat,  /* sq_repeat */
8137     (ssizeargfunc) unicode_getitem,     /* sq_item */
8138     (ssizessizeargfunc) unicode_slice,  /* sq_slice */
8139     0,                  /* sq_ass_item */
8140     0,                  /* sq_ass_slice */
8141     PyUnicode_Contains,         /* sq_contains */
8142 };
8143
8144 static PyObject*
8145 unicode_subscript(PyUnicodeObject* self, PyObject* item)
8146 {
8147     if (PyIndex_Check(item)) {
8148         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
8149         if (i == -1 && PyErr_Occurred())
8150             return NULL;
8151         if (i < 0)
8152             i += PyUnicode_GET_SIZE(self);
8153         return unicode_getitem(self, i);
8154     } else if (PySlice_Check(item)) {
8155         Py_ssize_t start, stop, step, slicelength, cur, i;
8156         Py_UNICODE* source_buf;
8157         Py_UNICODE* result_buf;
8158         PyObject* result;
8159
8160         if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
8161                                  &start, &stop, &step, &slicelength) < 0) {
8162             return NULL;
8163         }
8164
8165         if (slicelength <= 0) {
8166             return PyUnicode_FromUnicode(NULL, 0);
8167         } else if (start == 0 && step == 1 && slicelength == self->length &&
8168                    PyUnicode_CheckExact(self)) {
8169             Py_INCREF(self);
8170             return (PyObject *)self;
8171         } else if (step == 1) {
8172             return PyUnicode_FromUnicode(self->str + start, slicelength);
8173         } else {
8174             source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
8175             result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8176                                                        sizeof(Py_UNICODE));
8177
8178             if (result_buf == NULL)
8179                 return PyErr_NoMemory();
8180
8181             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8182                 result_buf[i] = source_buf[cur];
8183             }
8184
8185             result = PyUnicode_FromUnicode(result_buf, slicelength);
8186             PyObject_FREE(result_buf);
8187             return result;
8188         }
8189     } else {
8190         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8191         return NULL;
8192     }
8193 }
8194
8195 static PyMappingMethods unicode_as_mapping = {
8196     (lenfunc)unicode_length,        /* mp_length */
8197     (binaryfunc)unicode_subscript,  /* mp_subscript */
8198     (objobjargproc)0,           /* mp_ass_subscript */
8199 };
8200
8201 static Py_ssize_t
8202 unicode_buffer_getreadbuf(PyUnicodeObject *self,
8203                           Py_ssize_t index,
8204                           const void **ptr)
8205 {
8206     if (index != 0) {
8207         PyErr_SetString(PyExc_SystemError,
8208                         "accessing non-existent unicode segment");
8209         return -1;
8210     }
8211     *ptr = (void *) self->str;
8212     return PyUnicode_GET_DATA_SIZE(self);
8213 }
8214
8215 static Py_ssize_t
8216 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
8217                            const void **ptr)
8218 {
8219     PyErr_SetString(PyExc_TypeError,
8220                     "cannot use unicode as modifiable buffer");
8221     return -1;
8222 }
8223
8224 static int
8225 unicode_buffer_getsegcount(PyUnicodeObject *self,
8226                            Py_ssize_t *lenp)
8227 {
8228     if (lenp)
8229         *lenp = PyUnicode_GET_DATA_SIZE(self);
8230     return 1;
8231 }
8232
8233 static Py_ssize_t
8234 unicode_buffer_getcharbuf(PyUnicodeObject *self,
8235                           Py_ssize_t index,
8236                           const void **ptr)
8237 {
8238     PyObject *str;
8239
8240     if (index != 0) {
8241         PyErr_SetString(PyExc_SystemError,
8242                         "accessing non-existent unicode segment");
8243         return -1;
8244     }
8245     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
8246     if (str == NULL)
8247         return -1;
8248     *ptr = (void *) PyString_AS_STRING(str);
8249     return PyString_GET_SIZE(str);
8250 }
8251
8252 /* Helpers for PyUnicode_Format() */
8253
8254 static PyObject *
8255 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8256 {
8257     Py_ssize_t argidx = *p_argidx;
8258     if (argidx < arglen) {
8259         (*p_argidx)++;
8260         if (arglen < 0)
8261             return args;
8262         else
8263             return PyTuple_GetItem(args, argidx);
8264     }
8265     PyErr_SetString(PyExc_TypeError,
8266                     "not enough arguments for format string");
8267     return NULL;
8268 }
8269
8270 #define F_LJUST (1<<0)
8271 #define F_SIGN  (1<<1)
8272 #define F_BLANK (1<<2)
8273 #define F_ALT   (1<<3)
8274 #define F_ZERO  (1<<4)
8275
8276 static Py_ssize_t
8277 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8278 {
8279     register Py_ssize_t i;
8280     Py_ssize_t len = strlen(charbuffer);
8281     for (i = len - 1; i >= 0; i--)
8282         buffer[i] = (Py_UNICODE) charbuffer[i];
8283
8284     return len;
8285 }
8286
8287 static int
8288 doubletounicode(Py_UNICODE *buffer, size_t len, int format_code,
8289                 int precision, int flags, double x)
8290 {
8291     Py_ssize_t result;
8292
8293     _PyOS_double_to_string((char *)buffer, len, x, format_code, precision,
8294                            flags, NULL);
8295     result = strtounicode(buffer, (char *)buffer);
8296     return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8297 }
8298
8299 static int
8300 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8301 {
8302     Py_ssize_t result;
8303
8304     PyOS_snprintf((char *)buffer, len, format, x);
8305     result = strtounicode(buffer, (char *)buffer);
8306     return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8307 }
8308
8309 /* XXX To save some code duplication, formatfloat/long/int could have been
8310    shared with stringobject.c, converting from 8-bit to Unicode after the
8311    formatting is done. */
8312
8313 static int
8314 formatfloat(Py_UNICODE *buf,
8315             size_t buflen,
8316             int flags,
8317             int prec,
8318             int type,
8319             PyObject *v)
8320 {
8321     double x;
8322
8323     x = PyFloat_AsDouble(v);
8324     if (x == -1.0 && PyErr_Occurred())
8325         return -1;
8326     if (prec < 0)
8327         prec = 6;
8328 #if SIZEOF_INT > 4
8329     /* make sure that the decimal representation of precision really does
8330        need at most 10 digits: platforms with sizeof(int) == 8 exist! */
8331     if (prec > 0x7fffffff) {
8332         PyErr_SetString(PyExc_OverflowError,
8333                         "outrageously large precision "
8334                         "for formatted float");
8335         return -1;
8336     }
8337 #endif
8338
8339     if (type == 'f' && fabs(x) >= 1e50)
8340         type = 'g';
8341     /* Worst case length calc to ensure no buffer overrun:
8342
8343        'g' formats:
8344        fmt = %#.<prec>g
8345        buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8346        for any double rep.)
8347        len = 1 + prec + 1 + 2 + 5 = 9 + prec
8348
8349        'f' formats:
8350        buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8351        len = 1 + 50 + 1 + prec = 52 + prec
8352
8353        If prec=0 the effective precision is 1 (the leading digit is
8354        always given), therefore increase the length by one.
8355
8356     */
8357     if (((type == 'g' || type == 'G') &&
8358          buflen <= (size_t)10 + (size_t)prec) ||
8359         (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
8360         PyErr_SetString(PyExc_OverflowError,
8361                         "formatted float is too long (precision too large?)");
8362         return -1;
8363     }
8364     return doubletounicode(buf, buflen, type, prec,
8365                            (flags&F_ALT)?Py_DTSF_ALT:0, x);
8366 }
8367
8368 static PyObject*
8369 formatlong(PyObject *val, int flags, int prec, int type)
8370 {
8371     char *buf;
8372     int i, len;
8373     PyObject *str; /* temporary string object. */
8374     PyUnicodeObject *result;
8375
8376     str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8377     if (!str)
8378         return NULL;
8379     result = _PyUnicode_New(len);
8380     if (!result) {
8381         Py_DECREF(str);
8382         return NULL;
8383     }
8384     for (i = 0; i < len; i++)
8385         result->str[i] = buf[i];
8386     result->str[len] = 0;
8387     Py_DECREF(str);
8388     return (PyObject*)result;
8389 }
8390
8391 static int
8392 formatint(Py_UNICODE *buf,
8393           size_t buflen,
8394           int flags,
8395           int prec,
8396           int type,
8397           PyObject *v)
8398 {
8399     /* fmt = '%#.' + `prec` + 'l' + `type`
8400      * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8401      *                     + 1 + 1
8402      *                   = 24
8403      */
8404     char fmt[64]; /* plenty big enough! */
8405     char *sign;
8406     long x;
8407
8408     x = PyInt_AsLong(v);
8409     if (x == -1 && PyErr_Occurred())
8410         return -1;
8411     if (x < 0 && type == 'u') {
8412         type = 'd';
8413     }
8414     if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8415         sign = "-";
8416     else
8417         sign = "";
8418     if (prec < 0)
8419         prec = 1;
8420
8421     /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8422      * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8423      */
8424     if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8425         PyErr_SetString(PyExc_OverflowError,
8426                         "formatted integer is too long (precision too large?)");
8427         return -1;
8428     }
8429
8430     if ((flags & F_ALT) &&
8431         (type == 'x' || type == 'X')) {
8432         /* When converting under %#x or %#X, there are a number
8433          * of issues that cause pain:
8434          * - when 0 is being converted, the C standard leaves off
8435          *   the '0x' or '0X', which is inconsistent with other
8436          *   %#x/%#X conversions and inconsistent with Python's
8437          *   hex() function
8438          * - there are platforms that violate the standard and
8439          *   convert 0 with the '0x' or '0X'
8440          *   (Metrowerks, Compaq Tru64)
8441          * - there are platforms that give '0x' when converting
8442          *   under %#X, but convert 0 in accordance with the
8443          *   standard (OS/2 EMX)
8444          *
8445          * We can achieve the desired consistency by inserting our
8446          * own '0x' or '0X' prefix, and substituting %x/%X in place
8447          * of %#x/%#X.
8448          *
8449          * Note that this is the same approach as used in
8450          * formatint() in stringobject.c
8451          */
8452         PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8453                       sign, type, prec, type);
8454     }
8455     else {
8456         PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8457                       sign, (flags&F_ALT) ? "#" : "",
8458                       prec, type);
8459     }
8460     if (sign[0])
8461         return longtounicode(buf, buflen, fmt, -x);
8462     else
8463         return longtounicode(buf, buflen, fmt, x);
8464 }
8465
8466 static int
8467 formatchar(Py_UNICODE *buf,
8468            size_t buflen,
8469            PyObject *v)
8470 {
8471     /* presume that the buffer is at least 2 characters long */
8472     if (PyUnicode_Check(v)) {
8473         if (PyUnicode_GET_SIZE(v) != 1)
8474             goto onError;
8475         buf[0] = PyUnicode_AS_UNICODE(v)[0];
8476     }
8477
8478     else if (PyString_Check(v)) {
8479         if (PyString_GET_SIZE(v) != 1)
8480             goto onError;
8481         buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8482     }
8483
8484     else {
8485         /* Integer input truncated to a character */
8486         long x;
8487         x = PyInt_AsLong(v);
8488         if (x == -1 && PyErr_Occurred())
8489             goto onError;
8490 #ifdef Py_UNICODE_WIDE
8491         if (x < 0 || x > 0x10ffff) {
8492             PyErr_SetString(PyExc_OverflowError,
8493                             "%c arg not in range(0x110000) "
8494                             "(wide Python build)");
8495             return -1;
8496         }
8497 #else
8498         if (x < 0 || x > 0xffff) {
8499             PyErr_SetString(PyExc_OverflowError,
8500                             "%c arg not in range(0x10000) "
8501                             "(narrow Python build)");
8502             return -1;
8503         }
8504 #endif
8505         buf[0] = (Py_UNICODE) x;
8506     }
8507     buf[1] = '\0';
8508     return 1;
8509
8510   onError:
8511     PyErr_SetString(PyExc_TypeError,
8512                     "%c requires int or char");
8513     return -1;
8514 }
8515
8516 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8517
8518    FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8519    chars are formatted. XXX This is a magic number. Each formatting
8520    routine does bounds checking to ensure no overflow, but a better
8521    solution may be to malloc a buffer of appropriate size for each
8522    format. For now, the current solution is sufficient.
8523 */
8524 #define FORMATBUFLEN (size_t)120
8525
8526 PyObject *PyUnicode_Format(PyObject *format,
8527                            PyObject *args)
8528 {
8529     Py_UNICODE *fmt, *res;
8530     Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8531     int args_owned = 0;
8532     PyUnicodeObject *result = NULL;
8533     PyObject *dict = NULL;
8534     PyObject *uformat;
8535
8536     if (format == NULL || args == NULL) {
8537         PyErr_BadInternalCall();
8538         return NULL;
8539     }
8540     uformat = PyUnicode_FromObject(format);
8541     if (uformat == NULL)
8542         return NULL;
8543     fmt = PyUnicode_AS_UNICODE(uformat);
8544     fmtcnt = PyUnicode_GET_SIZE(uformat);
8545
8546     reslen = rescnt = fmtcnt + 100;
8547     result = _PyUnicode_New(reslen);
8548     if (result == NULL)
8549         goto onError;
8550     res = PyUnicode_AS_UNICODE(result);
8551
8552     if (PyTuple_Check(args)) {
8553         arglen = PyTuple_Size(args);
8554         argidx = 0;
8555     }
8556     else {
8557         arglen = -1;
8558         argidx = -2;
8559     }
8560     if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
8561         !PyObject_TypeCheck(args, &PyBaseString_Type))
8562         dict = args;
8563
8564     while (--fmtcnt >= 0) {
8565         if (*fmt != '%') {
8566             if (--rescnt < 0) {
8567                 rescnt = fmtcnt + 100;
8568                 reslen += rescnt;
8569                 if (_PyUnicode_Resize(&result, reslen) < 0)
8570                     goto onError;
8571                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8572                 --rescnt;
8573             }
8574             *res++ = *fmt++;
8575         }
8576         else {
8577             /* Got a format specifier */
8578             int flags = 0;
8579             Py_ssize_t width = -1;
8580             int prec = -1;
8581             Py_UNICODE c = '\0';
8582             Py_UNICODE fill;
8583             int isnumok;
8584             PyObject *v = NULL;
8585             PyObject *temp = NULL;
8586             Py_UNICODE *pbuf;
8587             Py_UNICODE sign;
8588             Py_ssize_t len;
8589             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
8590
8591             fmt++;
8592             if (*fmt == '(') {
8593                 Py_UNICODE *keystart;
8594                 Py_ssize_t keylen;
8595                 PyObject *key;
8596                 int pcount = 1;
8597
8598                 if (dict == NULL) {
8599                     PyErr_SetString(PyExc_TypeError,
8600                                     "format requires a mapping");
8601                     goto onError;
8602                 }
8603                 ++fmt;
8604                 --fmtcnt;
8605                 keystart = fmt;
8606                 /* Skip over balanced parentheses */
8607                 while (pcount > 0 && --fmtcnt >= 0) {
8608                     if (*fmt == ')')
8609                         --pcount;
8610                     else if (*fmt == '(')
8611                         ++pcount;
8612                     fmt++;
8613                 }
8614                 keylen = fmt - keystart - 1;
8615                 if (fmtcnt < 0 || pcount > 0) {
8616                     PyErr_SetString(PyExc_ValueError,
8617                                     "incomplete format key");
8618                     goto onError;
8619                 }
8620 #if 0
8621                 /* keys are converted to strings using UTF-8 and
8622                    then looked up since Python uses strings to hold
8623                    variables names etc. in its namespaces and we
8624                    wouldn't want to break common idioms. */
8625                 key = PyUnicode_EncodeUTF8(keystart,
8626                                            keylen,
8627                                            NULL);
8628 #else
8629                 key = PyUnicode_FromUnicode(keystart, keylen);
8630 #endif
8631                 if (key == NULL)
8632                     goto onError;
8633                 if (args_owned) {
8634                     Py_DECREF(args);
8635                     args_owned = 0;
8636                 }
8637                 args = PyObject_GetItem(dict, key);
8638                 Py_DECREF(key);
8639                 if (args == NULL) {
8640                     goto onError;
8641                 }
8642                 args_owned = 1;
8643                 arglen = -1;
8644                 argidx = -2;
8645             }
8646             while (--fmtcnt >= 0) {
8647                 switch (c = *fmt++) {
8648                 case '-': flags |= F_LJUST; continue;
8649                 case '+': flags |= F_SIGN; continue;
8650                 case ' ': flags |= F_BLANK; continue;
8651                 case '#': flags |= F_ALT; continue;
8652                 case '0': flags |= F_ZERO; continue;
8653                 }
8654                 break;
8655             }
8656             if (c == '*') {
8657                 v = getnextarg(args, arglen, &argidx);
8658                 if (v == NULL)
8659                     goto onError;
8660                 if (!PyInt_Check(v)) {
8661                     PyErr_SetString(PyExc_TypeError,
8662                                     "* wants int");
8663                     goto onError;
8664                 }
8665                 width = PyInt_AsLong(v);
8666                 if (width < 0) {
8667                     flags |= F_LJUST;
8668                     width = -width;
8669                 }
8670                 if (--fmtcnt >= 0)
8671                     c = *fmt++;
8672             }
8673             else if (c >= '0' && c <= '9') {
8674                 width = c - '0';
8675                 while (--fmtcnt >= 0) {
8676                     c = *fmt++;
8677                     if (c < '0' || c > '9')
8678                         break;
8679                     if ((width*10) / 10 != width) {
8680                         PyErr_SetString(PyExc_ValueError,
8681                                         "width too big");
8682                         goto onError;
8683                     }
8684                     width = width*10 + (c - '0');
8685                 }
8686             }
8687             if (c == '.') {
8688                 prec = 0;
8689                 if (--fmtcnt >= 0)
8690                     c = *fmt++;
8691                 if (c == '*') {
8692                     v = getnextarg(args, arglen, &argidx);
8693                     if (v == NULL)
8694                         goto onError;
8695                     if (!PyInt_Check(v)) {
8696                         PyErr_SetString(PyExc_TypeError,
8697                                         "* wants int");
8698                         goto onError;
8699                     }
8700                     prec = PyInt_AsLong(v);
8701                     if (prec < 0)
8702                         prec = 0;
8703                     if (--fmtcnt >= 0)
8704                         c = *fmt++;
8705                 }
8706                 else if (c >= '0' && c <= '9') {
8707                     prec = c - '0';
8708                     while (--fmtcnt >= 0) {
8709                         c = Py_CHARMASK(*fmt++);
8710                         if (c < '0' || c > '9')
8711                             break;
8712                         if ((prec*10) / 10 != prec) {
8713                             PyErr_SetString(PyExc_ValueError,
8714                                             "prec too big");
8715                             goto onError;
8716                         }
8717                         prec = prec*10 + (c - '0');
8718                     }
8719                 }
8720             } /* prec */
8721             if (fmtcnt >= 0) {
8722                 if (c == 'h' || c == 'l' || c == 'L') {
8723                     if (--fmtcnt >= 0)
8724                         c = *fmt++;
8725                 }
8726             }
8727             if (fmtcnt < 0) {
8728                 PyErr_SetString(PyExc_ValueError,
8729                                 "incomplete format");
8730                 goto onError;
8731             }
8732             if (c != '%') {
8733                 v = getnextarg(args, arglen, &argidx);
8734                 if (v == NULL)
8735                     goto onError;
8736             }
8737             sign = 0;
8738             fill = ' ';
8739             switch (c) {
8740
8741             case '%':
8742                 pbuf = formatbuf;
8743                 /* presume that buffer length is at least 1 */
8744                 pbuf[0] = '%';
8745                 len = 1;
8746                 break;
8747
8748             case 's':
8749             case 'r':
8750                 if (PyUnicode_Check(v) && c == 's') {
8751                     temp = v;
8752                     Py_INCREF(temp);
8753                 }
8754                 else {
8755                     PyObject *unicode;
8756                     if (c == 's')
8757                         temp = PyObject_Unicode(v);
8758                     else
8759                         temp = PyObject_Repr(v);
8760                     if (temp == NULL)
8761                         goto onError;
8762                     if (PyUnicode_Check(temp))
8763                         /* nothing to do */;
8764                     else if (PyString_Check(temp)) {
8765                         /* convert to string to Unicode */
8766                         unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8767                                                    PyString_GET_SIZE(temp),
8768                                                    NULL,
8769                                                    "strict");
8770                         Py_DECREF(temp);
8771                         temp = unicode;
8772                         if (temp == NULL)
8773                             goto onError;
8774                     }
8775                     else {
8776                         Py_DECREF(temp);
8777                         PyErr_SetString(PyExc_TypeError,
8778                                         "%s argument has non-string str()");
8779                         goto onError;
8780                     }
8781                 }
8782                 pbuf = PyUnicode_AS_UNICODE(temp);
8783                 len = PyUnicode_GET_SIZE(temp);
8784                 if (prec >= 0 && len > prec)
8785                     len = prec;
8786                 break;
8787
8788             case 'i':
8789             case 'd':
8790             case 'u':
8791             case 'o':
8792             case 'x':
8793             case 'X':
8794                 if (c == 'i')
8795                     c = 'd';
8796                 isnumok = 0;
8797                 if (PyNumber_Check(v)) {
8798                     PyObject *iobj=NULL;
8799
8800                     if (PyInt_Check(v) || (PyLong_Check(v))) {
8801                         iobj = v;
8802                         Py_INCREF(iobj);
8803                     }
8804                     else {
8805                         iobj = PyNumber_Int(v);
8806                         if (iobj==NULL) iobj = PyNumber_Long(v);
8807                     }
8808                     if (iobj!=NULL) {
8809                         if (PyInt_Check(iobj)) {
8810                             isnumok = 1;
8811                             pbuf = formatbuf;
8812                             len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8813                                             flags, prec, c, iobj);
8814                             Py_DECREF(iobj);
8815                             if (len < 0)
8816                                 goto onError;
8817                             sign = 1;
8818                         }
8819                         else if (PyLong_Check(iobj)) {
8820                             isnumok = 1;
8821                             temp = formatlong(iobj, flags, prec, c);
8822                             Py_DECREF(iobj);
8823                             if (!temp)
8824                                 goto onError;
8825                             pbuf = PyUnicode_AS_UNICODE(temp);
8826                             len = PyUnicode_GET_SIZE(temp);
8827                             sign = 1;
8828                         }
8829                         else {
8830                             Py_DECREF(iobj);
8831                         }
8832                     }
8833                 }
8834                 if (!isnumok) {
8835                     PyErr_Format(PyExc_TypeError,
8836                                  "%%%c format: a number is required, "
8837                                  "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8838                     goto onError;
8839                 }
8840                 if (flags & F_ZERO)
8841                     fill = '0';
8842                 break;
8843
8844             case 'e':
8845             case 'E':
8846             case 'f':
8847             case 'F':
8848             case 'g':
8849             case 'G':
8850                 if (c == 'F')
8851                     c = 'f';
8852                 pbuf = formatbuf;
8853                 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8854                                   flags, prec, c, v);
8855                 if (len < 0)
8856                     goto onError;
8857                 sign = 1;
8858                 if (flags & F_ZERO)
8859                     fill = '0';
8860                 break;
8861
8862             case 'c':
8863                 pbuf = formatbuf;
8864                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8865                 if (len < 0)
8866                     goto onError;
8867                 break;
8868
8869             default:
8870                 PyErr_Format(PyExc_ValueError,
8871                              "unsupported format character '%c' (0x%x) "
8872                              "at index %zd",
8873                              (31<=c && c<=126) ? (char)c : '?',
8874                              (int)c,
8875                              (Py_ssize_t)(fmt - 1 -
8876                                           PyUnicode_AS_UNICODE(uformat)));
8877                 goto onError;
8878             }
8879             if (sign) {
8880                 if (*pbuf == '-' || *pbuf == '+') {
8881                     sign = *pbuf++;
8882                     len--;
8883                 }
8884                 else if (flags & F_SIGN)
8885                     sign = '+';
8886                 else if (flags & F_BLANK)
8887                     sign = ' ';
8888                 else
8889                     sign = 0;
8890             }
8891             if (width < len)
8892                 width = len;
8893             if (rescnt - (sign != 0) < width) {
8894                 reslen -= rescnt;
8895                 rescnt = width + fmtcnt + 100;
8896                 reslen += rescnt;
8897                 if (reslen < 0) {
8898                     Py_XDECREF(temp);
8899                     PyErr_NoMemory();
8900                     goto onError;
8901                 }
8902                 if (_PyUnicode_Resize(&result, reslen) < 0) {
8903                     Py_XDECREF(temp);
8904                     goto onError;
8905                 }
8906                 res = PyUnicode_AS_UNICODE(result)
8907                     + reslen - rescnt;
8908             }
8909             if (sign) {
8910                 if (fill != ' ')
8911                     *res++ = sign;
8912                 rescnt--;
8913                 if (width > len)
8914                     width--;
8915             }
8916             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8917                 assert(pbuf[0] == '0');
8918                 assert(pbuf[1] == c);
8919                 if (fill != ' ') {
8920                     *res++ = *pbuf++;
8921                     *res++ = *pbuf++;
8922                 }
8923                 rescnt -= 2;
8924                 width -= 2;
8925                 if (width < 0)
8926                     width = 0;
8927                 len -= 2;
8928             }
8929             if (width > len && !(flags & F_LJUST)) {
8930                 do {
8931                     --rescnt;
8932                     *res++ = fill;
8933                 } while (--width > len);
8934             }
8935             if (fill == ' ') {
8936                 if (sign)
8937                     *res++ = sign;
8938                 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8939                     assert(pbuf[0] == '0');
8940                     assert(pbuf[1] == c);
8941                     *res++ = *pbuf++;
8942                     *res++ = *pbuf++;
8943                 }
8944             }
8945             Py_UNICODE_COPY(res, pbuf, len);
8946             res += len;
8947             rescnt -= len;
8948             while (--width >= len) {
8949                 --rescnt;
8950                 *res++ = ' ';
8951             }
8952             if (dict && (argidx < arglen) && c != '%') {
8953                 PyErr_SetString(PyExc_TypeError,
8954                                 "not all arguments converted during string formatting");
8955                 Py_XDECREF(temp);
8956                 goto onError;
8957             }
8958             Py_XDECREF(temp);
8959         } /* '%' */
8960     } /* until end */
8961     if (argidx < arglen && !dict) {
8962         PyErr_SetString(PyExc_TypeError,
8963                         "not all arguments converted during string formatting");
8964         goto onError;
8965     }
8966
8967     if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8968         goto onError;
8969     if (args_owned) {
8970         Py_DECREF(args);
8971     }
8972     Py_DECREF(uformat);
8973     return (PyObject *)result;
8974
8975   onError:
8976     Py_XDECREF(result);
8977     Py_DECREF(uformat);
8978     if (args_owned) {
8979         Py_DECREF(args);
8980     }
8981     return NULL;
8982 }
8983
8984 static PyBufferProcs unicode_as_buffer = {
8985     (readbufferproc) unicode_buffer_getreadbuf,
8986     (writebufferproc) unicode_buffer_getwritebuf,
8987     (segcountproc) unicode_buffer_getsegcount,
8988     (charbufferproc) unicode_buffer_getcharbuf,
8989 };
8990
8991 static PyObject *
8992 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8993
8994 static PyObject *
8995 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8996 {
8997     PyObject *x = NULL;
8998     static char *kwlist[] = {"string", "encoding", "errors", 0};
8999     char *encoding = NULL;
9000     char *errors = NULL;
9001
9002     if (type != &PyUnicode_Type)
9003         return unicode_subtype_new(type, args, kwds);
9004     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
9005                                      kwlist, &x, &encoding, &errors))
9006         return NULL;
9007     if (x == NULL)
9008         return (PyObject *)_PyUnicode_New(0);
9009     if (encoding == NULL && errors == NULL)
9010         return PyObject_Unicode(x);
9011     else
9012         return PyUnicode_FromEncodedObject(x, encoding, errors);
9013 }
9014
9015 static PyObject *
9016 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9017 {
9018     PyUnicodeObject *tmp, *pnew;
9019     Py_ssize_t n;
9020
9021     assert(PyType_IsSubtype(type, &PyUnicode_Type));
9022     tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9023     if (tmp == NULL)
9024         return NULL;
9025     assert(PyUnicode_Check(tmp));
9026     pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9027     if (pnew == NULL) {
9028         Py_DECREF(tmp);
9029         return NULL;
9030     }
9031     pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9032     if (pnew->str == NULL) {
9033         _Py_ForgetReference((PyObject *)pnew);
9034         PyObject_Del(pnew);
9035         Py_DECREF(tmp);
9036         return PyErr_NoMemory();
9037     }
9038     Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9039     pnew->length = n;
9040     pnew->hash = tmp->hash;
9041     Py_DECREF(tmp);
9042     return (PyObject *)pnew;
9043 }
9044
9045 PyDoc_STRVAR(unicode_doc,
9046              "unicode(string [, encoding[, errors]]) -> object\n\
9047 \n\
9048 Create a new Unicode object from the given encoded string.\n\
9049 encoding defaults to the current default string encoding.\n\
9050 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
9051
9052 PyTypeObject PyUnicode_Type = {
9053     PyVarObject_HEAD_INIT(&PyType_Type, 0)
9054     "unicode",              /* tp_name */
9055     sizeof(PyUnicodeObject),        /* tp_size */
9056     0,                  /* tp_itemsize */
9057     /* Slots */
9058     (destructor)unicode_dealloc,    /* tp_dealloc */
9059     0,                  /* tp_print */
9060     0,                  /* tp_getattr */
9061     0,                  /* tp_setattr */
9062     0,                  /* tp_compare */
9063     unicode_repr,           /* tp_repr */
9064     &unicode_as_number,         /* tp_as_number */
9065     &unicode_as_sequence,       /* tp_as_sequence */
9066     &unicode_as_mapping,        /* tp_as_mapping */
9067     (hashfunc) unicode_hash,        /* tp_hash*/
9068     0,                  /* tp_call*/
9069     (reprfunc) unicode_str,     /* tp_str */
9070     PyObject_GenericGetAttr,        /* tp_getattro */
9071     0,                  /* tp_setattro */
9072     &unicode_as_buffer,         /* tp_as_buffer */
9073     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
9074     Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS,  /* tp_flags */
9075     unicode_doc,            /* tp_doc */
9076     0,                  /* tp_traverse */
9077     0,                  /* tp_clear */
9078     PyUnicode_RichCompare,      /* tp_richcompare */
9079     0,                  /* tp_weaklistoffset */
9080     0,                  /* tp_iter */
9081     0,                  /* tp_iternext */
9082     unicode_methods,            /* tp_methods */
9083     0,                  /* tp_members */
9084     0,                  /* tp_getset */
9085     &PyBaseString_Type,         /* tp_base */
9086     0,                  /* tp_dict */
9087     0,                  /* tp_descr_get */
9088     0,                  /* tp_descr_set */
9089     0,                  /* tp_dictoffset */
9090     0,                  /* tp_init */
9091     0,                  /* tp_alloc */
9092     unicode_new,            /* tp_new */
9093     PyObject_Del,           /* tp_free */
9094 };
9095
9096 /* Initialize the Unicode implementation */
9097
9098 void _PyUnicode_Init(void)
9099 {
9100     int i;
9101
9102     /* XXX - move this array to unicodectype.c ? */
9103     Py_UNICODE linebreak[] = {
9104         0x000A, /* LINE FEED */
9105         0x000D, /* CARRIAGE RETURN */
9106         0x001C, /* FILE SEPARATOR */
9107         0x001D, /* GROUP SEPARATOR */
9108         0x001E, /* RECORD SEPARATOR */
9109         0x0085, /* NEXT LINE */
9110         0x2028, /* LINE SEPARATOR */
9111         0x2029, /* PARAGRAPH SEPARATOR */
9112     };
9113
9114     /* Init the implementation */
9115     free_list = NULL;
9116     numfree = 0;
9117     unicode_empty = _PyUnicode_New(0);
9118     if (!unicode_empty)
9119         return;
9120
9121     strcpy(unicode_default_encoding, "ascii");
9122     for (i = 0; i < 256; i++)
9123         unicode_latin1[i] = NULL;
9124     if (PyType_Ready(&PyUnicode_Type) < 0)
9125         Py_FatalError("Can't initialize 'unicode'");
9126
9127     /* initialize the linebreak bloom filter */
9128     bloom_linebreak = make_bloom_mask(
9129         linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9130         );
9131
9132     PyType_Ready(&EncodingMapType);
9133 }
9134
9135 /* Finalize the Unicode implementation */
9136
9137 int
9138 PyUnicode_ClearFreeList(void)
9139 {
9140     int freelist_size = numfree;
9141     PyUnicodeObject *u;
9142
9143     for (u = free_list; u != NULL;) {
9144         PyUnicodeObject *v = u;
9145         u = *(PyUnicodeObject **)u;
9146         if (v->str)
9147             PyObject_DEL(v->str);
9148         Py_XDECREF(v->defenc);
9149         PyObject_Del(v);
9150         numfree--;
9151     }
9152     free_list = NULL;
9153     assert(numfree == 0);
9154     return freelist_size;
9155 }
9156
9157 void
9158 _PyUnicode_Fini(void)
9159 {
9160     int i;
9161
9162     Py_XDECREF(unicode_empty);
9163     unicode_empty = NULL;
9164
9165     for (i = 0; i < 256; i++) {
9166         if (unicode_latin1[i]) {
9167             Py_DECREF(unicode_latin1[i]);
9168             unicode_latin1[i] = NULL;
9169         }
9170     }
9171     (void)PyUnicode_ClearFreeList();
9172 }
9173
9174 #ifdef __cplusplus
9175 }
9176 #endif
9177
9178
9179 /*
9180   Local variables:
9181   c-basic-offset: 4
9182   indent-tabs-mode: nil
9183   End:
9184 */