Objects/unicodeobject.c

   1 /*
   2
   3 Unicode implementation based on original code by Fredrik Lundh,
   4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
   5 Unicode Integration Proposal (see file Misc/unicode.txt).
   6
   7 Major speed upgrades to the method implementations at the Reykjavik
   8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
   9
  10 Copyright (c) Corporation for National Research Initiatives.
  11
  12 --------------------------------------------------------------------
  13 The original string type implementation is:
  14
  15   Copyright (c) 1999 by Secret Labs AB
  16   Copyright (c) 1999 by Fredrik Lundh
  17
  18 By obtaining, using, and/or copying this software and/or its
  19 associated documentation, you agree that you have read, understood,
  20 and will comply with the following terms and conditions:
  21
  22 Permission to use, copy, modify, and distribute this software and its
  23 associated documentation for any purpose and without fee is hereby
  24 granted, provided that the above copyright notice appears in all
  25 copies, and that both that copyright notice and this permission notice
  26 appear in supporting documentation, and that the name of Secret Labs
  27 AB or the author not be used in advertising or publicity pertaining to
  28 distribution of the software without specific, written prior
  29 permission.
  30
  31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  33 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  38 --------------------------------------------------------------------
  39
  40 */
  41
  42 #define PY_SSIZE_T_CLEAN
  43 #include "Python.h"
  44
  45 #include "unicodeobject.h"
  46 #include "ucnhash.h"
  47
  48 #ifdef MS_WINDOWS
  49 #include <windows.h>
  50 #endif
  51
  52 /* Limit for the Unicode object free list */
  53
  54 #define PyUnicode_MAXFREELIST       1024
  55
  56 /* Limit for the Unicode object free list stay alive optimization.
  57
  58    The implementation will keep allocated Unicode memory intact for
  59    all objects on the free list having a size less than this
  60    limit. This reduces malloc() overhead for small Unicode objects.
  61
  62    At worst this will result in PyUnicode_MAXFREELIST *
  63    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
  64    malloc()-overhead) bytes of unused garbage.
  65
  66    Setting the limit to 0 effectively turns the feature off.
  67
  68    Note: This is an experimental feature ! If you get core dumps when
  69    using Unicode objects, turn this feature off.
  70
  71 */
  72
  73 #define KEEPALIVE_SIZE_LIMIT       9
  74
  75 /* Endianness switches; defaults to little endian */
  76
  77 #ifdef WORDS_BIGENDIAN
  78 # define BYTEORDER_IS_BIG_ENDIAN
  79 #else
  80 # define BYTEORDER_IS_LITTLE_ENDIAN
  81 #endif
  82
  83 /* --- Globals ------------------------------------------------------------
  84
  85    The globals are initialized by the _PyUnicode_Init() API and should
  86    not be used before calling that API.
  87
  88 */
  89
  90
  91 #ifdef __cplusplus
  92 extern "C" {
  93 #endif
  94
  95 /* Free list for Unicode objects */
  96 static PyUnicodeObject *free_list;
  97 static int numfree;
  98
  99 /* The empty Unicode object is shared to improve performance. */
 100 static PyUnicodeObject *unicode_empty;
 101
 102 /* Single character Unicode strings in the Latin-1 range are being
 103    shared as well. */
 104 static PyUnicodeObject *unicode_latin1[256];
 105
 106 /* Default encoding to use and assume when NULL is passed as encoding
 107    parameter; it is initialized by _PyUnicode_Init().
 108
 109    Always use the PyUnicode_SetDefaultEncoding() and
 110    PyUnicode_GetDefaultEncoding() APIs to access this global.
 111
 112 */
 113 static char unicode_default_encoding[100];
 114
 115 /* Fast detection of the most frequent whitespace characters */
 116 const unsigned char _Py_ascii_whitespace[] = {
 117     0, 0, 0, 0, 0, 0, 0, 0,
 118 /*     case 0x0009: * HORIZONTAL TABULATION */
 119 /*     case 0x000A: * LINE FEED */
 120 /*     case 0x000B: * VERTICAL TABULATION */
 121 /*     case 0x000C: * FORM FEED */
 122 /*     case 0x000D: * CARRIAGE RETURN */
 123     0, 1, 1, 1, 1, 1, 0, 0,
 124     0, 0, 0, 0, 0, 0, 0, 0,
 125 /*     case 0x001C: * FILE SEPARATOR */
 126 /*     case 0x001D: * GROUP SEPARATOR */
 127 /*     case 0x001E: * RECORD SEPARATOR */
 128 /*     case 0x001F: * UNIT SEPARATOR */
 129     0, 0, 0, 0, 1, 1, 1, 1,
 130 /*     case 0x0020: * SPACE */
 131     1, 0, 0, 0, 0, 0, 0, 0,
 132     0, 0, 0, 0, 0, 0, 0, 0,
 133     0, 0, 0, 0, 0, 0, 0, 0,
 134     0, 0, 0, 0, 0, 0, 0, 0,
 135
 136     0, 0, 0, 0, 0, 0, 0, 0,
 137     0, 0, 0, 0, 0, 0, 0, 0,
 138     0, 0, 0, 0, 0, 0, 0, 0,
 139     0, 0, 0, 0, 0, 0, 0, 0,
 140     0, 0, 0, 0, 0, 0, 0, 0,
 141     0, 0, 0, 0, 0, 0, 0, 0,
 142     0, 0, 0, 0, 0, 0, 0, 0,
 143     0, 0, 0, 0, 0, 0, 0, 0
 144 };
 145
 146 /* Same for linebreaks */
 147 static unsigned char ascii_linebreak[] = {
 148     0, 0, 0, 0, 0, 0, 0, 0,
 149 /*         0x000A, * LINE FEED */
 150 /*         0x000D, * CARRIAGE RETURN */
 151     0, 0, 1, 0, 0, 1, 0, 0,
 152     0, 0, 0, 0, 0, 0, 0, 0,
 153 /*         0x001C, * FILE SEPARATOR */
 154 /*         0x001D, * GROUP SEPARATOR */
 155 /*         0x001E, * RECORD SEPARATOR */
 156     0, 0, 0, 0, 1, 1, 1, 0,
 157     0, 0, 0, 0, 0, 0, 0, 0,
 158     0, 0, 0, 0, 0, 0, 0, 0,
 159     0, 0, 0, 0, 0, 0, 0, 0,
 160     0, 0, 0, 0, 0, 0, 0, 0,
 161
 162     0, 0, 0, 0, 0, 0, 0, 0,
 163     0, 0, 0, 0, 0, 0, 0, 0,
 164     0, 0, 0, 0, 0, 0, 0, 0,
 165     0, 0, 0, 0, 0, 0, 0, 0,
 166     0, 0, 0, 0, 0, 0, 0, 0,
 167     0, 0, 0, 0, 0, 0, 0, 0,
 168     0, 0, 0, 0, 0, 0, 0, 0,
 169     0, 0, 0, 0, 0, 0, 0, 0
 170 };
 171
 172
 173 Py_UNICODE
 174 PyUnicode_GetMax(void)
 175 {
 176 #ifdef Py_UNICODE_WIDE
 177     return 0x10FFFF;
 178 #else
 179     /* This is actually an illegal character, so it should
 180        not be passed to unichr. */
 181     return 0xFFFF;
 182 #endif
 183 }
 184
 185 /* --- Bloom Filters ----------------------------------------------------- */
 186
 187 /* stuff to implement simple "bloom filters" for Unicode characters.
 188    to keep things simple, we use a single bitmask, using the least 5
 189    bits from each unicode characters as the bit index. */
 190
 191 /* the linebreak mask is set up by Unicode_Init below */
 192
 193 #define BLOOM_MASK unsigned long
 194
 195 static BLOOM_MASK bloom_linebreak;
 196
 197 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
 198
 199 #define BLOOM_LINEBREAK(ch)                                             \
 200     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
 201      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
 202
 203 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
 204 {
 205     /* calculate simple bloom-style bitmask for a given unicode string */
 206
 207     long mask;
 208     Py_ssize_t i;
 209
 210     mask = 0;
 211     for (i = 0; i < len; i++)
 212         mask |= (1 << (ptr[i] & 0x1F));
 213
 214     return mask;
 215 }
 216
 217 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
 218 {
 219     Py_ssize_t i;
 220
 221     for (i = 0; i < setlen; i++)
 222         if (set[i] == chr)
 223             return 1;
 224
 225     return 0;
 226 }
 227
 228 #define BLOOM_MEMBER(mask, chr, set, setlen)                    \
 229     BLOOM(mask, chr) && unicode_member(chr, set, setlen)
 230
 231 /* --- Unicode Object ----------------------------------------------------- */
 232
 233 static
 234 int unicode_resize(register PyUnicodeObject *unicode,
 235                    Py_ssize_t length)
 236 {
 237     void *oldstr;
 238
 239     /* Shortcut if there's nothing much to do. */
 240     if (unicode->length == length)
 241         goto reset;
 242
 243     /* Resizing shared object (unicode_empty or single character
 244        objects) in-place is not allowed. Use PyUnicode_Resize()
 245        instead ! */
 246
 247     if (unicode == unicode_empty ||
 248         (unicode->length == 1 &&
 249          unicode->str[0] < 256U &&
 250          unicode_latin1[unicode->str[0]] == unicode)) {
 251         PyErr_SetString(PyExc_SystemError,
 252                         "can't resize shared unicode objects");
 253         return -1;
 254     }
 255
 256     /* We allocate one more byte to make sure the string is Ux0000 terminated.
 257        The overallocation is also used by fastsearch, which assumes that it's
 258        safe to look at str[length] (without making any assumptions about what
 259        it contains). */
 260
 261     oldstr = unicode->str;
 262     unicode->str = PyObject_REALLOC(unicode->str,
 263                                     sizeof(Py_UNICODE) * (length + 1));
 264     if (!unicode->str) {
 265         unicode->str = (Py_UNICODE *)oldstr;
 266         PyErr_NoMemory();
 267         return -1;
 268     }
 269     unicode->str[length] = 0;
 270     unicode->length = length;
 271
 272   reset:
 273     /* Reset the object caches */
 274     if (unicode->defenc) {
 275         Py_DECREF(unicode->defenc);
 276         unicode->defenc = NULL;
 277     }
 278     unicode->hash = -1;
 279
 280     return 0;
 281 }
 282
 283 /* We allocate one more byte to make sure the string is
 284    Ux0000 terminated -- XXX is this needed ?
 285
 286    XXX This allocator could further be enhanced by assuring that the
 287    free list never reduces its size below 1.
 288
 289 */
 290
 291 static
 292 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
 293 {
 294     register PyUnicodeObject *unicode;
 295
 296     /* Optimization for empty strings */
 297     if (length == 0 && unicode_empty != NULL) {
 298         Py_INCREF(unicode_empty);
 299         return unicode_empty;
 300     }
 301
 302     /* Ensure we won't overflow the size. */
 303     if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
 304         return (PyUnicodeObject *)PyErr_NoMemory();
 305     }
 306
 307     /* Unicode freelist & memory allocation */
 308     if (free_list) {
 309         unicode = free_list;
 310         free_list = *(PyUnicodeObject **)unicode;
 311         numfree--;
 312         if (unicode->str) {
 313             /* Keep-Alive optimization: we only upsize the buffer,
 314                never downsize it. */
 315             if ((unicode->length < length) &&
 316                 unicode_resize(unicode, length) < 0) {
 317                 PyObject_DEL(unicode->str);
 318                 unicode->str = NULL;
 319             }
 320         }
 321         else {
 322             size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
 323             unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
 324         }
 325         PyObject_INIT(unicode, &PyUnicode_Type);
 326     }
 327     else {
 328         size_t new_size;
 329         unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
 330         if (unicode == NULL)
 331             return NULL;
 332         new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
 333         unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
 334     }
 335
 336     if (!unicode->str) {
 337         PyErr_NoMemory();
 338         goto onError;
 339     }
 340     /* Initialize the first element to guard against cases where
 341      * the caller fails before initializing str -- unicode_resize()
 342      * reads str[0], and the Keep-Alive optimization can keep memory
 343      * allocated for str alive across a call to unicode_dealloc(unicode).
 344      * We don't want unicode_resize to read uninitialized memory in
 345      * that case.
 346      */
 347     unicode->str[0] = 0;
 348     unicode->str[length] = 0;
 349     unicode->length = length;
 350     unicode->hash = -1;
 351     unicode->defenc = NULL;
 352     return unicode;
 353
 354   onError:
 355     /* XXX UNREF/NEWREF interface should be more symmetrical */
 356     _Py_DEC_REFTOTAL;
 357     _Py_ForgetReference((PyObject *)unicode);
 358     PyObject_Del(unicode);
 359     return NULL;
 360 }
 361
 362 static
 363 void unicode_dealloc(register PyUnicodeObject *unicode)
 364 {
 365     if (PyUnicode_CheckExact(unicode) &&
 366         numfree < PyUnicode_MAXFREELIST) {
 367         /* Keep-Alive optimization */
 368         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
 369             PyObject_DEL(unicode->str);
 370             unicode->str = NULL;
 371             unicode->length = 0;
 372         }
 373         if (unicode->defenc) {
 374             Py_DECREF(unicode->defenc);
 375             unicode->defenc = NULL;
 376         }
 377         /* Add to free list */
 378         *(PyUnicodeObject **)unicode = free_list;
 379         free_list = unicode;
 380         numfree++;
 381     }
 382     else {
 383         PyObject_DEL(unicode->str);
 384         Py_XDECREF(unicode->defenc);
 385         Py_TYPE(unicode)->tp_free((PyObject *)unicode);
 386     }
 387 }
 388
 389 static
 390 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
 391 {
 392     register PyUnicodeObject *v;
 393
 394     /* Argument checks */
 395     if (unicode == NULL) {
 396         PyErr_BadInternalCall();
 397         return -1;
 398     }
 399     v = *unicode;
 400     if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
 401         PyErr_BadInternalCall();
 402         return -1;
 403     }
 404
 405     /* Resizing unicode_empty and single character objects is not
 406        possible since these are being shared. We simply return a fresh
 407        copy with the same Unicode content. */
 408     if (v->length != length &&
 409         (v == unicode_empty || v->length == 1)) {
 410         PyUnicodeObject *w = _PyUnicode_New(length);
 411         if (w == NULL)
 412             return -1;
 413         Py_UNICODE_COPY(w->str, v->str,
 414                         length < v->length ? length : v->length);
 415         Py_DECREF(*unicode);
 416         *unicode = w;
 417         return 0;
 418     }
 419
 420     /* Note that we don't have to modify *unicode for unshared Unicode
 421        objects, since we can modify them in-place. */
 422     return unicode_resize(v, length);
 423 }
 424
 425 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
 426 {
 427     return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
 428 }
 429
 430 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
 431                                 Py_ssize_t size)
 432 {
 433     PyUnicodeObject *unicode;
 434
 435     /* If the Unicode data is known at construction time, we can apply
 436        some optimizations which share commonly used objects. */
 437     if (u != NULL) {
 438
 439         /* Optimization for empty strings */
 440         if (size == 0 && unicode_empty != NULL) {
 441             Py_INCREF(unicode_empty);
 442             return (PyObject *)unicode_empty;
 443         }
 444
 445         /* Single character Unicode objects in the Latin-1 range are
 446            shared when using this constructor */
 447         if (size == 1 && *u < 256) {
 448             unicode = unicode_latin1[*u];
 449             if (!unicode) {
 450                 unicode = _PyUnicode_New(1);
 451                 if (!unicode)
 452                     return NULL;
 453                 unicode->str[0] = *u;
 454                 unicode_latin1[*u] = unicode;
 455             }
 456             Py_INCREF(unicode);
 457             return (PyObject *)unicode;
 458         }
 459     }
 460
 461     unicode = _PyUnicode_New(size);
 462     if (!unicode)
 463         return NULL;
 464
 465     /* Copy the Unicode data into the new object */
 466     if (u != NULL)
 467         Py_UNICODE_COPY(unicode->str, u, size);
 468
 469     return (PyObject *)unicode;
 470 }
 471
 472 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
 473 {
 474     PyUnicodeObject *unicode;
 475
 476     if (size < 0) {
 477         PyErr_SetString(PyExc_SystemError,
 478                         "Negative size passed to PyUnicode_FromStringAndSize");
 479         return NULL;
 480     }
 481
 482     /* If the Unicode data is known at construction time, we can apply
 483        some optimizations which share commonly used objects.
 484        Also, this means the input must be UTF-8, so fall back to the
 485        UTF-8 decoder at the end. */
 486     if (u != NULL) {
 487
 488         /* Optimization for empty strings */
 489         if (size == 0 && unicode_empty != NULL) {
 490             Py_INCREF(unicode_empty);
 491             return (PyObject *)unicode_empty;
 492         }
 493
 494         /* Single characters are shared when using this constructor.
 495            Restrict to ASCII, since the input must be UTF-8. */
 496         if (size == 1 && Py_CHARMASK(*u) < 128) {
 497             unicode = unicode_latin1[Py_CHARMASK(*u)];
 498             if (!unicode) {
 499                 unicode = _PyUnicode_New(1);
 500                 if (!unicode)
 501                     return NULL;
 502                 unicode->str[0] = Py_CHARMASK(*u);
 503                 unicode_latin1[Py_CHARMASK(*u)] = unicode;
 504             }
 505             Py_INCREF(unicode);
 506             return (PyObject *)unicode;
 507         }
 508
 509         return PyUnicode_DecodeUTF8(u, size, NULL);
 510     }
 511
 512     unicode = _PyUnicode_New(size);
 513     if (!unicode)
 514         return NULL;
 515
 516     return (PyObject *)unicode;
 517 }
 518
 519 PyObject *PyUnicode_FromString(const char *u)
 520 {
 521     size_t size = strlen(u);
 522     if (size > PY_SSIZE_T_MAX) {
 523         PyErr_SetString(PyExc_OverflowError, "input too long");
 524         return NULL;
 525     }
 526
 527     return PyUnicode_FromStringAndSize(u, size);
 528 }
 529
 530 #ifdef HAVE_WCHAR_H
 531
 532 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
 533 # define CONVERT_WCHAR_TO_SURROGATES
 534 #endif
 535
 536 #ifdef CONVERT_WCHAR_TO_SURROGATES
 537
 538 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
 539    to convert from UTF32 to UTF16. */
 540
 541 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 542                                  Py_ssize_t size)
 543 {
 544     PyUnicodeObject *unicode;
 545     register Py_ssize_t i;
 546     Py_ssize_t alloc;
 547     const wchar_t *orig_w;
 548
 549     if (w == NULL) {
 550         PyErr_BadInternalCall();
 551         return NULL;
 552     }
 553
 554     alloc = size;
 555     orig_w = w;
 556     for (i = size; i > 0; i--) {
 557         if (*w > 0xFFFF)
 558             alloc++;
 559         w++;
 560     }
 561     w = orig_w;
 562     unicode = _PyUnicode_New(alloc);
 563     if (!unicode)
 564         return NULL;
 565
 566     /* Copy the wchar_t data into the new object */
 567     {
 568         register Py_UNICODE *u;
 569         u = PyUnicode_AS_UNICODE(unicode);
 570         for (i = size; i > 0; i--) {
 571             if (*w > 0xFFFF) {
 572                 wchar_t ordinal = *w++;
 573                 ordinal -= 0x10000;
 574                 *u++ = 0xD800 | (ordinal >> 10);
 575                 *u++ = 0xDC00 | (ordinal & 0x3FF);
 576             }
 577             else
 578                 *u++ = *w++;
 579         }
 580     }
 581     return (PyObject *)unicode;
 582 }
 583
 584 #else
 585
 586 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 587                                  Py_ssize_t size)
 588 {
 589     PyUnicodeObject *unicode;
 590
 591     if (w == NULL) {
 592         PyErr_BadInternalCall();
 593         return NULL;
 594     }
 595
 596     unicode = _PyUnicode_New(size);
 597     if (!unicode)
 598         return NULL;
 599
 600     /* Copy the wchar_t data into the new object */
 601 #ifdef HAVE_USABLE_WCHAR_T
 602     memcpy(unicode->str, w, size * sizeof(wchar_t));
 603 #else
 604     {
 605         register Py_UNICODE *u;
 606         register Py_ssize_t i;
 607         u = PyUnicode_AS_UNICODE(unicode);
 608         for (i = size; i > 0; i--)
 609             *u++ = *w++;
 610     }
 611 #endif
 612
 613     return (PyObject *)unicode;
 614 }
 615
 616 #endif /* CONVERT_WCHAR_TO_SURROGATES */
 617
 618 #undef CONVERT_WCHAR_TO_SURROGATES
 619
 620 static void
 621 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
 622 {
 623     *fmt++ = '%';
 624     if (width) {
 625         if (zeropad)
 626             *fmt++ = '0';
 627         fmt += sprintf(fmt, "%d", width);
 628     }
 629     if (precision)
 630         fmt += sprintf(fmt, ".%d", precision);
 631     if (longflag)
 632         *fmt++ = 'l';
 633     else if (size_tflag) {
 634         char *f = PY_FORMAT_SIZE_T;
 635         while (*f)
 636             *fmt++ = *f++;
 637     }
 638     *fmt++ = c;
 639     *fmt = '\0';
 640 }
 641
 642 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
 643
 644 PyObject *
 645 PyUnicode_FromFormatV(const char *format, va_list vargs)
 646 {
 647     va_list count;
 648     Py_ssize_t callcount = 0;
 649     PyObject **callresults = NULL;
 650     PyObject **callresult = NULL;
 651     Py_ssize_t n = 0;
 652     int width = 0;
 653     int precision = 0;
 654     int zeropad;
 655     const char* f;
 656     Py_UNICODE *s;
 657     PyObject *string;
 658     /* used by sprintf */
 659     char buffer[21];
 660     /* use abuffer instead of buffer, if we need more space
 661      * (which can happen if there's a format specifier with width). */
 662     char *abuffer = NULL;
 663     char *realbuffer;
 664     Py_ssize_t abuffersize = 0;
 665     char fmt[60]; /* should be enough for %0width.precisionld */
 666     const char *copy;
 667
 668 #ifdef VA_LIST_IS_ARRAY
 669     Py_MEMCPY(count, vargs, sizeof(va_list));
 670 #else
 671 #ifdef  __va_copy
 672     __va_copy(count, vargs);
 673 #else
 674     count = vargs;
 675 #endif
 676 #endif
 677      /* step 1: count the number of %S/%R/%s format specifications
 678       * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
 679       * objects once during step 3 and put the result in an array) */
 680     for (f = format; *f; f++) {
 681          if (*f == '%') {
 682              if (*(f+1)=='%')
 683                  continue;
 684              if (*(f+1)=='S' || *(f+1)=='R')
 685                  ++callcount;
 686              while (isdigit((unsigned)*f))
 687                  width = (width*10) + *f++ - '0';
 688              while (*++f && *f != '%' && !isalpha((unsigned)*f))
 689                  ;
 690              if (*f == 's')
 691                  ++callcount;
 692          }
 693     }
 694     /* step 2: allocate memory for the results of
 695      * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
 696     if (callcount) {
 697         callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
 698         if (!callresults) {
 699             PyErr_NoMemory();
 700             return NULL;
 701         }
 702         callresult = callresults;
 703     }
 704     /* step 3: figure out how large a buffer we need */
 705     for (f = format; *f; f++) {
 706         if (*f == '%') {
 707             const char* p = f;
 708             width = 0;
 709             while (isdigit((unsigned)*f))
 710                 width = (width*10) + *f++ - '0';
 711             while (*++f && *f != '%' && !isalpha((unsigned)*f))
 712                 ;
 713
 714             /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
 715              * they don't affect the amount of space we reserve.
 716              */
 717             if ((*f == 'l' || *f == 'z') &&
 718                 (f[1] == 'd' || f[1] == 'u'))
 719                 ++f;
 720
 721             switch (*f) {
 722             case 'c':
 723                 (void)va_arg(count, int);
 724                 /* fall through... */
 725             case '%':
 726                 n++;
 727                 break;
 728             case 'd': case 'u': case 'i': case 'x':
 729                 (void) va_arg(count, int);
 730                 /* 20 bytes is enough to hold a 64-bit
 731                    integer.  Decimal takes the most space.
 732                    This isn't enough for octal.
 733                    If a width is specified we need more
 734                    (which we allocate later). */
 735                 if (width < 20)
 736                     width = 20;
 737                 n += width;
 738                 if (abuffersize < width)
 739                     abuffersize = width;
 740                 break;
 741             case 's':
 742             {
 743                 /* UTF-8 */
 744                 const char *s = va_arg(count, const char*);
 745                 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
 746                 if (!str)
 747                     goto fail;
 748                 n += PyUnicode_GET_SIZE(str);
 749                 /* Remember the str and switch to the next slot */
 750                 *callresult++ = str;
 751                 break;
 752             }
 753             case 'U':
 754             {
 755                 PyObject *obj = va_arg(count, PyObject *);
 756                 assert(obj && PyUnicode_Check(obj));
 757                 n += PyUnicode_GET_SIZE(obj);
 758                 break;
 759             }
 760             case 'V':
 761             {
 762                 PyObject *obj = va_arg(count, PyObject *);
 763                 const char *str = va_arg(count, const char *);
 764                 assert(obj || str);
 765                 assert(!obj || PyUnicode_Check(obj));
 766                 if (obj)
 767                     n += PyUnicode_GET_SIZE(obj);
 768                 else
 769                     n += strlen(str);
 770                 break;
 771             }
 772             case 'S':
 773             {
 774                 PyObject *obj = va_arg(count, PyObject *);
 775                 PyObject *str;
 776                 assert(obj);
 777                 str = PyObject_Str(obj);
 778                 if (!str)
 779                     goto fail;
 780                 n += PyUnicode_GET_SIZE(str);
 781                 /* Remember the str and switch to the next slot */
 782                 *callresult++ = str;
 783                 break;
 784             }
 785             case 'R':
 786             {
 787                 PyObject *obj = va_arg(count, PyObject *);
 788                 PyObject *repr;
 789                 assert(obj);
 790                 repr = PyObject_Repr(obj);
 791                 if (!repr)
 792                     goto fail;
 793                 n += PyUnicode_GET_SIZE(repr);
 794                 /* Remember the repr and switch to the next slot */
 795                 *callresult++ = repr;
 796                 break;
 797             }
 798             case 'p':
 799                 (void) va_arg(count, int);
 800                 /* maximum 64-bit pointer representation:
 801                  * 0xffffffffffffffff
 802                  * so 19 characters is enough.
 803                  * XXX I count 18 -- what's the extra for?
 804                  */
 805                 n += 19;
 806                 break;
 807             default:
 808                 /* if we stumble upon an unknown
 809                    formatting code, copy the rest of
 810                    the format string to the output
 811                    string. (we cannot just skip the
 812                    code, since there's no way to know
 813                    what's in the argument list) */
 814                 n += strlen(p);
 815                 goto expand;
 816             }
 817         } else
 818             n++;
 819     }
 820   expand:
 821     if (abuffersize > 20) {
 822         abuffer = PyObject_Malloc(abuffersize);
 823         if (!abuffer) {
 824             PyErr_NoMemory();
 825             goto fail;
 826         }
 827         realbuffer = abuffer;
 828     }
 829     else
 830         realbuffer = buffer;
 831     /* step 4: fill the buffer */
 832     /* Since we've analyzed how much space we need for the worst case,
 833        we don't have to resize the string.
 834        There can be no errors beyond this point. */
 835     string = PyUnicode_FromUnicode(NULL, n);
 836     if (!string)
 837         goto fail;
 838
 839     s = PyUnicode_AS_UNICODE(string);
 840     callresult = callresults;
 841
 842     for (f = format; *f; f++) {
 843         if (*f == '%') {
 844             const char* p = f++;
 845             int longflag = 0;
 846             int size_tflag = 0;
 847             zeropad = (*f == '0');
 848             /* parse the width.precision part */
 849             width = 0;
 850             while (isdigit((unsigned)*f))
 851                 width = (width*10) + *f++ - '0';
 852             precision = 0;
 853             if (*f == '.') {
 854                 f++;
 855                 while (isdigit((unsigned)*f))
 856                     precision = (precision*10) + *f++ - '0';
 857             }
 858             /* handle the long flag, but only for %ld and %lu.
 859                others can be added when necessary. */
 860             if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
 861                 longflag = 1;
 862                 ++f;
 863             }
 864             /* handle the size_t flag. */
 865             if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
 866                 size_tflag = 1;
 867                 ++f;
 868             }
 869
 870             switch (*f) {
 871             case 'c':
 872                 *s++ = va_arg(vargs, int);
 873                 break;
 874             case 'd':
 875                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
 876                 if (longflag)
 877                     sprintf(realbuffer, fmt, va_arg(vargs, long));
 878                 else if (size_tflag)
 879                     sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
 880                 else
 881                     sprintf(realbuffer, fmt, va_arg(vargs, int));
 882                 appendstring(realbuffer);
 883                 break;
 884             case 'u':
 885                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
 886                 if (longflag)
 887                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
 888                 else if (size_tflag)
 889                     sprintf(realbuffer, fmt, va_arg(vargs, size_t));
 890                 else
 891                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
 892                 appendstring(realbuffer);
 893                 break;
 894             case 'i':
 895                 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
 896                 sprintf(realbuffer, fmt, va_arg(vargs, int));
 897                 appendstring(realbuffer);
 898                 break;
 899             case 'x':
 900                 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
 901                 sprintf(realbuffer, fmt, va_arg(vargs, int));
 902                 appendstring(realbuffer);
 903                 break;
 904             case 's':
 905             {
 906                 /* unused, since we already have the result */
 907                 (void) va_arg(vargs, char *);
 908                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
 909                                 PyUnicode_GET_SIZE(*callresult));
 910                 s += PyUnicode_GET_SIZE(*callresult);
 911                 /* We're done with the unicode()/repr() => forget it */
 912                 Py_DECREF(*callresult);
 913                 /* switch to next unicode()/repr() result */
 914                 ++callresult;
 915                 break;
 916             }
 917             case 'U':
 918             {
 919                 PyObject *obj = va_arg(vargs, PyObject *);
 920                 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
 921                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
 922                 s += size;
 923                 break;
 924             }
 925             case 'V':
 926             {
 927                 PyObject *obj = va_arg(vargs, PyObject *);
 928                 const char *str = va_arg(vargs, const char *);
 929                 if (obj) {
 930                     Py_ssize_t size = PyUnicode_GET_SIZE(obj);
 931                     Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
 932                     s += size;
 933                 } else {
 934                     appendstring(str);
 935                 }
 936                 break;
 937             }
 938             case 'S':
 939             case 'R':
 940             {
 941                 Py_UNICODE *ucopy;
 942                 Py_ssize_t usize;
 943                 Py_ssize_t upos;
 944                 /* unused, since we already have the result */
 945                 (void) va_arg(vargs, PyObject *);
 946                 ucopy = PyUnicode_AS_UNICODE(*callresult);
 947                 usize = PyUnicode_GET_SIZE(*callresult);
 948                 for (upos = 0; upos<usize;)
 949                     *s++ = ucopy[upos++];
 950                 /* We're done with the unicode()/repr() => forget it */
 951                 Py_DECREF(*callresult);
 952                 /* switch to next unicode()/repr() result */
 953                 ++callresult;
 954                 break;
 955             }
 956             case 'p':
 957                 sprintf(buffer, "%p", va_arg(vargs, void*));
 958                 /* %p is ill-defined:  ensure leading 0x. */
 959                 if (buffer[1] == 'X')
 960                     buffer[1] = 'x';
 961                 else if (buffer[1] != 'x') {
 962                     memmove(buffer+2, buffer, strlen(buffer)+1);
 963                     buffer[0] = '0';
 964                     buffer[1] = 'x';
 965                 }
 966                 appendstring(buffer);
 967                 break;
 968             case '%':
 969                 *s++ = '%';
 970                 break;
 971             default:
 972                 appendstring(p);
 973                 goto end;
 974             }
 975         } else
 976             *s++ = *f;
 977     }
 978
 979   end:
 980     if (callresults)
 981         PyObject_Free(callresults);
 982     if (abuffer)
 983         PyObject_Free(abuffer);
 984     PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
 985     return string;
 986   fail:
 987     if (callresults) {
 988         PyObject **callresult2 = callresults;
 989         while (callresult2 < callresult) {
 990             Py_DECREF(*callresult2);
 991             ++callresult2;
 992         }
 993         PyObject_Free(callresults);
 994     }
 995     if (abuffer)
 996         PyObject_Free(abuffer);
 997     return NULL;
 998 }
 999
1000 #undef appendstring
1001
1002 PyObject *
1003 PyUnicode_FromFormat(const char *format, ...)
1004 {
1005     PyObject* ret;
1006     va_list vargs;
1007
1008 #ifdef HAVE_STDARG_PROTOTYPES
1009     va_start(vargs, format);
1010 #else
1011     va_start(vargs);
1012 #endif
1013     ret = PyUnicode_FromFormatV(format, vargs);
1014     va_end(vargs);
1015     return ret;
1016 }
1017
1018 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1019                                 wchar_t *w,
1020                                 Py_ssize_t size)
1021 {
1022     if (unicode == NULL) {
1023         PyErr_BadInternalCall();
1024         return -1;
1025     }
1026
1027     /* If possible, try to copy the 0-termination as well */
1028     if (size > PyUnicode_GET_SIZE(unicode))
1029         size = PyUnicode_GET_SIZE(unicode) + 1;
1030
1031 #ifdef HAVE_USABLE_WCHAR_T
1032     memcpy(w, unicode->str, size * sizeof(wchar_t));
1033 #else
1034     {
1035         register Py_UNICODE *u;
1036         register Py_ssize_t i;
1037         u = PyUnicode_AS_UNICODE(unicode);
1038         for (i = size; i > 0; i--)
1039             *w++ = *u++;
1040     }
1041 #endif
1042
1043     if (size > PyUnicode_GET_SIZE(unicode))
1044         return PyUnicode_GET_SIZE(unicode);
1045     else
1046         return size;
1047 }
1048
1049 #endif
1050
1051 PyObject *PyUnicode_FromOrdinal(int ordinal)
1052 {
1053     Py_UNICODE s[1];
1054
1055 #ifdef Py_UNICODE_WIDE
1056     if (ordinal < 0 || ordinal > 0x10ffff) {
1057         PyErr_SetString(PyExc_ValueError,
1058                         "unichr() arg not in range(0x110000) "
1059                         "(wide Python build)");
1060         return NULL;
1061     }
1062 #else
1063     if (ordinal < 0 || ordinal > 0xffff) {
1064         PyErr_SetString(PyExc_ValueError,
1065                         "unichr() arg not in range(0x10000) "
1066                         "(narrow Python build)");
1067         return NULL;
1068     }
1069 #endif
1070
1071     s[0] = (Py_UNICODE)ordinal;
1072     return PyUnicode_FromUnicode(s, 1);
1073 }
1074
1075 PyObject *PyUnicode_FromObject(register PyObject *obj)
1076 {
1077     /* XXX Perhaps we should make this API an alias of
1078        PyObject_Unicode() instead ?! */
1079     if (PyUnicode_CheckExact(obj)) {
1080         Py_INCREF(obj);
1081         return obj;
1082     }
1083     if (PyUnicode_Check(obj)) {
1084         /* For a Unicode subtype that's not a Unicode object,
1085            return a true Unicode object with the same data. */
1086         return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1087                                      PyUnicode_GET_SIZE(obj));
1088     }
1089     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1090 }
1091
1092 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1093                                       const char *encoding,
1094                                       const char *errors)
1095 {
1096     const char *s = NULL;
1097     Py_ssize_t len;
1098     PyObject *v;
1099
1100     if (obj == NULL) {
1101         PyErr_BadInternalCall();
1102         return NULL;
1103     }
1104
1105 #if 0
1106     /* For b/w compatibility we also accept Unicode objects provided
1107        that no encodings is given and then redirect to
1108        PyObject_Unicode() which then applies the additional logic for
1109        Unicode subclasses.
1110
1111        NOTE: This API should really only be used for object which
1112        represent *encoded* Unicode !
1113
1114     */
1115     if (PyUnicode_Check(obj)) {
1116         if (encoding) {
1117             PyErr_SetString(PyExc_TypeError,
1118                             "decoding Unicode is not supported");
1119             return NULL;
1120         }
1121         return PyObject_Unicode(obj);
1122     }
1123 #else
1124     if (PyUnicode_Check(obj)) {
1125         PyErr_SetString(PyExc_TypeError,
1126                         "decoding Unicode is not supported");
1127         return NULL;
1128     }
1129 #endif
1130
1131     /* Coerce object */
1132     if (PyString_Check(obj)) {
1133         s = PyString_AS_STRING(obj);
1134         len = PyString_GET_SIZE(obj);
1135     }
1136     else if (PyByteArray_Check(obj)) {
1137         /* Python 2.x specific */
1138         PyErr_Format(PyExc_TypeError,
1139                      "decoding bytearray is not supported");
1140         return NULL;
1141     }
1142     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1143         /* Overwrite the error message with something more useful in
1144            case of a TypeError. */
1145         if (PyErr_ExceptionMatches(PyExc_TypeError))
1146             PyErr_Format(PyExc_TypeError,
1147                          "coercing to Unicode: need string or buffer, "
1148                          "%.80s found",
1149                          Py_TYPE(obj)->tp_name);
1150         goto onError;
1151     }
1152
1153     /* Convert to Unicode */
1154     if (len == 0) {
1155         Py_INCREF(unicode_empty);
1156         v = (PyObject *)unicode_empty;
1157     }
1158     else
1159         v = PyUnicode_Decode(s, len, encoding, errors);
1160
1161     return v;
1162
1163   onError:
1164     return NULL;
1165 }
1166
1167 PyObject *PyUnicode_Decode(const char *s,
1168                            Py_ssize_t size,
1169                            const char *encoding,
1170                            const char *errors)
1171 {
1172     PyObject *buffer = NULL, *unicode;
1173
1174     if (encoding == NULL)
1175         encoding = PyUnicode_GetDefaultEncoding();
1176
1177     /* Shortcuts for common default encodings */
1178     if (strcmp(encoding, "utf-8") == 0)
1179         return PyUnicode_DecodeUTF8(s, size, errors);
1180     else if (strcmp(encoding, "latin-1") == 0)
1181         return PyUnicode_DecodeLatin1(s, size, errors);
1182 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1183     else if (strcmp(encoding, "mbcs") == 0)
1184         return PyUnicode_DecodeMBCS(s, size, errors);
1185 #endif
1186     else if (strcmp(encoding, "ascii") == 0)
1187         return PyUnicode_DecodeASCII(s, size, errors);
1188
1189     /* Decode via the codec registry */
1190     buffer = PyBuffer_FromMemory((void *)s, size);
1191     if (buffer == NULL)
1192         goto onError;
1193     unicode = PyCodec_Decode(buffer, encoding, errors);
1194     if (unicode == NULL)
1195         goto onError;
1196     if (!PyUnicode_Check(unicode)) {
1197         PyErr_Format(PyExc_TypeError,
1198                      "decoder did not return an unicode object (type=%.400s)",
1199                      Py_TYPE(unicode)->tp_name);
1200         Py_DECREF(unicode);
1201         goto onError;
1202     }
1203     Py_DECREF(buffer);
1204     return unicode;
1205
1206   onError:
1207     Py_XDECREF(buffer);
1208     return NULL;
1209 }
1210
1211 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1212                                     const char *encoding,
1213                                     const char *errors)
1214 {
1215     PyObject *v;
1216
1217     if (!PyUnicode_Check(unicode)) {
1218         PyErr_BadArgument();
1219         goto onError;
1220     }
1221
1222     if (encoding == NULL)
1223         encoding = PyUnicode_GetDefaultEncoding();
1224
1225     /* Decode via the codec registry */
1226     v = PyCodec_Decode(unicode, encoding, errors);
1227     if (v == NULL)
1228         goto onError;
1229     return v;
1230
1231   onError:
1232     return NULL;
1233 }
1234
1235 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1236                            Py_ssize_t size,
1237                            const char *encoding,
1238                            const char *errors)
1239 {
1240     PyObject *v, *unicode;
1241
1242     unicode = PyUnicode_FromUnicode(s, size);
1243     if (unicode == NULL)
1244         return NULL;
1245     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1246     Py_DECREF(unicode);
1247     return v;
1248 }
1249
1250 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1251                                     const char *encoding,
1252                                     const char *errors)
1253 {
1254     PyObject *v;
1255
1256     if (!PyUnicode_Check(unicode)) {
1257         PyErr_BadArgument();
1258         goto onError;
1259     }
1260
1261     if (encoding == NULL)
1262         encoding = PyUnicode_GetDefaultEncoding();
1263
1264     /* Encode via the codec registry */
1265     v = PyCodec_Encode(unicode, encoding, errors);
1266     if (v == NULL)
1267         goto onError;
1268     return v;
1269
1270   onError:
1271     return NULL;
1272 }
1273
1274 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1275                                     const char *encoding,
1276                                     const char *errors)
1277 {
1278     PyObject *v;
1279
1280     if (!PyUnicode_Check(unicode)) {
1281         PyErr_BadArgument();
1282         goto onError;
1283     }
1284
1285     if (encoding == NULL)
1286         encoding = PyUnicode_GetDefaultEncoding();
1287
1288     /* Shortcuts for common default encodings */
1289     if (errors == NULL) {
1290         if (strcmp(encoding, "utf-8") == 0)
1291             return PyUnicode_AsUTF8String(unicode);
1292         else if (strcmp(encoding, "latin-1") == 0)
1293             return PyUnicode_AsLatin1String(unicode);
1294 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1295         else if (strcmp(encoding, "mbcs") == 0)
1296             return PyUnicode_AsMBCSString(unicode);
1297 #endif
1298         else if (strcmp(encoding, "ascii") == 0)
1299             return PyUnicode_AsASCIIString(unicode);
1300     }
1301
1302     /* Encode via the codec registry */
1303     v = PyCodec_Encode(unicode, encoding, errors);
1304     if (v == NULL)
1305         goto onError;
1306     if (!PyString_Check(v)) {
1307         PyErr_Format(PyExc_TypeError,
1308                      "encoder did not return a string object (type=%.400s)",
1309                      Py_TYPE(v)->tp_name);
1310         Py_DECREF(v);
1311         goto onError;
1312     }
1313     return v;
1314
1315   onError:
1316     return NULL;
1317 }
1318
1319 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1320                                             const char *errors)
1321 {
1322     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1323
1324     if (v)
1325         return v;
1326     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1327     if (v && errors == NULL)
1328         ((PyUnicodeObject *)unicode)->defenc = v;
1329     return v;
1330 }
1331
1332 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1333 {
1334     if (!PyUnicode_Check(unicode)) {
1335         PyErr_BadArgument();
1336         goto onError;
1337     }
1338     return PyUnicode_AS_UNICODE(unicode);
1339
1340   onError:
1341     return NULL;
1342 }
1343
1344 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1345 {
1346     if (!PyUnicode_Check(unicode)) {
1347         PyErr_BadArgument();
1348         goto onError;
1349     }
1350     return PyUnicode_GET_SIZE(unicode);
1351
1352   onError:
1353     return -1;
1354 }
1355
1356 const char *PyUnicode_GetDefaultEncoding(void)
1357 {
1358     return unicode_default_encoding;
1359 }
1360
1361 int PyUnicode_SetDefaultEncoding(const char *encoding)
1362 {
1363     PyObject *v;
1364
1365     /* Make sure the encoding is valid. As side effect, this also
1366        loads the encoding into the codec registry cache. */
1367     v = _PyCodec_Lookup(encoding);
1368     if (v == NULL)
1369         goto onError;
1370     Py_DECREF(v);
1371     strncpy(unicode_default_encoding,
1372             encoding,
1373             sizeof(unicode_default_encoding));
1374     return 0;
1375
1376   onError:
1377     return -1;
1378 }
1379
1380 /* error handling callback helper:
1381    build arguments, call the callback and check the arguments,
1382    if no exception occurred, copy the replacement to the output
1383    and adjust various state variables.
1384    return 0 on success, -1 on error
1385 */
1386
1387 static
1388 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1389                                      const char *encoding, const char *reason,
1390                                      const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1391                                      Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1392                                      PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1393 {
1394     static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1395
1396     PyObject *restuple = NULL;
1397     PyObject *repunicode = NULL;
1398     Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1399     Py_ssize_t requiredsize;
1400     Py_ssize_t newpos;
1401     Py_UNICODE *repptr;
1402     Py_ssize_t repsize;
1403     int res = -1;
1404
1405     if (*errorHandler == NULL) {
1406         *errorHandler = PyCodec_LookupError(errors);
1407         if (*errorHandler == NULL)
1408             goto onError;
1409     }
1410
1411     if (*exceptionObject == NULL) {
1412         *exceptionObject = PyUnicodeDecodeError_Create(
1413             encoding, input, insize, *startinpos, *endinpos, reason);
1414         if (*exceptionObject == NULL)
1415             goto onError;
1416     }
1417     else {
1418         if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1419             goto onError;
1420         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1421             goto onError;
1422         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1423             goto onError;
1424     }
1425
1426     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1427     if (restuple == NULL)
1428         goto onError;
1429     if (!PyTuple_Check(restuple)) {
1430         PyErr_SetString(PyExc_TypeError, &argparse[4]);
1431         goto onError;
1432     }
1433     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1434         goto onError;
1435     if (newpos<0)
1436         newpos = insize+newpos;
1437     if (newpos<0 || newpos>insize) {
1438         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1439         goto onError;
1440     }
1441
1442     /* need more space? (at least enough for what we
1443        have+the replacement+the rest of the string (starting
1444        at the new input position), so we won't have to check space
1445        when there are no errors in the rest of the string) */
1446     repptr = PyUnicode_AS_UNICODE(repunicode);
1447     repsize = PyUnicode_GET_SIZE(repunicode);
1448     requiredsize = *outpos + repsize + insize-newpos;
1449     if (requiredsize > outsize) {
1450         if (requiredsize<2*outsize)
1451             requiredsize = 2*outsize;
1452         if (_PyUnicode_Resize(output, requiredsize) < 0)
1453             goto onError;
1454         *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1455     }
1456     *endinpos = newpos;
1457     *inptr = input + newpos;
1458     Py_UNICODE_COPY(*outptr, repptr, repsize);
1459     *outptr += repsize;
1460     *outpos += repsize;
1461     /* we made it! */
1462     res = 0;
1463
1464   onError:
1465     Py_XDECREF(restuple);
1466     return res;
1467 }
1468
1469 /* --- UTF-7 Codec -------------------------------------------------------- */
1470
1471 /* See RFC2152 for details.  We encode conservatively and decode liberally. */
1472
1473 /* Three simple macros defining base-64. */
1474
1475 /* Is c a base-64 character? */
1476
1477 #define IS_BASE64(c) \
1478     (isalnum(c) || (c) == '+' || (c) == '/')
1479
1480 /* given that c is a base-64 character, what is its base-64 value? */
1481
1482 #define FROM_BASE64(c)                                                  \
1483     (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
1484      ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
1485      ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
1486      (c) == '+' ? 62 : 63)
1487
1488 /* What is the base-64 character of the bottom 6 bits of n? */
1489
1490 #define TO_BASE64(n)  \
1491     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1492
1493 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1494  * decoded as itself.  We are permissive on decoding; the only ASCII
1495  * byte not decoding to itself is the + which begins a base64
1496  * string. */
1497
1498 #define DECODE_DIRECT(c)                                \
1499     ((c) <= 127 && (c) != '+')
1500
1501 /* The UTF-7 encoder treats ASCII characters differently according to
1502  * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1503  * the above).  See RFC2152.  This array identifies these different
1504  * sets:
1505  * 0 : "Set D"
1506  *     alphanumeric and '(),-./:?
1507  * 1 : "Set O"
1508  *     !"#$%&*;<=>@[]^_`{|}
1509  * 2 : "whitespace"
1510  *     ht nl cr sp
1511  * 3 : special (must be base64 encoded)
1512  *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1513  */
1514
1515 static
1516 char utf7_category[128] = {
1517 /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
1518     3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
1519 /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
1520     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
1521 /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
1522     2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
1523 /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
1524     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
1525 /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
1526     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1527 /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
1528     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
1529 /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
1530     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1531 /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
1532     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
1533 };
1534
1535 /* ENCODE_DIRECT: this character should be encoded as itself.  The
1536  * answer depends on whether we are encoding set O as itself, and also
1537  * on whether we are encoding whitespace as itself.  RFC2152 makes it
1538  * clear that the answers to these questions vary between
1539  * applications, so this code needs to be flexible.  */
1540
1541 #define ENCODE_DIRECT(c, directO, directWS)             \
1542     ((c) < 128 && (c) > 0 &&                            \
1543      ((utf7_category[(c)] == 0) ||                      \
1544       (directWS && (utf7_category[(c)] == 2)) ||        \
1545       (directO && (utf7_category[(c)] == 1))))
1546
1547 PyObject *PyUnicode_DecodeUTF7(const char *s,
1548                                Py_ssize_t size,
1549                                const char *errors)
1550 {
1551     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1552 }
1553
1554 /* The decoder.  The only state we preserve is our read position,
1555  * i.e. how many characters we have consumed.  So if we end in the
1556  * middle of a shift sequence we have to back off the read position
1557  * and the output to the beginning of the sequence, otherwise we lose
1558  * all the shift state (seen bits, number of bits seen, high
1559  * surrogate). */
1560
1561 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1562                                        Py_ssize_t size,
1563                                        const char *errors,
1564                                        Py_ssize_t *consumed)
1565 {
1566     const char *starts = s;
1567     Py_ssize_t startinpos;
1568     Py_ssize_t endinpos;
1569     Py_ssize_t outpos;
1570     const char *e;
1571     PyUnicodeObject *unicode;
1572     Py_UNICODE *p;
1573     const char *errmsg = "";
1574     int inShift = 0;
1575     Py_UNICODE *shiftOutStart;
1576     unsigned int base64bits = 0;
1577     unsigned long base64buffer = 0;
1578     Py_UNICODE surrogate = 0;
1579     PyObject *errorHandler = NULL;
1580     PyObject *exc = NULL;
1581
1582     unicode = _PyUnicode_New(size);
1583     if (!unicode)
1584         return NULL;
1585     if (size == 0) {
1586         if (consumed)
1587             *consumed = 0;
1588         return (PyObject *)unicode;
1589     }
1590
1591     p = unicode->str;
1592     shiftOutStart = p;
1593     e = s + size;
1594
1595     while (s < e) {
1596         Py_UNICODE ch = (unsigned char) *s;
1597
1598         if (inShift) { /* in a base-64 section */
1599             if (IS_BASE64(ch)) { /* consume a base-64 character */
1600                 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1601                 base64bits += 6;
1602                 s++;
1603                 if (base64bits >= 16) {
1604                     /* we have enough bits for a UTF-16 value */
1605                     Py_UNICODE outCh = (Py_UNICODE)
1606                                        (base64buffer >> (base64bits-16));
1607                     base64bits -= 16;
1608                     base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1609                     if (surrogate) {
1610                         /* expecting a second surrogate */
1611                         if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1612 #ifdef Py_UNICODE_WIDE
1613                             *p++ = (((surrogate & 0x3FF)<<10)
1614                                     | (outCh & 0x3FF)) + 0x10000;
1615 #else
1616                             *p++ = surrogate;
1617                             *p++ = outCh;
1618 #endif
1619                             surrogate = 0;
1620                         }
1621                         else {
1622                             surrogate = 0;
1623                             errmsg = "second surrogate missing";
1624                             goto utf7Error;
1625                         }
1626                     }
1627                     else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1628                         /* first surrogate */
1629                         surrogate = outCh;
1630                     }
1631                     else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1632                         errmsg = "unexpected second surrogate";
1633                         goto utf7Error;
1634                     }
1635                     else {
1636                         *p++ = outCh;
1637                     }
1638                 }
1639             }
1640             else { /* now leaving a base-64 section */
1641                 inShift = 0;
1642                 s++;
1643                 if (surrogate) {
1644                     errmsg = "second surrogate missing at end of shift sequence";
1645                     goto utf7Error;
1646                 }
1647                 if (base64bits > 0) { /* left-over bits */
1648                     if (base64bits >= 6) {
1649                         /* We've seen at least one base-64 character */
1650                         errmsg = "partial character in shift sequence";
1651                         goto utf7Error;
1652                     }
1653                     else {
1654                         /* Some bits remain; they should be zero */
1655                         if (base64buffer != 0) {
1656                             errmsg = "non-zero padding bits in shift sequence";
1657                             goto utf7Error;
1658                         }
1659                     }
1660                 }
1661                 if (ch != '-') {
1662                     /* '-' is absorbed; other terminating
1663                        characters are preserved */
1664                     *p++ = ch;
1665                 }
1666             }
1667         }
1668         else if ( ch == '+' ) {
1669             startinpos = s-starts;
1670             s++; /* consume '+' */
1671             if (s < e && *s == '-') { /* '+-' encodes '+' */
1672                 s++;
1673                 *p++ = '+';
1674             }
1675             else { /* begin base64-encoded section */
1676                 inShift = 1;
1677                 shiftOutStart = p;
1678                 base64bits = 0;
1679             }
1680         }
1681         else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
1682             *p++ = ch;
1683             s++;
1684         }
1685         else {
1686             startinpos = s-starts;
1687             s++;
1688             errmsg = "unexpected special character";
1689             goto utf7Error;
1690         }
1691         continue;
1692 utf7Error:
1693         outpos = p-PyUnicode_AS_UNICODE(unicode);
1694         endinpos = s-starts;
1695         if (unicode_decode_call_errorhandler(
1696                 errors, &errorHandler,
1697                 "utf7", errmsg,
1698                 starts, size, &startinpos, &endinpos, &exc, &s,
1699                 &unicode, &outpos, &p))
1700             goto onError;
1701     }
1702
1703     /* end of string */
1704
1705     if (inShift && !consumed) { /* in shift sequence, no more to follow */
1706         /* if we're in an inconsistent state, that's an error */
1707         if (surrogate ||
1708                 (base64bits >= 6) ||
1709                 (base64bits > 0 && base64buffer != 0)) {
1710             outpos = p-PyUnicode_AS_UNICODE(unicode);
1711             endinpos = size;
1712             if (unicode_decode_call_errorhandler(
1713                     errors, &errorHandler,
1714                     "utf7", "unterminated shift sequence",
1715                     starts, size, &startinpos, &endinpos, &exc, &s,
1716                     &unicode, &outpos, &p))
1717                 goto onError;
1718         }
1719     }
1720
1721     /* return state */
1722     if (consumed) {
1723         if (inShift) {
1724             p = shiftOutStart; /* back off output */
1725             *consumed = startinpos;
1726         }
1727         else {
1728             *consumed = s-starts;
1729         }
1730     }
1731
1732     if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1733         goto onError;
1734
1735     Py_XDECREF(errorHandler);
1736     Py_XDECREF(exc);
1737     return (PyObject *)unicode;
1738
1739   onError:
1740     Py_XDECREF(errorHandler);
1741     Py_XDECREF(exc);
1742     Py_DECREF(unicode);
1743     return NULL;
1744 }
1745
1746
1747 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1748                                Py_ssize_t size,
1749                                int base64SetO,
1750                                int base64WhiteSpace,
1751                                const char *errors)
1752 {
1753     PyObject *v;
1754     /* It might be possible to tighten this worst case */
1755     Py_ssize_t allocated = 8 * size;
1756     int inShift = 0;
1757     Py_ssize_t i = 0;
1758     unsigned int base64bits = 0;
1759     unsigned long base64buffer = 0;
1760     char * out;
1761     char * start;
1762
1763     if (allocated / 8 != size)
1764         return PyErr_NoMemory();
1765
1766     if (size == 0)
1767         return PyString_FromStringAndSize(NULL, 0);
1768
1769     v = PyString_FromStringAndSize(NULL, allocated);
1770     if (v == NULL)
1771         return NULL;
1772
1773     start = out = PyString_AS_STRING(v);
1774     for (;i < size; ++i) {
1775         Py_UNICODE ch = s[i];
1776
1777         if (inShift) {
1778             if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1779                 /* shifting out */
1780                 if (base64bits) { /* output remaining bits */
1781                     *out++ = TO_BASE64(base64buffer << (6-base64bits));
1782                     base64buffer = 0;
1783                     base64bits = 0;
1784                 }
1785                 inShift = 0;
1786                 /* Characters not in the BASE64 set implicitly unshift the sequence
1787                    so no '-' is required, except if the character is itself a '-' */
1788                 if (IS_BASE64(ch) || ch == '-') {
1789                     *out++ = '-';
1790                 }
1791                 *out++ = (char) ch;
1792             }
1793             else {
1794                 goto encode_char;
1795             }
1796         }
1797         else { /* not in a shift sequence */
1798             if (ch == '+') {
1799                 *out++ = '+';
1800                         *out++ = '-';
1801             }
1802             else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1803                 *out++ = (char) ch;
1804             }
1805             else {
1806                 *out++ = '+';
1807                 inShift = 1;
1808                 goto encode_char;
1809             }
1810         }
1811         continue;
1812 encode_char:
1813 #ifdef Py_UNICODE_WIDE
1814         if (ch >= 0x10000) {
1815             /* code first surrogate */
1816             base64bits += 16;
1817             base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1818             while (base64bits >= 6) {
1819                 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1820                 base64bits -= 6;
1821             }
1822             /* prepare second surrogate */
1823             ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
1824         }
1825 #endif
1826         base64bits += 16;
1827         base64buffer = (base64buffer << 16) | ch;
1828         while (base64bits >= 6) {
1829             *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1830             base64bits -= 6;
1831         }
1832     }
1833     if (base64bits)
1834         *out++= TO_BASE64(base64buffer << (6-base64bits) );
1835     if (inShift)
1836         *out++ = '-';
1837
1838     _PyString_Resize(&v, out - start);
1839     return v;
1840 }
1841
1842 #undef IS_BASE64
1843 #undef FROM_BASE64
1844 #undef TO_BASE64
1845 #undef DECODE_DIRECT
1846 #undef ENCODE_DIRECT
1847
1848 /* --- UTF-8 Codec -------------------------------------------------------- */
1849
1850 static
1851 char utf8_code_length[256] = {
1852     /* Map UTF-8 encoded prefix byte to sequence length.  zero means
1853        illegal prefix.  see RFC 2279 for details */
1854     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1855     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1856     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1857     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1858     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1859     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1860     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1861     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1862     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1863     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1864     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1865     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1866     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1867     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1868     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1869     4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1870 };
1871
1872 PyObject *PyUnicode_DecodeUTF8(const char *s,
1873                                Py_ssize_t size,
1874                                const char *errors)
1875 {
1876     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1877 }
1878
1879 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1880                                        Py_ssize_t size,
1881                                        const char *errors,
1882                                        Py_ssize_t *consumed)
1883 {
1884     const char *starts = s;
1885     int n;
1886     Py_ssize_t startinpos;
1887     Py_ssize_t endinpos;
1888     Py_ssize_t outpos;
1889     const char *e;
1890     PyUnicodeObject *unicode;
1891     Py_UNICODE *p;
1892     const char *errmsg = "";
1893     PyObject *errorHandler = NULL;
1894     PyObject *exc = NULL;
1895
1896     /* Note: size will always be longer than the resulting Unicode
1897        character count */
1898     unicode = _PyUnicode_New(size);
1899     if (!unicode)
1900         return NULL;
1901     if (size == 0) {
1902         if (consumed)
1903             *consumed = 0;
1904         return (PyObject *)unicode;
1905     }
1906
1907     /* Unpack UTF-8 encoded data */
1908     p = unicode->str;
1909     e = s + size;
1910
1911     while (s < e) {
1912         Py_UCS4 ch = (unsigned char)*s;
1913
1914         if (ch < 0x80) {
1915             *p++ = (Py_UNICODE)ch;
1916             s++;
1917             continue;
1918         }
1919
1920         n = utf8_code_length[ch];
1921
1922         if (s + n > e) {
1923             if (consumed)
1924                 break;
1925             else {
1926                 errmsg = "unexpected end of data";
1927                 startinpos = s-starts;
1928                 endinpos = size;
1929                 goto utf8Error;
1930             }
1931         }
1932
1933         switch (n) {
1934
1935         case 0:
1936             errmsg = "unexpected code byte";
1937             startinpos = s-starts;
1938             endinpos = startinpos+1;
1939             goto utf8Error;
1940
1941         case 1:
1942             errmsg = "internal error";
1943             startinpos = s-starts;
1944             endinpos = startinpos+1;
1945             goto utf8Error;
1946
1947         case 2:
1948             if ((s[1] & 0xc0) != 0x80) {
1949                 errmsg = "invalid data";
1950                 startinpos = s-starts;
1951                 endinpos = startinpos+2;
1952                 goto utf8Error;
1953             }
1954             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1955             if (ch < 0x80) {
1956                 startinpos = s-starts;
1957                 endinpos = startinpos+2;
1958                 errmsg = "illegal encoding";
1959                 goto utf8Error;
1960             }
1961             else
1962                 *p++ = (Py_UNICODE)ch;
1963             break;
1964
1965         case 3:
1966             if ((s[1] & 0xc0) != 0x80 ||
1967                 (s[2] & 0xc0) != 0x80) {
1968                 errmsg = "invalid data";
1969                 startinpos = s-starts;
1970                 endinpos = startinpos+3;
1971                 goto utf8Error;
1972             }
1973             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1974             if (ch < 0x0800) {
1975                 /* Note: UTF-8 encodings of surrogates are considered
1976                    legal UTF-8 sequences;
1977
1978                    XXX For wide builds (UCS-4) we should probably try
1979                    to recombine the surrogates into a single code
1980                    unit.
1981                 */
1982                 errmsg = "illegal encoding";
1983                 startinpos = s-starts;
1984                 endinpos = startinpos+3;
1985                 goto utf8Error;
1986             }
1987             else
1988                 *p++ = (Py_UNICODE)ch;
1989             break;
1990
1991         case 4:
1992             if ((s[1] & 0xc0) != 0x80 ||
1993                 (s[2] & 0xc0) != 0x80 ||
1994                 (s[3] & 0xc0) != 0x80) {
1995                 errmsg = "invalid data";
1996                 startinpos = s-starts;
1997                 endinpos = startinpos+4;
1998                 goto utf8Error;
1999             }
2000             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2001                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2002             /* validate and convert to UTF-16 */
2003             if ((ch < 0x10000)        /* minimum value allowed for 4
2004                                          byte encoding */
2005                 || (ch > 0x10ffff))   /* maximum value allowed for
2006                                          UTF-16 */
2007             {
2008                 errmsg = "illegal encoding";
2009                 startinpos = s-starts;
2010                 endinpos = startinpos+4;
2011                 goto utf8Error;
2012             }
2013 #ifdef Py_UNICODE_WIDE
2014             *p++ = (Py_UNICODE)ch;
2015 #else
2016             /*  compute and append the two surrogates: */
2017
2018             /*  translate from 10000..10FFFF to 0..FFFF */
2019             ch -= 0x10000;
2020
2021             /*  high surrogate = top 10 bits added to D800 */
2022             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2023
2024             /*  low surrogate = bottom 10 bits added to DC00 */
2025             *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2026 #endif
2027             break;
2028
2029         default:
2030             /* Other sizes are only needed for UCS-4 */
2031             errmsg = "unsupported Unicode code range";
2032             startinpos = s-starts;
2033             endinpos = startinpos+n;
2034             goto utf8Error;
2035         }
2036         s += n;
2037         continue;
2038
2039       utf8Error:
2040         outpos = p-PyUnicode_AS_UNICODE(unicode);
2041         if (unicode_decode_call_errorhandler(
2042                 errors, &errorHandler,
2043                 "utf8", errmsg,
2044                 starts, size, &startinpos, &endinpos, &exc, &s,
2045                 &unicode, &outpos, &p))
2046             goto onError;
2047     }
2048     if (consumed)
2049         *consumed = s-starts;
2050
2051     /* Adjust length */
2052     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2053         goto onError;
2054
2055     Py_XDECREF(errorHandler);
2056     Py_XDECREF(exc);
2057     return (PyObject *)unicode;
2058
2059   onError:
2060     Py_XDECREF(errorHandler);
2061     Py_XDECREF(exc);
2062     Py_DECREF(unicode);
2063     return NULL;
2064 }
2065
2066 /* Allocation strategy:  if the string is short, convert into a stack buffer
2067    and allocate exactly as much space needed at the end.  Else allocate the
2068    maximum possible needed (4 result bytes per Unicode character), and return
2069    the excess memory at the end.
2070 */
2071 PyObject *
2072 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2073                      Py_ssize_t size,
2074                      const char *errors)
2075 {
2076 #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
2077
2078     Py_ssize_t i;           /* index into s of next input byte */
2079     PyObject *v;        /* result string object */
2080     char *p;            /* next free byte in output buffer */
2081     Py_ssize_t nallocated;  /* number of result bytes allocated */
2082     Py_ssize_t nneeded;        /* number of result bytes needed */
2083     char stackbuf[MAX_SHORT_UNICHARS * 4];
2084
2085     assert(s != NULL);
2086     assert(size >= 0);
2087
2088     if (size <= MAX_SHORT_UNICHARS) {
2089         /* Write into the stack buffer; nallocated can't overflow.
2090          * At the end, we'll allocate exactly as much heap space as it
2091          * turns out we need.
2092          */
2093         nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2094         v = NULL;   /* will allocate after we're done */
2095         p = stackbuf;
2096     }
2097     else {
2098         /* Overallocate on the heap, and give the excess back at the end. */
2099         nallocated = size * 4;
2100         if (nallocated / 4 != size)  /* overflow! */
2101             return PyErr_NoMemory();
2102         v = PyString_FromStringAndSize(NULL, nallocated);
2103         if (v == NULL)
2104             return NULL;
2105         p = PyString_AS_STRING(v);
2106     }
2107
2108     for (i = 0; i < size;) {
2109         Py_UCS4 ch = s[i++];
2110
2111         if (ch < 0x80)
2112             /* Encode ASCII */
2113             *p++ = (char) ch;
2114
2115         else if (ch < 0x0800) {
2116             /* Encode Latin-1 */
2117             *p++ = (char)(0xc0 | (ch >> 6));
2118             *p++ = (char)(0x80 | (ch & 0x3f));
2119         }
2120         else {
2121             /* Encode UCS2 Unicode ordinals */
2122             if (ch < 0x10000) {
2123                 /* Special case: check for high surrogate */
2124                 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2125                     Py_UCS4 ch2 = s[i];
2126                     /* Check for low surrogate and combine the two to
2127                        form a UCS4 value */
2128                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2129                         ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2130                         i++;
2131                         goto encodeUCS4;
2132                     }
2133                     /* Fall through: handles isolated high surrogates */
2134                 }
2135                 *p++ = (char)(0xe0 | (ch >> 12));
2136                 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2137                 *p++ = (char)(0x80 | (ch & 0x3f));
2138                 continue;
2139             }
2140           encodeUCS4:
2141             /* Encode UCS4 Unicode ordinals */
2142             *p++ = (char)(0xf0 | (ch >> 18));
2143             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2144             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2145             *p++ = (char)(0x80 | (ch & 0x3f));
2146         }
2147     }
2148
2149     if (v == NULL) {
2150         /* This was stack allocated. */
2151         nneeded = p - stackbuf;
2152         assert(nneeded <= nallocated);
2153         v = PyString_FromStringAndSize(stackbuf, nneeded);
2154     }
2155     else {
2156         /* Cut back to size actually needed. */
2157         nneeded = p - PyString_AS_STRING(v);
2158         assert(nneeded <= nallocated);
2159         _PyString_Resize(&v, nneeded);
2160     }
2161     return v;
2162
2163 #undef MAX_SHORT_UNICHARS
2164 }
2165
2166 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2167 {
2168     if (!PyUnicode_Check(unicode)) {
2169         PyErr_BadArgument();
2170         return NULL;
2171     }
2172     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2173                                 PyUnicode_GET_SIZE(unicode),
2174                                 NULL);
2175 }
2176
2177 /* --- UTF-32 Codec ------------------------------------------------------- */
2178
2179 PyObject *
2180 PyUnicode_DecodeUTF32(const char *s,
2181                       Py_ssize_t size,
2182                       const char *errors,
2183                       int *byteorder)
2184 {
2185     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2186 }
2187
2188 PyObject *
2189 PyUnicode_DecodeUTF32Stateful(const char *s,
2190                               Py_ssize_t size,
2191                               const char *errors,
2192                               int *byteorder,
2193                               Py_ssize_t *consumed)
2194 {
2195     const char *starts = s;
2196     Py_ssize_t startinpos;
2197     Py_ssize_t endinpos;
2198     Py_ssize_t outpos;
2199     PyUnicodeObject *unicode;
2200     Py_UNICODE *p;
2201 #ifndef Py_UNICODE_WIDE
2202     int i, pairs;
2203 #else
2204     const int pairs = 0;
2205 #endif
2206     const unsigned char *q, *e;
2207     int bo = 0;       /* assume native ordering by default */
2208     const char *errmsg = "";
2209     /* Offsets from q for retrieving bytes in the right order. */
2210 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2211     int iorder[] = {0, 1, 2, 3};
2212 #else
2213     int iorder[] = {3, 2, 1, 0};
2214 #endif
2215     PyObject *errorHandler = NULL;
2216     PyObject *exc = NULL;
2217     /* On narrow builds we split characters outside the BMP into two
2218        codepoints => count how much extra space we need. */
2219 #ifndef Py_UNICODE_WIDE
2220     for (i = pairs = 0; i < size/4; i++)
2221         if (((Py_UCS4 *)s)[i] >= 0x10000)
2222             pairs++;
2223 #endif
2224
2225     /* This might be one to much, because of a BOM */
2226     unicode = _PyUnicode_New((size+3)/4+pairs);
2227     if (!unicode)
2228         return NULL;
2229     if (size == 0)
2230         return (PyObject *)unicode;
2231
2232     /* Unpack UTF-32 encoded data */
2233     p = unicode->str;
2234     q = (unsigned char *)s;
2235     e = q + size;
2236
2237     if (byteorder)
2238         bo = *byteorder;
2239
2240     /* Check for BOM marks (U+FEFF) in the input and adjust current
2241        byte order setting accordingly. In native mode, the leading BOM
2242        mark is skipped, in all other modes, it is copied to the output
2243        stream as-is (giving a ZWNBSP character). */
2244     if (bo == 0) {
2245         if (size >= 4) {
2246             const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2247                 (q[iorder[1]] << 8) | q[iorder[0]];
2248 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2249             if (bom == 0x0000FEFF) {
2250                 q += 4;
2251                 bo = -1;
2252             }
2253             else if (bom == 0xFFFE0000) {
2254                 q += 4;
2255                 bo = 1;
2256             }
2257 #else
2258             if (bom == 0x0000FEFF) {
2259                 q += 4;
2260                 bo = 1;
2261             }
2262             else if (bom == 0xFFFE0000) {
2263                 q += 4;
2264                 bo = -1;
2265             }
2266 #endif
2267         }
2268     }
2269
2270     if (bo == -1) {
2271         /* force LE */
2272         iorder[0] = 0;
2273         iorder[1] = 1;
2274         iorder[2] = 2;
2275         iorder[3] = 3;
2276     }
2277     else if (bo == 1) {
2278         /* force BE */
2279         iorder[0] = 3;
2280         iorder[1] = 2;
2281         iorder[2] = 1;
2282         iorder[3] = 0;
2283     }
2284
2285     while (q < e) {
2286         Py_UCS4 ch;
2287         /* remaining bytes at the end? (size should be divisible by 4) */
2288         if (e-q<4) {
2289             if (consumed)
2290                 break;
2291             errmsg = "truncated data";
2292             startinpos = ((const char *)q)-starts;
2293             endinpos = ((const char *)e)-starts;
2294             goto utf32Error;
2295             /* The remaining input chars are ignored if the callback
2296                chooses to skip the input */
2297         }
2298         ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2299             (q[iorder[1]] << 8) | q[iorder[0]];
2300
2301         if (ch >= 0x110000)
2302         {
2303             errmsg = "codepoint not in range(0x110000)";
2304             startinpos = ((const char *)q)-starts;
2305             endinpos = startinpos+4;
2306             goto utf32Error;
2307         }
2308 #ifndef Py_UNICODE_WIDE
2309         if (ch >= 0x10000)
2310         {
2311             *p++ = 0xD800 | ((ch-0x10000) >> 10);
2312             *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2313         }
2314         else
2315 #endif
2316             *p++ = ch;
2317         q += 4;
2318         continue;
2319       utf32Error:
2320         outpos = p-PyUnicode_AS_UNICODE(unicode);
2321         if (unicode_decode_call_errorhandler(
2322                 errors, &errorHandler,
2323                 "utf32", errmsg,
2324                 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2325                 &unicode, &outpos, &p))
2326             goto onError;
2327     }
2328
2329     if (byteorder)
2330         *byteorder = bo;
2331
2332     if (consumed)
2333         *consumed = (const char *)q-starts;
2334
2335     /* Adjust length */
2336     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2337         goto onError;
2338
2339     Py_XDECREF(errorHandler);
2340     Py_XDECREF(exc);
2341     return (PyObject *)unicode;
2342
2343   onError:
2344     Py_DECREF(unicode);
2345     Py_XDECREF(errorHandler);
2346     Py_XDECREF(exc);
2347     return NULL;
2348 }
2349
2350 PyObject *
2351 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2352                       Py_ssize_t size,
2353                       const char *errors,
2354                       int byteorder)
2355 {
2356     PyObject *v;
2357     unsigned char *p;
2358     Py_ssize_t nsize, bytesize;
2359 #ifndef Py_UNICODE_WIDE
2360     Py_ssize_t i, pairs;
2361 #else
2362     const int pairs = 0;
2363 #endif
2364     /* Offsets from p for storing byte pairs in the right order. */
2365 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2366     int iorder[] = {0, 1, 2, 3};
2367 #else
2368     int iorder[] = {3, 2, 1, 0};
2369 #endif
2370
2371 #define STORECHAR(CH)                           \
2372     do {                                        \
2373         p[iorder[3]] = ((CH) >> 24) & 0xff;     \
2374         p[iorder[2]] = ((CH) >> 16) & 0xff;     \
2375         p[iorder[1]] = ((CH) >> 8) & 0xff;      \
2376         p[iorder[0]] = (CH) & 0xff;             \
2377         p += 4;                                 \
2378     } while(0)
2379
2380     /* In narrow builds we can output surrogate pairs as one codepoint,
2381        so we need less space. */
2382 #ifndef Py_UNICODE_WIDE
2383     for (i = pairs = 0; i < size-1; i++)
2384         if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2385             0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2386             pairs++;
2387 #endif
2388     nsize = (size - pairs + (byteorder == 0));
2389     bytesize = nsize * 4;
2390     if (bytesize / 4 != nsize)
2391         return PyErr_NoMemory();
2392     v = PyString_FromStringAndSize(NULL, bytesize);
2393     if (v == NULL)
2394         return NULL;
2395
2396     p = (unsigned char *)PyString_AS_STRING(v);
2397     if (byteorder == 0)
2398         STORECHAR(0xFEFF);
2399     if (size == 0)
2400         return v;
2401
2402     if (byteorder == -1) {
2403         /* force LE */
2404         iorder[0] = 0;
2405         iorder[1] = 1;
2406         iorder[2] = 2;
2407         iorder[3] = 3;
2408     }
2409     else if (byteorder == 1) {
2410         /* force BE */
2411         iorder[0] = 3;
2412         iorder[1] = 2;
2413         iorder[2] = 1;
2414         iorder[3] = 0;
2415     }
2416
2417     while (size-- > 0) {
2418         Py_UCS4 ch = *s++;
2419 #ifndef Py_UNICODE_WIDE
2420         if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2421             Py_UCS4 ch2 = *s;
2422             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2423                 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2424                 s++;
2425                 size--;
2426             }
2427         }
2428 #endif
2429         STORECHAR(ch);
2430     }
2431     return v;
2432 #undef STORECHAR
2433 }
2434
2435 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2436 {
2437     if (!PyUnicode_Check(unicode)) {
2438         PyErr_BadArgument();
2439         return NULL;
2440     }
2441     return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2442                                  PyUnicode_GET_SIZE(unicode),
2443                                  NULL,
2444                                  0);
2445 }
2446
2447 /* --- UTF-16 Codec ------------------------------------------------------- */
2448
2449 PyObject *
2450 PyUnicode_DecodeUTF16(const char *s,
2451                       Py_ssize_t size,
2452                       const char *errors,
2453                       int *byteorder)
2454 {
2455     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2456 }
2457
2458 PyObject *
2459 PyUnicode_DecodeUTF16Stateful(const char *s,
2460                               Py_ssize_t size,
2461                               const char *errors,
2462                               int *byteorder,
2463                               Py_ssize_t *consumed)
2464 {
2465     const char *starts = s;
2466     Py_ssize_t startinpos;
2467     Py_ssize_t endinpos;
2468     Py_ssize_t outpos;
2469     PyUnicodeObject *unicode;
2470     Py_UNICODE *p;
2471     const unsigned char *q, *e;
2472     int bo = 0;       /* assume native ordering by default */
2473     const char *errmsg = "";
2474     /* Offsets from q for retrieving byte pairs in the right order. */
2475 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2476     int ihi = 1, ilo = 0;
2477 #else
2478     int ihi = 0, ilo = 1;
2479 #endif
2480     PyObject *errorHandler = NULL;
2481     PyObject *exc = NULL;
2482
2483     /* Note: size will always be longer than the resulting Unicode
2484        character count */
2485     unicode = _PyUnicode_New(size);
2486     if (!unicode)
2487         return NULL;
2488     if (size == 0)
2489         return (PyObject *)unicode;
2490
2491     /* Unpack UTF-16 encoded data */
2492     p = unicode->str;
2493     q = (unsigned char *)s;
2494     e = q + size;
2495
2496     if (byteorder)
2497         bo = *byteorder;
2498
2499     /* Check for BOM marks (U+FEFF) in the input and adjust current
2500        byte order setting accordingly. In native mode, the leading BOM
2501        mark is skipped, in all other modes, it is copied to the output
2502        stream as-is (giving a ZWNBSP character). */
2503     if (bo == 0) {
2504         if (size >= 2) {
2505             const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2506 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2507             if (bom == 0xFEFF) {
2508                 q += 2;
2509                 bo = -1;
2510             }
2511             else if (bom == 0xFFFE) {
2512                 q += 2;
2513                 bo = 1;
2514             }
2515 #else
2516             if (bom == 0xFEFF) {
2517                 q += 2;
2518                 bo = 1;
2519             }
2520             else if (bom == 0xFFFE) {
2521                 q += 2;
2522                 bo = -1;
2523             }
2524 #endif
2525         }
2526     }
2527
2528     if (bo == -1) {
2529         /* force LE */
2530         ihi = 1;
2531         ilo = 0;
2532     }
2533     else if (bo == 1) {
2534         /* force BE */
2535         ihi = 0;
2536         ilo = 1;
2537     }
2538
2539     while (q < e) {
2540         Py_UNICODE ch;
2541         /* remaining bytes at the end? (size should be even) */
2542         if (e-q<2) {
2543             if (consumed)
2544                 break;
2545             errmsg = "truncated data";
2546             startinpos = ((const char *)q)-starts;
2547             endinpos = ((const char *)e)-starts;
2548             goto utf16Error;
2549             /* The remaining input chars are ignored if the callback
2550                chooses to skip the input */
2551         }
2552         ch = (q[ihi] << 8) | q[ilo];
2553
2554         q += 2;
2555
2556         if (ch < 0xD800 || ch > 0xDFFF) {
2557             *p++ = ch;
2558             continue;
2559         }
2560
2561         /* UTF-16 code pair: */
2562         if (q >= e) {
2563             errmsg = "unexpected end of data";
2564             startinpos = (((const char *)q)-2)-starts;
2565             endinpos = ((const char *)e)-starts;
2566             goto utf16Error;
2567         }
2568         if (0xD800 <= ch && ch <= 0xDBFF) {
2569             Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2570             q += 2;
2571             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2572 #ifndef Py_UNICODE_WIDE
2573                 *p++ = ch;
2574                 *p++ = ch2;
2575 #else
2576                 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2577 #endif
2578                 continue;
2579             }
2580             else {
2581                 errmsg = "illegal UTF-16 surrogate";
2582                 startinpos = (((const char *)q)-4)-starts;
2583                 endinpos = startinpos+2;
2584                 goto utf16Error;
2585             }
2586
2587         }
2588         errmsg = "illegal encoding";
2589         startinpos = (((const char *)q)-2)-starts;
2590         endinpos = startinpos+2;
2591         /* Fall through to report the error */
2592
2593       utf16Error:
2594         outpos = p-PyUnicode_AS_UNICODE(unicode);
2595         if (unicode_decode_call_errorhandler(
2596                 errors, &errorHandler,
2597                 "utf16", errmsg,
2598                 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2599                 &unicode, &outpos, &p))
2600             goto onError;
2601     }
2602
2603     if (byteorder)
2604         *byteorder = bo;
2605
2606     if (consumed)
2607         *consumed = (const char *)q-starts;
2608
2609     /* Adjust length */
2610     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2611         goto onError;
2612
2613     Py_XDECREF(errorHandler);
2614     Py_XDECREF(exc);
2615     return (PyObject *)unicode;
2616
2617   onError:
2618     Py_DECREF(unicode);
2619     Py_XDECREF(errorHandler);
2620     Py_XDECREF(exc);
2621     return NULL;
2622 }
2623
2624 PyObject *
2625 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2626                       Py_ssize_t size,
2627                       const char *errors,
2628                       int byteorder)
2629 {
2630     PyObject *v;
2631     unsigned char *p;
2632     Py_ssize_t nsize, bytesize;
2633 #ifdef Py_UNICODE_WIDE
2634     Py_ssize_t i, pairs;
2635 #else
2636     const int pairs = 0;
2637 #endif
2638     /* Offsets from p for storing byte pairs in the right order. */
2639 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2640     int ihi = 1, ilo = 0;
2641 #else
2642     int ihi = 0, ilo = 1;
2643 #endif
2644
2645 #define STORECHAR(CH)                           \
2646     do {                                        \
2647         p[ihi] = ((CH) >> 8) & 0xff;            \
2648         p[ilo] = (CH) & 0xff;                   \
2649         p += 2;                                 \
2650     } while(0)
2651
2652 #ifdef Py_UNICODE_WIDE
2653     for (i = pairs = 0; i < size; i++)
2654         if (s[i] >= 0x10000)
2655             pairs++;
2656 #endif
2657     /* 2 * (size + pairs + (byteorder == 0)) */
2658     if (size > PY_SSIZE_T_MAX ||
2659         size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2660         return PyErr_NoMemory();
2661     nsize = size + pairs + (byteorder == 0);
2662     bytesize = nsize * 2;
2663     if (bytesize / 2 != nsize)
2664         return PyErr_NoMemory();
2665     v = PyString_FromStringAndSize(NULL, bytesize);
2666     if (v == NULL)
2667         return NULL;
2668
2669     p = (unsigned char *)PyString_AS_STRING(v);
2670     if (byteorder == 0)
2671         STORECHAR(0xFEFF);
2672     if (size == 0)
2673         return v;
2674
2675     if (byteorder == -1) {
2676         /* force LE */
2677         ihi = 1;
2678         ilo = 0;
2679     }
2680     else if (byteorder == 1) {
2681         /* force BE */
2682         ihi = 0;
2683         ilo = 1;
2684     }
2685
2686     while (size-- > 0) {
2687         Py_UNICODE ch = *s++;
2688         Py_UNICODE ch2 = 0;
2689 #ifdef Py_UNICODE_WIDE
2690         if (ch >= 0x10000) {
2691             ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2692             ch  = 0xD800 | ((ch-0x10000) >> 10);
2693         }
2694 #endif
2695         STORECHAR(ch);
2696         if (ch2)
2697             STORECHAR(ch2);
2698     }
2699     return v;
2700 #undef STORECHAR
2701 }
2702
2703 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2704 {
2705     if (!PyUnicode_Check(unicode)) {
2706         PyErr_BadArgument();
2707         return NULL;
2708     }
2709     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2710                                  PyUnicode_GET_SIZE(unicode),
2711                                  NULL,
2712                                  0);
2713 }
2714
2715 /* --- Unicode Escape Codec ----------------------------------------------- */
2716
2717 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2718
2719 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2720                                         Py_ssize_t size,
2721                                         const char *errors)
2722 {
2723     const char *starts = s;
2724     Py_ssize_t startinpos;
2725     Py_ssize_t endinpos;
2726     Py_ssize_t outpos;
2727     int i;
2728     PyUnicodeObject *v;
2729     Py_UNICODE *p;
2730     const char *end;
2731     char* message;
2732     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2733     PyObject *errorHandler = NULL;
2734     PyObject *exc = NULL;
2735
2736     /* Escaped strings will always be longer than the resulting
2737        Unicode string, so we start with size here and then reduce the
2738        length after conversion to the true value.
2739        (but if the error callback returns a long replacement string
2740        we'll have to allocate more space) */
2741     v = _PyUnicode_New(size);
2742     if (v == NULL)
2743         goto onError;
2744     if (size == 0)
2745         return (PyObject *)v;
2746
2747     p = PyUnicode_AS_UNICODE(v);
2748     end = s + size;
2749
2750     while (s < end) {
2751         unsigned char c;
2752         Py_UNICODE x;
2753         int digits;
2754
2755         /* Non-escape characters are interpreted as Unicode ordinals */
2756         if (*s != '\\') {
2757             *p++ = (unsigned char) *s++;
2758             continue;
2759         }
2760
2761         startinpos = s-starts;
2762         /* \ - Escapes */
2763         s++;
2764         c = *s++;
2765         if (s > end)
2766             c = '\0'; /* Invalid after \ */
2767         switch (c) {
2768
2769             /* \x escapes */
2770         case '\n': break;
2771         case '\\': *p++ = '\\'; break;
2772         case '\'': *p++ = '\''; break;
2773         case '\"': *p++ = '\"'; break;
2774         case 'b': *p++ = '\b'; break;
2775         case 'f': *p++ = '\014'; break; /* FF */
2776         case 't': *p++ = '\t'; break;
2777         case 'n': *p++ = '\n'; break;
2778         case 'r': *p++ = '\r'; break;
2779         case 'v': *p++ = '\013'; break; /* VT */
2780         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2781
2782             /* \OOO (octal) escapes */
2783         case '0': case '1': case '2': case '3':
2784         case '4': case '5': case '6': case '7':
2785             x = s[-1] - '0';
2786             if (s < end && '0' <= *s && *s <= '7') {
2787                 x = (x<<3) + *s++ - '0';
2788                 if (s < end && '0' <= *s && *s <= '7')
2789                     x = (x<<3) + *s++ - '0';
2790             }
2791             *p++ = x;
2792             break;
2793
2794             /* hex escapes */
2795             /* \xXX */
2796         case 'x':
2797             digits = 2;
2798             message = "truncated \\xXX escape";
2799             goto hexescape;
2800
2801             /* \uXXXX */
2802         case 'u':
2803             digits = 4;
2804             message = "truncated \\uXXXX escape";
2805             goto hexescape;
2806
2807             /* \UXXXXXXXX */
2808         case 'U':
2809             digits = 8;
2810             message = "truncated \\UXXXXXXXX escape";
2811         hexescape:
2812             chr = 0;
2813             outpos = p-PyUnicode_AS_UNICODE(v);
2814             if (s+digits>end) {
2815                 endinpos = size;
2816                 if (unicode_decode_call_errorhandler(
2817                         errors, &errorHandler,
2818                         "unicodeescape", "end of string in escape sequence",
2819                         starts, size, &startinpos, &endinpos, &exc, &s,
2820                         &v, &outpos, &p))
2821                     goto onError;
2822                 goto nextByte;
2823             }
2824             for (i = 0; i < digits; ++i) {
2825                 c = (unsigned char) s[i];
2826                 if (!isxdigit(c)) {
2827                     endinpos = (s+i+1)-starts;
2828                     if (unicode_decode_call_errorhandler(
2829                             errors, &errorHandler,
2830                             "unicodeescape", message,
2831                             starts, size, &startinpos, &endinpos, &exc, &s,
2832                             &v, &outpos, &p))
2833                         goto onError;
2834                     goto nextByte;
2835                 }
2836                 chr = (chr<<4) & ~0xF;
2837                 if (c >= '0' && c <= '9')
2838                     chr += c - '0';
2839                 else if (c >= 'a' && c <= 'f')
2840                     chr += 10 + c - 'a';
2841                 else
2842                     chr += 10 + c - 'A';
2843             }
2844             s += i;
2845             if (chr == 0xffffffff && PyErr_Occurred())
2846                 /* _decoding_error will have already written into the
2847                    target buffer. */
2848                 break;
2849         store:
2850             /* when we get here, chr is a 32-bit unicode character */
2851             if (chr <= 0xffff)
2852                 /* UCS-2 character */
2853                 *p++ = (Py_UNICODE) chr;
2854             else if (chr <= 0x10ffff) {
2855                 /* UCS-4 character. Either store directly, or as
2856                    surrogate pair. */
2857 #ifdef Py_UNICODE_WIDE
2858                 *p++ = chr;
2859 #else
2860                 chr -= 0x10000L;
2861                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2862                 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2863 #endif
2864             } else {
2865                 endinpos = s-starts;
2866                 outpos = p-PyUnicode_AS_UNICODE(v);
2867                 if (unicode_decode_call_errorhandler(
2868                         errors, &errorHandler,
2869                         "unicodeescape", "illegal Unicode character",
2870                         starts, size, &startinpos, &endinpos, &exc, &s,
2871                         &v, &outpos, &p))
2872                     goto onError;
2873             }
2874             break;
2875
2876             /* \N{name} */
2877         case 'N':
2878             message = "malformed \\N character escape";
2879             if (ucnhash_CAPI == NULL) {
2880                 /* load the unicode data module */
2881                 PyObject *m, *api;
2882                 m = PyImport_ImportModuleNoBlock("unicodedata");
2883                 if (m == NULL)
2884                     goto ucnhashError;
2885                 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
2886                 Py_DECREF(m);
2887                 if (api == NULL)
2888                     goto ucnhashError;
2889                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
2890                 Py_DECREF(api);
2891                 if (ucnhash_CAPI == NULL)
2892                     goto ucnhashError;
2893             }
2894             if (*s == '{') {
2895                 const char *start = s+1;
2896                 /* look for the closing brace */
2897                 while (*s != '}' && s < end)
2898                     s++;
2899                 if (s > start && s < end && *s == '}') {
2900                     /* found a name.  look it up in the unicode database */
2901                     message = "unknown Unicode character name";
2902                     s++;
2903                     if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2904                         goto store;
2905                 }
2906             }
2907             endinpos = s-starts;
2908             outpos = p-PyUnicode_AS_UNICODE(v);
2909             if (unicode_decode_call_errorhandler(
2910                     errors, &errorHandler,
2911                     "unicodeescape", message,
2912                     starts, size, &startinpos, &endinpos, &exc, &s,
2913                     &v, &outpos, &p))
2914                 goto onError;
2915             break;
2916
2917         default:
2918             if (s > end) {
2919                 message = "\\ at end of string";
2920                 s--;
2921                 endinpos = s-starts;
2922                 outpos = p-PyUnicode_AS_UNICODE(v);
2923                 if (unicode_decode_call_errorhandler(
2924                         errors, &errorHandler,
2925                         "unicodeescape", message,
2926                         starts, size, &startinpos, &endinpos, &exc, &s,
2927                         &v, &outpos, &p))
2928                     goto onError;
2929             }
2930             else {
2931                 *p++ = '\\';
2932                 *p++ = (unsigned char)s[-1];
2933             }
2934             break;
2935         }
2936       nextByte:
2937         ;
2938     }
2939     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2940         goto onError;
2941     Py_XDECREF(errorHandler);
2942     Py_XDECREF(exc);
2943     return (PyObject *)v;
2944
2945   ucnhashError:
2946     PyErr_SetString(
2947         PyExc_UnicodeError,
2948         "\\N escapes not supported (can't load unicodedata module)"
2949         );
2950     Py_XDECREF(v);
2951     Py_XDECREF(errorHandler);
2952     Py_XDECREF(exc);
2953     return NULL;
2954
2955   onError:
2956     Py_XDECREF(v);
2957     Py_XDECREF(errorHandler);
2958     Py_XDECREF(exc);
2959     return NULL;
2960 }
2961
2962 /* Return a Unicode-Escape string version of the Unicode object.
2963
2964    If quotes is true, the string is enclosed in u"" or u'' quotes as
2965    appropriate.
2966
2967 */
2968
2969 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2970                                              Py_ssize_t size,
2971                                              Py_UNICODE ch)
2972 {
2973     /* like wcschr, but doesn't stop at NULL characters */
2974
2975     while (size-- > 0) {
2976         if (*s == ch)
2977             return s;
2978         s++;
2979     }
2980
2981     return NULL;
2982 }
2983
2984 static
2985 PyObject *unicodeescape_string(const Py_UNICODE *s,
2986                                Py_ssize_t size,
2987                                int quotes)
2988 {
2989     PyObject *repr;
2990     char *p;
2991
2992     static const char *hexdigit = "0123456789abcdef";
2993 #ifdef Py_UNICODE_WIDE
2994     const Py_ssize_t expandsize = 10;
2995 #else
2996     const Py_ssize_t expandsize = 6;
2997 #endif
2998
2999     /* XXX(nnorwitz): rather than over-allocating, it would be
3000        better to choose a different scheme.  Perhaps scan the
3001        first N-chars of the string and allocate based on that size.
3002     */
3003     /* Initial allocation is based on the longest-possible unichr
3004        escape.
3005
3006        In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3007        unichr, so in this case it's the longest unichr escape. In
3008        narrow (UTF-16) builds this is five chars per source unichr
3009        since there are two unichrs in the surrogate pair, so in narrow
3010        (UTF-16) builds it's not the longest unichr escape.
3011
3012        In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3013        so in the narrow (UTF-16) build case it's the longest unichr
3014        escape.
3015     */
3016
3017     if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3018         return PyErr_NoMemory();
3019
3020     repr = PyString_FromStringAndSize(NULL,
3021                                       2
3022                                       + expandsize*size
3023                                       + 1);
3024     if (repr == NULL)
3025         return NULL;
3026
3027     p = PyString_AS_STRING(repr);
3028
3029     if (quotes) {
3030         *p++ = 'u';
3031         *p++ = (findchar(s, size, '\'') &&
3032                 !findchar(s, size, '"')) ? '"' : '\'';
3033     }
3034     while (size-- > 0) {
3035         Py_UNICODE ch = *s++;
3036
3037         /* Escape quotes and backslashes */
3038         if ((quotes &&
3039              ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
3040             *p++ = '\\';
3041             *p++ = (char) ch;
3042             continue;
3043         }
3044
3045 #ifdef Py_UNICODE_WIDE
3046         /* Map 21-bit characters to '\U00xxxxxx' */
3047         else if (ch >= 0x10000) {
3048             *p++ = '\\';
3049             *p++ = 'U';
3050             *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3051             *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3052             *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3053             *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3054             *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3055             *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3056             *p++ = hexdigit[(ch >> 4) & 0x0000000F];
3057             *p++ = hexdigit[ch & 0x0000000F];
3058             continue;
3059         }
3060 #else
3061         /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3062         else if (ch >= 0xD800 && ch < 0xDC00) {
3063             Py_UNICODE ch2;
3064             Py_UCS4 ucs;
3065
3066             ch2 = *s++;
3067             size--;
3068             if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3069                 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3070                 *p++ = '\\';
3071                 *p++ = 'U';
3072                 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3073                 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3074                 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3075                 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3076                 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3077                 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3078                 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3079                 *p++ = hexdigit[ucs & 0x0000000F];
3080                 continue;
3081             }
3082             /* Fall through: isolated surrogates are copied as-is */
3083             s--;
3084             size++;
3085         }
3086 #endif
3087
3088         /* Map 16-bit characters to '\uxxxx' */
3089         if (ch >= 256) {
3090             *p++ = '\\';
3091             *p++ = 'u';
3092             *p++ = hexdigit[(ch >> 12) & 0x000F];
3093             *p++ = hexdigit[(ch >> 8) & 0x000F];
3094             *p++ = hexdigit[(ch >> 4) & 0x000F];
3095             *p++ = hexdigit[ch & 0x000F];
3096         }
3097
3098         /* Map special whitespace to '\t', \n', '\r' */
3099         else if (ch == '\t') {
3100             *p++ = '\\';
3101             *p++ = 't';
3102         }
3103         else if (ch == '\n') {
3104             *p++ = '\\';
3105             *p++ = 'n';
3106         }
3107         else if (ch == '\r') {
3108             *p++ = '\\';
3109             *p++ = 'r';
3110         }
3111
3112         /* Map non-printable US ASCII to '\xhh' */
3113         else if (ch < ' ' || ch >= 0x7F) {
3114             *p++ = '\\';
3115             *p++ = 'x';
3116             *p++ = hexdigit[(ch >> 4) & 0x000F];
3117             *p++ = hexdigit[ch & 0x000F];
3118         }
3119
3120         /* Copy everything else as-is */
3121         else
3122             *p++ = (char) ch;
3123     }
3124     if (quotes)
3125         *p++ = PyString_AS_STRING(repr)[1];
3126
3127     *p = '\0';
3128     _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
3129     return repr;
3130 }
3131
3132 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3133                                         Py_ssize_t size)
3134 {
3135     return unicodeescape_string(s, size, 0);
3136 }
3137
3138 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3139 {
3140     if (!PyUnicode_Check(unicode)) {
3141         PyErr_BadArgument();
3142         return NULL;
3143     }
3144     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3145                                          PyUnicode_GET_SIZE(unicode));
3146 }
3147
3148 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3149
3150 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3151                                            Py_ssize_t size,
3152                                            const char *errors)
3153 {
3154     const char *starts = s;
3155     Py_ssize_t startinpos;
3156     Py_ssize_t endinpos;
3157     Py_ssize_t outpos;
3158     PyUnicodeObject *v;
3159     Py_UNICODE *p;
3160     const char *end;
3161     const char *bs;
3162     PyObject *errorHandler = NULL;
3163     PyObject *exc = NULL;
3164
3165     /* Escaped strings will always be longer than the resulting
3166        Unicode string, so we start with size here and then reduce the
3167        length after conversion to the true value. (But decoding error
3168        handler might have to resize the string) */
3169     v = _PyUnicode_New(size);
3170     if (v == NULL)
3171         goto onError;
3172     if (size == 0)
3173         return (PyObject *)v;
3174     p = PyUnicode_AS_UNICODE(v);
3175     end = s + size;
3176     while (s < end) {
3177         unsigned char c;
3178         Py_UCS4 x;
3179         int i;
3180         int count;
3181
3182         /* Non-escape characters are interpreted as Unicode ordinals */
3183         if (*s != '\\') {
3184             *p++ = (unsigned char)*s++;
3185             continue;
3186         }
3187         startinpos = s-starts;
3188
3189         /* \u-escapes are only interpreted iff the number of leading
3190            backslashes if odd */
3191         bs = s;
3192         for (;s < end;) {
3193             if (*s != '\\')
3194                 break;
3195             *p++ = (unsigned char)*s++;
3196         }
3197         if (((s - bs) & 1) == 0 ||
3198             s >= end ||
3199             (*s != 'u' && *s != 'U')) {
3200             continue;
3201         }
3202         p--;
3203         count = *s=='u' ? 4 : 8;
3204         s++;
3205
3206         /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3207         outpos = p-PyUnicode_AS_UNICODE(v);
3208         for (x = 0, i = 0; i < count; ++i, ++s) {
3209             c = (unsigned char)*s;
3210             if (!isxdigit(c)) {
3211                 endinpos = s-starts;
3212                 if (unicode_decode_call_errorhandler(
3213                         errors, &errorHandler,
3214                         "rawunicodeescape", "truncated \\uXXXX",
3215                         starts, size, &startinpos, &endinpos, &exc, &s,
3216                         &v, &outpos, &p))
3217                     goto onError;
3218                 goto nextByte;
3219             }
3220             x = (x<<4) & ~0xF;
3221             if (c >= '0' && c <= '9')
3222                 x += c - '0';
3223             else if (c >= 'a' && c <= 'f')
3224                 x += 10 + c - 'a';
3225             else
3226                 x += 10 + c - 'A';
3227         }
3228         if (x <= 0xffff)
3229             /* UCS-2 character */
3230             *p++ = (Py_UNICODE) x;
3231         else if (x <= 0x10ffff) {
3232             /* UCS-4 character. Either store directly, or as
3233                surrogate pair. */
3234 #ifdef Py_UNICODE_WIDE
3235             *p++ = (Py_UNICODE) x;
3236 #else
3237             x -= 0x10000L;
3238             *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3239             *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3240 #endif
3241         } else {
3242             endinpos = s-starts;
3243             outpos = p-PyUnicode_AS_UNICODE(v);
3244             if (unicode_decode_call_errorhandler(
3245                     errors, &errorHandler,
3246                     "rawunicodeescape", "\\Uxxxxxxxx out of range",
3247                     starts, size, &startinpos, &endinpos, &exc, &s,
3248                     &v, &outpos, &p))
3249                 goto onError;
3250         }
3251       nextByte:
3252         ;
3253     }
3254     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3255         goto onError;
3256     Py_XDECREF(errorHandler);
3257     Py_XDECREF(exc);
3258     return (PyObject *)v;
3259
3260   onError:
3261     Py_XDECREF(v);
3262     Py_XDECREF(errorHandler);
3263     Py_XDECREF(exc);
3264     return NULL;
3265 }
3266
3267 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3268                                            Py_ssize_t size)
3269 {
3270     PyObject *repr;
3271     char *p;
3272     char *q;
3273
3274     static const char *hexdigit = "0123456789abcdef";
3275 #ifdef Py_UNICODE_WIDE
3276     const Py_ssize_t expandsize = 10;
3277 #else
3278     const Py_ssize_t expandsize = 6;
3279 #endif
3280
3281     if (size > PY_SSIZE_T_MAX / expandsize)
3282         return PyErr_NoMemory();
3283
3284     repr = PyString_FromStringAndSize(NULL, expandsize * size);
3285     if (repr == NULL)
3286         return NULL;
3287     if (size == 0)
3288         return repr;
3289
3290     p = q = PyString_AS_STRING(repr);
3291     while (size-- > 0) {
3292         Py_UNICODE ch = *s++;
3293 #ifdef Py_UNICODE_WIDE
3294         /* Map 32-bit characters to '\Uxxxxxxxx' */
3295         if (ch >= 0x10000) {
3296             *p++ = '\\';
3297             *p++ = 'U';
3298             *p++ = hexdigit[(ch >> 28) & 0xf];
3299             *p++ = hexdigit[(ch >> 24) & 0xf];
3300             *p++ = hexdigit[(ch >> 20) & 0xf];
3301             *p++ = hexdigit[(ch >> 16) & 0xf];
3302             *p++ = hexdigit[(ch >> 12) & 0xf];
3303             *p++ = hexdigit[(ch >> 8) & 0xf];
3304             *p++ = hexdigit[(ch >> 4) & 0xf];
3305             *p++ = hexdigit[ch & 15];
3306         }
3307         else
3308 #else
3309             /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3310             if (ch >= 0xD800 && ch < 0xDC00) {
3311                 Py_UNICODE ch2;
3312                 Py_UCS4 ucs;
3313
3314                 ch2 = *s++;
3315                 size--;
3316                 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3317                     ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3318                     *p++ = '\\';
3319                     *p++ = 'U';
3320                     *p++ = hexdigit[(ucs >> 28) & 0xf];
3321                     *p++ = hexdigit[(ucs >> 24) & 0xf];
3322                     *p++ = hexdigit[(ucs >> 20) & 0xf];
3323                     *p++ = hexdigit[(ucs >> 16) & 0xf];
3324                     *p++ = hexdigit[(ucs >> 12) & 0xf];
3325                     *p++ = hexdigit[(ucs >> 8) & 0xf];
3326                     *p++ = hexdigit[(ucs >> 4) & 0xf];
3327                     *p++ = hexdigit[ucs & 0xf];
3328                     continue;
3329                 }
3330                 /* Fall through: isolated surrogates are copied as-is */
3331                 s--;
3332                 size++;
3333             }
3334 #endif
3335         /* Map 16-bit characters to '\uxxxx' */
3336         if (ch >= 256) {
3337             *p++ = '\\';
3338             *p++ = 'u';
3339             *p++ = hexdigit[(ch >> 12) & 0xf];
3340             *p++ = hexdigit[(ch >> 8) & 0xf];
3341             *p++ = hexdigit[(ch >> 4) & 0xf];
3342             *p++ = hexdigit[ch & 15];
3343         }
3344         /* Copy everything else as-is */
3345         else
3346             *p++ = (char) ch;
3347     }
3348     *p = '\0';
3349     _PyString_Resize(&repr, p - q);
3350     return repr;
3351 }
3352
3353 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3354 {
3355     if (!PyUnicode_Check(unicode)) {
3356         PyErr_BadArgument();
3357         return NULL;
3358     }
3359     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3360                                             PyUnicode_GET_SIZE(unicode));
3361 }
3362
3363 /* --- Unicode Internal Codec ------------------------------------------- */
3364
3365 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3366                                            Py_ssize_t size,
3367                                            const char *errors)
3368 {
3369     const char *starts = s;
3370     Py_ssize_t startinpos;
3371     Py_ssize_t endinpos;
3372     Py_ssize_t outpos;
3373     PyUnicodeObject *v;
3374     Py_UNICODE *p;
3375     const char *end;
3376     const char *reason;
3377     PyObject *errorHandler = NULL;
3378     PyObject *exc = NULL;
3379
3380 #ifdef Py_UNICODE_WIDE
3381     Py_UNICODE unimax = PyUnicode_GetMax();
3382 #endif
3383
3384     /* XXX overflow detection missing */
3385     v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3386     if (v == NULL)
3387         goto onError;
3388     if (PyUnicode_GetSize((PyObject *)v) == 0)
3389         return (PyObject *)v;
3390     p = PyUnicode_AS_UNICODE(v);
3391     end = s + size;
3392
3393     while (s < end) {
3394         memcpy(p, s, sizeof(Py_UNICODE));
3395         /* We have to sanity check the raw data, otherwise doom looms for
3396            some malformed UCS-4 data. */
3397         if (
3398 #ifdef Py_UNICODE_WIDE
3399             *p > unimax || *p < 0 ||
3400 #endif
3401             end-s < Py_UNICODE_SIZE
3402             )
3403         {
3404             startinpos = s - starts;
3405             if (end-s < Py_UNICODE_SIZE) {
3406                 endinpos = end-starts;
3407                 reason = "truncated input";
3408             }
3409             else {
3410                 endinpos = s - starts + Py_UNICODE_SIZE;
3411                 reason = "illegal code point (> 0x10FFFF)";
3412             }
3413             outpos = p - PyUnicode_AS_UNICODE(v);
3414             if (unicode_decode_call_errorhandler(
3415                     errors, &errorHandler,
3416                     "unicode_internal", reason,
3417                     starts, size, &startinpos, &endinpos, &exc, &s,
3418                     &v, &outpos, &p)) {
3419                 goto onError;
3420             }
3421         }
3422         else {
3423             p++;
3424             s += Py_UNICODE_SIZE;
3425         }
3426     }
3427
3428     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3429         goto onError;
3430     Py_XDECREF(errorHandler);
3431     Py_XDECREF(exc);
3432     return (PyObject *)v;
3433
3434   onError:
3435     Py_XDECREF(v);
3436     Py_XDECREF(errorHandler);
3437     Py_XDECREF(exc);
3438     return NULL;
3439 }
3440
3441 /* --- Latin-1 Codec ------------------------------------------------------ */
3442
3443 PyObject *PyUnicode_DecodeLatin1(const char *s,
3444                                  Py_ssize_t size,
3445                                  const char *errors)
3446 {
3447     PyUnicodeObject *v;
3448     Py_UNICODE *p;
3449
3450     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3451     if (size == 1) {
3452         Py_UNICODE r = *(unsigned char*)s;
3453         return PyUnicode_FromUnicode(&r, 1);
3454     }
3455
3456     v = _PyUnicode_New(size);
3457     if (v == NULL)
3458         goto onError;
3459     if (size == 0)
3460         return (PyObject *)v;
3461     p = PyUnicode_AS_UNICODE(v);
3462     while (size-- > 0)
3463         *p++ = (unsigned char)*s++;
3464     return (PyObject *)v;
3465
3466   onError:
3467     Py_XDECREF(v);
3468     return NULL;
3469 }
3470
3471 /* create or adjust a UnicodeEncodeError */
3472 static void make_encode_exception(PyObject **exceptionObject,
3473                                   const char *encoding,
3474                                   const Py_UNICODE *unicode, Py_ssize_t size,
3475                                   Py_ssize_t startpos, Py_ssize_t endpos,
3476                                   const char *reason)
3477 {
3478     if (*exceptionObject == NULL) {
3479         *exceptionObject = PyUnicodeEncodeError_Create(
3480             encoding, unicode, size, startpos, endpos, reason);
3481     }
3482     else {
3483         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3484             goto onError;
3485         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3486             goto onError;
3487         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3488             goto onError;
3489         return;
3490       onError:
3491         Py_DECREF(*exceptionObject);
3492         *exceptionObject = NULL;
3493     }
3494 }
3495
3496 /* raises a UnicodeEncodeError */
3497 static void raise_encode_exception(PyObject **exceptionObject,
3498                                    const char *encoding,
3499                                    const Py_UNICODE *unicode, Py_ssize_t size,
3500                                    Py_ssize_t startpos, Py_ssize_t endpos,
3501                                    const char *reason)
3502 {
3503     make_encode_exception(exceptionObject,
3504                           encoding, unicode, size, startpos, endpos, reason);
3505     if (*exceptionObject != NULL)
3506         PyCodec_StrictErrors(*exceptionObject);
3507 }
3508
3509 /* error handling callback helper:
3510    build arguments, call the callback and check the arguments,
3511    put the result into newpos and return the replacement string, which
3512    has to be freed by the caller */
3513 static PyObject *unicode_encode_call_errorhandler(const char *errors,
3514                                                   PyObject **errorHandler,
3515                                                   const char *encoding, const char *reason,
3516                                                   const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3517                                                   Py_ssize_t startpos, Py_ssize_t endpos,
3518                                                   Py_ssize_t *newpos)
3519 {
3520     static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3521
3522     PyObject *restuple;
3523     PyObject *resunicode;
3524
3525     if (*errorHandler == NULL) {
3526         *errorHandler = PyCodec_LookupError(errors);
3527         if (*errorHandler == NULL)
3528             return NULL;
3529     }
3530
3531     make_encode_exception(exceptionObject,
3532                           encoding, unicode, size, startpos, endpos, reason);
3533     if (*exceptionObject == NULL)
3534         return NULL;
3535
3536     restuple = PyObject_CallFunctionObjArgs(
3537         *errorHandler, *exceptionObject, NULL);
3538     if (restuple == NULL)
3539         return NULL;
3540     if (!PyTuple_Check(restuple)) {
3541         PyErr_SetString(PyExc_TypeError, &argparse[4]);
3542         Py_DECREF(restuple);
3543         return NULL;
3544     }
3545     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3546                           &resunicode, newpos)) {
3547         Py_DECREF(restuple);
3548         return NULL;
3549     }
3550     if (*newpos<0)
3551         *newpos = size+*newpos;
3552     if (*newpos<0 || *newpos>size) {
3553         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3554         Py_DECREF(restuple);
3555         return NULL;
3556     }
3557     Py_INCREF(resunicode);
3558     Py_DECREF(restuple);
3559     return resunicode;
3560 }
3561
3562 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3563                                      Py_ssize_t size,
3564                                      const char *errors,
3565                                      int limit)
3566 {
3567     /* output object */
3568     PyObject *res;
3569     /* pointers to the beginning and end+1 of input */
3570     const Py_UNICODE *startp = p;
3571     const Py_UNICODE *endp = p + size;
3572     /* pointer to the beginning of the unencodable characters */
3573     /* const Py_UNICODE *badp = NULL; */
3574     /* pointer into the output */
3575     char *str;
3576     /* current output position */
3577     Py_ssize_t respos = 0;
3578     Py_ssize_t ressize;
3579     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3580     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3581     PyObject *errorHandler = NULL;
3582     PyObject *exc = NULL;
3583     /* the following variable is used for caching string comparisons
3584      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3585     int known_errorHandler = -1;
3586
3587     /* allocate enough for a simple encoding without
3588        replacements, if we need more, we'll resize */
3589     res = PyString_FromStringAndSize(NULL, size);
3590     if (res == NULL)
3591         goto onError;
3592     if (size == 0)
3593         return res;
3594     str = PyString_AS_STRING(res);
3595     ressize = size;
3596
3597     while (p<endp) {
3598         Py_UNICODE c = *p;
3599
3600         /* can we encode this? */
3601         if (c<limit) {
3602             /* no overflow check, because we know that the space is enough */
3603             *str++ = (char)c;
3604             ++p;
3605         }
3606         else {
3607             Py_ssize_t unicodepos = p-startp;
3608             Py_ssize_t requiredsize;
3609             PyObject *repunicode;
3610             Py_ssize_t repsize;
3611             Py_ssize_t newpos;
3612             Py_ssize_t respos;
3613             Py_UNICODE *uni2;
3614             /* startpos for collecting unencodable chars */
3615             const Py_UNICODE *collstart = p;
3616             const Py_UNICODE *collend = p;
3617             /* find all unecodable characters */
3618             while ((collend < endp) && ((*collend)>=limit))
3619                 ++collend;
3620             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3621             if (known_errorHandler==-1) {
3622                 if ((errors==NULL) || (!strcmp(errors, "strict")))
3623                     known_errorHandler = 1;
3624                 else if (!strcmp(errors, "replace"))
3625                     known_errorHandler = 2;
3626                 else if (!strcmp(errors, "ignore"))
3627                     known_errorHandler = 3;
3628                 else if (!strcmp(errors, "xmlcharrefreplace"))
3629                     known_errorHandler = 4;
3630                 else
3631                     known_errorHandler = 0;
3632             }
3633             switch (known_errorHandler) {
3634             case 1: /* strict */
3635                 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3636                 goto onError;
3637             case 2: /* replace */
3638                 while (collstart++<collend)
3639                     *str++ = '?'; /* fall through */
3640             case 3: /* ignore */
3641                 p = collend;
3642                 break;
3643             case 4: /* xmlcharrefreplace */
3644                 respos = str-PyString_AS_STRING(res);
3645                 /* determine replacement size (temporarily (mis)uses p) */
3646                 for (p = collstart, repsize = 0; p < collend; ++p) {
3647                     if (*p<10)
3648                         repsize += 2+1+1;
3649                     else if (*p<100)
3650                         repsize += 2+2+1;
3651                     else if (*p<1000)
3652                         repsize += 2+3+1;
3653                     else if (*p<10000)
3654                         repsize += 2+4+1;
3655 #ifndef Py_UNICODE_WIDE
3656                     else
3657                         repsize += 2+5+1;
3658 #else
3659                     else if (*p<100000)
3660                         repsize += 2+5+1;
3661                     else if (*p<1000000)
3662                         repsize += 2+6+1;
3663                     else
3664                         repsize += 2+7+1;
3665 #endif
3666                 }
3667                 requiredsize = respos+repsize+(endp-collend);
3668                 if (requiredsize > ressize) {
3669                     if (requiredsize<2*ressize)
3670                         requiredsize = 2*ressize;
3671                     if (_PyString_Resize(&res, requiredsize))
3672                         goto onError;
3673                     str = PyString_AS_STRING(res) + respos;
3674                     ressize = requiredsize;
3675                 }
3676                 /* generate replacement (temporarily (mis)uses p) */
3677                 for (p = collstart; p < collend; ++p) {
3678                     str += sprintf(str, "&#%d;", (int)*p);
3679                 }
3680                 p = collend;
3681                 break;
3682             default:
3683                 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3684                                                               encoding, reason, startp, size, &exc,
3685                                                               collstart-startp, collend-startp, &newpos);
3686                 if (repunicode == NULL)
3687                     goto onError;
3688                 /* need more space? (at least enough for what we have+the
3689                    replacement+the rest of the string, so we won't have to
3690                    check space for encodable characters) */
3691                 respos = str-PyString_AS_STRING(res);
3692                 repsize = PyUnicode_GET_SIZE(repunicode);
3693                 requiredsize = respos+repsize+(endp-collend);
3694                 if (requiredsize > ressize) {
3695                     if (requiredsize<2*ressize)
3696                         requiredsize = 2*ressize;
3697                     if (_PyString_Resize(&res, requiredsize)) {
3698                         Py_DECREF(repunicode);
3699                         goto onError;
3700                     }
3701                     str = PyString_AS_STRING(res) + respos;
3702                     ressize = requiredsize;
3703                 }
3704                 /* check if there is anything unencodable in the replacement
3705                    and copy it to the output */
3706                 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3707                     c = *uni2;
3708                     if (c >= limit) {
3709                         raise_encode_exception(&exc, encoding, startp, size,
3710                                                unicodepos, unicodepos+1, reason);
3711                         Py_DECREF(repunicode);
3712                         goto onError;
3713                     }
3714                     *str = (char)c;
3715                 }
3716                 p = startp + newpos;
3717                 Py_DECREF(repunicode);
3718             }
3719         }
3720     }
3721     /* Resize if we allocated to much */
3722     respos = str-PyString_AS_STRING(res);
3723     if (respos<ressize)
3724         /* If this falls res will be NULL */
3725         _PyString_Resize(&res, respos);
3726     Py_XDECREF(errorHandler);
3727     Py_XDECREF(exc);
3728     return res;
3729
3730   onError:
3731     Py_XDECREF(res);
3732     Py_XDECREF(errorHandler);
3733     Py_XDECREF(exc);
3734     return NULL;
3735 }
3736
3737 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3738                                  Py_ssize_t size,
3739                                  const char *errors)
3740 {
3741     return unicode_encode_ucs1(p, size, errors, 256);
3742 }
3743
3744 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3745 {
3746     if (!PyUnicode_Check(unicode)) {
3747         PyErr_BadArgument();
3748         return NULL;
3749     }
3750     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3751                                   PyUnicode_GET_SIZE(unicode),
3752                                   NULL);
3753 }
3754
3755 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3756
3757 PyObject *PyUnicode_DecodeASCII(const char *s,
3758                                 Py_ssize_t size,
3759                                 const char *errors)
3760 {
3761     const char *starts = s;
3762     PyUnicodeObject *v;
3763     Py_UNICODE *p;
3764     Py_ssize_t startinpos;
3765     Py_ssize_t endinpos;
3766     Py_ssize_t outpos;
3767     const char *e;
3768     PyObject *errorHandler = NULL;
3769     PyObject *exc = NULL;
3770
3771     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3772     if (size == 1 && *(unsigned char*)s < 128) {
3773         Py_UNICODE r = *(unsigned char*)s;
3774         return PyUnicode_FromUnicode(&r, 1);
3775     }
3776
3777     v = _PyUnicode_New(size);
3778     if (v == NULL)
3779         goto onError;
3780     if (size == 0)
3781         return (PyObject *)v;
3782     p = PyUnicode_AS_UNICODE(v);
3783     e = s + size;
3784     while (s < e) {
3785         register unsigned char c = (unsigned char)*s;
3786         if (c < 128) {
3787             *p++ = c;
3788             ++s;
3789         }
3790         else {
3791             startinpos = s-starts;
3792             endinpos = startinpos + 1;
3793             outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3794             if (unicode_decode_call_errorhandler(
3795                     errors, &errorHandler,
3796                     "ascii", "ordinal not in range(128)",
3797                     starts, size, &startinpos, &endinpos, &exc, &s,
3798                     &v, &outpos, &p))
3799                 goto onError;
3800         }
3801     }
3802     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
3803         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3804             goto onError;
3805     Py_XDECREF(errorHandler);
3806     Py_XDECREF(exc);
3807     return (PyObject *)v;
3808
3809   onError:
3810     Py_XDECREF(v);
3811     Py_XDECREF(errorHandler);
3812     Py_XDECREF(exc);
3813     return NULL;
3814 }
3815
3816 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3817                                 Py_ssize_t size,
3818                                 const char *errors)
3819 {
3820     return unicode_encode_ucs1(p, size, errors, 128);
3821 }
3822
3823 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3824 {
3825     if (!PyUnicode_Check(unicode)) {
3826         PyErr_BadArgument();
3827         return NULL;
3828     }
3829     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3830                                  PyUnicode_GET_SIZE(unicode),
3831                                  NULL);
3832 }
3833
3834 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3835
3836 /* --- MBCS codecs for Windows -------------------------------------------- */
3837
3838 #if SIZEOF_INT < SIZEOF_SIZE_T
3839 #define NEED_RETRY
3840 #endif
3841
3842 /* XXX This code is limited to "true" double-byte encodings, as
3843    a) it assumes an incomplete character consists of a single byte, and
3844    b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3845    encodings, see IsDBCSLeadByteEx documentation. */
3846
3847 static int is_dbcs_lead_byte(const char *s, int offset)
3848 {
3849     const char *curr = s + offset;
3850
3851     if (IsDBCSLeadByte(*curr)) {
3852         const char *prev = CharPrev(s, curr);
3853         return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3854     }
3855     return 0;
3856 }
3857
3858 /*
3859  * Decode MBCS string into unicode object. If 'final' is set, converts
3860  * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3861  */
3862 static int decode_mbcs(PyUnicodeObject **v,
3863                        const char *s, /* MBCS string */
3864                        int size, /* sizeof MBCS string */
3865                        int final)
3866 {
3867     Py_UNICODE *p;
3868     Py_ssize_t n = 0;
3869     int usize = 0;
3870
3871     assert(size >= 0);
3872
3873     /* Skip trailing lead-byte unless 'final' is set */
3874     if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3875         --size;
3876
3877     /* First get the size of the result */
3878     if (size > 0) {
3879         usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3880         if (usize == 0) {
3881             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3882             return -1;
3883         }
3884     }
3885
3886     if (*v == NULL) {
3887         /* Create unicode object */
3888         *v = _PyUnicode_New(usize);
3889         if (*v == NULL)
3890             return -1;
3891     }
3892     else {
3893         /* Extend unicode object */
3894         n = PyUnicode_GET_SIZE(*v);
3895         if (_PyUnicode_Resize(v, n + usize) < 0)
3896             return -1;
3897     }
3898
3899     /* Do the conversion */
3900     if (size > 0) {
3901         p = PyUnicode_AS_UNICODE(*v) + n;
3902         if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3903             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3904             return -1;
3905         }
3906     }
3907
3908     return size;
3909 }
3910
3911 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3912                                        Py_ssize_t size,
3913                                        const char *errors,
3914                                        Py_ssize_t *consumed)
3915 {
3916     PyUnicodeObject *v = NULL;
3917     int done;
3918
3919     if (consumed)
3920         *consumed = 0;
3921
3922 #ifdef NEED_RETRY
3923   retry:
3924     if (size > INT_MAX)
3925         done = decode_mbcs(&v, s, INT_MAX, 0);
3926     else
3927 #endif
3928         done = decode_mbcs(&v, s, (int)size, !consumed);
3929
3930     if (done < 0) {
3931         Py_XDECREF(v);
3932         return NULL;
3933     }
3934
3935     if (consumed)
3936         *consumed += done;
3937
3938 #ifdef NEED_RETRY
3939     if (size > INT_MAX) {
3940         s += done;
3941         size -= done;
3942         goto retry;
3943     }
3944 #endif
3945
3946     return (PyObject *)v;
3947 }
3948
3949 PyObject *PyUnicode_DecodeMBCS(const char *s,
3950                                Py_ssize_t size,
3951                                const char *errors)
3952 {
3953     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3954 }
3955
3956 /*
3957  * Convert unicode into string object (MBCS).
3958  * Returns 0 if succeed, -1 otherwise.
3959  */
3960 static int encode_mbcs(PyObject **repr,
3961                        const Py_UNICODE *p, /* unicode */
3962                        int size) /* size of unicode */
3963 {
3964     int mbcssize = 0;
3965     Py_ssize_t n = 0;
3966
3967     assert(size >= 0);
3968
3969     /* First get the size of the result */
3970     if (size > 0) {
3971         mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3972         if (mbcssize == 0) {
3973             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3974             return -1;
3975         }
3976     }
3977
3978     if (*repr == NULL) {
3979         /* Create string object */
3980         *repr = PyString_FromStringAndSize(NULL, mbcssize);
3981         if (*repr == NULL)
3982             return -1;
3983     }
3984     else {
3985         /* Extend string object */
3986         n = PyString_Size(*repr);
3987         if (_PyString_Resize(repr, n + mbcssize) < 0)
3988             return -1;
3989     }
3990
3991     /* Do the conversion */
3992     if (size > 0) {
3993         char *s = PyString_AS_STRING(*repr) + n;
3994         if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3995             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3996             return -1;
3997         }
3998     }
3999
4000     return 0;
4001 }
4002
4003 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
4004                                Py_ssize_t size,
4005                                const char *errors)
4006 {
4007     PyObject *repr = NULL;
4008     int ret;
4009
4010 #ifdef NEED_RETRY
4011   retry:
4012     if (size > INT_MAX)
4013         ret = encode_mbcs(&repr, p, INT_MAX);
4014     else
4015 #endif
4016         ret = encode_mbcs(&repr, p, (int)size);
4017
4018     if (ret < 0) {
4019         Py_XDECREF(repr);
4020         return NULL;
4021     }
4022
4023 #ifdef NEED_RETRY
4024     if (size > INT_MAX) {
4025         p += INT_MAX;
4026         size -= INT_MAX;
4027         goto retry;
4028     }
4029 #endif
4030
4031     return repr;
4032 }
4033
4034 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4035 {
4036     if (!PyUnicode_Check(unicode)) {
4037         PyErr_BadArgument();
4038         return NULL;
4039     }
4040     return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4041                                 PyUnicode_GET_SIZE(unicode),
4042                                 NULL);
4043 }
4044
4045 #undef NEED_RETRY
4046
4047 #endif /* MS_WINDOWS */
4048
4049 /* --- Character Mapping Codec -------------------------------------------- */
4050
4051 PyObject *PyUnicode_DecodeCharmap(const char *s,
4052                                   Py_ssize_t size,
4053                                   PyObject *mapping,
4054                                   const char *errors)
4055 {
4056     const char *starts = s;
4057     Py_ssize_t startinpos;
4058     Py_ssize_t endinpos;
4059     Py_ssize_t outpos;
4060     const char *e;
4061     PyUnicodeObject *v;
4062     Py_UNICODE *p;
4063     Py_ssize_t extrachars = 0;
4064     PyObject *errorHandler = NULL;
4065     PyObject *exc = NULL;
4066     Py_UNICODE *mapstring = NULL;
4067     Py_ssize_t maplen = 0;
4068
4069     /* Default to Latin-1 */
4070     if (mapping == NULL)
4071         return PyUnicode_DecodeLatin1(s, size, errors);
4072
4073     v = _PyUnicode_New(size);
4074     if (v == NULL)
4075         goto onError;
4076     if (size == 0)
4077         return (PyObject *)v;
4078     p = PyUnicode_AS_UNICODE(v);
4079     e = s + size;
4080     if (PyUnicode_CheckExact(mapping)) {
4081         mapstring = PyUnicode_AS_UNICODE(mapping);
4082         maplen = PyUnicode_GET_SIZE(mapping);
4083         while (s < e) {
4084             unsigned char ch = *s;
4085             Py_UNICODE x = 0xfffe; /* illegal value */
4086
4087             if (ch < maplen)
4088                 x = mapstring[ch];
4089
4090             if (x == 0xfffe) {
4091                 /* undefined mapping */
4092                 outpos = p-PyUnicode_AS_UNICODE(v);
4093                 startinpos = s-starts;
4094                 endinpos = startinpos+1;
4095                 if (unicode_decode_call_errorhandler(
4096                         errors, &errorHandler,
4097                         "charmap", "character maps to <undefined>",
4098                         starts, size, &startinpos, &endinpos, &exc, &s,
4099                         &v, &outpos, &p)) {
4100                     goto onError;
4101                 }
4102                 continue;
4103             }
4104             *p++ = x;
4105             ++s;
4106         }
4107     }
4108     else {
4109         while (s < e) {
4110             unsigned char ch = *s;
4111             PyObject *w, *x;
4112
4113             /* Get mapping (char ordinal -> integer, Unicode char or None) */
4114             w = PyInt_FromLong((long)ch);
4115             if (w == NULL)
4116                 goto onError;
4117             x = PyObject_GetItem(mapping, w);
4118             Py_DECREF(w);
4119             if (x == NULL) {
4120                 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4121                     /* No mapping found means: mapping is undefined. */
4122                     PyErr_Clear();
4123                     x = Py_None;
4124                     Py_INCREF(x);
4125                 } else
4126                     goto onError;
4127             }
4128
4129             /* Apply mapping */
4130             if (PyInt_Check(x)) {
4131                 long value = PyInt_AS_LONG(x);
4132                 if (value < 0 || value > 65535) {
4133                     PyErr_SetString(PyExc_TypeError,
4134                                     "character mapping must be in range(65536)");
4135                     Py_DECREF(x);
4136                     goto onError;
4137                 }
4138                 *p++ = (Py_UNICODE)value;
4139             }
4140             else if (x == Py_None) {
4141                 /* undefined mapping */
4142                 outpos = p-PyUnicode_AS_UNICODE(v);
4143                 startinpos = s-starts;
4144                 endinpos = startinpos+1;
4145                 if (unicode_decode_call_errorhandler(
4146                         errors, &errorHandler,
4147                         "charmap", "character maps to <undefined>",
4148                         starts, size, &startinpos, &endinpos, &exc, &s,
4149                         &v, &outpos, &p)) {
4150                     Py_DECREF(x);
4151                     goto onError;
4152                 }
4153                 Py_DECREF(x);
4154                 continue;
4155             }
4156             else if (PyUnicode_Check(x)) {
4157                 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4158
4159                 if (targetsize == 1)
4160                     /* 1-1 mapping */
4161                     *p++ = *PyUnicode_AS_UNICODE(x);
4162
4163                 else if (targetsize > 1) {
4164                     /* 1-n mapping */
4165                     if (targetsize > extrachars) {
4166                         /* resize first */
4167                         Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4168                         Py_ssize_t needed = (targetsize - extrachars) + \
4169                             (targetsize << 2);
4170                         extrachars += needed;
4171                         /* XXX overflow detection missing */
4172                         if (_PyUnicode_Resize(&v,
4173                                               PyUnicode_GET_SIZE(v) + needed) < 0) {
4174                             Py_DECREF(x);
4175                             goto onError;
4176                         }
4177                         p = PyUnicode_AS_UNICODE(v) + oldpos;
4178                     }
4179                     Py_UNICODE_COPY(p,
4180                                     PyUnicode_AS_UNICODE(x),
4181                                     targetsize);
4182                     p += targetsize;
4183                     extrachars -= targetsize;
4184                 }
4185                 /* 1-0 mapping: skip the character */
4186             }
4187             else {
4188                 /* wrong return value */
4189                 PyErr_SetString(PyExc_TypeError,
4190                                 "character mapping must return integer, None or unicode");
4191                 Py_DECREF(x);
4192                 goto onError;
4193             }
4194             Py_DECREF(x);
4195             ++s;
4196         }
4197     }
4198     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4199         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4200             goto onError;
4201     Py_XDECREF(errorHandler);
4202     Py_XDECREF(exc);
4203     return (PyObject *)v;
4204
4205   onError:
4206     Py_XDECREF(errorHandler);
4207     Py_XDECREF(exc);
4208     Py_XDECREF(v);
4209     return NULL;
4210 }
4211
4212 /* Charmap encoding: the lookup table */
4213
4214 struct encoding_map{
4215     PyObject_HEAD
4216     unsigned char level1[32];
4217     int count2, count3;
4218     unsigned char level23[1];
4219 };
4220
4221 static PyObject*
4222 encoding_map_size(PyObject *obj, PyObject* args)
4223 {
4224     struct encoding_map *map = (struct encoding_map*)obj;
4225     return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4226                           128*map->count3);
4227 }
4228
4229 static PyMethodDef encoding_map_methods[] = {
4230     {"size", encoding_map_size, METH_NOARGS,
4231      PyDoc_STR("Return the size (in bytes) of this object") },
4232     { 0 }
4233 };
4234
4235 static void
4236 encoding_map_dealloc(PyObject* o)
4237 {
4238     PyObject_FREE(o);
4239 }
4240
4241 static PyTypeObject EncodingMapType = {
4242     PyVarObject_HEAD_INIT(NULL, 0)
4243     "EncodingMap",          /*tp_name*/
4244     sizeof(struct encoding_map),   /*tp_basicsize*/
4245     0,                      /*tp_itemsize*/
4246     /* methods */
4247     encoding_map_dealloc,   /*tp_dealloc*/
4248     0,                      /*tp_print*/
4249     0,                      /*tp_getattr*/
4250     0,                      /*tp_setattr*/
4251     0,                      /*tp_compare*/
4252     0,                      /*tp_repr*/
4253     0,                      /*tp_as_number*/
4254     0,                      /*tp_as_sequence*/
4255     0,                      /*tp_as_mapping*/
4256     0,                      /*tp_hash*/
4257     0,                      /*tp_call*/
4258     0,                      /*tp_str*/
4259     0,                      /*tp_getattro*/
4260     0,                      /*tp_setattro*/
4261     0,                      /*tp_as_buffer*/
4262     Py_TPFLAGS_DEFAULT,     /*tp_flags*/
4263     0,                      /*tp_doc*/
4264     0,                      /*tp_traverse*/
4265     0,                      /*tp_clear*/
4266     0,                      /*tp_richcompare*/
4267     0,                      /*tp_weaklistoffset*/
4268     0,                      /*tp_iter*/
4269     0,                      /*tp_iternext*/
4270     encoding_map_methods,   /*tp_methods*/
4271     0,                      /*tp_members*/
4272     0,                      /*tp_getset*/
4273     0,                      /*tp_base*/
4274     0,                      /*tp_dict*/
4275     0,                      /*tp_descr_get*/
4276     0,                      /*tp_descr_set*/
4277     0,                      /*tp_dictoffset*/
4278     0,                      /*tp_init*/
4279     0,                      /*tp_alloc*/
4280     0,                      /*tp_new*/
4281     0,                      /*tp_free*/
4282     0,                      /*tp_is_gc*/
4283 };
4284
4285 PyObject*
4286 PyUnicode_BuildEncodingMap(PyObject* string)
4287 {
4288     Py_UNICODE *decode;
4289     PyObject *result;
4290     struct encoding_map *mresult;
4291     int i;
4292     int need_dict = 0;
4293     unsigned char level1[32];
4294     unsigned char level2[512];
4295     unsigned char *mlevel1, *mlevel2, *mlevel3;
4296     int count2 = 0, count3 = 0;
4297
4298     if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4299         PyErr_BadArgument();
4300         return NULL;
4301     }
4302     decode = PyUnicode_AS_UNICODE(string);
4303     memset(level1, 0xFF, sizeof level1);
4304     memset(level2, 0xFF, sizeof level2);
4305
4306     /* If there isn't a one-to-one mapping of NULL to \0,
4307        or if there are non-BMP characters, we need to use
4308        a mapping dictionary. */
4309     if (decode[0] != 0)
4310         need_dict = 1;
4311     for (i = 1; i < 256; i++) {
4312         int l1, l2;
4313         if (decode[i] == 0
4314 #ifdef Py_UNICODE_WIDE
4315             || decode[i] > 0xFFFF
4316 #endif
4317             ) {
4318             need_dict = 1;
4319             break;
4320         }
4321         if (decode[i] == 0xFFFE)
4322             /* unmapped character */
4323             continue;
4324         l1 = decode[i] >> 11;
4325         l2 = decode[i] >> 7;
4326         if (level1[l1] == 0xFF)
4327             level1[l1] = count2++;
4328         if (level2[l2] == 0xFF)
4329             level2[l2] = count3++;
4330     }
4331
4332     if (count2 >= 0xFF || count3 >= 0xFF)
4333         need_dict = 1;
4334
4335     if (need_dict) {
4336         PyObject *result = PyDict_New();
4337         PyObject *key, *value;
4338         if (!result)
4339             return NULL;
4340         for (i = 0; i < 256; i++) {
4341             key = value = NULL;
4342             key = PyInt_FromLong(decode[i]);
4343             value = PyInt_FromLong(i);
4344             if (!key || !value)
4345                 goto failed1;
4346             if (PyDict_SetItem(result, key, value) == -1)
4347                 goto failed1;
4348             Py_DECREF(key);
4349             Py_DECREF(value);
4350         }
4351         return result;
4352       failed1:
4353         Py_XDECREF(key);
4354         Py_XDECREF(value);
4355         Py_DECREF(result);
4356         return NULL;
4357     }
4358
4359     /* Create a three-level trie */
4360     result = PyObject_MALLOC(sizeof(struct encoding_map) +
4361                              16*count2 + 128*count3 - 1);
4362     if (!result)
4363         return PyErr_NoMemory();
4364     PyObject_Init(result, &EncodingMapType);
4365     mresult = (struct encoding_map*)result;
4366     mresult->count2 = count2;
4367     mresult->count3 = count3;
4368     mlevel1 = mresult->level1;
4369     mlevel2 = mresult->level23;
4370     mlevel3 = mresult->level23 + 16*count2;
4371     memcpy(mlevel1, level1, 32);
4372     memset(mlevel2, 0xFF, 16*count2);
4373     memset(mlevel3, 0, 128*count3);
4374     count3 = 0;
4375     for (i = 1; i < 256; i++) {
4376         int o1, o2, o3, i2, i3;
4377         if (decode[i] == 0xFFFE)
4378             /* unmapped character */
4379             continue;
4380         o1 = decode[i]>>11;
4381         o2 = (decode[i]>>7) & 0xF;
4382         i2 = 16*mlevel1[o1] + o2;
4383         if (mlevel2[i2] == 0xFF)
4384             mlevel2[i2] = count3++;
4385         o3 = decode[i] & 0x7F;
4386         i3 = 128*mlevel2[i2] + o3;
4387         mlevel3[i3] = i;
4388     }
4389     return result;
4390 }
4391
4392 static int
4393 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4394 {
4395     struct encoding_map *map = (struct encoding_map*)mapping;
4396     int l1 = c>>11;
4397     int l2 = (c>>7) & 0xF;
4398     int l3 = c & 0x7F;
4399     int i;
4400
4401 #ifdef Py_UNICODE_WIDE
4402     if (c > 0xFFFF) {
4403         return -1;
4404     }
4405 #endif
4406     if (c == 0)
4407         return 0;
4408     /* level 1*/
4409     i = map->level1[l1];
4410     if (i == 0xFF) {
4411         return -1;
4412     }
4413     /* level 2*/
4414     i = map->level23[16*i+l2];
4415     if (i == 0xFF) {
4416         return -1;
4417     }
4418     /* level 3 */
4419     i = map->level23[16*map->count2 + 128*i + l3];
4420     if (i == 0) {
4421         return -1;
4422     }
4423     return i;
4424 }
4425
4426 /* Lookup the character ch in the mapping. If the character
4427    can't be found, Py_None is returned (or NULL, if another
4428    error occurred). */
4429 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4430 {
4431     PyObject *w = PyInt_FromLong((long)c);
4432     PyObject *x;
4433
4434     if (w == NULL)
4435         return NULL;
4436     x = PyObject_GetItem(mapping, w);
4437     Py_DECREF(w);
4438     if (x == NULL) {
4439         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4440             /* No mapping found means: mapping is undefined. */
4441             PyErr_Clear();
4442             x = Py_None;
4443             Py_INCREF(x);
4444             return x;
4445         } else
4446             return NULL;
4447     }
4448     else if (x == Py_None)
4449         return x;
4450     else if (PyInt_Check(x)) {
4451         long value = PyInt_AS_LONG(x);
4452         if (value < 0 || value > 255) {
4453             PyErr_SetString(PyExc_TypeError,
4454                             "character mapping must be in range(256)");
4455             Py_DECREF(x);
4456             return NULL;
4457         }
4458         return x;
4459     }
4460     else if (PyString_Check(x))
4461         return x;
4462     else {
4463         /* wrong return value */
4464         PyErr_SetString(PyExc_TypeError,
4465                         "character mapping must return integer, None or str");
4466         Py_DECREF(x);
4467         return NULL;
4468     }
4469 }
4470
4471 static int
4472 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4473 {
4474     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4475     /* exponentially overallocate to minimize reallocations */
4476     if (requiredsize < 2*outsize)
4477         requiredsize = 2*outsize;
4478     if (_PyString_Resize(outobj, requiredsize)) {
4479         return 0;
4480     }
4481     return 1;
4482 }
4483
4484 typedef enum charmapencode_result {
4485     enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4486 }charmapencode_result;
4487 /* lookup the character, put the result in the output string and adjust
4488    various state variables. Reallocate the output string if not enough
4489    space is available. Return a new reference to the object that
4490    was put in the output buffer, or Py_None, if the mapping was undefined
4491    (in which case no character was written) or NULL, if a
4492    reallocation error occurred. The caller must decref the result */
4493 static
4494 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4495                                           PyObject **outobj, Py_ssize_t *outpos)
4496 {
4497     PyObject *rep;
4498     char *outstart;
4499     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4500
4501     if (Py_TYPE(mapping) == &EncodingMapType) {
4502         int res = encoding_map_lookup(c, mapping);
4503         Py_ssize_t requiredsize = *outpos+1;
4504         if (res == -1)
4505             return enc_FAILED;
4506         if (outsize<requiredsize)
4507             if (!charmapencode_resize(outobj, outpos, requiredsize))
4508                 return enc_EXCEPTION;
4509         outstart = PyString_AS_STRING(*outobj);
4510         outstart[(*outpos)++] = (char)res;
4511         return enc_SUCCESS;
4512     }
4513
4514     rep = charmapencode_lookup(c, mapping);
4515     if (rep==NULL)
4516         return enc_EXCEPTION;
4517     else if (rep==Py_None) {
4518         Py_DECREF(rep);
4519         return enc_FAILED;
4520     } else {
4521         if (PyInt_Check(rep)) {
4522             Py_ssize_t requiredsize = *outpos+1;
4523             if (outsize<requiredsize)
4524                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4525                     Py_DECREF(rep);
4526                     return enc_EXCEPTION;
4527                 }
4528             outstart = PyString_AS_STRING(*outobj);
4529             outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4530         }
4531         else {
4532             const char *repchars = PyString_AS_STRING(rep);
4533             Py_ssize_t repsize = PyString_GET_SIZE(rep);
4534             Py_ssize_t requiredsize = *outpos+repsize;
4535             if (outsize<requiredsize)
4536                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4537                     Py_DECREF(rep);
4538                     return enc_EXCEPTION;
4539                 }
4540             outstart = PyString_AS_STRING(*outobj);
4541             memcpy(outstart + *outpos, repchars, repsize);
4542             *outpos += repsize;
4543         }
4544     }
4545     Py_DECREF(rep);
4546     return enc_SUCCESS;
4547 }
4548
4549 /* handle an error in PyUnicode_EncodeCharmap
4550    Return 0 on success, -1 on error */
4551 static
4552 int charmap_encoding_error(
4553     const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4554     PyObject **exceptionObject,
4555     int *known_errorHandler, PyObject **errorHandler, const char *errors,
4556     PyObject **res, Py_ssize_t *respos)
4557 {
4558     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4559     Py_ssize_t repsize;
4560     Py_ssize_t newpos;
4561     Py_UNICODE *uni2;
4562     /* startpos for collecting unencodable chars */
4563     Py_ssize_t collstartpos = *inpos;
4564     Py_ssize_t collendpos = *inpos+1;
4565     Py_ssize_t collpos;
4566     char *encoding = "charmap";
4567     char *reason = "character maps to <undefined>";
4568     charmapencode_result x;
4569
4570     /* find all unencodable characters */
4571     while (collendpos < size) {
4572         PyObject *rep;
4573         if (Py_TYPE(mapping) == &EncodingMapType) {
4574             int res = encoding_map_lookup(p[collendpos], mapping);
4575             if (res != -1)
4576                 break;
4577             ++collendpos;
4578             continue;
4579         }
4580
4581         rep = charmapencode_lookup(p[collendpos], mapping);
4582         if (rep==NULL)
4583             return -1;
4584         else if (rep!=Py_None) {
4585             Py_DECREF(rep);
4586             break;
4587         }
4588         Py_DECREF(rep);
4589         ++collendpos;
4590     }
4591     /* cache callback name lookup
4592      * (if not done yet, i.e. it's the first error) */
4593     if (*known_errorHandler==-1) {
4594         if ((errors==NULL) || (!strcmp(errors, "strict")))
4595             *known_errorHandler = 1;
4596         else if (!strcmp(errors, "replace"))
4597             *known_errorHandler = 2;
4598         else if (!strcmp(errors, "ignore"))
4599             *known_errorHandler = 3;
4600         else if (!strcmp(errors, "xmlcharrefreplace"))
4601             *known_errorHandler = 4;
4602         else
4603             *known_errorHandler = 0;
4604     }
4605     switch (*known_errorHandler) {
4606     case 1: /* strict */
4607         raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4608         return -1;
4609     case 2: /* replace */
4610         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4611             x = charmapencode_output('?', mapping, res, respos);
4612             if (x==enc_EXCEPTION) {
4613                 return -1;
4614             }
4615             else if (x==enc_FAILED) {
4616                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4617                 return -1;
4618             }
4619         }
4620         /* fall through */
4621     case 3: /* ignore */
4622         *inpos = collendpos;
4623         break;
4624     case 4: /* xmlcharrefreplace */
4625         /* generate replacement (temporarily (mis)uses p) */
4626         for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4627             char buffer[2+29+1+1];
4628             char *cp;
4629             sprintf(buffer, "&#%d;", (int)p[collpos]);
4630             for (cp = buffer; *cp; ++cp) {
4631                 x = charmapencode_output(*cp, mapping, res, respos);
4632                 if (x==enc_EXCEPTION)
4633                     return -1;
4634                 else if (x==enc_FAILED) {
4635                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4636                     return -1;
4637                 }
4638             }
4639         }
4640         *inpos = collendpos;
4641         break;
4642     default:
4643         repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4644                                                       encoding, reason, p, size, exceptionObject,
4645                                                       collstartpos, collendpos, &newpos);
4646         if (repunicode == NULL)
4647             return -1;
4648         /* generate replacement  */
4649         repsize = PyUnicode_GET_SIZE(repunicode);
4650         for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4651             x = charmapencode_output(*uni2, mapping, res, respos);
4652             if (x==enc_EXCEPTION) {
4653                 return -1;
4654             }
4655             else if (x==enc_FAILED) {
4656                 Py_DECREF(repunicode);
4657                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4658                 return -1;
4659             }
4660         }
4661         *inpos = newpos;
4662         Py_DECREF(repunicode);
4663     }
4664     return 0;
4665 }
4666
4667 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4668                                   Py_ssize_t size,
4669                                   PyObject *mapping,
4670                                   const char *errors)
4671 {
4672     /* output object */
4673     PyObject *res = NULL;
4674     /* current input position */
4675     Py_ssize_t inpos = 0;
4676     /* current output position */
4677     Py_ssize_t respos = 0;
4678     PyObject *errorHandler = NULL;
4679     PyObject *exc = NULL;
4680     /* the following variable is used for caching string comparisons
4681      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4682      * 3=ignore, 4=xmlcharrefreplace */
4683     int known_errorHandler = -1;
4684
4685     /* Default to Latin-1 */
4686     if (mapping == NULL)
4687         return PyUnicode_EncodeLatin1(p, size, errors);
4688
4689     /* allocate enough for a simple encoding without
4690        replacements, if we need more, we'll resize */
4691     res = PyString_FromStringAndSize(NULL, size);
4692     if (res == NULL)
4693         goto onError;
4694     if (size == 0)
4695         return res;
4696
4697     while (inpos<size) {
4698         /* try to encode it */
4699         charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4700         if (x==enc_EXCEPTION) /* error */
4701             goto onError;
4702         if (x==enc_FAILED) { /* unencodable character */
4703             if (charmap_encoding_error(p, size, &inpos, mapping,
4704                                        &exc,
4705                                        &known_errorHandler, &errorHandler, errors,
4706                                        &res, &respos)) {
4707                 goto onError;
4708             }
4709         }
4710         else
4711             /* done with this character => adjust input position */
4712             ++inpos;
4713     }
4714
4715     /* Resize if we allocated to much */
4716     if (respos<PyString_GET_SIZE(res)) {
4717         if (_PyString_Resize(&res, respos))
4718             goto onError;
4719     }
4720     Py_XDECREF(exc);
4721     Py_XDECREF(errorHandler);
4722     return res;
4723
4724   onError:
4725     Py_XDECREF(res);
4726     Py_XDECREF(exc);
4727     Py_XDECREF(errorHandler);
4728     return NULL;
4729 }
4730
4731 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4732                                     PyObject *mapping)
4733 {
4734     if (!PyUnicode_Check(unicode) || mapping == NULL) {
4735         PyErr_BadArgument();
4736         return NULL;
4737     }
4738     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4739                                    PyUnicode_GET_SIZE(unicode),
4740                                    mapping,
4741                                    NULL);
4742 }
4743
4744 /* create or adjust a UnicodeTranslateError */
4745 static void make_translate_exception(PyObject **exceptionObject,
4746                                      const Py_UNICODE *unicode, Py_ssize_t size,
4747                                      Py_ssize_t startpos, Py_ssize_t endpos,
4748                                      const char *reason)
4749 {
4750     if (*exceptionObject == NULL) {
4751         *exceptionObject = PyUnicodeTranslateError_Create(
4752             unicode, size, startpos, endpos, reason);
4753     }
4754     else {
4755         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4756             goto onError;
4757         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4758             goto onError;
4759         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4760             goto onError;
4761         return;
4762       onError:
4763         Py_DECREF(*exceptionObject);
4764         *exceptionObject = NULL;
4765     }
4766 }
4767
4768 /* raises a UnicodeTranslateError */
4769 static void raise_translate_exception(PyObject **exceptionObject,
4770                                       const Py_UNICODE *unicode, Py_ssize_t size,
4771                                       Py_ssize_t startpos, Py_ssize_t endpos,
4772                                       const char *reason)
4773 {
4774     make_translate_exception(exceptionObject,
4775                              unicode, size, startpos, endpos, reason);
4776     if (*exceptionObject != NULL)
4777         PyCodec_StrictErrors(*exceptionObject);
4778 }
4779
4780 /* error handling callback helper:
4781    build arguments, call the callback and check the arguments,
4782    put the result into newpos and return the replacement string, which
4783    has to be freed by the caller */
4784 static PyObject *unicode_translate_call_errorhandler(const char *errors,
4785                                                      PyObject **errorHandler,
4786                                                      const char *reason,
4787                                                      const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4788                                                      Py_ssize_t startpos, Py_ssize_t endpos,
4789                                                      Py_ssize_t *newpos)
4790 {
4791     static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4792
4793     Py_ssize_t i_newpos;
4794     PyObject *restuple;
4795     PyObject *resunicode;
4796
4797     if (*errorHandler == NULL) {
4798         *errorHandler = PyCodec_LookupError(errors);
4799         if (*errorHandler == NULL)
4800             return NULL;
4801     }
4802
4803     make_translate_exception(exceptionObject,
4804                              unicode, size, startpos, endpos, reason);
4805     if (*exceptionObject == NULL)
4806         return NULL;
4807
4808     restuple = PyObject_CallFunctionObjArgs(
4809         *errorHandler, *exceptionObject, NULL);
4810     if (restuple == NULL)
4811         return NULL;
4812     if (!PyTuple_Check(restuple)) {
4813         PyErr_SetString(PyExc_TypeError, &argparse[4]);
4814         Py_DECREF(restuple);
4815         return NULL;
4816     }
4817     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4818                           &resunicode, &i_newpos)) {
4819         Py_DECREF(restuple);
4820         return NULL;
4821     }
4822     if (i_newpos<0)
4823         *newpos = size+i_newpos;
4824     else
4825         *newpos = i_newpos;
4826     if (*newpos<0 || *newpos>size) {
4827         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4828         Py_DECREF(restuple);
4829         return NULL;
4830     }
4831     Py_INCREF(resunicode);
4832     Py_DECREF(restuple);
4833     return resunicode;
4834 }
4835
4836 /* Lookup the character ch in the mapping and put the result in result,
4837    which must be decrefed by the caller.
4838    Return 0 on success, -1 on error */
4839 static
4840 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4841 {
4842     PyObject *w = PyInt_FromLong((long)c);
4843     PyObject *x;
4844
4845     if (w == NULL)
4846         return -1;
4847     x = PyObject_GetItem(mapping, w);
4848     Py_DECREF(w);
4849     if (x == NULL) {
4850         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4851             /* No mapping found means: use 1:1 mapping. */
4852             PyErr_Clear();
4853             *result = NULL;
4854             return 0;
4855         } else
4856             return -1;
4857     }
4858     else if (x == Py_None) {
4859         *result = x;
4860         return 0;
4861     }
4862     else if (PyInt_Check(x)) {
4863         long value = PyInt_AS_LONG(x);
4864         long max = PyUnicode_GetMax();
4865         if (value < 0 || value > max) {
4866             PyErr_Format(PyExc_TypeError,
4867                          "character mapping must be in range(0x%lx)", max+1);
4868             Py_DECREF(x);
4869             return -1;
4870         }
4871         *result = x;
4872         return 0;
4873     }
4874     else if (PyUnicode_Check(x)) {
4875         *result = x;
4876         return 0;
4877     }
4878     else {
4879         /* wrong return value */
4880         PyErr_SetString(PyExc_TypeError,
4881                         "character mapping must return integer, None or unicode");
4882         Py_DECREF(x);
4883         return -1;
4884     }
4885 }
4886 /* ensure that *outobj is at least requiredsize characters long,
4887    if not reallocate and adjust various state variables.
4888    Return 0 on success, -1 on error */
4889 static
4890 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4891                                Py_ssize_t requiredsize)
4892 {
4893     Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4894     if (requiredsize > oldsize) {
4895         /* remember old output position */
4896         Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4897         /* exponentially overallocate to minimize reallocations */
4898         if (requiredsize < 2 * oldsize)
4899             requiredsize = 2 * oldsize;
4900         if (PyUnicode_Resize(outobj, requiredsize) < 0)
4901             return -1;
4902         *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4903     }
4904     return 0;
4905 }
4906 /* lookup the character, put the result in the output string and adjust
4907    various state variables. Return a new reference to the object that
4908    was put in the output buffer in *result, or Py_None, if the mapping was
4909    undefined (in which case no character was written).
4910    The called must decref result.
4911    Return 0 on success, -1 on error. */
4912 static
4913 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
4914                             Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4915                             PyObject **res)
4916 {
4917     if (charmaptranslate_lookup(*curinp, mapping, res))
4918         return -1;
4919     if (*res==NULL) {
4920         /* not found => default to 1:1 mapping */
4921         *(*outp)++ = *curinp;
4922     }
4923     else if (*res==Py_None)
4924         ;
4925     else if (PyInt_Check(*res)) {
4926         /* no overflow check, because we know that the space is enough */
4927         *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4928     }
4929     else if (PyUnicode_Check(*res)) {
4930         Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4931         if (repsize==1) {
4932             /* no overflow check, because we know that the space is enough */
4933             *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4934         }
4935         else if (repsize!=0) {
4936             /* more than one character */
4937             Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4938                 (insize - (curinp-startinp)) +
4939                 repsize - 1;
4940             if (charmaptranslate_makespace(outobj, outp, requiredsize))
4941                 return -1;
4942             memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4943             *outp += repsize;
4944         }
4945     }
4946     else
4947         return -1;
4948     return 0;
4949 }
4950
4951 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
4952                                      Py_ssize_t size,
4953                                      PyObject *mapping,
4954                                      const char *errors)
4955 {
4956     /* output object */
4957     PyObject *res = NULL;
4958     /* pointers to the beginning and end+1 of input */
4959     const Py_UNICODE *startp = p;
4960     const Py_UNICODE *endp = p + size;
4961     /* pointer into the output */
4962     Py_UNICODE *str;
4963     /* current output position */
4964     Py_ssize_t respos = 0;
4965     char *reason = "character maps to <undefined>";
4966     PyObject *errorHandler = NULL;
4967     PyObject *exc = NULL;
4968     /* the following variable is used for caching string comparisons
4969      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4970      * 3=ignore, 4=xmlcharrefreplace */
4971     int known_errorHandler = -1;
4972
4973     if (mapping == NULL) {
4974         PyErr_BadArgument();
4975         return NULL;
4976     }
4977
4978     /* allocate enough for a simple 1:1 translation without
4979        replacements, if we need more, we'll resize */
4980     res = PyUnicode_FromUnicode(NULL, size);
4981     if (res == NULL)
4982         goto onError;
4983     if (size == 0)
4984         return res;
4985     str = PyUnicode_AS_UNICODE(res);
4986
4987     while (p<endp) {
4988         /* try to encode it */
4989         PyObject *x = NULL;
4990         if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4991             Py_XDECREF(x);
4992             goto onError;
4993         }
4994         Py_XDECREF(x);
4995         if (x!=Py_None) /* it worked => adjust input pointer */
4996             ++p;
4997         else { /* untranslatable character */
4998             PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4999             Py_ssize_t repsize;
5000             Py_ssize_t newpos;
5001             Py_UNICODE *uni2;
5002             /* startpos for collecting untranslatable chars */
5003             const Py_UNICODE *collstart = p;
5004             const Py_UNICODE *collend = p+1;
5005             const Py_UNICODE *coll;
5006
5007             /* find all untranslatable characters */
5008             while (collend < endp) {
5009                 if (charmaptranslate_lookup(*collend, mapping, &x))
5010                     goto onError;
5011                 Py_XDECREF(x);
5012                 if (x!=Py_None)
5013                     break;
5014                 ++collend;
5015             }
5016             /* cache callback name lookup
5017              * (if not done yet, i.e. it's the first error) */
5018             if (known_errorHandler==-1) {
5019                 if ((errors==NULL) || (!strcmp(errors, "strict")))
5020                     known_errorHandler = 1;
5021                 else if (!strcmp(errors, "replace"))
5022                     known_errorHandler = 2;
5023                 else if (!strcmp(errors, "ignore"))
5024                     known_errorHandler = 3;
5025                 else if (!strcmp(errors, "xmlcharrefreplace"))
5026                     known_errorHandler = 4;
5027                 else
5028                     known_errorHandler = 0;
5029             }
5030             switch (known_errorHandler) {
5031             case 1: /* strict */
5032                 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5033                 goto onError;
5034             case 2: /* replace */
5035                 /* No need to check for space, this is a 1:1 replacement */
5036                 for (coll = collstart; coll<collend; ++coll)
5037                     *str++ = '?';
5038                 /* fall through */
5039             case 3: /* ignore */
5040                 p = collend;
5041                 break;
5042             case 4: /* xmlcharrefreplace */
5043                 /* generate replacement (temporarily (mis)uses p) */
5044                 for (p = collstart; p < collend; ++p) {
5045                     char buffer[2+29+1+1];
5046                     char *cp;
5047                     sprintf(buffer, "&#%d;", (int)*p);
5048                     if (charmaptranslate_makespace(&res, &str,
5049                                                    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5050                         goto onError;
5051                     for (cp = buffer; *cp; ++cp)
5052                         *str++ = *cp;
5053                 }
5054                 p = collend;
5055                 break;
5056             default:
5057                 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5058                                                                  reason, startp, size, &exc,
5059                                                                  collstart-startp, collend-startp, &newpos);
5060                 if (repunicode == NULL)
5061                     goto onError;
5062                 /* generate replacement  */
5063                 repsize = PyUnicode_GET_SIZE(repunicode);
5064                 if (charmaptranslate_makespace(&res, &str,
5065                                                (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5066                     Py_DECREF(repunicode);
5067                     goto onError;
5068                 }
5069                 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5070                     *str++ = *uni2;
5071                 p = startp + newpos;
5072                 Py_DECREF(repunicode);
5073             }
5074         }
5075     }
5076     /* Resize if we allocated to much */
5077     respos = str-PyUnicode_AS_UNICODE(res);
5078     if (respos<PyUnicode_GET_SIZE(res)) {
5079         if (PyUnicode_Resize(&res, respos) < 0)
5080             goto onError;
5081     }
5082     Py_XDECREF(exc);
5083     Py_XDECREF(errorHandler);
5084     return res;
5085
5086   onError:
5087     Py_XDECREF(res);
5088     Py_XDECREF(exc);
5089     Py_XDECREF(errorHandler);
5090     return NULL;
5091 }
5092
5093 PyObject *PyUnicode_Translate(PyObject *str,
5094                               PyObject *mapping,
5095                               const char *errors)
5096 {
5097     PyObject *result;
5098
5099     str = PyUnicode_FromObject(str);
5100     if (str == NULL)
5101         goto onError;
5102     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5103                                         PyUnicode_GET_SIZE(str),
5104                                         mapping,
5105                                         errors);
5106     Py_DECREF(str);
5107     return result;
5108
5109   onError:
5110     Py_XDECREF(str);
5111     return NULL;
5112 }
5113
5114 /* --- Decimal Encoder ---------------------------------------------------- */
5115
5116 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5117                             Py_ssize_t length,
5118                             char *output,
5119                             const char *errors)
5120 {
5121     Py_UNICODE *p, *end;
5122     PyObject *errorHandler = NULL;
5123     PyObject *exc = NULL;
5124     const char *encoding = "decimal";
5125     const char *reason = "invalid decimal Unicode string";
5126     /* the following variable is used for caching string comparisons
5127      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5128     int known_errorHandler = -1;
5129
5130     if (output == NULL) {
5131         PyErr_BadArgument();
5132         return -1;
5133     }
5134
5135     p = s;
5136     end = s + length;
5137     while (p < end) {
5138         register Py_UNICODE ch = *p;
5139         int decimal;
5140         PyObject *repunicode;
5141         Py_ssize_t repsize;
5142         Py_ssize_t newpos;
5143         Py_UNICODE *uni2;
5144         Py_UNICODE *collstart;
5145         Py_UNICODE *collend;
5146
5147         if (Py_UNICODE_ISSPACE(ch)) {
5148             *output++ = ' ';
5149             ++p;
5150             continue;
5151         }
5152         decimal = Py_UNICODE_TODECIMAL(ch);
5153         if (decimal >= 0) {
5154             *output++ = '0' + decimal;
5155             ++p;
5156             continue;
5157         }
5158         if (0 < ch && ch < 256) {
5159             *output++ = (char)ch;
5160             ++p;
5161             continue;
5162         }
5163         /* All other characters are considered unencodable */
5164         collstart = p;
5165         collend = p+1;
5166         while (collend < end) {
5167             if ((0 < *collend && *collend < 256) ||
5168                 !Py_UNICODE_ISSPACE(*collend) ||
5169                 Py_UNICODE_TODECIMAL(*collend))
5170                 break;
5171         }
5172         /* cache callback name lookup
5173          * (if not done yet, i.e. it's the first error) */
5174         if (known_errorHandler==-1) {
5175             if ((errors==NULL) || (!strcmp(errors, "strict")))
5176                 known_errorHandler = 1;
5177             else if (!strcmp(errors, "replace"))
5178                 known_errorHandler = 2;
5179             else if (!strcmp(errors, "ignore"))
5180                 known_errorHandler = 3;
5181             else if (!strcmp(errors, "xmlcharrefreplace"))
5182                 known_errorHandler = 4;
5183             else
5184                 known_errorHandler = 0;
5185         }
5186         switch (known_errorHandler) {
5187         case 1: /* strict */
5188             raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5189             goto onError;
5190         case 2: /* replace */
5191             for (p = collstart; p < collend; ++p)
5192                 *output++ = '?';
5193             /* fall through */
5194         case 3: /* ignore */
5195             p = collend;
5196             break;
5197         case 4: /* xmlcharrefreplace */
5198             /* generate replacement (temporarily (mis)uses p) */
5199             for (p = collstart; p < collend; ++p)
5200                 output += sprintf(output, "&#%d;", (int)*p);
5201             p = collend;
5202             break;
5203         default:
5204             repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5205                                                           encoding, reason, s, length, &exc,
5206                                                           collstart-s, collend-s, &newpos);
5207             if (repunicode == NULL)
5208                 goto onError;
5209             /* generate replacement  */
5210             repsize = PyUnicode_GET_SIZE(repunicode);
5211             for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5212                 Py_UNICODE ch = *uni2;
5213                 if (Py_UNICODE_ISSPACE(ch))
5214                     *output++ = ' ';
5215                 else {
5216                     decimal = Py_UNICODE_TODECIMAL(ch);
5217                     if (decimal >= 0)
5218                         *output++ = '0' + decimal;
5219                     else if (0 < ch && ch < 256)
5220                         *output++ = (char)ch;
5221                     else {
5222                         Py_DECREF(repunicode);
5223                         raise_encode_exception(&exc, encoding,
5224                                                s, length, collstart-s, collend-s, reason);
5225                         goto onError;
5226                     }
5227                 }
5228             }
5229             p = s + newpos;
5230             Py_DECREF(repunicode);
5231         }
5232     }
5233     /* 0-terminate the output string */
5234     *output++ = '\0';
5235     Py_XDECREF(exc);
5236     Py_XDECREF(errorHandler);
5237     return 0;
5238
5239   onError:
5240     Py_XDECREF(exc);
5241     Py_XDECREF(errorHandler);
5242     return -1;
5243 }
5244
5245 /* --- Helpers ------------------------------------------------------------ */
5246
5247 #include "stringlib/unicodedefs.h"
5248
5249 #define FROM_UNICODE
5250
5251 #include "stringlib/fastsearch.h"
5252
5253 #include "stringlib/count.h"
5254 #include "stringlib/find.h"
5255 #include "stringlib/partition.h"
5256
5257 /* helper macro to fixup start/end slice values */
5258 #define FIX_START_END(obj)                      \
5259     if (start < 0)                              \
5260         start += (obj)->length;                 \
5261     if (start < 0)                              \
5262         start = 0;                              \
5263     if (end > (obj)->length)                    \
5264         end = (obj)->length;                    \
5265     if (end < 0)                                \
5266         end += (obj)->length;                   \
5267     if (end < 0)                                \
5268         end = 0;
5269
5270 Py_ssize_t PyUnicode_Count(PyObject *str,
5271                            PyObject *substr,
5272                            Py_ssize_t start,
5273                            Py_ssize_t end)
5274 {
5275     Py_ssize_t result;
5276     PyUnicodeObject* str_obj;
5277     PyUnicodeObject* sub_obj;
5278
5279     str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5280     if (!str_obj)
5281         return -1;
5282     sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5283     if (!sub_obj) {
5284         Py_DECREF(str_obj);
5285         return -1;
5286     }
5287
5288     FIX_START_END(str_obj);
5289
5290     result = stringlib_count(
5291         str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5292         );
5293
5294     Py_DECREF(sub_obj);
5295     Py_DECREF(str_obj);
5296
5297     return result;
5298 }
5299
5300 Py_ssize_t PyUnicode_Find(PyObject *str,
5301                           PyObject *sub,
5302                           Py_ssize_t start,
5303                           Py_ssize_t end,
5304                           int direction)
5305 {
5306     Py_ssize_t result;
5307
5308     str = PyUnicode_FromObject(str);
5309     if (!str)
5310         return -2;
5311     sub = PyUnicode_FromObject(sub);
5312     if (!sub) {
5313         Py_DECREF(str);
5314         return -2;
5315     }
5316
5317     if (direction > 0)
5318         result = stringlib_find_slice(
5319             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5320             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5321             start, end
5322             );
5323     else
5324         result = stringlib_rfind_slice(
5325             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5326             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5327             start, end
5328             );
5329
5330     Py_DECREF(str);
5331     Py_DECREF(sub);
5332
5333     return result;
5334 }
5335
5336 static
5337 int tailmatch(PyUnicodeObject *self,
5338               PyUnicodeObject *substring,
5339               Py_ssize_t start,
5340               Py_ssize_t end,
5341               int direction)
5342 {
5343     if (substring->length == 0)
5344         return 1;
5345
5346     FIX_START_END(self);
5347
5348     end -= substring->length;
5349     if (end < start)
5350         return 0;
5351
5352     if (direction > 0) {
5353         if (Py_UNICODE_MATCH(self, end, substring))
5354             return 1;
5355     } else {
5356         if (Py_UNICODE_MATCH(self, start, substring))
5357             return 1;
5358     }
5359
5360     return 0;
5361 }
5362
5363 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5364                                PyObject *substr,
5365                                Py_ssize_t start,
5366                                Py_ssize_t end,
5367                                int direction)
5368 {
5369     Py_ssize_t result;
5370
5371     str = PyUnicode_FromObject(str);
5372     if (str == NULL)
5373         return -1;
5374     substr = PyUnicode_FromObject(substr);
5375     if (substr == NULL) {
5376         Py_DECREF(str);
5377         return -1;
5378     }
5379
5380     result = tailmatch((PyUnicodeObject *)str,
5381                        (PyUnicodeObject *)substr,
5382                        start, end, direction);
5383     Py_DECREF(str);
5384     Py_DECREF(substr);
5385     return result;
5386 }
5387
5388 /* Apply fixfct filter to the Unicode object self and return a
5389    reference to the modified object */
5390
5391 static
5392 PyObject *fixup(PyUnicodeObject *self,
5393                 int (*fixfct)(PyUnicodeObject *s))
5394 {
5395
5396     PyUnicodeObject *u;
5397
5398     u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5399     if (u == NULL)
5400         return NULL;
5401
5402     Py_UNICODE_COPY(u->str, self->str, self->length);
5403
5404     if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5405         /* fixfct should return TRUE if it modified the buffer. If
5406            FALSE, return a reference to the original buffer instead
5407            (to save space, not time) */
5408         Py_INCREF(self);
5409         Py_DECREF(u);
5410         return (PyObject*) self;
5411     }
5412     return (PyObject*) u;
5413 }
5414
5415 static
5416 int fixupper(PyUnicodeObject *self)
5417 {
5418     Py_ssize_t len = self->length;
5419     Py_UNICODE *s = self->str;
5420     int status = 0;
5421
5422     while (len-- > 0) {
5423         register Py_UNICODE ch;
5424
5425         ch = Py_UNICODE_TOUPPER(*s);
5426         if (ch != *s) {
5427             status = 1;
5428             *s = ch;
5429         }
5430         s++;
5431     }
5432
5433     return status;
5434 }
5435
5436 static
5437 int fixlower(PyUnicodeObject *self)
5438 {
5439     Py_ssize_t len = self->length;
5440     Py_UNICODE *s = self->str;
5441     int status = 0;
5442
5443     while (len-- > 0) {
5444         register Py_UNICODE ch;
5445
5446         ch = Py_UNICODE_TOLOWER(*s);
5447         if (ch != *s) {
5448             status = 1;
5449             *s = ch;
5450         }
5451         s++;
5452     }
5453
5454     return status;
5455 }
5456
5457 static
5458 int fixswapcase(PyUnicodeObject *self)
5459 {
5460     Py_ssize_t len = self->length;
5461     Py_UNICODE *s = self->str;
5462     int status = 0;
5463
5464     while (len-- > 0) {
5465         if (Py_UNICODE_ISUPPER(*s)) {
5466             *s = Py_UNICODE_TOLOWER(*s);
5467             status = 1;
5468         } else if (Py_UNICODE_ISLOWER(*s)) {
5469             *s = Py_UNICODE_TOUPPER(*s);
5470             status = 1;
5471         }
5472         s++;
5473     }
5474
5475     return status;
5476 }
5477
5478 static
5479 int fixcapitalize(PyUnicodeObject *self)
5480 {
5481     Py_ssize_t len = self->length;
5482     Py_UNICODE *s = self->str;
5483     int status = 0;
5484
5485     if (len == 0)
5486         return 0;
5487     if (Py_UNICODE_ISLOWER(*s)) {
5488         *s = Py_UNICODE_TOUPPER(*s);
5489         status = 1;
5490     }
5491     s++;
5492     while (--len > 0) {
5493         if (Py_UNICODE_ISUPPER(*s)) {
5494             *s = Py_UNICODE_TOLOWER(*s);
5495             status = 1;
5496         }
5497         s++;
5498     }
5499     return status;
5500 }
5501
5502 static
5503 int fixtitle(PyUnicodeObject *self)
5504 {
5505     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5506     register Py_UNICODE *e;
5507     int previous_is_cased;
5508
5509     /* Shortcut for single character strings */
5510     if (PyUnicode_GET_SIZE(self) == 1) {
5511         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5512         if (*p != ch) {
5513             *p = ch;
5514             return 1;
5515         }
5516         else
5517             return 0;
5518     }
5519
5520     e = p + PyUnicode_GET_SIZE(self);
5521     previous_is_cased = 0;
5522     for (; p < e; p++) {
5523         register const Py_UNICODE ch = *p;
5524
5525         if (previous_is_cased)
5526             *p = Py_UNICODE_TOLOWER(ch);
5527         else
5528             *p = Py_UNICODE_TOTITLE(ch);
5529
5530         if (Py_UNICODE_ISLOWER(ch) ||
5531             Py_UNICODE_ISUPPER(ch) ||
5532             Py_UNICODE_ISTITLE(ch))
5533             previous_is_cased = 1;
5534         else
5535             previous_is_cased = 0;
5536     }
5537     return 1;
5538 }
5539
5540 PyObject *
5541 PyUnicode_Join(PyObject *separator, PyObject *seq)
5542 {
5543     PyObject *internal_separator = NULL;
5544     const Py_UNICODE blank = ' ';
5545     const Py_UNICODE *sep = &blank;
5546     Py_ssize_t seplen = 1;
5547     PyUnicodeObject *res = NULL; /* the result */
5548     Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
5549     Py_ssize_t res_used;         /* # used bytes */
5550     Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
5551     PyObject *fseq;          /* PySequence_Fast(seq) */
5552     Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
5553     PyObject *item;
5554     Py_ssize_t i;
5555
5556     fseq = PySequence_Fast(seq, "");
5557     if (fseq == NULL) {
5558         return NULL;
5559     }
5560
5561     /* Grrrr.  A codec may be invoked to convert str objects to
5562      * Unicode, and so it's possible to call back into Python code
5563      * during PyUnicode_FromObject(), and so it's possible for a sick
5564      * codec to change the size of fseq (if seq is a list).  Therefore
5565      * we have to keep refetching the size -- can't assume seqlen
5566      * is invariant.
5567      */
5568     seqlen = PySequence_Fast_GET_SIZE(fseq);
5569     /* If empty sequence, return u"". */
5570     if (seqlen == 0) {
5571         res = _PyUnicode_New(0);  /* empty sequence; return u"" */
5572         goto Done;
5573     }
5574     /* If singleton sequence with an exact Unicode, return that. */
5575     if (seqlen == 1) {
5576         item = PySequence_Fast_GET_ITEM(fseq, 0);
5577         if (PyUnicode_CheckExact(item)) {
5578             Py_INCREF(item);
5579             res = (PyUnicodeObject *)item;
5580             goto Done;
5581         }
5582     }
5583
5584     /* At least two items to join, or one that isn't exact Unicode. */
5585     if (seqlen > 1) {
5586         /* Set up sep and seplen -- they're needed. */
5587         if (separator == NULL) {
5588             sep = &blank;
5589             seplen = 1;
5590         }
5591         else {
5592             internal_separator = PyUnicode_FromObject(separator);
5593             if (internal_separator == NULL)
5594                 goto onError;
5595             sep = PyUnicode_AS_UNICODE(internal_separator);
5596             seplen = PyUnicode_GET_SIZE(internal_separator);
5597             /* In case PyUnicode_FromObject() mutated seq. */
5598             seqlen = PySequence_Fast_GET_SIZE(fseq);
5599         }
5600     }
5601
5602     /* Get space. */
5603     res = _PyUnicode_New(res_alloc);
5604     if (res == NULL)
5605         goto onError;
5606     res_p = PyUnicode_AS_UNICODE(res);
5607     res_used = 0;
5608
5609     for (i = 0; i < seqlen; ++i) {
5610         Py_ssize_t itemlen;
5611         Py_ssize_t new_res_used;
5612
5613         item = PySequence_Fast_GET_ITEM(fseq, i);
5614         /* Convert item to Unicode. */
5615         if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5616             PyErr_Format(PyExc_TypeError,
5617                          "sequence item %zd: expected string or Unicode,"
5618                          " %.80s found",
5619                          i, Py_TYPE(item)->tp_name);
5620             goto onError;
5621         }
5622         item = PyUnicode_FromObject(item);
5623         if (item == NULL)
5624             goto onError;
5625         /* We own a reference to item from here on. */
5626
5627         /* In case PyUnicode_FromObject() mutated seq. */
5628         seqlen = PySequence_Fast_GET_SIZE(fseq);
5629
5630         /* Make sure we have enough space for the separator and the item. */
5631         itemlen = PyUnicode_GET_SIZE(item);
5632         new_res_used = res_used + itemlen;
5633         if (new_res_used < 0)
5634             goto Overflow;
5635         if (i < seqlen - 1) {
5636             new_res_used += seplen;
5637             if (new_res_used < 0)
5638                 goto Overflow;
5639         }
5640         if (new_res_used > res_alloc) {
5641             /* double allocated size until it's big enough */
5642             do {
5643                 res_alloc += res_alloc;
5644                 if (res_alloc <= 0)
5645                     goto Overflow;
5646             } while (new_res_used > res_alloc);
5647             if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5648                 Py_DECREF(item);
5649                 goto onError;
5650             }
5651             res_p = PyUnicode_AS_UNICODE(res) + res_used;
5652         }
5653
5654         /* Copy item, and maybe the separator. */
5655         Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5656         res_p += itemlen;
5657         if (i < seqlen - 1) {
5658             Py_UNICODE_COPY(res_p, sep, seplen);
5659             res_p += seplen;
5660         }
5661         Py_DECREF(item);
5662         res_used = new_res_used;
5663     }
5664
5665     /* Shrink res to match the used area; this probably can't fail,
5666      * but it's cheap to check.
5667      */
5668     if (_PyUnicode_Resize(&res, res_used) < 0)
5669         goto onError;
5670
5671   Done:
5672     Py_XDECREF(internal_separator);
5673     Py_DECREF(fseq);
5674     return (PyObject *)res;
5675
5676   Overflow:
5677     PyErr_SetString(PyExc_OverflowError,
5678                     "join() result is too long for a Python string");
5679     Py_DECREF(item);
5680     /* fall through */
5681
5682   onError:
5683     Py_XDECREF(internal_separator);
5684     Py_DECREF(fseq);
5685     Py_XDECREF(res);
5686     return NULL;
5687 }
5688
5689 static
5690 PyUnicodeObject *pad(PyUnicodeObject *self,
5691                      Py_ssize_t left,
5692                      Py_ssize_t right,
5693                      Py_UNICODE fill)
5694 {
5695     PyUnicodeObject *u;
5696
5697     if (left < 0)
5698         left = 0;
5699     if (right < 0)
5700         right = 0;
5701
5702     if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5703         Py_INCREF(self);
5704         return self;
5705     }
5706
5707     if (left > PY_SSIZE_T_MAX - self->length ||
5708         right > PY_SSIZE_T_MAX - (left + self->length)) {
5709         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5710         return NULL;
5711     }
5712     u = _PyUnicode_New(left + self->length + right);
5713     if (u) {
5714         if (left)
5715             Py_UNICODE_FILL(u->str, fill, left);
5716         Py_UNICODE_COPY(u->str + left, self->str, self->length);
5717         if (right)
5718             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5719     }
5720
5721     return u;
5722 }
5723
5724 #define SPLIT_APPEND(data, left, right)                                 \
5725     str = PyUnicode_FromUnicode((data) + (left), (right) - (left));     \
5726     if (!str)                                                           \
5727         goto onError;                                                   \
5728     if (PyList_Append(list, str)) {                                     \
5729         Py_DECREF(str);                                                 \
5730         goto onError;                                                   \
5731     }                                                                   \
5732     else                                                                \
5733         Py_DECREF(str);
5734
5735 static
5736 PyObject *split_whitespace(PyUnicodeObject *self,
5737                            PyObject *list,
5738                            Py_ssize_t maxcount)
5739 {
5740     register Py_ssize_t i;
5741     register Py_ssize_t j;
5742     Py_ssize_t len = self->length;
5743     PyObject *str;
5744     register const Py_UNICODE *buf = self->str;
5745
5746     for (i = j = 0; i < len; ) {
5747         /* find a token */
5748         while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5749             i++;
5750         j = i;
5751         while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
5752             i++;
5753         if (j < i) {
5754             if (maxcount-- <= 0)
5755                 break;
5756             SPLIT_APPEND(buf, j, i);
5757             while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5758                 i++;
5759             j = i;
5760         }
5761     }
5762     if (j < len) {
5763         SPLIT_APPEND(buf, j, len);
5764     }
5765     return list;
5766
5767   onError:
5768     Py_DECREF(list);
5769     return NULL;
5770 }
5771
5772 PyObject *PyUnicode_Splitlines(PyObject *string,
5773                                int keepends)
5774 {
5775     register Py_ssize_t i;
5776     register Py_ssize_t j;
5777     Py_ssize_t len;
5778     PyObject *list;
5779     PyObject *str;
5780     Py_UNICODE *data;
5781
5782     string = PyUnicode_FromObject(string);
5783     if (string == NULL)
5784         return NULL;
5785     data = PyUnicode_AS_UNICODE(string);
5786     len = PyUnicode_GET_SIZE(string);
5787
5788     list = PyList_New(0);
5789     if (!list)
5790         goto onError;
5791
5792     for (i = j = 0; i < len; ) {
5793         Py_ssize_t eol;
5794
5795         /* Find a line and append it */
5796         while (i < len && !BLOOM_LINEBREAK(data[i]))
5797             i++;
5798
5799         /* Skip the line break reading CRLF as one line break */
5800         eol = i;
5801         if (i < len) {
5802             if (data[i] == '\r' && i + 1 < len &&
5803                 data[i+1] == '\n')
5804                 i += 2;
5805             else
5806                 i++;
5807             if (keepends)
5808                 eol = i;
5809         }
5810         SPLIT_APPEND(data, j, eol);
5811         j = i;
5812     }
5813     if (j < len) {
5814         SPLIT_APPEND(data, j, len);
5815     }
5816
5817     Py_DECREF(string);
5818     return list;
5819
5820   onError:
5821     Py_XDECREF(list);
5822     Py_DECREF(string);
5823     return NULL;
5824 }
5825
5826 static
5827 PyObject *split_char(PyUnicodeObject *self,
5828                      PyObject *list,
5829                      Py_UNICODE ch,
5830                      Py_ssize_t maxcount)
5831 {
5832     register Py_ssize_t i;
5833     register Py_ssize_t j;
5834     Py_ssize_t len = self->length;
5835     PyObject *str;
5836     register const Py_UNICODE *buf = self->str;
5837
5838     for (i = j = 0; i < len; ) {
5839         if (buf[i] == ch) {
5840             if (maxcount-- <= 0)
5841                 break;
5842             SPLIT_APPEND(buf, j, i);
5843             i = j = i + 1;
5844         } else
5845             i++;
5846     }
5847     if (j <= len) {
5848         SPLIT_APPEND(buf, j, len);
5849     }
5850     return list;
5851
5852   onError:
5853     Py_DECREF(list);
5854     return NULL;
5855 }
5856
5857 static
5858 PyObject *split_substring(PyUnicodeObject *self,
5859                           PyObject *list,
5860                           PyUnicodeObject *substring,
5861                           Py_ssize_t maxcount)
5862 {
5863     register Py_ssize_t i;
5864     register Py_ssize_t j;
5865     Py_ssize_t len = self->length;
5866     Py_ssize_t sublen = substring->length;
5867     PyObject *str;
5868
5869     for (i = j = 0; i <= len - sublen; ) {
5870         if (Py_UNICODE_MATCH(self, i, substring)) {
5871             if (maxcount-- <= 0)
5872                 break;
5873             SPLIT_APPEND(self->str, j, i);
5874             i = j = i + sublen;
5875         } else
5876             i++;
5877     }
5878     if (j <= len) {
5879         SPLIT_APPEND(self->str, j, len);
5880     }
5881     return list;
5882
5883   onError:
5884     Py_DECREF(list);
5885     return NULL;
5886 }
5887
5888 static
5889 PyObject *rsplit_whitespace(PyUnicodeObject *self,
5890                             PyObject *list,
5891                             Py_ssize_t maxcount)
5892 {
5893     register Py_ssize_t i;
5894     register Py_ssize_t j;
5895     Py_ssize_t len = self->length;
5896     PyObject *str;
5897     register const Py_UNICODE *buf = self->str;
5898
5899     for (i = j = len - 1; i >= 0; ) {
5900         /* find a token */
5901         while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5902             i--;
5903         j = i;
5904         while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
5905             i--;
5906         if (j > i) {
5907             if (maxcount-- <= 0)
5908                 break;
5909             SPLIT_APPEND(buf, i + 1, j + 1);
5910             while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5911                 i--;
5912             j = i;
5913         }
5914     }
5915     if (j >= 0) {
5916         SPLIT_APPEND(buf, 0, j + 1);
5917     }
5918     if (PyList_Reverse(list) < 0)
5919         goto onError;
5920     return list;
5921
5922   onError:
5923     Py_DECREF(list);
5924     return NULL;
5925 }
5926
5927 static
5928 PyObject *rsplit_char(PyUnicodeObject *self,
5929                       PyObject *list,
5930                       Py_UNICODE ch,
5931                       Py_ssize_t maxcount)
5932 {
5933     register Py_ssize_t i;
5934     register Py_ssize_t j;
5935     Py_ssize_t len = self->length;
5936     PyObject *str;
5937     register const Py_UNICODE *buf = self->str;
5938
5939     for (i = j = len - 1; i >= 0; ) {
5940         if (buf[i] == ch) {
5941             if (maxcount-- <= 0)
5942                 break;
5943             SPLIT_APPEND(buf, i + 1, j + 1);
5944             j = i = i - 1;
5945         } else
5946             i--;
5947     }
5948     if (j >= -1) {
5949         SPLIT_APPEND(buf, 0, j + 1);
5950     }
5951     if (PyList_Reverse(list) < 0)
5952         goto onError;
5953     return list;
5954
5955   onError:
5956     Py_DECREF(list);
5957     return NULL;
5958 }
5959
5960 static
5961 PyObject *rsplit_substring(PyUnicodeObject *self,
5962                            PyObject *list,
5963                            PyUnicodeObject *substring,
5964                            Py_ssize_t maxcount)
5965 {
5966     register Py_ssize_t i;
5967     register Py_ssize_t j;
5968     Py_ssize_t len = self->length;
5969     Py_ssize_t sublen = substring->length;
5970     PyObject *str;
5971
5972     for (i = len - sublen, j = len; i >= 0; ) {
5973         if (Py_UNICODE_MATCH(self, i, substring)) {
5974             if (maxcount-- <= 0)
5975                 break;
5976             SPLIT_APPEND(self->str, i + sublen, j);
5977             j = i;
5978             i -= sublen;
5979         } else
5980             i--;
5981     }
5982     if (j >= 0) {
5983         SPLIT_APPEND(self->str, 0, j);
5984     }
5985     if (PyList_Reverse(list) < 0)
5986         goto onError;
5987     return list;
5988
5989   onError:
5990     Py_DECREF(list);
5991     return NULL;
5992 }
5993
5994 #undef SPLIT_APPEND
5995
5996 static
5997 PyObject *split(PyUnicodeObject *self,
5998                 PyUnicodeObject *substring,
5999                 Py_ssize_t maxcount)
6000 {
6001     PyObject *list;
6002
6003     if (maxcount < 0)
6004         maxcount = PY_SSIZE_T_MAX;
6005
6006     list = PyList_New(0);
6007     if (!list)
6008         return NULL;
6009
6010     if (substring == NULL)
6011         return split_whitespace(self,list,maxcount);
6012
6013     else if (substring->length == 1)
6014         return split_char(self,list,substring->str[0],maxcount);
6015
6016     else if (substring->length == 0) {
6017         Py_DECREF(list);
6018         PyErr_SetString(PyExc_ValueError, "empty separator");
6019         return NULL;
6020     }
6021     else
6022         return split_substring(self,list,substring,maxcount);
6023 }
6024
6025 static
6026 PyObject *rsplit(PyUnicodeObject *self,
6027                  PyUnicodeObject *substring,
6028                  Py_ssize_t maxcount)
6029 {
6030     PyObject *list;
6031
6032     if (maxcount < 0)
6033         maxcount = PY_SSIZE_T_MAX;
6034
6035     list = PyList_New(0);
6036     if (!list)
6037         return NULL;
6038
6039     if (substring == NULL)
6040         return rsplit_whitespace(self,list,maxcount);
6041
6042     else if (substring->length == 1)
6043         return rsplit_char(self,list,substring->str[0],maxcount);
6044
6045     else if (substring->length == 0) {
6046         Py_DECREF(list);
6047         PyErr_SetString(PyExc_ValueError, "empty separator");
6048         return NULL;
6049     }
6050     else
6051         return rsplit_substring(self,list,substring,maxcount);
6052 }
6053
6054 static
6055 PyObject *replace(PyUnicodeObject *self,
6056                   PyUnicodeObject *str1,
6057                   PyUnicodeObject *str2,
6058                   Py_ssize_t maxcount)
6059 {
6060     PyUnicodeObject *u;
6061
6062     if (maxcount < 0)
6063         maxcount = PY_SSIZE_T_MAX;
6064
6065     if (str1->length == str2->length) {
6066         /* same length */
6067         Py_ssize_t i;
6068         if (str1->length == 1) {
6069             /* replace characters */
6070             Py_UNICODE u1, u2;
6071             if (!findchar(self->str, self->length, str1->str[0]))
6072                 goto nothing;
6073             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6074             if (!u)
6075                 return NULL;
6076             Py_UNICODE_COPY(u->str, self->str, self->length);
6077             u1 = str1->str[0];
6078             u2 = str2->str[0];
6079             for (i = 0; i < u->length; i++)
6080                 if (u->str[i] == u1) {
6081                     if (--maxcount < 0)
6082                         break;
6083                     u->str[i] = u2;
6084                 }
6085         } else {
6086             i = fastsearch(
6087                 self->str, self->length, str1->str, str1->length, FAST_SEARCH
6088                 );
6089             if (i < 0)
6090                 goto nothing;
6091             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6092             if (!u)
6093                 return NULL;
6094             Py_UNICODE_COPY(u->str, self->str, self->length);
6095             while (i <= self->length - str1->length)
6096                 if (Py_UNICODE_MATCH(self, i, str1)) {
6097                     if (--maxcount < 0)
6098                         break;
6099                     Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6100                     i += str1->length;
6101                 } else
6102                     i++;
6103         }
6104     } else {
6105
6106         Py_ssize_t n, i, j, e;
6107         Py_ssize_t product, new_size, delta;
6108         Py_UNICODE *p;
6109
6110         /* replace strings */
6111         n = stringlib_count(self->str, self->length, str1->str, str1->length);
6112         if (n > maxcount)
6113             n = maxcount;
6114         if (n == 0)
6115             goto nothing;
6116         /* new_size = self->length + n * (str2->length - str1->length)); */
6117         delta = (str2->length - str1->length);
6118         if (delta == 0) {
6119             new_size = self->length;
6120         } else {
6121             product = n * (str2->length - str1->length);
6122             if ((product / (str2->length - str1->length)) != n) {
6123                 PyErr_SetString(PyExc_OverflowError,
6124                                 "replace string is too long");
6125                 return NULL;
6126             }
6127             new_size = self->length + product;
6128             if (new_size < 0) {
6129                 PyErr_SetString(PyExc_OverflowError,
6130                                 "replace string is too long");
6131                 return NULL;
6132             }
6133         }
6134         u = _PyUnicode_New(new_size);
6135         if (!u)
6136             return NULL;
6137         i = 0;
6138         p = u->str;
6139         e = self->length - str1->length;
6140         if (str1->length > 0) {
6141             while (n-- > 0) {
6142                 /* look for next match */
6143                 j = i;
6144                 while (j <= e) {
6145                     if (Py_UNICODE_MATCH(self, j, str1))
6146                         break;
6147                     j++;
6148                 }
6149                 if (j > i) {
6150                     if (j > e)
6151                         break;
6152                     /* copy unchanged part [i:j] */
6153                     Py_UNICODE_COPY(p, self->str+i, j-i);
6154                     p += j - i;
6155                 }
6156                 /* copy substitution string */
6157                 if (str2->length > 0) {
6158                     Py_UNICODE_COPY(p, str2->str, str2->length);
6159                     p += str2->length;
6160                 }
6161                 i = j + str1->length;
6162             }
6163             if (i < self->length)
6164                 /* copy tail [i:] */
6165                 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6166         } else {
6167             /* interleave */
6168             while (n > 0) {
6169                 Py_UNICODE_COPY(p, str2->str, str2->length);
6170                 p += str2->length;
6171                 if (--n <= 0)
6172                     break;
6173                 *p++ = self->str[i++];
6174             }
6175             Py_UNICODE_COPY(p, self->str+i, self->length-i);
6176         }
6177     }
6178     return (PyObject *) u;
6179
6180   nothing:
6181     /* nothing to replace; return original string (when possible) */
6182     if (PyUnicode_CheckExact(self)) {
6183         Py_INCREF(self);
6184         return (PyObject *) self;
6185     }
6186     return PyUnicode_FromUnicode(self->str, self->length);
6187 }
6188
6189 /* --- Unicode Object Methods --------------------------------------------- */
6190
6191 PyDoc_STRVAR(title__doc__,
6192              "S.title() -> unicode\n\
6193 \n\
6194 Return a titlecased version of S, i.e. words start with title case\n\
6195 characters, all remaining cased characters have lower case.");
6196
6197 static PyObject*
6198 unicode_title(PyUnicodeObject *self)
6199 {
6200     return fixup(self, fixtitle);
6201 }
6202
6203 PyDoc_STRVAR(capitalize__doc__,
6204              "S.capitalize() -> unicode\n\
6205 \n\
6206 Return a capitalized version of S, i.e. make the first character\n\
6207 have upper case.");
6208
6209 static PyObject*
6210 unicode_capitalize(PyUnicodeObject *self)
6211 {
6212     return fixup(self, fixcapitalize);
6213 }
6214
6215 #if 0
6216 PyDoc_STRVAR(capwords__doc__,
6217              "S.capwords() -> unicode\n\
6218 \n\
6219 Apply .capitalize() to all words in S and return the result with\n\
6220 normalized whitespace (all whitespace strings are replaced by ' ').");
6221
6222 static PyObject*
6223 unicode_capwords(PyUnicodeObject *self)
6224 {
6225     PyObject *list;
6226     PyObject *item;
6227     Py_ssize_t i;
6228
6229     /* Split into words */
6230     list = split(self, NULL, -1);
6231     if (!list)
6232         return NULL;
6233
6234     /* Capitalize each word */
6235     for (i = 0; i < PyList_GET_SIZE(list); i++) {
6236         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6237                      fixcapitalize);
6238         if (item == NULL)
6239             goto onError;
6240         Py_DECREF(PyList_GET_ITEM(list, i));
6241         PyList_SET_ITEM(list, i, item);
6242     }
6243
6244     /* Join the words to form a new string */
6245     item = PyUnicode_Join(NULL, list);
6246
6247   onError:
6248     Py_DECREF(list);
6249     return (PyObject *)item;
6250 }
6251 #endif
6252
6253 /* Argument converter.  Coerces to a single unicode character */
6254
6255 static int
6256 convert_uc(PyObject *obj, void *addr)
6257 {
6258     Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6259     PyObject *uniobj;
6260     Py_UNICODE *unistr;
6261
6262     uniobj = PyUnicode_FromObject(obj);
6263     if (uniobj == NULL) {
6264         PyErr_SetString(PyExc_TypeError,
6265                         "The fill character cannot be converted to Unicode");
6266         return 0;
6267     }
6268     if (PyUnicode_GET_SIZE(uniobj) != 1) {
6269         PyErr_SetString(PyExc_TypeError,
6270                         "The fill character must be exactly one character long");
6271         Py_DECREF(uniobj);
6272         return 0;
6273     }
6274     unistr = PyUnicode_AS_UNICODE(uniobj);
6275     *fillcharloc = unistr[0];
6276     Py_DECREF(uniobj);
6277     return 1;
6278 }
6279
6280 PyDoc_STRVAR(center__doc__,
6281              "S.center(width[, fillchar]) -> unicode\n\
6282 \n\
6283 Return S centered in a Unicode string of length width. Padding is\n\
6284 done using the specified fill character (default is a space)");
6285
6286 static PyObject *
6287 unicode_center(PyUnicodeObject *self, PyObject *args)
6288 {
6289     Py_ssize_t marg, left;
6290     Py_ssize_t width;
6291     Py_UNICODE fillchar = ' ';
6292
6293     if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6294         return NULL;
6295
6296     if (self->length >= width && PyUnicode_CheckExact(self)) {
6297         Py_INCREF(self);
6298         return (PyObject*) self;
6299     }
6300
6301     marg = width - self->length;
6302     left = marg / 2 + (marg & width & 1);
6303
6304     return (PyObject*) pad(self, left, marg - left, fillchar);
6305 }
6306
6307 #if 0
6308
6309 /* This code should go into some future Unicode collation support
6310    module. The basic comparison should compare ordinals on a naive
6311    basis (this is what Java does and thus Jython too). */
6312
6313 /* speedy UTF-16 code point order comparison */
6314 /* gleaned from: */
6315 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6316
6317 static short utf16Fixup[32] =
6318 {
6319     0, 0, 0, 0, 0, 0, 0, 0,
6320     0, 0, 0, 0, 0, 0, 0, 0,
6321     0, 0, 0, 0, 0, 0, 0, 0,
6322     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6323 };
6324
6325 static int
6326 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6327 {
6328     Py_ssize_t len1, len2;
6329
6330     Py_UNICODE *s1 = str1->str;
6331     Py_UNICODE *s2 = str2->str;
6332
6333     len1 = str1->length;
6334     len2 = str2->length;
6335
6336     while (len1 > 0 && len2 > 0) {
6337         Py_UNICODE c1, c2;
6338
6339         c1 = *s1++;
6340         c2 = *s2++;
6341
6342         if (c1 > (1<<11) * 26)
6343             c1 += utf16Fixup[c1>>11];
6344         if (c2 > (1<<11) * 26)
6345             c2 += utf16Fixup[c2>>11];
6346         /* now c1 and c2 are in UTF-32-compatible order */
6347
6348         if (c1 != c2)
6349             return (c1 < c2) ? -1 : 1;
6350
6351         len1--; len2--;
6352     }
6353
6354     return (len1 < len2) ? -1 : (len1 != len2);
6355 }
6356
6357 #else
6358
6359 static int
6360 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6361 {
6362     register Py_ssize_t len1, len2;
6363
6364     Py_UNICODE *s1 = str1->str;
6365     Py_UNICODE *s2 = str2->str;
6366
6367     len1 = str1->length;
6368     len2 = str2->length;
6369
6370     while (len1 > 0 && len2 > 0) {
6371         Py_UNICODE c1, c2;
6372
6373         c1 = *s1++;
6374         c2 = *s2++;
6375
6376         if (c1 != c2)
6377             return (c1 < c2) ? -1 : 1;
6378
6379         len1--; len2--;
6380     }
6381
6382     return (len1 < len2) ? -1 : (len1 != len2);
6383 }
6384
6385 #endif
6386
6387 int PyUnicode_Compare(PyObject *left,
6388                       PyObject *right)
6389 {
6390     PyUnicodeObject *u = NULL, *v = NULL;
6391     int result;
6392
6393     /* Coerce the two arguments */
6394     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6395     if (u == NULL)
6396         goto onError;
6397     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6398     if (v == NULL)
6399         goto onError;
6400
6401     /* Shortcut for empty or interned objects */
6402     if (v == u) {
6403         Py_DECREF(u);
6404         Py_DECREF(v);
6405         return 0;
6406     }
6407
6408     result = unicode_compare(u, v);
6409
6410     Py_DECREF(u);
6411     Py_DECREF(v);
6412     return result;
6413
6414   onError:
6415     Py_XDECREF(u);
6416     Py_XDECREF(v);
6417     return -1;
6418 }
6419
6420 PyObject *PyUnicode_RichCompare(PyObject *left,
6421                                 PyObject *right,
6422                                 int op)
6423 {
6424     int result;
6425
6426     result = PyUnicode_Compare(left, right);
6427     if (result == -1 && PyErr_Occurred())
6428         goto onError;
6429
6430     /* Convert the return value to a Boolean */
6431     switch (op) {
6432     case Py_EQ:
6433         result = (result == 0);
6434         break;
6435     case Py_NE:
6436         result = (result != 0);
6437         break;
6438     case Py_LE:
6439         result = (result <= 0);
6440         break;
6441     case Py_GE:
6442         result = (result >= 0);
6443         break;
6444     case Py_LT:
6445         result = (result == -1);
6446         break;
6447     case Py_GT:
6448         result = (result == 1);
6449         break;
6450     }
6451     return PyBool_FromLong(result);
6452
6453   onError:
6454
6455     /* Standard case
6456
6457        Type errors mean that PyUnicode_FromObject() could not convert
6458        one of the arguments (usually the right hand side) to Unicode,
6459        ie. we can't handle the comparison request. However, it is
6460        possible that the other object knows a comparison method, which
6461        is why we return Py_NotImplemented to give the other object a
6462        chance.
6463
6464     */
6465     if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6466         PyErr_Clear();
6467         Py_INCREF(Py_NotImplemented);
6468         return Py_NotImplemented;
6469     }
6470     if (op != Py_EQ && op != Py_NE)
6471         return NULL;
6472
6473     /* Equality comparison.
6474
6475        This is a special case: we silence any PyExc_UnicodeDecodeError
6476        and instead turn it into a PyErr_UnicodeWarning.
6477
6478     */
6479     if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6480         return NULL;
6481     PyErr_Clear();
6482     if (PyErr_Warn(PyExc_UnicodeWarning,
6483                    (op == Py_EQ) ?
6484                    "Unicode equal comparison "
6485                    "failed to convert both arguments to Unicode - "
6486                    "interpreting them as being unequal" :
6487                    "Unicode unequal comparison "
6488                    "failed to convert both arguments to Unicode - "
6489                    "interpreting them as being unequal"
6490             ) < 0)
6491         return NULL;
6492     result = (op == Py_NE);
6493     return PyBool_FromLong(result);
6494 }
6495
6496 int PyUnicode_Contains(PyObject *container,
6497                        PyObject *element)
6498 {
6499     PyObject *str, *sub;
6500     int result;
6501
6502     /* Coerce the two arguments */
6503     sub = PyUnicode_FromObject(element);
6504     if (!sub) {
6505         return -1;
6506     }
6507
6508     str = PyUnicode_FromObject(container);
6509     if (!str) {
6510         Py_DECREF(sub);
6511         return -1;
6512     }
6513
6514     result = stringlib_contains_obj(str, sub);
6515
6516     Py_DECREF(str);
6517     Py_DECREF(sub);
6518
6519     return result;
6520 }
6521
6522 /* Concat to string or Unicode object giving a new Unicode object. */
6523
6524 PyObject *PyUnicode_Concat(PyObject *left,
6525                            PyObject *right)
6526 {
6527     PyUnicodeObject *u = NULL, *v = NULL, *w;
6528
6529     /* Coerce the two arguments */
6530     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6531     if (u == NULL)
6532         goto onError;
6533     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6534     if (v == NULL)
6535         goto onError;
6536
6537     /* Shortcuts */
6538     if (v == unicode_empty) {
6539         Py_DECREF(v);
6540         return (PyObject *)u;
6541     }
6542     if (u == unicode_empty) {
6543         Py_DECREF(u);
6544         return (PyObject *)v;
6545     }
6546
6547     /* Concat the two Unicode strings */
6548     w = _PyUnicode_New(u->length + v->length);
6549     if (w == NULL)
6550         goto onError;
6551     Py_UNICODE_COPY(w->str, u->str, u->length);
6552     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6553
6554     Py_DECREF(u);
6555     Py_DECREF(v);
6556     return (PyObject *)w;
6557
6558   onError:
6559     Py_XDECREF(u);
6560     Py_XDECREF(v);
6561     return NULL;
6562 }
6563
6564 PyDoc_STRVAR(count__doc__,
6565              "S.count(sub[, start[, end]]) -> int\n\
6566 \n\
6567 Return the number of non-overlapping occurrences of substring sub in\n\
6568 Unicode string S[start:end].  Optional arguments start and end are\n\
6569 interpreted as in slice notation.");
6570
6571 static PyObject *
6572 unicode_count(PyUnicodeObject *self, PyObject *args)
6573 {
6574     PyUnicodeObject *substring;
6575     Py_ssize_t start = 0;
6576     Py_ssize_t end = PY_SSIZE_T_MAX;
6577     PyObject *result;
6578
6579     if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6580                           _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6581         return NULL;
6582
6583     substring = (PyUnicodeObject *)PyUnicode_FromObject(
6584         (PyObject *)substring);
6585     if (substring == NULL)
6586         return NULL;
6587
6588     FIX_START_END(self);
6589
6590     result = PyInt_FromSsize_t(
6591         stringlib_count(self->str + start, end - start,
6592                         substring->str, substring->length)
6593         );
6594
6595     Py_DECREF(substring);
6596
6597     return result;
6598 }
6599
6600 PyDoc_STRVAR(encode__doc__,
6601              "S.encode([encoding[,errors]]) -> string or unicode\n\
6602 \n\
6603 Encodes S using the codec registered for encoding. encoding defaults\n\
6604 to the default encoding. errors may be given to set a different error\n\
6605 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6606 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6607 'xmlcharrefreplace' as well as any other name registered with\n\
6608 codecs.register_error that can handle UnicodeEncodeErrors.");
6609
6610 static PyObject *
6611 unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6612 {
6613     static char *kwlist[] = {"encoding", "errors", 0};
6614     char *encoding = NULL;
6615     char *errors = NULL;
6616     PyObject *v;
6617
6618     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6619                                      kwlist, &encoding, &errors))
6620         return NULL;
6621     v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6622     if (v == NULL)
6623         goto onError;
6624     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6625         PyErr_Format(PyExc_TypeError,
6626                      "encoder did not return a string/unicode object "
6627                      "(type=%.400s)",
6628                      Py_TYPE(v)->tp_name);
6629         Py_DECREF(v);
6630         return NULL;
6631     }
6632     return v;
6633
6634   onError:
6635     return NULL;
6636 }
6637
6638 PyDoc_STRVAR(decode__doc__,
6639              "S.decode([encoding[,errors]]) -> string or unicode\n\
6640 \n\
6641 Decodes S using the codec registered for encoding. encoding defaults\n\
6642 to the default encoding. errors may be given to set a different error\n\
6643 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6644 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6645 as well as any other name registerd with codecs.register_error that is\n\
6646 able to handle UnicodeDecodeErrors.");
6647
6648 static PyObject *
6649 unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6650 {
6651     static char *kwlist[] = {"encoding", "errors", 0};
6652     char *encoding = NULL;
6653     char *errors = NULL;
6654     PyObject *v;
6655
6656     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6657                                      kwlist, &encoding, &errors))
6658         return NULL;
6659     v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
6660     if (v == NULL)
6661         goto onError;
6662     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6663         PyErr_Format(PyExc_TypeError,
6664                      "decoder did not return a string/unicode object "
6665                      "(type=%.400s)",
6666                      Py_TYPE(v)->tp_name);
6667         Py_DECREF(v);
6668         return NULL;
6669     }
6670     return v;
6671
6672   onError:
6673     return NULL;
6674 }
6675
6676 PyDoc_STRVAR(expandtabs__doc__,
6677              "S.expandtabs([tabsize]) -> unicode\n\
6678 \n\
6679 Return a copy of S where all tab characters are expanded using spaces.\n\
6680 If tabsize is not given, a tab size of 8 characters is assumed.");
6681
6682 static PyObject*
6683 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6684 {
6685     Py_UNICODE *e;
6686     Py_UNICODE *p;
6687     Py_UNICODE *q;
6688     Py_UNICODE *qe;
6689     Py_ssize_t i, j, incr;
6690     PyUnicodeObject *u;
6691     int tabsize = 8;
6692
6693     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6694         return NULL;
6695
6696     /* First pass: determine size of output string */
6697     i = 0; /* chars up to and including most recent \n or \r */
6698     j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6699     e = self->str + self->length; /* end of input */
6700     for (p = self->str; p < e; p++)
6701         if (*p == '\t') {
6702             if (tabsize > 0) {
6703                 incr = tabsize - (j % tabsize); /* cannot overflow */
6704                 if (j > PY_SSIZE_T_MAX - incr)
6705                     goto overflow1;
6706                 j += incr;
6707             }
6708         }
6709         else {
6710             if (j > PY_SSIZE_T_MAX - 1)
6711                 goto overflow1;
6712             j++;
6713             if (*p == '\n' || *p == '\r') {
6714                 if (i > PY_SSIZE_T_MAX - j)
6715                     goto overflow1;
6716                 i += j;
6717                 j = 0;
6718             }
6719         }
6720
6721     if (i > PY_SSIZE_T_MAX - j)
6722         goto overflow1;
6723
6724     /* Second pass: create output string and fill it */
6725     u = _PyUnicode_New(i + j);
6726     if (!u)
6727         return NULL;
6728
6729     j = 0; /* same as in first pass */
6730     q = u->str; /* next output char */
6731     qe = u->str + u->length; /* end of output */
6732
6733     for (p = self->str; p < e; p++)
6734         if (*p == '\t') {
6735             if (tabsize > 0) {
6736                 i = tabsize - (j % tabsize);
6737                 j += i;
6738                 while (i--) {
6739                     if (q >= qe)
6740                         goto overflow2;
6741                     *q++ = ' ';
6742                 }
6743             }
6744         }
6745         else {
6746             if (q >= qe)
6747                 goto overflow2;
6748             *q++ = *p;
6749             j++;
6750             if (*p == '\n' || *p == '\r')
6751                 j = 0;
6752         }
6753
6754     return (PyObject*) u;
6755
6756   overflow2:
6757     Py_DECREF(u);
6758   overflow1:
6759     PyErr_SetString(PyExc_OverflowError, "new string is too long");
6760     return NULL;
6761 }
6762
6763 PyDoc_STRVAR(find__doc__,
6764              "S.find(sub [,start [,end]]) -> int\n\
6765 \n\
6766 Return the lowest index in S where substring sub is found,\n\
6767 such that sub is contained within s[start:end].  Optional\n\
6768 arguments start and end are interpreted as in slice notation.\n\
6769 \n\
6770 Return -1 on failure.");
6771
6772 static PyObject *
6773 unicode_find(PyUnicodeObject *self, PyObject *args)
6774 {
6775     PyObject *substring;
6776     Py_ssize_t start;
6777     Py_ssize_t end;
6778     Py_ssize_t result;
6779
6780     if (!_ParseTupleFinds(args, &substring, &start, &end))
6781         return NULL;
6782
6783     result = stringlib_find_slice(
6784         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6785         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6786         start, end
6787         );
6788
6789     Py_DECREF(substring);
6790
6791     return PyInt_FromSsize_t(result);
6792 }
6793
6794 static PyObject *
6795 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6796 {
6797     if (index < 0 || index >= self->length) {
6798         PyErr_SetString(PyExc_IndexError, "string index out of range");
6799         return NULL;
6800     }
6801
6802     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6803 }
6804
6805 static long
6806 unicode_hash(PyUnicodeObject *self)
6807 {
6808     /* Since Unicode objects compare equal to their ASCII string
6809        counterparts, they should use the individual character values
6810        as basis for their hash value.  This is needed to assure that
6811        strings and Unicode objects behave in the same way as
6812        dictionary keys. */
6813
6814     register Py_ssize_t len;
6815     register Py_UNICODE *p;
6816     register long x;
6817
6818     if (self->hash != -1)
6819         return self->hash;
6820     len = PyUnicode_GET_SIZE(self);
6821     p = PyUnicode_AS_UNICODE(self);
6822     x = *p << 7;
6823     while (--len >= 0)
6824         x = (1000003*x) ^ *p++;
6825     x ^= PyUnicode_GET_SIZE(self);
6826     if (x == -1)
6827         x = -2;
6828     self->hash = x;
6829     return x;
6830 }
6831
6832 PyDoc_STRVAR(index__doc__,
6833              "S.index(sub [,start [,end]]) -> int\n\
6834 \n\
6835 Like S.find() but raise ValueError when the substring is not found.");
6836
6837 static PyObject *
6838 unicode_index(PyUnicodeObject *self, PyObject *args)
6839 {
6840     Py_ssize_t result;
6841     PyObject *substring;
6842     Py_ssize_t start;
6843     Py_ssize_t end;
6844
6845     if (!_ParseTupleFinds(args, &substring, &start, &end))
6846         return NULL;
6847
6848     result = stringlib_find_slice(
6849         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6850         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6851         start, end
6852         );
6853
6854     Py_DECREF(substring);
6855
6856     if (result < 0) {
6857         PyErr_SetString(PyExc_ValueError, "substring not found");
6858         return NULL;
6859     }
6860
6861     return PyInt_FromSsize_t(result);
6862 }
6863
6864 PyDoc_STRVAR(islower__doc__,
6865              "S.islower() -> bool\n\
6866 \n\
6867 Return True if all cased characters in S are lowercase and there is\n\
6868 at least one cased character in S, False otherwise.");
6869
6870 static PyObject*
6871 unicode_islower(PyUnicodeObject *self)
6872 {
6873     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6874     register const Py_UNICODE *e;
6875     int cased;
6876
6877     /* Shortcut for single character strings */
6878     if (PyUnicode_GET_SIZE(self) == 1)
6879         return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6880
6881     /* Special case for empty strings */
6882     if (PyUnicode_GET_SIZE(self) == 0)
6883         return PyBool_FromLong(0);
6884
6885     e = p + PyUnicode_GET_SIZE(self);
6886     cased = 0;
6887     for (; p < e; p++) {
6888         register const Py_UNICODE ch = *p;
6889
6890         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6891             return PyBool_FromLong(0);
6892         else if (!cased && Py_UNICODE_ISLOWER(ch))
6893             cased = 1;
6894     }
6895     return PyBool_FromLong(cased);
6896 }
6897
6898 PyDoc_STRVAR(isupper__doc__,
6899              "S.isupper() -> bool\n\
6900 \n\
6901 Return True if all cased characters in S are uppercase and there is\n\
6902 at least one cased character in S, False otherwise.");
6903
6904 static PyObject*
6905 unicode_isupper(PyUnicodeObject *self)
6906 {
6907     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6908     register const Py_UNICODE *e;
6909     int cased;
6910
6911     /* Shortcut for single character strings */
6912     if (PyUnicode_GET_SIZE(self) == 1)
6913         return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6914
6915     /* Special case for empty strings */
6916     if (PyUnicode_GET_SIZE(self) == 0)
6917         return PyBool_FromLong(0);
6918
6919     e = p + PyUnicode_GET_SIZE(self);
6920     cased = 0;
6921     for (; p < e; p++) {
6922         register const Py_UNICODE ch = *p;
6923
6924         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6925             return PyBool_FromLong(0);
6926         else if (!cased && Py_UNICODE_ISUPPER(ch))
6927             cased = 1;
6928     }
6929     return PyBool_FromLong(cased);
6930 }
6931
6932 PyDoc_STRVAR(istitle__doc__,
6933              "S.istitle() -> bool\n\
6934 \n\
6935 Return True if S is a titlecased string and there is at least one\n\
6936 character in S, i.e. upper- and titlecase characters may only\n\
6937 follow uncased characters and lowercase characters only cased ones.\n\
6938 Return False otherwise.");
6939
6940 static PyObject*
6941 unicode_istitle(PyUnicodeObject *self)
6942 {
6943     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6944     register const Py_UNICODE *e;
6945     int cased, previous_is_cased;
6946
6947     /* Shortcut for single character strings */
6948     if (PyUnicode_GET_SIZE(self) == 1)
6949         return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6950                                (Py_UNICODE_ISUPPER(*p) != 0));
6951
6952     /* Special case for empty strings */
6953     if (PyUnicode_GET_SIZE(self) == 0)
6954         return PyBool_FromLong(0);
6955
6956     e = p + PyUnicode_GET_SIZE(self);
6957     cased = 0;
6958     previous_is_cased = 0;
6959     for (; p < e; p++) {
6960         register const Py_UNICODE ch = *p;
6961
6962         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6963             if (previous_is_cased)
6964                 return PyBool_FromLong(0);
6965             previous_is_cased = 1;
6966             cased = 1;
6967         }
6968         else if (Py_UNICODE_ISLOWER(ch)) {
6969             if (!previous_is_cased)
6970                 return PyBool_FromLong(0);
6971             previous_is_cased = 1;
6972             cased = 1;
6973         }
6974         else
6975             previous_is_cased = 0;
6976     }
6977     return PyBool_FromLong(cased);
6978 }
6979
6980 PyDoc_STRVAR(isspace__doc__,
6981              "S.isspace() -> bool\n\
6982 \n\
6983 Return True if all characters in S are whitespace\n\
6984 and there is at least one character in S, False otherwise.");
6985
6986 static PyObject*
6987 unicode_isspace(PyUnicodeObject *self)
6988 {
6989     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6990     register const Py_UNICODE *e;
6991
6992     /* Shortcut for single character strings */
6993     if (PyUnicode_GET_SIZE(self) == 1 &&
6994         Py_UNICODE_ISSPACE(*p))
6995         return PyBool_FromLong(1);
6996
6997     /* Special case for empty strings */
6998     if (PyUnicode_GET_SIZE(self) == 0)
6999         return PyBool_FromLong(0);
7000
7001     e = p + PyUnicode_GET_SIZE(self);
7002     for (; p < e; p++) {
7003         if (!Py_UNICODE_ISSPACE(*p))
7004             return PyBool_FromLong(0);
7005     }
7006     return PyBool_FromLong(1);
7007 }
7008
7009 PyDoc_STRVAR(isalpha__doc__,
7010              "S.isalpha() -> bool\n\
7011 \n\
7012 Return True if all characters in S are alphabetic\n\
7013 and there is at least one character in S, False otherwise.");
7014
7015 static PyObject*
7016 unicode_isalpha(PyUnicodeObject *self)
7017 {
7018     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7019     register const Py_UNICODE *e;
7020
7021     /* Shortcut for single character strings */
7022     if (PyUnicode_GET_SIZE(self) == 1 &&
7023         Py_UNICODE_ISALPHA(*p))
7024         return PyBool_FromLong(1);
7025
7026     /* Special case for empty strings */
7027     if (PyUnicode_GET_SIZE(self) == 0)
7028         return PyBool_FromLong(0);
7029
7030     e = p + PyUnicode_GET_SIZE(self);
7031     for (; p < e; p++) {
7032         if (!Py_UNICODE_ISALPHA(*p))
7033             return PyBool_FromLong(0);
7034     }
7035     return PyBool_FromLong(1);
7036 }
7037
7038 PyDoc_STRVAR(isalnum__doc__,
7039              "S.isalnum() -> bool\n\
7040 \n\
7041 Return True if all characters in S are alphanumeric\n\
7042 and there is at least one character in S, False otherwise.");
7043
7044 static PyObject*
7045 unicode_isalnum(PyUnicodeObject *self)
7046 {
7047     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7048     register const Py_UNICODE *e;
7049
7050     /* Shortcut for single character strings */
7051     if (PyUnicode_GET_SIZE(self) == 1 &&
7052         Py_UNICODE_ISALNUM(*p))
7053         return PyBool_FromLong(1);
7054
7055     /* Special case for empty strings */
7056     if (PyUnicode_GET_SIZE(self) == 0)
7057         return PyBool_FromLong(0);
7058
7059     e = p + PyUnicode_GET_SIZE(self);
7060     for (; p < e; p++) {
7061         if (!Py_UNICODE_ISALNUM(*p))
7062             return PyBool_FromLong(0);
7063     }
7064     return PyBool_FromLong(1);
7065 }
7066
7067 PyDoc_STRVAR(isdecimal__doc__,
7068              "S.isdecimal() -> bool\n\
7069 \n\
7070 Return True if there are only decimal characters in S,\n\
7071 False otherwise.");
7072
7073 static PyObject*
7074 unicode_isdecimal(PyUnicodeObject *self)
7075 {
7076     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7077     register const Py_UNICODE *e;
7078
7079     /* Shortcut for single character strings */
7080     if (PyUnicode_GET_SIZE(self) == 1 &&
7081         Py_UNICODE_ISDECIMAL(*p))
7082         return PyBool_FromLong(1);
7083
7084     /* Special case for empty strings */
7085     if (PyUnicode_GET_SIZE(self) == 0)
7086         return PyBool_FromLong(0);
7087
7088     e = p + PyUnicode_GET_SIZE(self);
7089     for (; p < e; p++) {
7090         if (!Py_UNICODE_ISDECIMAL(*p))
7091             return PyBool_FromLong(0);
7092     }
7093     return PyBool_FromLong(1);
7094 }
7095
7096 PyDoc_STRVAR(isdigit__doc__,
7097              "S.isdigit() -> bool\n\
7098 \n\
7099 Return True if all characters in S are digits\n\
7100 and there is at least one character in S, False otherwise.");
7101
7102 static PyObject*
7103 unicode_isdigit(PyUnicodeObject *self)
7104 {
7105     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7106     register const Py_UNICODE *e;
7107
7108     /* Shortcut for single character strings */
7109     if (PyUnicode_GET_SIZE(self) == 1 &&
7110         Py_UNICODE_ISDIGIT(*p))
7111         return PyBool_FromLong(1);
7112
7113     /* Special case for empty strings */
7114     if (PyUnicode_GET_SIZE(self) == 0)
7115         return PyBool_FromLong(0);
7116
7117     e = p + PyUnicode_GET_SIZE(self);
7118     for (; p < e; p++) {
7119         if (!Py_UNICODE_ISDIGIT(*p))
7120             return PyBool_FromLong(0);
7121     }
7122     return PyBool_FromLong(1);
7123 }
7124
7125 PyDoc_STRVAR(isnumeric__doc__,
7126              "S.isnumeric() -> bool\n\
7127 \n\
7128 Return True if there are only numeric characters in S,\n\
7129 False otherwise.");
7130
7131 static PyObject*
7132 unicode_isnumeric(PyUnicodeObject *self)
7133 {
7134     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7135     register const Py_UNICODE *e;
7136
7137     /* Shortcut for single character strings */
7138     if (PyUnicode_GET_SIZE(self) == 1 &&
7139         Py_UNICODE_ISNUMERIC(*p))
7140         return PyBool_FromLong(1);
7141
7142     /* Special case for empty strings */
7143     if (PyUnicode_GET_SIZE(self) == 0)
7144         return PyBool_FromLong(0);
7145
7146     e = p + PyUnicode_GET_SIZE(self);
7147     for (; p < e; p++) {
7148         if (!Py_UNICODE_ISNUMERIC(*p))
7149             return PyBool_FromLong(0);
7150     }
7151     return PyBool_FromLong(1);
7152 }
7153
7154 PyDoc_STRVAR(join__doc__,
7155              "S.join(iterable) -> unicode\n\
7156 \n\
7157 Return a string which is the concatenation of the strings in the\n\
7158 iterable.  The separator between elements is S.");
7159
7160 static PyObject*
7161 unicode_join(PyObject *self, PyObject *data)
7162 {
7163     return PyUnicode_Join(self, data);
7164 }
7165
7166 static Py_ssize_t
7167 unicode_length(PyUnicodeObject *self)
7168 {
7169     return self->length;
7170 }
7171
7172 PyDoc_STRVAR(ljust__doc__,
7173              "S.ljust(width[, fillchar]) -> int\n\
7174 \n\
7175 Return S left-justified in a Unicode string of length width. Padding is\n\
7176 done using the specified fill character (default is a space).");
7177
7178 static PyObject *
7179 unicode_ljust(PyUnicodeObject *self, PyObject *args)
7180 {
7181     Py_ssize_t width;
7182     Py_UNICODE fillchar = ' ';
7183
7184     if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7185         return NULL;
7186
7187     if (self->length >= width && PyUnicode_CheckExact(self)) {
7188         Py_INCREF(self);
7189         return (PyObject*) self;
7190     }
7191
7192     return (PyObject*) pad(self, 0, width - self->length, fillchar);
7193 }
7194
7195 PyDoc_STRVAR(lower__doc__,
7196              "S.lower() -> unicode\n\
7197 \n\
7198 Return a copy of the string S converted to lowercase.");
7199
7200 static PyObject*
7201 unicode_lower(PyUnicodeObject *self)
7202 {
7203     return fixup(self, fixlower);
7204 }
7205
7206 #define LEFTSTRIP 0
7207 #define RIGHTSTRIP 1
7208 #define BOTHSTRIP 2
7209
7210 /* Arrays indexed by above */
7211 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7212
7213 #define STRIPNAME(i) (stripformat[i]+3)
7214
7215 /* externally visible for str.strip(unicode) */
7216 PyObject *
7217 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7218 {
7219     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7220     Py_ssize_t len = PyUnicode_GET_SIZE(self);
7221     Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7222     Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7223     Py_ssize_t i, j;
7224
7225     BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7226
7227     i = 0;
7228     if (striptype != RIGHTSTRIP) {
7229         while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7230             i++;
7231         }
7232     }
7233
7234     j = len;
7235     if (striptype != LEFTSTRIP) {
7236         do {
7237             j--;
7238         } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7239         j++;
7240     }
7241
7242     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7243         Py_INCREF(self);
7244         return (PyObject*)self;
7245     }
7246     else
7247         return PyUnicode_FromUnicode(s+i, j-i);
7248 }
7249
7250
7251 static PyObject *
7252 do_strip(PyUnicodeObject *self, int striptype)
7253 {
7254     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7255     Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7256
7257     i = 0;
7258     if (striptype != RIGHTSTRIP) {
7259         while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7260             i++;
7261         }
7262     }
7263
7264     j = len;
7265     if (striptype != LEFTSTRIP) {
7266         do {
7267             j--;
7268         } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7269         j++;
7270     }
7271
7272     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7273         Py_INCREF(self);
7274         return (PyObject*)self;
7275     }
7276     else
7277         return PyUnicode_FromUnicode(s+i, j-i);
7278 }
7279
7280
7281 static PyObject *
7282 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7283 {
7284     PyObject *sep = NULL;
7285
7286     if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7287         return NULL;
7288
7289     if (sep != NULL && sep != Py_None) {
7290         if (PyUnicode_Check(sep))
7291             return _PyUnicode_XStrip(self, striptype, sep);
7292         else if (PyString_Check(sep)) {
7293             PyObject *res;
7294             sep = PyUnicode_FromObject(sep);
7295             if (sep==NULL)
7296                 return NULL;
7297             res = _PyUnicode_XStrip(self, striptype, sep);
7298             Py_DECREF(sep);
7299             return res;
7300         }
7301         else {
7302             PyErr_Format(PyExc_TypeError,
7303                          "%s arg must be None, unicode or str",
7304                          STRIPNAME(striptype));
7305             return NULL;
7306         }
7307     }
7308
7309     return do_strip(self, striptype);
7310 }
7311
7312
7313 PyDoc_STRVAR(strip__doc__,
7314              "S.strip([chars]) -> unicode\n\
7315 \n\
7316 Return a copy of the string S with leading and trailing\n\
7317 whitespace removed.\n\
7318 If chars is given and not None, remove characters in chars instead.\n\
7319 If chars is a str, it will be converted to unicode before stripping");
7320
7321 static PyObject *
7322 unicode_strip(PyUnicodeObject *self, PyObject *args)
7323 {
7324     if (PyTuple_GET_SIZE(args) == 0)
7325         return do_strip(self, BOTHSTRIP); /* Common case */
7326     else
7327         return do_argstrip(self, BOTHSTRIP, args);
7328 }
7329
7330
7331 PyDoc_STRVAR(lstrip__doc__,
7332              "S.lstrip([chars]) -> unicode\n\
7333 \n\
7334 Return a copy of the string S with leading whitespace removed.\n\
7335 If chars is given and not None, remove characters in chars instead.\n\
7336 If chars is a str, it will be converted to unicode before stripping");
7337
7338 static PyObject *
7339 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7340 {
7341     if (PyTuple_GET_SIZE(args) == 0)
7342         return do_strip(self, LEFTSTRIP); /* Common case */
7343     else
7344         return do_argstrip(self, LEFTSTRIP, args);
7345 }
7346
7347
7348 PyDoc_STRVAR(rstrip__doc__,
7349              "S.rstrip([chars]) -> unicode\n\
7350 \n\
7351 Return a copy of the string S with trailing whitespace removed.\n\
7352 If chars is given and not None, remove characters in chars instead.\n\
7353 If chars is a str, it will be converted to unicode before stripping");
7354
7355 static PyObject *
7356 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7357 {
7358     if (PyTuple_GET_SIZE(args) == 0)
7359         return do_strip(self, RIGHTSTRIP); /* Common case */
7360     else
7361         return do_argstrip(self, RIGHTSTRIP, args);
7362 }
7363
7364
7365 static PyObject*
7366 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7367 {
7368     PyUnicodeObject *u;
7369     Py_UNICODE *p;
7370     Py_ssize_t nchars;
7371     size_t nbytes;
7372
7373     if (len < 0)
7374         len = 0;
7375
7376     if (len == 1 && PyUnicode_CheckExact(str)) {
7377         /* no repeat, return original string */
7378         Py_INCREF(str);
7379         return (PyObject*) str;
7380     }
7381
7382     /* ensure # of chars needed doesn't overflow int and # of bytes
7383      * needed doesn't overflow size_t
7384      */
7385     nchars = len * str->length;
7386     if (len && nchars / len != str->length) {
7387         PyErr_SetString(PyExc_OverflowError,
7388                         "repeated string is too long");
7389         return NULL;
7390     }
7391     nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7392     if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7393         PyErr_SetString(PyExc_OverflowError,
7394                         "repeated string is too long");
7395         return NULL;
7396     }
7397     u = _PyUnicode_New(nchars);
7398     if (!u)
7399         return NULL;
7400
7401     p = u->str;
7402
7403     if (str->length == 1 && len > 0) {
7404         Py_UNICODE_FILL(p, str->str[0], len);
7405     } else {
7406         Py_ssize_t done = 0; /* number of characters copied this far */
7407         if (done < nchars) {
7408             Py_UNICODE_COPY(p, str->str, str->length);
7409             done = str->length;
7410         }
7411         while (done < nchars) {
7412             Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7413             Py_UNICODE_COPY(p+done, p, n);
7414             done += n;
7415         }
7416     }
7417
7418     return (PyObject*) u;
7419 }
7420
7421 PyObject *PyUnicode_Replace(PyObject *obj,
7422                             PyObject *subobj,
7423                             PyObject *replobj,
7424                             Py_ssize_t maxcount)
7425 {
7426     PyObject *self;
7427     PyObject *str1;
7428     PyObject *str2;
7429     PyObject *result;
7430
7431     self = PyUnicode_FromObject(obj);
7432     if (self == NULL)
7433         return NULL;
7434     str1 = PyUnicode_FromObject(subobj);
7435     if (str1 == NULL) {
7436         Py_DECREF(self);
7437         return NULL;
7438     }
7439     str2 = PyUnicode_FromObject(replobj);
7440     if (str2 == NULL) {
7441         Py_DECREF(self);
7442         Py_DECREF(str1);
7443         return NULL;
7444     }
7445     result = replace((PyUnicodeObject *)self,
7446                      (PyUnicodeObject *)str1,
7447                      (PyUnicodeObject *)str2,
7448                      maxcount);
7449     Py_DECREF(self);
7450     Py_DECREF(str1);
7451     Py_DECREF(str2);
7452     return result;
7453 }
7454
7455 PyDoc_STRVAR(replace__doc__,
7456              "S.replace (old, new[, count]) -> unicode\n\
7457 \n\
7458 Return a copy of S with all occurrences of substring\n\
7459 old replaced by new.  If the optional argument count is\n\
7460 given, only the first count occurrences are replaced.");
7461
7462 static PyObject*
7463 unicode_replace(PyUnicodeObject *self, PyObject *args)
7464 {
7465     PyUnicodeObject *str1;
7466     PyUnicodeObject *str2;
7467     Py_ssize_t maxcount = -1;
7468     PyObject *result;
7469
7470     if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7471         return NULL;
7472     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7473     if (str1 == NULL)
7474         return NULL;
7475     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7476     if (str2 == NULL) {
7477         Py_DECREF(str1);
7478         return NULL;
7479     }
7480
7481     result = replace(self, str1, str2, maxcount);
7482
7483     Py_DECREF(str1);
7484     Py_DECREF(str2);
7485     return result;
7486 }
7487
7488 static
7489 PyObject *unicode_repr(PyObject *unicode)
7490 {
7491     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7492                                 PyUnicode_GET_SIZE(unicode),
7493                                 1);
7494 }
7495
7496 PyDoc_STRVAR(rfind__doc__,
7497              "S.rfind(sub [,start [,end]]) -> int\n\
7498 \n\
7499 Return the highest index in S where substring sub is found,\n\
7500 such that sub is contained within s[start:end].  Optional\n\
7501 arguments start and end are interpreted as in slice notation.\n\
7502 \n\
7503 Return -1 on failure.");
7504
7505 static PyObject *
7506 unicode_rfind(PyUnicodeObject *self, PyObject *args)
7507 {
7508     PyObject *substring;
7509     Py_ssize_t start;
7510     Py_ssize_t end;
7511     Py_ssize_t result;
7512
7513     if (!_ParseTupleFinds(args, &substring, &start, &end))
7514         return NULL;
7515
7516     result = stringlib_rfind_slice(
7517         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7518         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7519         start, end
7520         );
7521
7522     Py_DECREF(substring);
7523
7524     return PyInt_FromSsize_t(result);
7525 }
7526
7527 PyDoc_STRVAR(rindex__doc__,
7528              "S.rindex(sub [,start [,end]]) -> int\n\
7529 \n\
7530 Like S.rfind() but raise ValueError when the substring is not found.");
7531
7532 static PyObject *
7533 unicode_rindex(PyUnicodeObject *self, PyObject *args)
7534 {
7535     PyObject *substring;
7536     Py_ssize_t start;
7537     Py_ssize_t end;
7538     Py_ssize_t result;
7539
7540     if (!_ParseTupleFinds(args, &substring, &start, &end))
7541         return NULL;
7542
7543     result = stringlib_rfind_slice(
7544         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7545         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7546         start, end
7547         );
7548
7549     Py_DECREF(substring);
7550
7551     if (result < 0) {
7552         PyErr_SetString(PyExc_ValueError, "substring not found");
7553         return NULL;
7554     }
7555     return PyInt_FromSsize_t(result);
7556 }
7557
7558 PyDoc_STRVAR(rjust__doc__,
7559              "S.rjust(width[, fillchar]) -> unicode\n\
7560 \n\
7561 Return S right-justified in a Unicode string of length width. Padding is\n\
7562 done using the specified fill character (default is a space).");
7563
7564 static PyObject *
7565 unicode_rjust(PyUnicodeObject *self, PyObject *args)
7566 {
7567     Py_ssize_t width;
7568     Py_UNICODE fillchar = ' ';
7569
7570     if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7571         return NULL;
7572
7573     if (self->length >= width && PyUnicode_CheckExact(self)) {
7574         Py_INCREF(self);
7575         return (PyObject*) self;
7576     }
7577
7578     return (PyObject*) pad(self, width - self->length, 0, fillchar);
7579 }
7580
7581 static PyObject*
7582 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
7583 {
7584     /* standard clamping */
7585     if (start < 0)
7586         start = 0;
7587     if (end < 0)
7588         end = 0;
7589     if (end > self->length)
7590         end = self->length;
7591     if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
7592         /* full slice, return original string */
7593         Py_INCREF(self);
7594         return (PyObject*) self;
7595     }
7596     if (start > end)
7597         start = end;
7598     /* copy slice */
7599     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7600                                              end - start);
7601 }
7602
7603 PyObject *PyUnicode_Split(PyObject *s,
7604                           PyObject *sep,
7605                           Py_ssize_t maxsplit)
7606 {
7607     PyObject *result;
7608
7609     s = PyUnicode_FromObject(s);
7610     if (s == NULL)
7611         return NULL;
7612     if (sep != NULL) {
7613         sep = PyUnicode_FromObject(sep);
7614         if (sep == NULL) {
7615             Py_DECREF(s);
7616             return NULL;
7617         }
7618     }
7619
7620     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7621
7622     Py_DECREF(s);
7623     Py_XDECREF(sep);
7624     return result;
7625 }
7626
7627 PyDoc_STRVAR(split__doc__,
7628              "S.split([sep [,maxsplit]]) -> list of strings\n\
7629 \n\
7630 Return a list of the words in S, using sep as the\n\
7631 delimiter string.  If maxsplit is given, at most maxsplit\n\
7632 splits are done. If sep is not specified or is None, any\n\
7633 whitespace string is a separator and empty strings are\n\
7634 removed from the result.");
7635
7636 static PyObject*
7637 unicode_split(PyUnicodeObject *self, PyObject *args)
7638 {
7639     PyObject *substring = Py_None;
7640     Py_ssize_t maxcount = -1;
7641
7642     if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7643         return NULL;
7644
7645     if (substring == Py_None)
7646         return split(self, NULL, maxcount);
7647     else if (PyUnicode_Check(substring))
7648         return split(self, (PyUnicodeObject *)substring, maxcount);
7649     else
7650         return PyUnicode_Split((PyObject *)self, substring, maxcount);
7651 }
7652
7653 PyObject *
7654 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7655 {
7656     PyObject* str_obj;
7657     PyObject* sep_obj;
7658     PyObject* out;
7659
7660     str_obj = PyUnicode_FromObject(str_in);
7661     if (!str_obj)
7662         return NULL;
7663     sep_obj = PyUnicode_FromObject(sep_in);
7664     if (!sep_obj) {
7665         Py_DECREF(str_obj);
7666         return NULL;
7667     }
7668
7669     out = stringlib_partition(
7670         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7671         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7672         );
7673
7674     Py_DECREF(sep_obj);
7675     Py_DECREF(str_obj);
7676
7677     return out;
7678 }
7679
7680
7681 PyObject *
7682 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7683 {
7684     PyObject* str_obj;
7685     PyObject* sep_obj;
7686     PyObject* out;
7687
7688     str_obj = PyUnicode_FromObject(str_in);
7689     if (!str_obj)
7690         return NULL;
7691     sep_obj = PyUnicode_FromObject(sep_in);
7692     if (!sep_obj) {
7693         Py_DECREF(str_obj);
7694         return NULL;
7695     }
7696
7697     out = stringlib_rpartition(
7698         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7699         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7700         );
7701
7702     Py_DECREF(sep_obj);
7703     Py_DECREF(str_obj);
7704
7705     return out;
7706 }
7707
7708 PyDoc_STRVAR(partition__doc__,
7709              "S.partition(sep) -> (head, sep, tail)\n\
7710 \n\
7711 Search for the separator sep in S, and return the part before it,\n\
7712 the separator itself, and the part after it.  If the separator is not\n\
7713 found, return S and two empty strings.");
7714
7715 static PyObject*
7716 unicode_partition(PyUnicodeObject *self, PyObject *separator)
7717 {
7718     return PyUnicode_Partition((PyObject *)self, separator);
7719 }
7720
7721 PyDoc_STRVAR(rpartition__doc__,
7722              "S.rpartition(sep) -> (tail, sep, head)\n\
7723 \n\
7724 Search for the separator sep in S, starting at the end of S, and return\n\
7725 the part before it, the separator itself, and the part after it.  If the\n\
7726 separator is not found, return two empty strings and S.");
7727
7728 static PyObject*
7729 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7730 {
7731     return PyUnicode_RPartition((PyObject *)self, separator);
7732 }
7733
7734 PyObject *PyUnicode_RSplit(PyObject *s,
7735                            PyObject *sep,
7736                            Py_ssize_t maxsplit)
7737 {
7738     PyObject *result;
7739
7740     s = PyUnicode_FromObject(s);
7741     if (s == NULL)
7742         return NULL;
7743     if (sep != NULL) {
7744         sep = PyUnicode_FromObject(sep);
7745         if (sep == NULL) {
7746             Py_DECREF(s);
7747             return NULL;
7748         }
7749     }
7750
7751     result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7752
7753     Py_DECREF(s);
7754     Py_XDECREF(sep);
7755     return result;
7756 }
7757
7758 PyDoc_STRVAR(rsplit__doc__,
7759              "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7760 \n\
7761 Return a list of the words in S, using sep as the\n\
7762 delimiter string, starting at the end of the string and\n\
7763 working to the front.  If maxsplit is given, at most maxsplit\n\
7764 splits are done. If sep is not specified, any whitespace string\n\
7765 is a separator.");
7766
7767 static PyObject*
7768 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7769 {
7770     PyObject *substring = Py_None;
7771     Py_ssize_t maxcount = -1;
7772
7773     if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7774         return NULL;
7775
7776     if (substring == Py_None)
7777         return rsplit(self, NULL, maxcount);
7778     else if (PyUnicode_Check(substring))
7779         return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7780     else
7781         return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7782 }
7783
7784 PyDoc_STRVAR(splitlines__doc__,
7785              "S.splitlines([keepends]) -> list of strings\n\
7786 \n\
7787 Return a list of the lines in S, breaking at line boundaries.\n\
7788 Line breaks are not included in the resulting list unless keepends\n\
7789 is given and true.");
7790
7791 static PyObject*
7792 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7793 {
7794     int keepends = 0;
7795
7796     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7797         return NULL;
7798
7799     return PyUnicode_Splitlines((PyObject *)self, keepends);
7800 }
7801
7802 static
7803 PyObject *unicode_str(PyUnicodeObject *self)
7804 {
7805     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
7806 }
7807
7808 PyDoc_STRVAR(swapcase__doc__,
7809              "S.swapcase() -> unicode\n\
7810 \n\
7811 Return a copy of S with uppercase characters converted to lowercase\n\
7812 and vice versa.");
7813
7814 static PyObject*
7815 unicode_swapcase(PyUnicodeObject *self)
7816 {
7817     return fixup(self, fixswapcase);
7818 }
7819
7820 PyDoc_STRVAR(translate__doc__,
7821              "S.translate(table) -> unicode\n\
7822 \n\
7823 Return a copy of the string S, where all characters have been mapped\n\
7824 through the given translation table, which must be a mapping of\n\
7825 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7826 Unmapped characters are left untouched. Characters mapped to None\n\
7827 are deleted.");
7828
7829 static PyObject*
7830 unicode_translate(PyUnicodeObject *self, PyObject *table)
7831 {
7832     return PyUnicode_TranslateCharmap(self->str,
7833                                       self->length,
7834                                       table,
7835                                       "ignore");
7836 }
7837
7838 PyDoc_STRVAR(upper__doc__,
7839              "S.upper() -> unicode\n\
7840 \n\
7841 Return a copy of S converted to uppercase.");
7842
7843 static PyObject*
7844 unicode_upper(PyUnicodeObject *self)
7845 {
7846     return fixup(self, fixupper);
7847 }
7848
7849 PyDoc_STRVAR(zfill__doc__,
7850              "S.zfill(width) -> unicode\n\
7851 \n\
7852 Pad a numeric string S with zeros on the left, to fill a field\n\
7853 of the specified width. The string S is never truncated.");
7854
7855 static PyObject *
7856 unicode_zfill(PyUnicodeObject *self, PyObject *args)
7857 {
7858     Py_ssize_t fill;
7859     PyUnicodeObject *u;
7860
7861     Py_ssize_t width;
7862     if (!PyArg_ParseTuple(args, "n:zfill", &width))
7863         return NULL;
7864
7865     if (self->length >= width) {
7866         if (PyUnicode_CheckExact(self)) {
7867             Py_INCREF(self);
7868             return (PyObject*) self;
7869         }
7870         else
7871             return PyUnicode_FromUnicode(
7872                 PyUnicode_AS_UNICODE(self),
7873                 PyUnicode_GET_SIZE(self)
7874                 );
7875     }
7876
7877     fill = width - self->length;
7878
7879     u = pad(self, fill, 0, '0');
7880
7881     if (u == NULL)
7882         return NULL;
7883
7884     if (u->str[fill] == '+' || u->str[fill] == '-') {
7885         /* move sign to beginning of string */
7886         u->str[0] = u->str[fill];
7887         u->str[fill] = '0';
7888     }
7889
7890     return (PyObject*) u;
7891 }
7892
7893 #if 0
7894 static PyObject*
7895 free_listsize(PyUnicodeObject *self)
7896 {
7897     return PyInt_FromLong(numfree);
7898 }
7899 #endif
7900
7901 PyDoc_STRVAR(startswith__doc__,
7902              "S.startswith(prefix[, start[, end]]) -> bool\n\
7903 \n\
7904 Return True if S starts with the specified prefix, False otherwise.\n\
7905 With optional start, test S beginning at that position.\n\
7906 With optional end, stop comparing S at that position.\n\
7907 prefix can also be a tuple of strings to try.");
7908
7909 static PyObject *
7910 unicode_startswith(PyUnicodeObject *self,
7911                    PyObject *args)
7912 {
7913     PyObject *subobj;
7914     PyUnicodeObject *substring;
7915     Py_ssize_t start = 0;
7916     Py_ssize_t end = PY_SSIZE_T_MAX;
7917     int result;
7918
7919     if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
7920                           _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7921         return NULL;
7922     if (PyTuple_Check(subobj)) {
7923         Py_ssize_t i;
7924         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7925             substring = (PyUnicodeObject *)PyUnicode_FromObject(
7926                 PyTuple_GET_ITEM(subobj, i));
7927             if (substring == NULL)
7928                 return NULL;
7929             result = tailmatch(self, substring, start, end, -1);
7930             Py_DECREF(substring);
7931             if (result) {
7932                 Py_RETURN_TRUE;
7933             }
7934         }
7935         /* nothing matched */
7936         Py_RETURN_FALSE;
7937     }
7938     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7939     if (substring == NULL)
7940         return NULL;
7941     result = tailmatch(self, substring, start, end, -1);
7942     Py_DECREF(substring);
7943     return PyBool_FromLong(result);
7944 }
7945
7946
7947 PyDoc_STRVAR(endswith__doc__,
7948              "S.endswith(suffix[, start[, end]]) -> bool\n\
7949 \n\
7950 Return True if S ends with the specified suffix, False otherwise.\n\
7951 With optional start, test S beginning at that position.\n\
7952 With optional end, stop comparing S at that position.\n\
7953 suffix can also be a tuple of strings to try.");
7954
7955 static PyObject *
7956 unicode_endswith(PyUnicodeObject *self,
7957                  PyObject *args)
7958 {
7959     PyObject *subobj;
7960     PyUnicodeObject *substring;
7961     Py_ssize_t start = 0;
7962     Py_ssize_t end = PY_SSIZE_T_MAX;
7963     int result;
7964
7965     if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7966                           _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7967         return NULL;
7968     if (PyTuple_Check(subobj)) {
7969         Py_ssize_t i;
7970         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7971             substring = (PyUnicodeObject *)PyUnicode_FromObject(
7972                 PyTuple_GET_ITEM(subobj, i));
7973             if (substring == NULL)
7974                 return NULL;
7975             result = tailmatch(self, substring, start, end, +1);
7976             Py_DECREF(substring);
7977             if (result) {
7978                 Py_RETURN_TRUE;
7979             }
7980         }
7981         Py_RETURN_FALSE;
7982     }
7983     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7984     if (substring == NULL)
7985         return NULL;
7986
7987     result = tailmatch(self, substring, start, end, +1);
7988     Py_DECREF(substring);
7989     return PyBool_FromLong(result);
7990 }
7991
7992
7993 /* Implements do_string_format, which is unicode because of stringlib */
7994 #include "stringlib/string_format.h"
7995
7996 PyDoc_STRVAR(format__doc__,
7997              "S.format(*args, **kwargs) -> unicode\n\
7998 \n\
7999 ");
8000
8001 static PyObject *
8002 unicode__format__(PyObject *self, PyObject *args)
8003 {
8004     PyObject *format_spec;
8005     PyObject *result = NULL;
8006     PyObject *tmp = NULL;
8007
8008     /* If 2.x, convert format_spec to the same type as value */
8009     /* This is to allow things like u''.format('') */
8010     if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
8011         goto done;
8012     if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
8013         PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
8014                      "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
8015         goto done;
8016     }
8017     tmp = PyObject_Unicode(format_spec);
8018     if (tmp == NULL)
8019         goto done;
8020     format_spec = tmp;
8021
8022     result = _PyUnicode_FormatAdvanced(self,
8023                                        PyUnicode_AS_UNICODE(format_spec),
8024                                        PyUnicode_GET_SIZE(format_spec));
8025   done:
8026     Py_XDECREF(tmp);
8027     return result;
8028 }
8029
8030 PyDoc_STRVAR(p_format__doc__,
8031              "S.__format__(format_spec) -> unicode\n\
8032 \n\
8033 ");
8034
8035 static PyObject *
8036 unicode__sizeof__(PyUnicodeObject *v)
8037 {
8038     return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
8039                              sizeof(Py_UNICODE) * (v->length + 1));
8040 }
8041
8042 PyDoc_STRVAR(sizeof__doc__,
8043              "S.__sizeof__() -> size of S in memory, in bytes\n\
8044 \n\
8045 ");
8046
8047 static PyObject *
8048 unicode_getnewargs(PyUnicodeObject *v)
8049 {
8050     return Py_BuildValue("(u#)", v->str, v->length);
8051 }
8052
8053
8054 static PyMethodDef unicode_methods[] = {
8055
8056     /* Order is according to common usage: often used methods should
8057        appear first, since lookup is done sequentially. */
8058
8059     {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
8060     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8061     {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
8062     {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
8063     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8064     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8065     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8066     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8067     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8068     {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8069     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
8070     {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
8071     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8072     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8073     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
8074     {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
8075     {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
8076 /*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
8077     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8078     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8079     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
8080     {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
8081     {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
8082     {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
8083     {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
8084     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8085     {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8086     {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8087     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8088     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8089     {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8090     {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8091     {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8092     {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8093     {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8094     {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8095     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8096     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8097     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
8098     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
8099     {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8100     {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
8101     {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8102     {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
8103     {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
8104 #if 0
8105     {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
8106 #endif
8107
8108 #if 0
8109     /* This one is just used for debugging the implementation. */
8110     {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
8111 #endif
8112
8113     {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
8114     {NULL, NULL}
8115 };
8116
8117 static PyObject *
8118 unicode_mod(PyObject *v, PyObject *w)
8119 {
8120     if (!PyUnicode_Check(v)) {
8121         Py_INCREF(Py_NotImplemented);
8122         return Py_NotImplemented;
8123     }
8124     return PyUnicode_Format(v, w);
8125 }
8126
8127 static PyNumberMethods unicode_as_number = {
8128     0,              /*nb_add*/
8129     0,              /*nb_subtract*/
8130     0,              /*nb_multiply*/
8131     0,              /*nb_divide*/
8132     unicode_mod,            /*nb_remainder*/
8133 };
8134
8135 static PySequenceMethods unicode_as_sequence = {
8136     (lenfunc) unicode_length,       /* sq_length */
8137     PyUnicode_Concat,           /* sq_concat */
8138     (ssizeargfunc) unicode_repeat,  /* sq_repeat */
8139     (ssizeargfunc) unicode_getitem,     /* sq_item */
8140     (ssizessizeargfunc) unicode_slice,  /* sq_slice */
8141     0,                  /* sq_ass_item */
8142     0,                  /* sq_ass_slice */
8143     PyUnicode_Contains,         /* sq_contains */
8144 };
8145
8146 static PyObject*
8147 unicode_subscript(PyUnicodeObject* self, PyObject* item)
8148 {
8149     if (PyIndex_Check(item)) {
8150         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
8151         if (i == -1 && PyErr_Occurred())
8152             return NULL;
8153         if (i < 0)
8154             i += PyUnicode_GET_SIZE(self);
8155         return unicode_getitem(self, i);
8156     } else if (PySlice_Check(item)) {
8157         Py_ssize_t start, stop, step, slicelength, cur, i;
8158         Py_UNICODE* source_buf;
8159         Py_UNICODE* result_buf;
8160         PyObject* result;
8161
8162         if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
8163                                  &start, &stop, &step, &slicelength) < 0) {
8164             return NULL;
8165         }
8166
8167         if (slicelength <= 0) {
8168             return PyUnicode_FromUnicode(NULL, 0);
8169         } else if (start == 0 && step == 1 && slicelength == self->length &&
8170                    PyUnicode_CheckExact(self)) {
8171             Py_INCREF(self);
8172             return (PyObject *)self;
8173         } else if (step == 1) {
8174             return PyUnicode_FromUnicode(self->str + start, slicelength);
8175         } else {
8176             source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
8177             result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8178                                                        sizeof(Py_UNICODE));
8179
8180             if (result_buf == NULL)
8181                 return PyErr_NoMemory();
8182
8183             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8184                 result_buf[i] = source_buf[cur];
8185             }
8186
8187             result = PyUnicode_FromUnicode(result_buf, slicelength);
8188             PyObject_FREE(result_buf);
8189             return result;
8190         }
8191     } else {
8192         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8193         return NULL;
8194     }
8195 }
8196
8197 static PyMappingMethods unicode_as_mapping = {
8198     (lenfunc)unicode_length,        /* mp_length */
8199     (binaryfunc)unicode_subscript,  /* mp_subscript */
8200     (objobjargproc)0,           /* mp_ass_subscript */
8201 };
8202
8203 static Py_ssize_t
8204 unicode_buffer_getreadbuf(PyUnicodeObject *self,
8205                           Py_ssize_t index,
8206                           const void **ptr)
8207 {
8208     if (index != 0) {
8209         PyErr_SetString(PyExc_SystemError,
8210                         "accessing non-existent unicode segment");
8211         return -1;
8212     }
8213     *ptr = (void *) self->str;
8214     return PyUnicode_GET_DATA_SIZE(self);
8215 }
8216
8217 static Py_ssize_t
8218 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
8219                            const void **ptr)
8220 {
8221     PyErr_SetString(PyExc_TypeError,
8222                     "cannot use unicode as modifiable buffer");
8223     return -1;
8224 }
8225
8226 static int
8227 unicode_buffer_getsegcount(PyUnicodeObject *self,
8228                            Py_ssize_t *lenp)
8229 {
8230     if (lenp)
8231         *lenp = PyUnicode_GET_DATA_SIZE(self);
8232     return 1;
8233 }
8234
8235 static Py_ssize_t
8236 unicode_buffer_getcharbuf(PyUnicodeObject *self,
8237                           Py_ssize_t index,
8238                           const void **ptr)
8239 {
8240     PyObject *str;
8241
8242     if (index != 0) {
8243         PyErr_SetString(PyExc_SystemError,
8244                         "accessing non-existent unicode segment");
8245         return -1;
8246     }
8247     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
8248     if (str == NULL)
8249         return -1;
8250     *ptr = (void *) PyString_AS_STRING(str);
8251     return PyString_GET_SIZE(str);
8252 }
8253
8254 /* Helpers for PyUnicode_Format() */
8255
8256 static PyObject *
8257 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8258 {
8259     Py_ssize_t argidx = *p_argidx;
8260     if (argidx < arglen) {
8261         (*p_argidx)++;
8262         if (arglen < 0)
8263             return args;
8264         else
8265             return PyTuple_GetItem(args, argidx);
8266     }
8267     PyErr_SetString(PyExc_TypeError,
8268                     "not enough arguments for format string");
8269     return NULL;
8270 }
8271
8272 #define F_LJUST (1<<0)
8273 #define F_SIGN  (1<<1)
8274 #define F_BLANK (1<<2)
8275 #define F_ALT   (1<<3)
8276 #define F_ZERO  (1<<4)
8277
8278 static Py_ssize_t
8279 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8280 {
8281     register Py_ssize_t i;
8282     Py_ssize_t len = strlen(charbuffer);
8283     for (i = len - 1; i >= 0; i--)
8284         buffer[i] = (Py_UNICODE) charbuffer[i];
8285
8286     return len;
8287 }
8288
8289 static int
8290 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8291 {
8292     Py_ssize_t result;
8293
8294     PyOS_snprintf((char *)buffer, len, format, x);
8295     result = strtounicode(buffer, (char *)buffer);
8296     return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8297 }
8298
8299 /* XXX To save some code duplication, formatfloat/long/int could have been
8300    shared with stringobject.c, converting from 8-bit to Unicode after the
8301    formatting is done. */
8302
8303 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
8304
8305 static PyObject *
8306 formatfloat(PyObject *v, int flags, int prec, int type)
8307 {
8308     char *p;
8309     PyObject *result;
8310     double x;
8311
8312     x = PyFloat_AsDouble(v);
8313     if (x == -1.0 && PyErr_Occurred())
8314         return NULL;
8315
8316     if (prec < 0)
8317         prec = 6;
8318
8319     p = PyOS_double_to_string(x, type, prec,
8320                               (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8321     if (p == NULL)
8322         return NULL;
8323     result = PyUnicode_FromStringAndSize(p, strlen(p));
8324     PyMem_Free(p);
8325     return result;
8326 }
8327
8328 static PyObject*
8329 formatlong(PyObject *val, int flags, int prec, int type)
8330 {
8331     char *buf;
8332     int i, len;
8333     PyObject *str; /* temporary string object. */
8334     PyUnicodeObject *result;
8335
8336     str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8337     if (!str)
8338         return NULL;
8339     result = _PyUnicode_New(len);
8340     if (!result) {
8341         Py_DECREF(str);
8342         return NULL;
8343     }
8344     for (i = 0; i < len; i++)
8345         result->str[i] = buf[i];
8346     result->str[len] = 0;
8347     Py_DECREF(str);
8348     return (PyObject*)result;
8349 }
8350
8351 static int
8352 formatint(Py_UNICODE *buf,
8353           size_t buflen,
8354           int flags,
8355           int prec,
8356           int type,
8357           PyObject *v)
8358 {
8359     /* fmt = '%#.' + `prec` + 'l' + `type`
8360      * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8361      *                     + 1 + 1
8362      *                   = 24
8363      */
8364     char fmt[64]; /* plenty big enough! */
8365     char *sign;
8366     long x;
8367
8368     x = PyInt_AsLong(v);
8369     if (x == -1 && PyErr_Occurred())
8370         return -1;
8371     if (x < 0 && type == 'u') {
8372         type = 'd';
8373     }
8374     if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8375         sign = "-";
8376     else
8377         sign = "";
8378     if (prec < 0)
8379         prec = 1;
8380
8381     /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8382      * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8383      */
8384     if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8385         PyErr_SetString(PyExc_OverflowError,
8386                         "formatted integer is too long (precision too large?)");
8387         return -1;
8388     }
8389
8390     if ((flags & F_ALT) &&
8391         (type == 'x' || type == 'X')) {
8392         /* When converting under %#x or %#X, there are a number
8393          * of issues that cause pain:
8394          * - when 0 is being converted, the C standard leaves off
8395          *   the '0x' or '0X', which is inconsistent with other
8396          *   %#x/%#X conversions and inconsistent with Python's
8397          *   hex() function
8398          * - there are platforms that violate the standard and
8399          *   convert 0 with the '0x' or '0X'
8400          *   (Metrowerks, Compaq Tru64)
8401          * - there are platforms that give '0x' when converting
8402          *   under %#X, but convert 0 in accordance with the
8403          *   standard (OS/2 EMX)
8404          *
8405          * We can achieve the desired consistency by inserting our
8406          * own '0x' or '0X' prefix, and substituting %x/%X in place
8407          * of %#x/%#X.
8408          *
8409          * Note that this is the same approach as used in
8410          * formatint() in stringobject.c
8411          */
8412         PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8413                       sign, type, prec, type);
8414     }
8415     else {
8416         PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8417                       sign, (flags&F_ALT) ? "#" : "",
8418                       prec, type);
8419     }
8420     if (sign[0])
8421         return longtounicode(buf, buflen, fmt, -x);
8422     else
8423         return longtounicode(buf, buflen, fmt, x);
8424 }
8425
8426 static int
8427 formatchar(Py_UNICODE *buf,
8428            size_t buflen,
8429            PyObject *v)
8430 {
8431     /* presume that the buffer is at least 2 characters long */
8432     if (PyUnicode_Check(v)) {
8433         if (PyUnicode_GET_SIZE(v) != 1)
8434             goto onError;
8435         buf[0] = PyUnicode_AS_UNICODE(v)[0];
8436     }
8437
8438     else if (PyString_Check(v)) {
8439         if (PyString_GET_SIZE(v) != 1)
8440             goto onError;
8441         buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8442     }
8443
8444     else {
8445         /* Integer input truncated to a character */
8446         long x;
8447         x = PyInt_AsLong(v);
8448         if (x == -1 && PyErr_Occurred())
8449             goto onError;
8450 #ifdef Py_UNICODE_WIDE
8451         if (x < 0 || x > 0x10ffff) {
8452             PyErr_SetString(PyExc_OverflowError,
8453                             "%c arg not in range(0x110000) "
8454                             "(wide Python build)");
8455             return -1;
8456         }
8457 #else
8458         if (x < 0 || x > 0xffff) {
8459             PyErr_SetString(PyExc_OverflowError,
8460                             "%c arg not in range(0x10000) "
8461                             "(narrow Python build)");
8462             return -1;
8463         }
8464 #endif
8465         buf[0] = (Py_UNICODE) x;
8466     }
8467     buf[1] = '\0';
8468     return 1;
8469
8470   onError:
8471     PyErr_SetString(PyExc_TypeError,
8472                     "%c requires int or char");
8473     return -1;
8474 }
8475
8476 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8477
8478    FORMATBUFLEN is the length of the buffer in which the ints &
8479    chars are formatted. XXX This is a magic number. Each formatting
8480    routine does bounds checking to ensure no overflow, but a better
8481    solution may be to malloc a buffer of appropriate size for each
8482    format. For now, the current solution is sufficient.
8483 */
8484 #define FORMATBUFLEN (size_t)120
8485
8486 PyObject *PyUnicode_Format(PyObject *format,
8487                            PyObject *args)
8488 {
8489     Py_UNICODE *fmt, *res;
8490     Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8491     int args_owned = 0;
8492     PyUnicodeObject *result = NULL;
8493     PyObject *dict = NULL;
8494     PyObject *uformat;
8495
8496     if (format == NULL || args == NULL) {
8497         PyErr_BadInternalCall();
8498         return NULL;
8499     }
8500     uformat = PyUnicode_FromObject(format);
8501     if (uformat == NULL)
8502         return NULL;
8503     fmt = PyUnicode_AS_UNICODE(uformat);
8504     fmtcnt = PyUnicode_GET_SIZE(uformat);
8505
8506     reslen = rescnt = fmtcnt + 100;
8507     result = _PyUnicode_New(reslen);
8508     if (result == NULL)
8509         goto onError;
8510     res = PyUnicode_AS_UNICODE(result);
8511
8512     if (PyTuple_Check(args)) {
8513         arglen = PyTuple_Size(args);
8514         argidx = 0;
8515     }
8516     else {
8517         arglen = -1;
8518         argidx = -2;
8519     }
8520     if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
8521         !PyObject_TypeCheck(args, &PyBaseString_Type))
8522         dict = args;
8523
8524     while (--fmtcnt >= 0) {
8525         if (*fmt != '%') {
8526             if (--rescnt < 0) {
8527                 rescnt = fmtcnt + 100;
8528                 reslen += rescnt;
8529                 if (_PyUnicode_Resize(&result, reslen) < 0)
8530                     goto onError;
8531                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8532                 --rescnt;
8533             }
8534             *res++ = *fmt++;
8535         }
8536         else {
8537             /* Got a format specifier */
8538             int flags = 0;
8539             Py_ssize_t width = -1;
8540             int prec = -1;
8541             Py_UNICODE c = '\0';
8542             Py_UNICODE fill;
8543             int isnumok;
8544             PyObject *v = NULL;
8545             PyObject *temp = NULL;
8546             Py_UNICODE *pbuf;
8547             Py_UNICODE sign;
8548             Py_ssize_t len;
8549             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
8550
8551             fmt++;
8552             if (*fmt == '(') {
8553                 Py_UNICODE *keystart;
8554                 Py_ssize_t keylen;
8555                 PyObject *key;
8556                 int pcount = 1;
8557
8558                 if (dict == NULL) {
8559                     PyErr_SetString(PyExc_TypeError,
8560                                     "format requires a mapping");
8561                     goto onError;
8562                 }
8563                 ++fmt;
8564                 --fmtcnt;
8565                 keystart = fmt;
8566                 /* Skip over balanced parentheses */
8567                 while (pcount > 0 && --fmtcnt >= 0) {
8568                     if (*fmt == ')')
8569                         --pcount;
8570                     else if (*fmt == '(')
8571                         ++pcount;
8572                     fmt++;
8573                 }
8574                 keylen = fmt - keystart - 1;
8575                 if (fmtcnt < 0 || pcount > 0) {
8576                     PyErr_SetString(PyExc_ValueError,
8577                                     "incomplete format key");
8578                     goto onError;
8579                 }
8580 #if 0
8581                 /* keys are converted to strings using UTF-8 and
8582                    then looked up since Python uses strings to hold
8583                    variables names etc. in its namespaces and we
8584                    wouldn't want to break common idioms. */
8585                 key = PyUnicode_EncodeUTF8(keystart,
8586                                            keylen,
8587                                            NULL);
8588 #else
8589                 key = PyUnicode_FromUnicode(keystart, keylen);
8590 #endif
8591                 if (key == NULL)
8592                     goto onError;
8593                 if (args_owned) {
8594                     Py_DECREF(args);
8595                     args_owned = 0;
8596                 }
8597                 args = PyObject_GetItem(dict, key);
8598                 Py_DECREF(key);
8599                 if (args == NULL) {
8600                     goto onError;
8601                 }
8602                 args_owned = 1;
8603                 arglen = -1;
8604                 argidx = -2;
8605             }
8606             while (--fmtcnt >= 0) {
8607                 switch (c = *fmt++) {
8608                 case '-': flags |= F_LJUST; continue;
8609                 case '+': flags |= F_SIGN; continue;
8610                 case ' ': flags |= F_BLANK; continue;
8611                 case '#': flags |= F_ALT; continue;
8612                 case '0': flags |= F_ZERO; continue;
8613                 }
8614                 break;
8615             }
8616             if (c == '*') {
8617                 v = getnextarg(args, arglen, &argidx);
8618                 if (v == NULL)
8619                     goto onError;
8620                 if (!PyInt_Check(v)) {
8621                     PyErr_SetString(PyExc_TypeError,
8622                                     "* wants int");
8623                     goto onError;
8624                 }
8625                 width = PyInt_AsLong(v);
8626                 if (width < 0) {
8627                     flags |= F_LJUST;
8628                     width = -width;
8629                 }
8630                 if (--fmtcnt >= 0)
8631                     c = *fmt++;
8632             }
8633             else if (c >= '0' && c <= '9') {
8634                 width = c - '0';
8635                 while (--fmtcnt >= 0) {
8636                     c = *fmt++;
8637                     if (c < '0' || c > '9')
8638                         break;
8639                     if ((width*10) / 10 != width) {
8640                         PyErr_SetString(PyExc_ValueError,
8641                                         "width too big");
8642                         goto onError;
8643                     }
8644                     width = width*10 + (c - '0');
8645                 }
8646             }
8647             if (c == '.') {
8648                 prec = 0;
8649                 if (--fmtcnt >= 0)
8650                     c = *fmt++;
8651                 if (c == '*') {
8652                     v = getnextarg(args, arglen, &argidx);
8653                     if (v == NULL)
8654                         goto onError;
8655                     if (!PyInt_Check(v)) {
8656                         PyErr_SetString(PyExc_TypeError,
8657                                         "* wants int");
8658                         goto onError;
8659                     }
8660                     prec = PyInt_AsLong(v);
8661                     if (prec < 0)
8662                         prec = 0;
8663                     if (--fmtcnt >= 0)
8664                         c = *fmt++;
8665                 }
8666                 else if (c >= '0' && c <= '9') {
8667                     prec = c - '0';
8668                     while (--fmtcnt >= 0) {
8669                         c = Py_CHARMASK(*fmt++);
8670                         if (c < '0' || c > '9')
8671                             break;
8672                         if ((prec*10) / 10 != prec) {
8673                             PyErr_SetString(PyExc_ValueError,
8674                                             "prec too big");
8675                             goto onError;
8676                         }
8677                         prec = prec*10 + (c - '0');
8678                     }
8679                 }
8680             } /* prec */
8681             if (fmtcnt >= 0) {
8682                 if (c == 'h' || c == 'l' || c == 'L') {
8683                     if (--fmtcnt >= 0)
8684                         c = *fmt++;
8685                 }
8686             }
8687             if (fmtcnt < 0) {
8688                 PyErr_SetString(PyExc_ValueError,
8689                                 "incomplete format");
8690                 goto onError;
8691             }
8692             if (c != '%') {
8693                 v = getnextarg(args, arglen, &argidx);
8694                 if (v == NULL)
8695                     goto onError;
8696             }
8697             sign = 0;
8698             fill = ' ';
8699             switch (c) {
8700
8701             case '%':
8702                 pbuf = formatbuf;
8703                 /* presume that buffer length is at least 1 */
8704                 pbuf[0] = '%';
8705                 len = 1;
8706                 break;
8707
8708             case 's':
8709             case 'r':
8710                 if (PyUnicode_Check(v) && c == 's') {
8711                     temp = v;
8712                     Py_INCREF(temp);
8713                 }
8714                 else {
8715                     PyObject *unicode;
8716                     if (c == 's')
8717                         temp = PyObject_Unicode(v);
8718                     else
8719                         temp = PyObject_Repr(v);
8720                     if (temp == NULL)
8721                         goto onError;
8722                     if (PyUnicode_Check(temp))
8723                         /* nothing to do */;
8724                     else if (PyString_Check(temp)) {
8725                         /* convert to string to Unicode */
8726                         unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8727                                                    PyString_GET_SIZE(temp),
8728                                                    NULL,
8729                                                    "strict");
8730                         Py_DECREF(temp);
8731                         temp = unicode;
8732                         if (temp == NULL)
8733                             goto onError;
8734                     }
8735                     else {
8736                         Py_DECREF(temp);
8737                         PyErr_SetString(PyExc_TypeError,
8738                                         "%s argument has non-string str()");
8739                         goto onError;
8740                     }
8741                 }
8742                 pbuf = PyUnicode_AS_UNICODE(temp);
8743                 len = PyUnicode_GET_SIZE(temp);
8744                 if (prec >= 0 && len > prec)
8745                     len = prec;
8746                 break;
8747
8748             case 'i':
8749             case 'd':
8750             case 'u':
8751             case 'o':
8752             case 'x':
8753             case 'X':
8754                 if (c == 'i')
8755                     c = 'd';
8756                 isnumok = 0;
8757                 if (PyNumber_Check(v)) {
8758                     PyObject *iobj=NULL;
8759
8760                     if (PyInt_Check(v) || (PyLong_Check(v))) {
8761                         iobj = v;
8762                         Py_INCREF(iobj);
8763                     }
8764                     else {
8765                         iobj = PyNumber_Int(v);
8766                         if (iobj==NULL) iobj = PyNumber_Long(v);
8767                     }
8768                     if (iobj!=NULL) {
8769                         if (PyInt_Check(iobj)) {
8770                             isnumok = 1;
8771                             pbuf = formatbuf;
8772                             len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8773                                             flags, prec, c, iobj);
8774                             Py_DECREF(iobj);
8775                             if (len < 0)
8776                                 goto onError;
8777                             sign = 1;
8778                         }
8779                         else if (PyLong_Check(iobj)) {
8780                             isnumok = 1;
8781                             temp = formatlong(iobj, flags, prec, c);
8782                             Py_DECREF(iobj);
8783                             if (!temp)
8784                                 goto onError;
8785                             pbuf = PyUnicode_AS_UNICODE(temp);
8786                             len = PyUnicode_GET_SIZE(temp);
8787                             sign = 1;
8788                         }
8789                         else {
8790                             Py_DECREF(iobj);
8791                         }
8792                     }
8793                 }
8794                 if (!isnumok) {
8795                     PyErr_Format(PyExc_TypeError,
8796                                  "%%%c format: a number is required, "
8797                                  "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8798                     goto onError;
8799                 }
8800                 if (flags & F_ZERO)
8801                     fill = '0';
8802                 break;
8803
8804             case 'e':
8805             case 'E':
8806             case 'f':
8807             case 'F':
8808             case 'g':
8809             case 'G':
8810                 temp = formatfloat(v, flags, prec, c);
8811                 if (temp == NULL)
8812                     goto onError;
8813                 pbuf = PyUnicode_AS_UNICODE(temp);
8814                 len = PyUnicode_GET_SIZE(temp);
8815                 sign = 1;
8816                 if (flags & F_ZERO)
8817                     fill = '0';
8818                 break;
8819
8820             case 'c':
8821                 pbuf = formatbuf;
8822                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8823                 if (len < 0)
8824                     goto onError;
8825                 break;
8826
8827             default:
8828                 PyErr_Format(PyExc_ValueError,
8829                              "unsupported format character '%c' (0x%x) "
8830                              "at index %zd",
8831                              (31<=c && c<=126) ? (char)c : '?',
8832                              (int)c,
8833                              (Py_ssize_t)(fmt - 1 -
8834                                           PyUnicode_AS_UNICODE(uformat)));
8835                 goto onError;
8836             }
8837             if (sign) {
8838                 if (*pbuf == '-' || *pbuf == '+') {
8839                     sign = *pbuf++;
8840                     len--;
8841                 }
8842                 else if (flags & F_SIGN)
8843                     sign = '+';
8844                 else if (flags & F_BLANK)
8845                     sign = ' ';
8846                 else
8847                     sign = 0;
8848             }
8849             if (width < len)
8850                 width = len;
8851             if (rescnt - (sign != 0) < width) {
8852                 reslen -= rescnt;
8853                 rescnt = width + fmtcnt + 100;
8854                 reslen += rescnt;
8855                 if (reslen < 0) {
8856                     Py_XDECREF(temp);
8857                     PyErr_NoMemory();
8858                     goto onError;
8859                 }
8860                 if (_PyUnicode_Resize(&result, reslen) < 0) {
8861                     Py_XDECREF(temp);
8862                     goto onError;
8863                 }
8864                 res = PyUnicode_AS_UNICODE(result)
8865                     + reslen - rescnt;
8866             }
8867             if (sign) {
8868                 if (fill != ' ')
8869                     *res++ = sign;
8870                 rescnt--;
8871                 if (width > len)
8872                     width--;
8873             }
8874             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8875                 assert(pbuf[0] == '0');
8876                 assert(pbuf[1] == c);
8877                 if (fill != ' ') {
8878                     *res++ = *pbuf++;
8879                     *res++ = *pbuf++;
8880                 }
8881                 rescnt -= 2;
8882                 width -= 2;
8883                 if (width < 0)
8884                     width = 0;
8885                 len -= 2;
8886             }
8887             if (width > len && !(flags & F_LJUST)) {
8888                 do {
8889                     --rescnt;
8890                     *res++ = fill;
8891                 } while (--width > len);
8892             }
8893             if (fill == ' ') {
8894                 if (sign)
8895                     *res++ = sign;
8896                 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8897                     assert(pbuf[0] == '0');
8898                     assert(pbuf[1] == c);
8899                     *res++ = *pbuf++;
8900                     *res++ = *pbuf++;
8901                 }
8902             }
8903             Py_UNICODE_COPY(res, pbuf, len);
8904             res += len;
8905             rescnt -= len;
8906             while (--width >= len) {
8907                 --rescnt;
8908                 *res++ = ' ';
8909             }
8910             if (dict && (argidx < arglen) && c != '%') {
8911                 PyErr_SetString(PyExc_TypeError,
8912                                 "not all arguments converted during string formatting");
8913                 Py_XDECREF(temp);
8914                 goto onError;
8915             }
8916             Py_XDECREF(temp);
8917         } /* '%' */
8918     } /* until end */
8919     if (argidx < arglen && !dict) {
8920         PyErr_SetString(PyExc_TypeError,
8921                         "not all arguments converted during string formatting");
8922         goto onError;
8923     }
8924
8925     if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8926         goto onError;
8927     if (args_owned) {
8928         Py_DECREF(args);
8929     }
8930     Py_DECREF(uformat);
8931     return (PyObject *)result;
8932
8933   onError:
8934     Py_XDECREF(result);
8935     Py_DECREF(uformat);
8936     if (args_owned) {
8937         Py_DECREF(args);
8938     }
8939     return NULL;
8940 }
8941
8942 static PyBufferProcs unicode_as_buffer = {
8943     (readbufferproc) unicode_buffer_getreadbuf,
8944     (writebufferproc) unicode_buffer_getwritebuf,
8945     (segcountproc) unicode_buffer_getsegcount,
8946     (charbufferproc) unicode_buffer_getcharbuf,
8947 };
8948
8949 static PyObject *
8950 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8951
8952 static PyObject *
8953 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8954 {
8955     PyObject *x = NULL;
8956     static char *kwlist[] = {"string", "encoding", "errors", 0};
8957     char *encoding = NULL;
8958     char *errors = NULL;
8959
8960     if (type != &PyUnicode_Type)
8961         return unicode_subtype_new(type, args, kwds);
8962     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8963                                      kwlist, &x, &encoding, &errors))
8964         return NULL;
8965     if (x == NULL)
8966         return (PyObject *)_PyUnicode_New(0);
8967     if (encoding == NULL && errors == NULL)
8968         return PyObject_Unicode(x);
8969     else
8970         return PyUnicode_FromEncodedObject(x, encoding, errors);
8971 }
8972
8973 static PyObject *
8974 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8975 {
8976     PyUnicodeObject *tmp, *pnew;
8977     Py_ssize_t n;
8978
8979     assert(PyType_IsSubtype(type, &PyUnicode_Type));
8980     tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8981     if (tmp == NULL)
8982         return NULL;
8983     assert(PyUnicode_Check(tmp));
8984     pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8985     if (pnew == NULL) {
8986         Py_DECREF(tmp);
8987         return NULL;
8988     }
8989     pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8990     if (pnew->str == NULL) {
8991         _Py_ForgetReference((PyObject *)pnew);
8992         PyObject_Del(pnew);
8993         Py_DECREF(tmp);
8994         return PyErr_NoMemory();
8995     }
8996     Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8997     pnew->length = n;
8998     pnew->hash = tmp->hash;
8999     Py_DECREF(tmp);
9000     return (PyObject *)pnew;
9001 }
9002
9003 PyDoc_STRVAR(unicode_doc,
9004              "unicode(string [, encoding[, errors]]) -> object\n\
9005 \n\
9006 Create a new Unicode object from the given encoded string.\n\
9007 encoding defaults to the current default string encoding.\n\
9008 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
9009
9010 PyTypeObject PyUnicode_Type = {
9011     PyVarObject_HEAD_INIT(&PyType_Type, 0)
9012     "unicode",              /* tp_name */
9013     sizeof(PyUnicodeObject),        /* tp_size */
9014     0,                  /* tp_itemsize */
9015     /* Slots */
9016     (destructor)unicode_dealloc,    /* tp_dealloc */
9017     0,                  /* tp_print */
9018     0,                  /* tp_getattr */
9019     0,                  /* tp_setattr */
9020     0,                  /* tp_compare */
9021     unicode_repr,           /* tp_repr */
9022     &unicode_as_number,         /* tp_as_number */
9023     &unicode_as_sequence,       /* tp_as_sequence */
9024     &unicode_as_mapping,        /* tp_as_mapping */
9025     (hashfunc) unicode_hash,        /* tp_hash*/
9026     0,                  /* tp_call*/
9027     (reprfunc) unicode_str,     /* tp_str */
9028     PyObject_GenericGetAttr,        /* tp_getattro */
9029     0,                  /* tp_setattro */
9030     &unicode_as_buffer,         /* tp_as_buffer */
9031     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
9032     Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS,  /* tp_flags */
9033     unicode_doc,            /* tp_doc */
9034     0,                  /* tp_traverse */
9035     0,                  /* tp_clear */
9036     PyUnicode_RichCompare,      /* tp_richcompare */
9037     0,                  /* tp_weaklistoffset */
9038     0,                  /* tp_iter */
9039     0,                  /* tp_iternext */
9040     unicode_methods,            /* tp_methods */
9041     0,                  /* tp_members */
9042     0,                  /* tp_getset */
9043     &PyBaseString_Type,         /* tp_base */
9044     0,                  /* tp_dict */
9045     0,                  /* tp_descr_get */
9046     0,                  /* tp_descr_set */
9047     0,                  /* tp_dictoffset */
9048     0,                  /* tp_init */
9049     0,                  /* tp_alloc */
9050     unicode_new,            /* tp_new */
9051     PyObject_Del,           /* tp_free */
9052 };
9053
9054 /* Initialize the Unicode implementation */
9055
9056 void _PyUnicode_Init(void)
9057 {
9058     int i;
9059
9060     /* XXX - move this array to unicodectype.c ? */
9061     Py_UNICODE linebreak[] = {
9062         0x000A, /* LINE FEED */
9063         0x000D, /* CARRIAGE RETURN */
9064         0x001C, /* FILE SEPARATOR */
9065         0x001D, /* GROUP SEPARATOR */
9066         0x001E, /* RECORD SEPARATOR */
9067         0x0085, /* NEXT LINE */
9068         0x2028, /* LINE SEPARATOR */
9069         0x2029, /* PARAGRAPH SEPARATOR */
9070     };
9071
9072     /* Init the implementation */
9073     free_list = NULL;
9074     numfree = 0;
9075     unicode_empty = _PyUnicode_New(0);
9076     if (!unicode_empty)
9077         return;
9078
9079     strcpy(unicode_default_encoding, "ascii");
9080     for (i = 0; i < 256; i++)
9081         unicode_latin1[i] = NULL;
9082     if (PyType_Ready(&PyUnicode_Type) < 0)
9083         Py_FatalError("Can't initialize 'unicode'");
9084
9085     /* initialize the linebreak bloom filter */
9086     bloom_linebreak = make_bloom_mask(
9087         linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9088         );
9089
9090     PyType_Ready(&EncodingMapType);
9091 }
9092
9093 /* Finalize the Unicode implementation */
9094
9095 int
9096 PyUnicode_ClearFreeList(void)
9097 {
9098     int freelist_size = numfree;
9099     PyUnicodeObject *u;
9100
9101     for (u = free_list; u != NULL;) {
9102         PyUnicodeObject *v = u;
9103         u = *(PyUnicodeObject **)u;
9104         if (v->str)
9105             PyObject_DEL(v->str);
9106         Py_XDECREF(v->defenc);
9107         PyObject_Del(v);
9108         numfree--;
9109     }
9110     free_list = NULL;
9111     assert(numfree == 0);
9112     return freelist_size;
9113 }
9114
9115 void
9116 _PyUnicode_Fini(void)
9117 {
9118     int i;
9119
9120     Py_XDECREF(unicode_empty);
9121     unicode_empty = NULL;
9122
9123     for (i = 0; i < 256; i++) {
9124         if (unicode_latin1[i]) {
9125             Py_DECREF(unicode_latin1[i]);
9126             unicode_latin1[i] = NULL;
9127         }
9128     }
9129     (void)PyUnicode_ClearFreeList();
9130 }
9131
9132 #ifdef __cplusplus
9133 }
9134 #endif
9135
9136
9137 /*
9138   Local variables:
9139   c-basic-offset: 4
9140   indent-tabs-mode: nil
9141   End:
9142 */