Objects/unicodeobject.c

   1 /*
   2
   3 Unicode implementation based on original code by Fredrik Lundh,
   4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
   5 Unicode Integration Proposal (see file Misc/unicode.txt).
   6
   7 Major speed upgrades to the method implementations at the Reykjavik
   8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
   9
  10 Copyright (c) Corporation for National Research Initiatives.
  11
  12 --------------------------------------------------------------------
  13 The original string type implementation is:
  14
  15   Copyright (c) 1999 by Secret Labs AB
  16   Copyright (c) 1999 by Fredrik Lundh
  17
  18 By obtaining, using, and/or copying this software and/or its
  19 associated documentation, you agree that you have read, understood,
  20 and will comply with the following terms and conditions:
  21
  22 Permission to use, copy, modify, and distribute this software and its
  23 associated documentation for any purpose and without fee is hereby
  24 granted, provided that the above copyright notice appears in all
  25 copies, and that both that copyright notice and this permission notice
  26 appear in supporting documentation, and that the name of Secret Labs
  27 AB or the author not be used in advertising or publicity pertaining to
  28 distribution of the software without specific, written prior
  29 permission.
  30
  31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  33 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  38 --------------------------------------------------------------------
  39
  40 */
  41
  42 #define PY_SSIZE_T_CLEAN
  43 #include "Python.h"
  44
  45 #include "unicodeobject.h"
  46 #include "ucnhash.h"
  47
  48 #ifdef MS_WINDOWS
  49 #include <windows.h>
  50 #endif
  51
  52 /* Limit for the Unicode object free list */
  53
  54 #define PyUnicode_MAXFREELIST       1024
  55
  56 /* Limit for the Unicode object free list stay alive optimization.
  57
  58    The implementation will keep allocated Unicode memory intact for
  59    all objects on the free list having a size less than this
  60    limit. This reduces malloc() overhead for small Unicode objects.
  61
  62    At worst this will result in PyUnicode_MAXFREELIST *
  63    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
  64    malloc()-overhead) bytes of unused garbage.
  65
  66    Setting the limit to 0 effectively turns the feature off.
  67
  68    Note: This is an experimental feature ! If you get core dumps when
  69    using Unicode objects, turn this feature off.
  70
  71 */
  72
  73 #define KEEPALIVE_SIZE_LIMIT       9
  74
  75 /* Endianness switches; defaults to little endian */
  76
  77 #ifdef WORDS_BIGENDIAN
  78 # define BYTEORDER_IS_BIG_ENDIAN
  79 #else
  80 # define BYTEORDER_IS_LITTLE_ENDIAN
  81 #endif
  82
  83 /* --- Globals ------------------------------------------------------------
  84
  85    The globals are initialized by the _PyUnicode_Init() API and should
  86    not be used before calling that API.
  87
  88 */
  89
  90
  91 #ifdef __cplusplus
  92 extern "C" {
  93 #endif
  94
  95 /* Free list for Unicode objects */
  96 static PyUnicodeObject *free_list;
  97 static int numfree;
  98
  99 /* The empty Unicode object is shared to improve performance. */
 100 static PyUnicodeObject *unicode_empty;
 101
 102 /* Single character Unicode strings in the Latin-1 range are being
 103    shared as well. */
 104 static PyUnicodeObject *unicode_latin1[256];
 105
 106 /* Default encoding to use and assume when NULL is passed as encoding
 107    parameter; it is initialized by _PyUnicode_Init().
 108
 109    Always use the PyUnicode_SetDefaultEncoding() and
 110    PyUnicode_GetDefaultEncoding() APIs to access this global.
 111
 112 */
 113 static char unicode_default_encoding[100];
 114
 115 /* Fast detection of the most frequent whitespace characters */
 116 const unsigned char _Py_ascii_whitespace[] = {
 117     0, 0, 0, 0, 0, 0, 0, 0,
 118 /*     case 0x0009: * HORIZONTAL TABULATION */
 119 /*     case 0x000A: * LINE FEED */
 120 /*     case 0x000B: * VERTICAL TABULATION */
 121 /*     case 0x000C: * FORM FEED */
 122 /*     case 0x000D: * CARRIAGE RETURN */
 123     0, 1, 1, 1, 1, 1, 0, 0,
 124     0, 0, 0, 0, 0, 0, 0, 0,
 125 /*     case 0x001C: * FILE SEPARATOR */
 126 /*     case 0x001D: * GROUP SEPARATOR */
 127 /*     case 0x001E: * RECORD SEPARATOR */
 128 /*     case 0x001F: * UNIT SEPARATOR */
 129     0, 0, 0, 0, 1, 1, 1, 1,
 130 /*     case 0x0020: * SPACE */
 131     1, 0, 0, 0, 0, 0, 0, 0,
 132     0, 0, 0, 0, 0, 0, 0, 0,
 133     0, 0, 0, 0, 0, 0, 0, 0,
 134     0, 0, 0, 0, 0, 0, 0, 0,
 135
 136     0, 0, 0, 0, 0, 0, 0, 0,
 137     0, 0, 0, 0, 0, 0, 0, 0,
 138     0, 0, 0, 0, 0, 0, 0, 0,
 139     0, 0, 0, 0, 0, 0, 0, 0,
 140     0, 0, 0, 0, 0, 0, 0, 0,
 141     0, 0, 0, 0, 0, 0, 0, 0,
 142     0, 0, 0, 0, 0, 0, 0, 0,
 143     0, 0, 0, 0, 0, 0, 0, 0
 144 };
 145
 146 /* Same for linebreaks */
 147 static unsigned char ascii_linebreak[] = {
 148     0, 0, 0, 0, 0, 0, 0, 0,
 149 /*         0x000A, * LINE FEED */
 150 /*         0x000D, * CARRIAGE RETURN */
 151     0, 0, 1, 0, 0, 1, 0, 0,
 152     0, 0, 0, 0, 0, 0, 0, 0,
 153 /*         0x001C, * FILE SEPARATOR */
 154 /*         0x001D, * GROUP SEPARATOR */
 155 /*         0x001E, * RECORD SEPARATOR */
 156     0, 0, 0, 0, 1, 1, 1, 0,
 157     0, 0, 0, 0, 0, 0, 0, 0,
 158     0, 0, 0, 0, 0, 0, 0, 0,
 159     0, 0, 0, 0, 0, 0, 0, 0,
 160     0, 0, 0, 0, 0, 0, 0, 0,
 161
 162     0, 0, 0, 0, 0, 0, 0, 0,
 163     0, 0, 0, 0, 0, 0, 0, 0,
 164     0, 0, 0, 0, 0, 0, 0, 0,
 165     0, 0, 0, 0, 0, 0, 0, 0,
 166     0, 0, 0, 0, 0, 0, 0, 0,
 167     0, 0, 0, 0, 0, 0, 0, 0,
 168     0, 0, 0, 0, 0, 0, 0, 0,
 169     0, 0, 0, 0, 0, 0, 0, 0
 170 };
 171
 172
 173 Py_UNICODE
 174 PyUnicode_GetMax(void)
 175 {
 176 #ifdef Py_UNICODE_WIDE
 177     return 0x10FFFF;
 178 #else
 179     /* This is actually an illegal character, so it should
 180        not be passed to unichr. */
 181     return 0xFFFF;
 182 #endif
 183 }
 184
 185 /* --- Bloom Filters ----------------------------------------------------- */
 186
 187 /* stuff to implement simple "bloom filters" for Unicode characters.
 188    to keep things simple, we use a single bitmask, using the least 5
 189    bits from each unicode characters as the bit index. */
 190
 191 /* the linebreak mask is set up by Unicode_Init below */
 192
 193 #define BLOOM_MASK unsigned long
 194
 195 static BLOOM_MASK bloom_linebreak;
 196
 197 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
 198
 199 #define BLOOM_LINEBREAK(ch)                                             \
 200     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
 201      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
 202
 203 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
 204 {
 205     /* calculate simple bloom-style bitmask for a given unicode string */
 206
 207     long mask;
 208     Py_ssize_t i;
 209
 210     mask = 0;
 211     for (i = 0; i < len; i++)
 212         mask |= (1 << (ptr[i] & 0x1F));
 213
 214     return mask;
 215 }
 216
 217 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
 218 {
 219     Py_ssize_t i;
 220
 221     for (i = 0; i < setlen; i++)
 222         if (set[i] == chr)
 223             return 1;
 224
 225     return 0;
 226 }
 227
 228 #define BLOOM_MEMBER(mask, chr, set, setlen)                    \
 229     BLOOM(mask, chr) && unicode_member(chr, set, setlen)
 230
 231 /* --- Unicode Object ----------------------------------------------------- */
 232
 233 static
 234 int unicode_resize(register PyUnicodeObject *unicode,
 235                    Py_ssize_t length)
 236 {
 237     void *oldstr;
 238
 239     /* Shortcut if there's nothing much to do. */
 240     if (unicode->length == length)
 241         goto reset;
 242
 243     /* Resizing shared object (unicode_empty or single character
 244        objects) in-place is not allowed. Use PyUnicode_Resize()
 245        instead ! */
 246
 247     if (unicode == unicode_empty ||
 248         (unicode->length == 1 &&
 249          unicode->str[0] < 256U &&
 250          unicode_latin1[unicode->str[0]] == unicode)) {
 251         PyErr_SetString(PyExc_SystemError,
 252                         "can't resize shared unicode objects");
 253         return -1;
 254     }
 255
 256     /* We allocate one more byte to make sure the string is Ux0000 terminated.
 257        The overallocation is also used by fastsearch, which assumes that it's
 258        safe to look at str[length] (without making any assumptions about what
 259        it contains). */
 260
 261     oldstr = unicode->str;
 262     unicode->str = PyObject_REALLOC(unicode->str,
 263                                     sizeof(Py_UNICODE) * (length + 1));
 264     if (!unicode->str) {
 265         unicode->str = (Py_UNICODE *)oldstr;
 266         PyErr_NoMemory();
 267         return -1;
 268     }
 269     unicode->str[length] = 0;
 270     unicode->length = length;
 271
 272   reset:
 273     /* Reset the object caches */
 274     if (unicode->defenc) {
 275         Py_DECREF(unicode->defenc);
 276         unicode->defenc = NULL;
 277     }
 278     unicode->hash = -1;
 279
 280     return 0;
 281 }
 282
 283 /* We allocate one more byte to make sure the string is
 284    Ux0000 terminated -- XXX is this needed ?
 285
 286    XXX This allocator could further be enhanced by assuring that the
 287    free list never reduces its size below 1.
 288
 289 */
 290
 291 static
 292 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
 293 {
 294     register PyUnicodeObject *unicode;
 295
 296     /* Optimization for empty strings */
 297     if (length == 0 && unicode_empty != NULL) {
 298         Py_INCREF(unicode_empty);
 299         return unicode_empty;
 300     }
 301
 302     /* Ensure we won't overflow the size. */
 303     if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
 304         return (PyUnicodeObject *)PyErr_NoMemory();
 305     }
 306
 307     /* Unicode freelist & memory allocation */
 308     if (free_list) {
 309         unicode = free_list;
 310         free_list = *(PyUnicodeObject **)unicode;
 311         numfree--;
 312         if (unicode->str) {
 313             /* Keep-Alive optimization: we only upsize the buffer,
 314                never downsize it. */
 315             if ((unicode->length < length) &&
 316                 unicode_resize(unicode, length) < 0) {
 317                 PyObject_DEL(unicode->str);
 318                 unicode->str = NULL;
 319             }
 320         }
 321         else {
 322             size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
 323             unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
 324         }
 325         PyObject_INIT(unicode, &PyUnicode_Type);
 326     }
 327     else {
 328         size_t new_size;
 329         unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
 330         if (unicode == NULL)
 331             return NULL;
 332         new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
 333         unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
 334     }
 335
 336     if (!unicode->str) {
 337         PyErr_NoMemory();
 338         goto onError;
 339     }
 340     /* Initialize the first element to guard against cases where
 341      * the caller fails before initializing str -- unicode_resize()
 342      * reads str[0], and the Keep-Alive optimization can keep memory
 343      * allocated for str alive across a call to unicode_dealloc(unicode).
 344      * We don't want unicode_resize to read uninitialized memory in
 345      * that case.
 346      */
 347     unicode->str[0] = 0;
 348     unicode->str[length] = 0;
 349     unicode->length = length;
 350     unicode->hash = -1;
 351     unicode->defenc = NULL;
 352     return unicode;
 353
 354   onError:
 355     /* XXX UNREF/NEWREF interface should be more symmetrical */
 356     _Py_DEC_REFTOTAL;
 357     _Py_ForgetReference((PyObject *)unicode);
 358     PyObject_Del(unicode);
 359     return NULL;
 360 }
 361
 362 static
 363 void unicode_dealloc(register PyUnicodeObject *unicode)
 364 {
 365     if (PyUnicode_CheckExact(unicode) &&
 366         numfree < PyUnicode_MAXFREELIST) {
 367         /* Keep-Alive optimization */
 368         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
 369             PyObject_DEL(unicode->str);
 370             unicode->str = NULL;
 371             unicode->length = 0;
 372         }
 373         if (unicode->defenc) {
 374             Py_DECREF(unicode->defenc);
 375             unicode->defenc = NULL;
 376         }
 377         /* Add to free list */
 378         *(PyUnicodeObject **)unicode = free_list;
 379         free_list = unicode;
 380         numfree++;
 381     }
 382     else {
 383         PyObject_DEL(unicode->str);
 384         Py_XDECREF(unicode->defenc);
 385         Py_TYPE(unicode)->tp_free((PyObject *)unicode);
 386     }
 387 }
 388
 389 static
 390 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
 391 {
 392     register PyUnicodeObject *v;
 393
 394     /* Argument checks */
 395     if (unicode == NULL) {
 396         PyErr_BadInternalCall();
 397         return -1;
 398     }
 399     v = *unicode;
 400     if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
 401         PyErr_BadInternalCall();
 402         return -1;
 403     }
 404
 405     /* Resizing unicode_empty and single character objects is not
 406        possible since these are being shared. We simply return a fresh
 407        copy with the same Unicode content. */
 408     if (v->length != length &&
 409         (v == unicode_empty || v->length == 1)) {
 410         PyUnicodeObject *w = _PyUnicode_New(length);
 411         if (w == NULL)
 412             return -1;
 413         Py_UNICODE_COPY(w->str, v->str,
 414                         length < v->length ? length : v->length);
 415         Py_DECREF(*unicode);
 416         *unicode = w;
 417         return 0;
 418     }
 419
 420     /* Note that we don't have to modify *unicode for unshared Unicode
 421        objects, since we can modify them in-place. */
 422     return unicode_resize(v, length);
 423 }
 424
 425 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
 426 {
 427     return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
 428 }
 429
 430 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
 431                                 Py_ssize_t size)
 432 {
 433     PyUnicodeObject *unicode;
 434
 435     /* If the Unicode data is known at construction time, we can apply
 436        some optimizations which share commonly used objects. */
 437     if (u != NULL) {
 438
 439         /* Optimization for empty strings */
 440         if (size == 0 && unicode_empty != NULL) {
 441             Py_INCREF(unicode_empty);
 442             return (PyObject *)unicode_empty;
 443         }
 444
 445         /* Single character Unicode objects in the Latin-1 range are
 446            shared when using this constructor */
 447         if (size == 1 && *u < 256) {
 448             unicode = unicode_latin1[*u];
 449             if (!unicode) {
 450                 unicode = _PyUnicode_New(1);
 451                 if (!unicode)
 452                     return NULL;
 453                 unicode->str[0] = *u;
 454                 unicode_latin1[*u] = unicode;
 455             }
 456             Py_INCREF(unicode);
 457             return (PyObject *)unicode;
 458         }
 459     }
 460
 461     unicode = _PyUnicode_New(size);
 462     if (!unicode)
 463         return NULL;
 464
 465     /* Copy the Unicode data into the new object */
 466     if (u != NULL)
 467         Py_UNICODE_COPY(unicode->str, u, size);
 468
 469     return (PyObject *)unicode;
 470 }
 471
 472 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
 473 {
 474     PyUnicodeObject *unicode;
 475
 476     if (size < 0) {
 477         PyErr_SetString(PyExc_SystemError,
 478                         "Negative size passed to PyUnicode_FromStringAndSize");
 479         return NULL;
 480     }
 481
 482     /* If the Unicode data is known at construction time, we can apply
 483        some optimizations which share commonly used objects.
 484        Also, this means the input must be UTF-8, so fall back to the
 485        UTF-8 decoder at the end. */
 486     if (u != NULL) {
 487
 488         /* Optimization for empty strings */
 489         if (size == 0 && unicode_empty != NULL) {
 490             Py_INCREF(unicode_empty);
 491             return (PyObject *)unicode_empty;
 492         }
 493
 494         /* Single characters are shared when using this constructor.
 495            Restrict to ASCII, since the input must be UTF-8. */
 496         if (size == 1 && Py_CHARMASK(*u) < 128) {
 497             unicode = unicode_latin1[Py_CHARMASK(*u)];
 498             if (!unicode) {
 499                 unicode = _PyUnicode_New(1);
 500                 if (!unicode)
 501                     return NULL;
 502                 unicode->str[0] = Py_CHARMASK(*u);
 503                 unicode_latin1[Py_CHARMASK(*u)] = unicode;
 504             }
 505             Py_INCREF(unicode);
 506             return (PyObject *)unicode;
 507         }
 508
 509         return PyUnicode_DecodeUTF8(u, size, NULL);
 510     }
 511
 512     unicode = _PyUnicode_New(size);
 513     if (!unicode)
 514         return NULL;
 515
 516     return (PyObject *)unicode;
 517 }
 518
 519 PyObject *PyUnicode_FromString(const char *u)
 520 {
 521     size_t size = strlen(u);
 522     if (size > PY_SSIZE_T_MAX) {
 523         PyErr_SetString(PyExc_OverflowError, "input too long");
 524         return NULL;
 525     }
 526
 527     return PyUnicode_FromStringAndSize(u, size);
 528 }
 529
 530 #ifdef HAVE_WCHAR_H
 531
 532 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
 533 # define CONVERT_WCHAR_TO_SURROGATES
 534 #endif
 535
 536 #ifdef CONVERT_WCHAR_TO_SURROGATES
 537
 538 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
 539    to convert from UTF32 to UTF16. */
 540
 541 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 542                                  Py_ssize_t size)
 543 {
 544     PyUnicodeObject *unicode;
 545     register Py_ssize_t i;
 546     Py_ssize_t alloc;
 547     const wchar_t *orig_w;
 548
 549     if (w == NULL) {
 550         PyErr_BadInternalCall();
 551         return NULL;
 552     }
 553
 554     alloc = size;
 555     orig_w = w;
 556     for (i = size; i > 0; i--) {
 557         if (*w > 0xFFFF)
 558             alloc++;
 559         w++;
 560     }
 561     w = orig_w;
 562     unicode = _PyUnicode_New(alloc);
 563     if (!unicode)
 564         return NULL;
 565
 566     /* Copy the wchar_t data into the new object */
 567     {
 568         register Py_UNICODE *u;
 569         u = PyUnicode_AS_UNICODE(unicode);
 570         for (i = size; i > 0; i--) {
 571             if (*w > 0xFFFF) {
 572                 wchar_t ordinal = *w++;
 573                 ordinal -= 0x10000;
 574                 *u++ = 0xD800 | (ordinal >> 10);
 575                 *u++ = 0xDC00 | (ordinal & 0x3FF);
 576             }
 577             else
 578                 *u++ = *w++;
 579         }
 580     }
 581     return (PyObject *)unicode;
 582 }
 583
 584 #else
 585
 586 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 587                                  Py_ssize_t size)
 588 {
 589     PyUnicodeObject *unicode;
 590
 591     if (w == NULL) {
 592         PyErr_BadInternalCall();
 593         return NULL;
 594     }
 595
 596     unicode = _PyUnicode_New(size);
 597     if (!unicode)
 598         return NULL;
 599
 600     /* Copy the wchar_t data into the new object */
 601 #ifdef HAVE_USABLE_WCHAR_T
 602     memcpy(unicode->str, w, size * sizeof(wchar_t));
 603 #else
 604     {
 605         register Py_UNICODE *u;
 606         register Py_ssize_t i;
 607         u = PyUnicode_AS_UNICODE(unicode);
 608         for (i = size; i > 0; i--)
 609             *u++ = *w++;
 610     }
 611 #endif
 612
 613     return (PyObject *)unicode;
 614 }
 615
 616 #endif /* CONVERT_WCHAR_TO_SURROGATES */
 617
 618 #undef CONVERT_WCHAR_TO_SURROGATES
 619
 620 static void
 621 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
 622 {
 623     *fmt++ = '%';
 624     if (width) {
 625         if (zeropad)
 626             *fmt++ = '0';
 627         fmt += sprintf(fmt, "%d", width);
 628     }
 629     if (precision)
 630         fmt += sprintf(fmt, ".%d", precision);
 631     if (longflag)
 632         *fmt++ = 'l';
 633     else if (size_tflag) {
 634         char *f = PY_FORMAT_SIZE_T;
 635         while (*f)
 636             *fmt++ = *f++;
 637     }
 638     *fmt++ = c;
 639     *fmt = '\0';
 640 }
 641
 642 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
 643
 644 PyObject *
 645 PyUnicode_FromFormatV(const char *format, va_list vargs)
 646 {
 647     va_list count;
 648     Py_ssize_t callcount = 0;
 649     PyObject **callresults = NULL;
 650     PyObject **callresult = NULL;
 651     Py_ssize_t n = 0;
 652     int width = 0;
 653     int precision = 0;
 654     int zeropad;
 655     const char* f;
 656     Py_UNICODE *s;
 657     PyObject *string;
 658     /* used by sprintf */
 659     char buffer[21];
 660     /* use abuffer instead of buffer, if we need more space
 661      * (which can happen if there's a format specifier with width). */
 662     char *abuffer = NULL;
 663     char *realbuffer;
 664     Py_ssize_t abuffersize = 0;
 665     char fmt[60]; /* should be enough for %0width.precisionld */
 666     const char *copy;
 667
 668 #ifdef VA_LIST_IS_ARRAY
 669     Py_MEMCPY(count, vargs, sizeof(va_list));
 670 #else
 671 #ifdef  __va_copy
 672     __va_copy(count, vargs);
 673 #else
 674     count = vargs;
 675 #endif
 676 #endif
 677     /* step 1: count the number of %S/%R format specifications
 678      * (we call PyObject_Str()/PyObject_Repr() for these objects
 679      * once during step 3 and put the result in an array) */
 680     for (f = format; *f; f++) {
 681         if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
 682             ++callcount;
 683     }
 684     /* step 2: allocate memory for the results of
 685      * PyObject_Str()/PyObject_Repr() calls */
 686     if (callcount) {
 687         callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
 688         if (!callresults) {
 689             PyErr_NoMemory();
 690             return NULL;
 691         }
 692         callresult = callresults;
 693     }
 694     /* step 3: figure out how large a buffer we need */
 695     for (f = format; *f; f++) {
 696         if (*f == '%') {
 697             const char* p = f;
 698             width = 0;
 699             while (isdigit((unsigned)*f))
 700                 width = (width*10) + *f++ - '0';
 701             while (*++f && *f != '%' && !isalpha((unsigned)*f))
 702                 ;
 703
 704             /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
 705              * they don't affect the amount of space we reserve.
 706              */
 707             if ((*f == 'l' || *f == 'z') &&
 708                 (f[1] == 'd' || f[1] == 'u'))
 709                 ++f;
 710
 711             switch (*f) {
 712             case 'c':
 713                 (void)va_arg(count, int);
 714                 /* fall through... */
 715             case '%':
 716                 n++;
 717                 break;
 718             case 'd': case 'u': case 'i': case 'x':
 719                 (void) va_arg(count, int);
 720                 /* 20 bytes is enough to hold a 64-bit
 721                    integer.  Decimal takes the most space.
 722                    This isn't enough for octal.
 723                    If a width is specified we need more
 724                    (which we allocate later). */
 725                 if (width < 20)
 726                     width = 20;
 727                 n += width;
 728                 if (abuffersize < width)
 729                     abuffersize = width;
 730                 break;
 731             case 's':
 732             {
 733                 /* UTF-8 */
 734                 unsigned char*s;
 735                 s = va_arg(count, unsigned char*);
 736                 while (*s) {
 737                     if (*s < 128) {
 738                         n++; s++;
 739                     } else if (*s < 0xc0) {
 740                         /* invalid UTF-8 */
 741                         n++; s++;
 742                     } else if (*s < 0xc0) {
 743                         n++;
 744                         s++; if(!*s)break;
 745                         s++;
 746                     } else if (*s < 0xe0) {
 747                         n++;
 748                         s++; if(!*s)break;
 749                         s++; if(!*s)break;
 750                         s++;
 751                     } else {
 752 #ifdef Py_UNICODE_WIDE
 753                         n++;
 754 #else
 755                         n+=2;
 756 #endif
 757                         s++; if(!*s)break;
 758                         s++; if(!*s)break;
 759                         s++; if(!*s)break;
 760                         s++;
 761                     }
 762                 }
 763                 break;
 764             }
 765             case 'U':
 766             {
 767                 PyObject *obj = va_arg(count, PyObject *);
 768                 assert(obj && PyUnicode_Check(obj));
 769                 n += PyUnicode_GET_SIZE(obj);
 770                 break;
 771             }
 772             case 'V':
 773             {
 774                 PyObject *obj = va_arg(count, PyObject *);
 775                 const char *str = va_arg(count, const char *);
 776                 assert(obj || str);
 777                 assert(!obj || PyUnicode_Check(obj));
 778                 if (obj)
 779                     n += PyUnicode_GET_SIZE(obj);
 780                 else
 781                     n += strlen(str);
 782                 break;
 783             }
 784             case 'S':
 785             {
 786                 PyObject *obj = va_arg(count, PyObject *);
 787                 PyObject *str;
 788                 assert(obj);
 789                 str = PyObject_Str(obj);
 790                 if (!str)
 791                     goto fail;
 792                 n += PyUnicode_GET_SIZE(str);
 793                 /* Remember the str and switch to the next slot */
 794                 *callresult++ = str;
 795                 break;
 796             }
 797             case 'R':
 798             {
 799                 PyObject *obj = va_arg(count, PyObject *);
 800                 PyObject *repr;
 801                 assert(obj);
 802                 repr = PyObject_Repr(obj);
 803                 if (!repr)
 804                     goto fail;
 805                 n += PyUnicode_GET_SIZE(repr);
 806                 /* Remember the repr and switch to the next slot */
 807                 *callresult++ = repr;
 808                 break;
 809             }
 810             case 'p':
 811                 (void) va_arg(count, int);
 812                 /* maximum 64-bit pointer representation:
 813                  * 0xffffffffffffffff
 814                  * so 19 characters is enough.
 815                  * XXX I count 18 -- what's the extra for?
 816                  */
 817                 n += 19;
 818                 break;
 819             default:
 820                 /* if we stumble upon an unknown
 821                    formatting code, copy the rest of
 822                    the format string to the output
 823                    string. (we cannot just skip the
 824                    code, since there's no way to know
 825                    what's in the argument list) */
 826                 n += strlen(p);
 827                 goto expand;
 828             }
 829         } else
 830             n++;
 831     }
 832   expand:
 833     if (abuffersize > 20) {
 834         abuffer = PyObject_Malloc(abuffersize);
 835         if (!abuffer) {
 836             PyErr_NoMemory();
 837             goto fail;
 838         }
 839         realbuffer = abuffer;
 840     }
 841     else
 842         realbuffer = buffer;
 843     /* step 4: fill the buffer */
 844     /* Since we've analyzed how much space we need for the worst case,
 845        we don't have to resize the string.
 846        There can be no errors beyond this point. */
 847     string = PyUnicode_FromUnicode(NULL, n);
 848     if (!string)
 849         goto fail;
 850
 851     s = PyUnicode_AS_UNICODE(string);
 852     callresult = callresults;
 853
 854     for (f = format; *f; f++) {
 855         if (*f == '%') {
 856             const char* p = f++;
 857             int longflag = 0;
 858             int size_tflag = 0;
 859             zeropad = (*f == '0');
 860             /* parse the width.precision part */
 861             width = 0;
 862             while (isdigit((unsigned)*f))
 863                 width = (width*10) + *f++ - '0';
 864             precision = 0;
 865             if (*f == '.') {
 866                 f++;
 867                 while (isdigit((unsigned)*f))
 868                     precision = (precision*10) + *f++ - '0';
 869             }
 870             /* handle the long flag, but only for %ld and %lu.
 871                others can be added when necessary. */
 872             if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
 873                 longflag = 1;
 874                 ++f;
 875             }
 876             /* handle the size_t flag. */
 877             if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
 878                 size_tflag = 1;
 879                 ++f;
 880             }
 881
 882             switch (*f) {
 883             case 'c':
 884                 *s++ = va_arg(vargs, int);
 885                 break;
 886             case 'd':
 887                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
 888                 if (longflag)
 889                     sprintf(realbuffer, fmt, va_arg(vargs, long));
 890                 else if (size_tflag)
 891                     sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
 892                 else
 893                     sprintf(realbuffer, fmt, va_arg(vargs, int));
 894                 appendstring(realbuffer);
 895                 break;
 896             case 'u':
 897                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
 898                 if (longflag)
 899                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
 900                 else if (size_tflag)
 901                     sprintf(realbuffer, fmt, va_arg(vargs, size_t));
 902                 else
 903                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
 904                 appendstring(realbuffer);
 905                 break;
 906             case 'i':
 907                 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
 908                 sprintf(realbuffer, fmt, va_arg(vargs, int));
 909                 appendstring(realbuffer);
 910                 break;
 911             case 'x':
 912                 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
 913                 sprintf(realbuffer, fmt, va_arg(vargs, int));
 914                 appendstring(realbuffer);
 915                 break;
 916             case 's':
 917             {
 918                 /* Parameter must be UTF-8 encoded.
 919                    In case of encoding errors, use
 920                    the replacement character. */
 921                 PyObject *u;
 922                 p = va_arg(vargs, char*);
 923                 u = PyUnicode_DecodeUTF8(p, strlen(p),
 924                                          "replace");
 925                 if (!u)
 926                     goto fail;
 927                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
 928                                 PyUnicode_GET_SIZE(u));
 929                 s += PyUnicode_GET_SIZE(u);
 930                 Py_DECREF(u);
 931                 break;
 932             }
 933             case 'U':
 934             {
 935                 PyObject *obj = va_arg(vargs, PyObject *);
 936                 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
 937                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
 938                 s += size;
 939                 break;
 940             }
 941             case 'V':
 942             {
 943                 PyObject *obj = va_arg(vargs, PyObject *);
 944                 const char *str = va_arg(vargs, const char *);
 945                 if (obj) {
 946                     Py_ssize_t size = PyUnicode_GET_SIZE(obj);
 947                     Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
 948                     s += size;
 949                 } else {
 950                     appendstring(str);
 951                 }
 952                 break;
 953             }
 954             case 'S':
 955             case 'R':
 956             {
 957                 Py_UNICODE *ucopy;
 958                 Py_ssize_t usize;
 959                 Py_ssize_t upos;
 960                 /* unused, since we already have the result */
 961                 (void) va_arg(vargs, PyObject *);
 962                 ucopy = PyUnicode_AS_UNICODE(*callresult);
 963                 usize = PyUnicode_GET_SIZE(*callresult);
 964                 for (upos = 0; upos<usize;)
 965                     *s++ = ucopy[upos++];
 966                 /* We're done with the unicode()/repr() => forget it */
 967                 Py_DECREF(*callresult);
 968                 /* switch to next unicode()/repr() result */
 969                 ++callresult;
 970                 break;
 971             }
 972             case 'p':
 973                 sprintf(buffer, "%p", va_arg(vargs, void*));
 974                 /* %p is ill-defined:  ensure leading 0x. */
 975                 if (buffer[1] == 'X')
 976                     buffer[1] = 'x';
 977                 else if (buffer[1] != 'x') {
 978                     memmove(buffer+2, buffer, strlen(buffer)+1);
 979                     buffer[0] = '0';
 980                     buffer[1] = 'x';
 981                 }
 982                 appendstring(buffer);
 983                 break;
 984             case '%':
 985                 *s++ = '%';
 986                 break;
 987             default:
 988                 appendstring(p);
 989                 goto end;
 990             }
 991         } else
 992             *s++ = *f;
 993     }
 994
 995   end:
 996     if (callresults)
 997         PyObject_Free(callresults);
 998     if (abuffer)
 999         PyObject_Free(abuffer);
1000     PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1001     return string;
1002   fail:
1003     if (callresults) {
1004         PyObject **callresult2 = callresults;
1005         while (callresult2 < callresult) {
1006             Py_DECREF(*callresult2);
1007             ++callresult2;
1008         }
1009         PyObject_Free(callresults);
1010     }
1011     if (abuffer)
1012         PyObject_Free(abuffer);
1013     return NULL;
1014 }
1015
1016 #undef appendstring
1017
1018 PyObject *
1019 PyUnicode_FromFormat(const char *format, ...)
1020 {
1021     PyObject* ret;
1022     va_list vargs;
1023
1024 #ifdef HAVE_STDARG_PROTOTYPES
1025     va_start(vargs, format);
1026 #else
1027     va_start(vargs);
1028 #endif
1029     ret = PyUnicode_FromFormatV(format, vargs);
1030     va_end(vargs);
1031     return ret;
1032 }
1033
1034 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1035                                 wchar_t *w,
1036                                 Py_ssize_t size)
1037 {
1038     if (unicode == NULL) {
1039         PyErr_BadInternalCall();
1040         return -1;
1041     }
1042
1043     /* If possible, try to copy the 0-termination as well */
1044     if (size > PyUnicode_GET_SIZE(unicode))
1045         size = PyUnicode_GET_SIZE(unicode) + 1;
1046
1047 #ifdef HAVE_USABLE_WCHAR_T
1048     memcpy(w, unicode->str, size * sizeof(wchar_t));
1049 #else
1050     {
1051         register Py_UNICODE *u;
1052         register Py_ssize_t i;
1053         u = PyUnicode_AS_UNICODE(unicode);
1054         for (i = size; i > 0; i--)
1055             *w++ = *u++;
1056     }
1057 #endif
1058
1059     if (size > PyUnicode_GET_SIZE(unicode))
1060         return PyUnicode_GET_SIZE(unicode);
1061     else
1062         return size;
1063 }
1064
1065 #endif
1066
1067 PyObject *PyUnicode_FromOrdinal(int ordinal)
1068 {
1069     Py_UNICODE s[1];
1070
1071 #ifdef Py_UNICODE_WIDE
1072     if (ordinal < 0 || ordinal > 0x10ffff) {
1073         PyErr_SetString(PyExc_ValueError,
1074                         "unichr() arg not in range(0x110000) "
1075                         "(wide Python build)");
1076         return NULL;
1077     }
1078 #else
1079     if (ordinal < 0 || ordinal > 0xffff) {
1080         PyErr_SetString(PyExc_ValueError,
1081                         "unichr() arg not in range(0x10000) "
1082                         "(narrow Python build)");
1083         return NULL;
1084     }
1085 #endif
1086
1087     s[0] = (Py_UNICODE)ordinal;
1088     return PyUnicode_FromUnicode(s, 1);
1089 }
1090
1091 PyObject *PyUnicode_FromObject(register PyObject *obj)
1092 {
1093     /* XXX Perhaps we should make this API an alias of
1094        PyObject_Unicode() instead ?! */
1095     if (PyUnicode_CheckExact(obj)) {
1096         Py_INCREF(obj);
1097         return obj;
1098     }
1099     if (PyUnicode_Check(obj)) {
1100         /* For a Unicode subtype that's not a Unicode object,
1101            return a true Unicode object with the same data. */
1102         return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1103                                      PyUnicode_GET_SIZE(obj));
1104     }
1105     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1106 }
1107
1108 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1109                                       const char *encoding,
1110                                       const char *errors)
1111 {
1112     const char *s = NULL;
1113     Py_ssize_t len;
1114     PyObject *v;
1115
1116     if (obj == NULL) {
1117         PyErr_BadInternalCall();
1118         return NULL;
1119     }
1120
1121 #if 0
1122     /* For b/w compatibility we also accept Unicode objects provided
1123        that no encodings is given and then redirect to
1124        PyObject_Unicode() which then applies the additional logic for
1125        Unicode subclasses.
1126
1127        NOTE: This API should really only be used for object which
1128        represent *encoded* Unicode !
1129
1130     */
1131     if (PyUnicode_Check(obj)) {
1132         if (encoding) {
1133             PyErr_SetString(PyExc_TypeError,
1134                             "decoding Unicode is not supported");
1135             return NULL;
1136         }
1137         return PyObject_Unicode(obj);
1138     }
1139 #else
1140     if (PyUnicode_Check(obj)) {
1141         PyErr_SetString(PyExc_TypeError,
1142                         "decoding Unicode is not supported");
1143         return NULL;
1144     }
1145 #endif
1146
1147     /* Coerce object */
1148     if (PyString_Check(obj)) {
1149         s = PyString_AS_STRING(obj);
1150         len = PyString_GET_SIZE(obj);
1151     }
1152     else if (PyByteArray_Check(obj)) {
1153         /* Python 2.x specific */
1154         PyErr_Format(PyExc_TypeError,
1155                      "decoding bytearray is not supported");
1156         return NULL;
1157     }
1158     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1159         /* Overwrite the error message with something more useful in
1160            case of a TypeError. */
1161         if (PyErr_ExceptionMatches(PyExc_TypeError))
1162             PyErr_Format(PyExc_TypeError,
1163                          "coercing to Unicode: need string or buffer, "
1164                          "%.80s found",
1165                          Py_TYPE(obj)->tp_name);
1166         goto onError;
1167     }
1168
1169     /* Convert to Unicode */
1170     if (len == 0) {
1171         Py_INCREF(unicode_empty);
1172         v = (PyObject *)unicode_empty;
1173     }
1174     else
1175         v = PyUnicode_Decode(s, len, encoding, errors);
1176
1177     return v;
1178
1179   onError:
1180     return NULL;
1181 }
1182
1183 PyObject *PyUnicode_Decode(const char *s,
1184                            Py_ssize_t size,
1185                            const char *encoding,
1186                            const char *errors)
1187 {
1188     PyObject *buffer = NULL, *unicode;
1189
1190     if (encoding == NULL)
1191         encoding = PyUnicode_GetDefaultEncoding();
1192
1193     /* Shortcuts for common default encodings */
1194     if (strcmp(encoding, "utf-8") == 0)
1195         return PyUnicode_DecodeUTF8(s, size, errors);
1196     else if (strcmp(encoding, "latin-1") == 0)
1197         return PyUnicode_DecodeLatin1(s, size, errors);
1198 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1199     else if (strcmp(encoding, "mbcs") == 0)
1200         return PyUnicode_DecodeMBCS(s, size, errors);
1201 #endif
1202     else if (strcmp(encoding, "ascii") == 0)
1203         return PyUnicode_DecodeASCII(s, size, errors);
1204
1205     /* Decode via the codec registry */
1206     buffer = PyBuffer_FromMemory((void *)s, size);
1207     if (buffer == NULL)
1208         goto onError;
1209     unicode = PyCodec_Decode(buffer, encoding, errors);
1210     if (unicode == NULL)
1211         goto onError;
1212     if (!PyUnicode_Check(unicode)) {
1213         PyErr_Format(PyExc_TypeError,
1214                      "decoder did not return an unicode object (type=%.400s)",
1215                      Py_TYPE(unicode)->tp_name);
1216         Py_DECREF(unicode);
1217         goto onError;
1218     }
1219     Py_DECREF(buffer);
1220     return unicode;
1221
1222   onError:
1223     Py_XDECREF(buffer);
1224     return NULL;
1225 }
1226
1227 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1228                                     const char *encoding,
1229                                     const char *errors)
1230 {
1231     PyObject *v;
1232
1233     if (!PyUnicode_Check(unicode)) {
1234         PyErr_BadArgument();
1235         goto onError;
1236     }
1237
1238     if (encoding == NULL)
1239         encoding = PyUnicode_GetDefaultEncoding();
1240
1241     /* Decode via the codec registry */
1242     v = PyCodec_Decode(unicode, encoding, errors);
1243     if (v == NULL)
1244         goto onError;
1245     return v;
1246
1247   onError:
1248     return NULL;
1249 }
1250
1251 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1252                            Py_ssize_t size,
1253                            const char *encoding,
1254                            const char *errors)
1255 {
1256     PyObject *v, *unicode;
1257
1258     unicode = PyUnicode_FromUnicode(s, size);
1259     if (unicode == NULL)
1260         return NULL;
1261     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1262     Py_DECREF(unicode);
1263     return v;
1264 }
1265
1266 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1267                                     const char *encoding,
1268                                     const char *errors)
1269 {
1270     PyObject *v;
1271
1272     if (!PyUnicode_Check(unicode)) {
1273         PyErr_BadArgument();
1274         goto onError;
1275     }
1276
1277     if (encoding == NULL)
1278         encoding = PyUnicode_GetDefaultEncoding();
1279
1280     /* Encode via the codec registry */
1281     v = PyCodec_Encode(unicode, encoding, errors);
1282     if (v == NULL)
1283         goto onError;
1284     return v;
1285
1286   onError:
1287     return NULL;
1288 }
1289
1290 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1291                                     const char *encoding,
1292                                     const char *errors)
1293 {
1294     PyObject *v;
1295
1296     if (!PyUnicode_Check(unicode)) {
1297         PyErr_BadArgument();
1298         goto onError;
1299     }
1300
1301     if (encoding == NULL)
1302         encoding = PyUnicode_GetDefaultEncoding();
1303
1304     /* Shortcuts for common default encodings */
1305     if (errors == NULL) {
1306         if (strcmp(encoding, "utf-8") == 0)
1307             return PyUnicode_AsUTF8String(unicode);
1308         else if (strcmp(encoding, "latin-1") == 0)
1309             return PyUnicode_AsLatin1String(unicode);
1310 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1311         else if (strcmp(encoding, "mbcs") == 0)
1312             return PyUnicode_AsMBCSString(unicode);
1313 #endif
1314         else if (strcmp(encoding, "ascii") == 0)
1315             return PyUnicode_AsASCIIString(unicode);
1316     }
1317
1318     /* Encode via the codec registry */
1319     v = PyCodec_Encode(unicode, encoding, errors);
1320     if (v == NULL)
1321         goto onError;
1322     if (!PyString_Check(v)) {
1323         PyErr_Format(PyExc_TypeError,
1324                      "encoder did not return a string object (type=%.400s)",
1325                      Py_TYPE(v)->tp_name);
1326         Py_DECREF(v);
1327         goto onError;
1328     }
1329     return v;
1330
1331   onError:
1332     return NULL;
1333 }
1334
1335 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1336                                             const char *errors)
1337 {
1338     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1339
1340     if (v)
1341         return v;
1342     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1343     if (v && errors == NULL)
1344         ((PyUnicodeObject *)unicode)->defenc = v;
1345     return v;
1346 }
1347
1348 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1349 {
1350     if (!PyUnicode_Check(unicode)) {
1351         PyErr_BadArgument();
1352         goto onError;
1353     }
1354     return PyUnicode_AS_UNICODE(unicode);
1355
1356   onError:
1357     return NULL;
1358 }
1359
1360 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1361 {
1362     if (!PyUnicode_Check(unicode)) {
1363         PyErr_BadArgument();
1364         goto onError;
1365     }
1366     return PyUnicode_GET_SIZE(unicode);
1367
1368   onError:
1369     return -1;
1370 }
1371
1372 const char *PyUnicode_GetDefaultEncoding(void)
1373 {
1374     return unicode_default_encoding;
1375 }
1376
1377 int PyUnicode_SetDefaultEncoding(const char *encoding)
1378 {
1379     PyObject *v;
1380
1381     /* Make sure the encoding is valid. As side effect, this also
1382        loads the encoding into the codec registry cache. */
1383     v = _PyCodec_Lookup(encoding);
1384     if (v == NULL)
1385         goto onError;
1386     Py_DECREF(v);
1387     strncpy(unicode_default_encoding,
1388             encoding,
1389             sizeof(unicode_default_encoding));
1390     return 0;
1391
1392   onError:
1393     return -1;
1394 }
1395
1396 /* error handling callback helper:
1397    build arguments, call the callback and check the arguments,
1398    if no exception occurred, copy the replacement to the output
1399    and adjust various state variables.
1400    return 0 on success, -1 on error
1401 */
1402
1403 static
1404 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1405                                      const char *encoding, const char *reason,
1406                                      const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1407                                      Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1408                                      PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1409 {
1410     static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1411
1412     PyObject *restuple = NULL;
1413     PyObject *repunicode = NULL;
1414     Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1415     Py_ssize_t requiredsize;
1416     Py_ssize_t newpos;
1417     Py_UNICODE *repptr;
1418     Py_ssize_t repsize;
1419     int res = -1;
1420
1421     if (*errorHandler == NULL) {
1422         *errorHandler = PyCodec_LookupError(errors);
1423         if (*errorHandler == NULL)
1424             goto onError;
1425     }
1426
1427     if (*exceptionObject == NULL) {
1428         *exceptionObject = PyUnicodeDecodeError_Create(
1429             encoding, input, insize, *startinpos, *endinpos, reason);
1430         if (*exceptionObject == NULL)
1431             goto onError;
1432     }
1433     else {
1434         if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1435             goto onError;
1436         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1437             goto onError;
1438         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1439             goto onError;
1440     }
1441
1442     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1443     if (restuple == NULL)
1444         goto onError;
1445     if (!PyTuple_Check(restuple)) {
1446         PyErr_SetString(PyExc_TypeError, &argparse[4]);
1447         goto onError;
1448     }
1449     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1450         goto onError;
1451     if (newpos<0)
1452         newpos = insize+newpos;
1453     if (newpos<0 || newpos>insize) {
1454         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1455         goto onError;
1456     }
1457
1458     /* need more space? (at least enough for what we
1459        have+the replacement+the rest of the string (starting
1460        at the new input position), so we won't have to check space
1461        when there are no errors in the rest of the string) */
1462     repptr = PyUnicode_AS_UNICODE(repunicode);
1463     repsize = PyUnicode_GET_SIZE(repunicode);
1464     requiredsize = *outpos + repsize + insize-newpos;
1465     if (requiredsize > outsize) {
1466         if (requiredsize<2*outsize)
1467             requiredsize = 2*outsize;
1468         if (_PyUnicode_Resize(output, requiredsize) < 0)
1469             goto onError;
1470         *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1471     }
1472     *endinpos = newpos;
1473     *inptr = input + newpos;
1474     Py_UNICODE_COPY(*outptr, repptr, repsize);
1475     *outptr += repsize;
1476     *outpos += repsize;
1477     /* we made it! */
1478     res = 0;
1479
1480   onError:
1481     Py_XDECREF(restuple);
1482     return res;
1483 }
1484
1485 /* --- UTF-7 Codec -------------------------------------------------------- */
1486
1487 /* see RFC2152 for details */
1488
1489 static
1490 char utf7_special[128] = {
1491     /* indicate whether a UTF-7 character is special i.e. cannot be directly
1492        encoded:
1493        0 - not special
1494        1 - special
1495        2 - whitespace (optional)
1496        3 - RFC2152 Set O (optional) */
1497     1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1498     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1499     2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1500     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1501     3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1502     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1503     3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1504     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1505
1506 };
1507
1508 /* Note: The comparison (c) <= 0 is a trick to work-around gcc
1509    warnings about the comparison always being false; since
1510    utf7_special[0] is 1, we can safely make that one comparison
1511    true  */
1512
1513 #define SPECIAL(c, encodeO, encodeWS)                   \
1514     ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
1515      (encodeWS && (utf7_special[(c)] == 2)) ||          \
1516      (encodeO && (utf7_special[(c)] == 3)))
1517
1518 #define B64(n)                                                          \
1519     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1520 #define B64CHAR(c)                              \
1521     (isalnum(c) || (c) == '+' || (c) == '/')
1522 #define UB64(c)                                         \
1523     ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ?   \
1524      (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
1525
1526 #define ENCODE(out, ch, bits)                   \
1527     while (bits >= 6) {                         \
1528         *out++ = B64(ch >> (bits-6));           \
1529         bits -= 6;                              \
1530     }
1531
1532 #define DECODE(out, ch, bits, surrogate)                                \
1533     while (bits >= 16) {                                                \
1534         Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff);   \
1535         bits -= 16;                                                     \
1536         if (surrogate) {                                                \
1537             /* We have already generated an error for the high surrogate \
1538                so let's not bother seeing if the low surrogate is correct or not */ \
1539             surrogate = 0;                                              \
1540         } else if (0xDC00 <= outCh && outCh <= 0xDFFF) {                \
1541             /* This is a surrogate pair. Unfortunately we can't represent \
1542                it in a 16-bit character */                              \
1543             surrogate = 1;                                              \
1544             errmsg = "code pairs are not supported";                    \
1545             goto utf7Error;                                             \
1546         } else {                                                        \
1547             *out++ = outCh;                                             \
1548         }                                                               \
1549     }
1550
1551 PyObject *PyUnicode_DecodeUTF7(const char *s,
1552                                Py_ssize_t size,
1553                                const char *errors)
1554 {
1555     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1556 }
1557
1558 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1559                                        Py_ssize_t size,
1560                                        const char *errors,
1561                                        Py_ssize_t *consumed)
1562 {
1563     const char *starts = s;
1564     Py_ssize_t startinpos;
1565     Py_ssize_t endinpos;
1566     Py_ssize_t outpos;
1567     const char *e;
1568     PyUnicodeObject *unicode;
1569     Py_UNICODE *p;
1570     const char *errmsg = "";
1571     int inShift = 0;
1572     unsigned int bitsleft = 0;
1573     unsigned long charsleft = 0;
1574     int surrogate = 0;
1575     PyObject *errorHandler = NULL;
1576     PyObject *exc = NULL;
1577
1578     unicode = _PyUnicode_New(size);
1579     if (!unicode)
1580         return NULL;
1581     if (size == 0) {
1582         if (consumed)
1583             *consumed = 0;
1584         return (PyObject *)unicode;
1585     }
1586
1587     p = unicode->str;
1588     e = s + size;
1589
1590     while (s < e) {
1591         Py_UNICODE ch;
1592       restart:
1593         ch = (unsigned char) *s;
1594
1595         if (inShift) {
1596             if ((ch == '-') || !B64CHAR(ch)) {
1597                 inShift = 0;
1598                 s++;
1599
1600                 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1601                 if (bitsleft >= 6) {
1602                     /* The shift sequence has a partial character in it. If
1603                        bitsleft < 6 then we could just classify it as padding
1604                        but that is not the case here */
1605
1606                     errmsg = "partial character in shift sequence";
1607                     goto utf7Error;
1608                 }
1609                 /* According to RFC2152 the remaining bits should be zero. We
1610                    choose to signal an error/insert a replacement character
1611                    here so indicate the potential of a misencoded character. */
1612
1613                 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1614                 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1615                     errmsg = "non-zero padding bits in shift sequence";
1616                     goto utf7Error;
1617                 }
1618
1619                 if (ch == '-') {
1620                     if ((s < e) && (*(s) == '-')) {
1621                         *p++ = '-';
1622                         inShift = 1;
1623                     }
1624                 } else if (SPECIAL(ch,0,0)) {
1625                     errmsg = "unexpected special character";
1626                     goto utf7Error;
1627                 } else  {
1628                     *p++ = ch;
1629                 }
1630             } else {
1631                 charsleft = (charsleft << 6) | UB64(ch);
1632                 bitsleft += 6;
1633                 s++;
1634                 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1635             }
1636         }
1637         else if ( ch == '+' ) {
1638             startinpos = s-starts;
1639             s++;
1640             if (s < e && *s == '-') {
1641                 s++;
1642                 *p++ = '+';
1643             } else
1644             {
1645                 inShift = 1;
1646                 bitsleft = 0;
1647             }
1648         }
1649         else if (SPECIAL(ch,0,0)) {
1650             startinpos = s-starts;
1651             errmsg = "unexpected special character";
1652             s++;
1653             goto utf7Error;
1654         }
1655         else {
1656             *p++ = ch;
1657             s++;
1658         }
1659         continue;
1660       utf7Error:
1661         outpos = p-PyUnicode_AS_UNICODE(unicode);
1662         endinpos = s-starts;
1663         if (unicode_decode_call_errorhandler(
1664                 errors, &errorHandler,
1665                 "utf7", errmsg,
1666                 starts, size, &startinpos, &endinpos, &exc, &s,
1667                 &unicode, &outpos, &p))
1668             goto onError;
1669     }
1670
1671     if (inShift && !consumed) {
1672         outpos = p-PyUnicode_AS_UNICODE(unicode);
1673         endinpos = size;
1674         if (unicode_decode_call_errorhandler(
1675                 errors, &errorHandler,
1676                 "utf7", "unterminated shift sequence",
1677                 starts, size, &startinpos, &endinpos, &exc, &s,
1678                 &unicode, &outpos, &p))
1679             goto onError;
1680         if (s < e)
1681             goto restart;
1682     }
1683     if (consumed) {
1684         if(inShift)
1685             *consumed = startinpos;
1686         else
1687             *consumed = s-starts;
1688     }
1689
1690     if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1691         goto onError;
1692
1693     Py_XDECREF(errorHandler);
1694     Py_XDECREF(exc);
1695     return (PyObject *)unicode;
1696
1697   onError:
1698     Py_XDECREF(errorHandler);
1699     Py_XDECREF(exc);
1700     Py_DECREF(unicode);
1701     return NULL;
1702 }
1703
1704
1705 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1706                                Py_ssize_t size,
1707                                int encodeSetO,
1708                                int encodeWhiteSpace,
1709                                const char *errors)
1710 {
1711     PyObject *v;
1712     /* It might be possible to tighten this worst case */
1713     Py_ssize_t cbAllocated = 5 * size;
1714     int inShift = 0;
1715     Py_ssize_t i = 0;
1716     unsigned int bitsleft = 0;
1717     unsigned long charsleft = 0;
1718     char * out;
1719     char * start;
1720
1721     if (cbAllocated / 5 != size)
1722         return PyErr_NoMemory();
1723
1724     if (size == 0)
1725         return PyString_FromStringAndSize(NULL, 0);
1726
1727     v = PyString_FromStringAndSize(NULL, cbAllocated);
1728     if (v == NULL)
1729         return NULL;
1730
1731     start = out = PyString_AS_STRING(v);
1732     for (;i < size; ++i) {
1733         Py_UNICODE ch = s[i];
1734
1735         if (!inShift) {
1736             if (ch == '+') {
1737                 *out++ = '+';
1738                 *out++ = '-';
1739             } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1740                 charsleft = ch;
1741                 bitsleft = 16;
1742                 *out++ = '+';
1743                 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1744                 inShift = bitsleft > 0;
1745             } else {
1746                 *out++ = (char) ch;
1747             }
1748         } else {
1749             if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1750                 *out++ = B64(charsleft << (6-bitsleft));
1751                 charsleft = 0;
1752                 bitsleft = 0;
1753                 /* Characters not in the BASE64 set implicitly unshift the sequence
1754                    so no '-' is required, except if the character is itself a '-' */
1755                 if (B64CHAR(ch) || ch == '-') {
1756                     *out++ = '-';
1757                 }
1758                 inShift = 0;
1759                 *out++ = (char) ch;
1760             } else {
1761                 bitsleft += 16;
1762                 charsleft = (charsleft << 16) | ch;
1763                 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1764
1765                 /* If the next character is special then we don't need to terminate
1766                    the shift sequence. If the next character is not a BASE64 character
1767                    or '-' then the shift sequence will be terminated implicitly and we
1768                    don't have to insert a '-'. */
1769
1770                 if (bitsleft == 0) {
1771                     if (i + 1 < size) {
1772                         Py_UNICODE ch2 = s[i+1];
1773
1774                         if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1775
1776                         } else if (B64CHAR(ch2) || ch2 == '-') {
1777                             *out++ = '-';
1778                             inShift = 0;
1779                         } else {
1780                             inShift = 0;
1781                         }
1782
1783                     }
1784                     else {
1785                         *out++ = '-';
1786                         inShift = 0;
1787                     }
1788                 }
1789             }
1790         }
1791     }
1792     if (bitsleft) {
1793         *out++= B64(charsleft << (6-bitsleft) );
1794         *out++ = '-';
1795     }
1796
1797     _PyString_Resize(&v, out - start);
1798     return v;
1799 }
1800
1801 #undef SPECIAL
1802 #undef B64
1803 #undef B64CHAR
1804 #undef UB64
1805 #undef ENCODE
1806 #undef DECODE
1807
1808 /* --- UTF-8 Codec -------------------------------------------------------- */
1809
1810 static
1811 char utf8_code_length[256] = {
1812     /* Map UTF-8 encoded prefix byte to sequence length.  zero means
1813        illegal prefix.  see RFC 2279 for details */
1814     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1815     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1816     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1817     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1818     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1819     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1820     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1821     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1822     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1823     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1824     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1825     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1826     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1827     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1828     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1829     4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1830 };
1831
1832 PyObject *PyUnicode_DecodeUTF8(const char *s,
1833                                Py_ssize_t size,
1834                                const char *errors)
1835 {
1836     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1837 }
1838
1839 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1840                                        Py_ssize_t size,
1841                                        const char *errors,
1842                                        Py_ssize_t *consumed)
1843 {
1844     const char *starts = s;
1845     int n;
1846     Py_ssize_t startinpos;
1847     Py_ssize_t endinpos;
1848     Py_ssize_t outpos;
1849     const char *e;
1850     PyUnicodeObject *unicode;
1851     Py_UNICODE *p;
1852     const char *errmsg = "";
1853     PyObject *errorHandler = NULL;
1854     PyObject *exc = NULL;
1855
1856     /* Note: size will always be longer than the resulting Unicode
1857        character count */
1858     unicode = _PyUnicode_New(size);
1859     if (!unicode)
1860         return NULL;
1861     if (size == 0) {
1862         if (consumed)
1863             *consumed = 0;
1864         return (PyObject *)unicode;
1865     }
1866
1867     /* Unpack UTF-8 encoded data */
1868     p = unicode->str;
1869     e = s + size;
1870
1871     while (s < e) {
1872         Py_UCS4 ch = (unsigned char)*s;
1873
1874         if (ch < 0x80) {
1875             *p++ = (Py_UNICODE)ch;
1876             s++;
1877             continue;
1878         }
1879
1880         n = utf8_code_length[ch];
1881
1882         if (s + n > e) {
1883             if (consumed)
1884                 break;
1885             else {
1886                 errmsg = "unexpected end of data";
1887                 startinpos = s-starts;
1888                 endinpos = size;
1889                 goto utf8Error;
1890             }
1891         }
1892
1893         switch (n) {
1894
1895         case 0:
1896             errmsg = "unexpected code byte";
1897             startinpos = s-starts;
1898             endinpos = startinpos+1;
1899             goto utf8Error;
1900
1901         case 1:
1902             errmsg = "internal error";
1903             startinpos = s-starts;
1904             endinpos = startinpos+1;
1905             goto utf8Error;
1906
1907         case 2:
1908             if ((s[1] & 0xc0) != 0x80) {
1909                 errmsg = "invalid data";
1910                 startinpos = s-starts;
1911                 endinpos = startinpos+2;
1912                 goto utf8Error;
1913             }
1914             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1915             if (ch < 0x80) {
1916                 startinpos = s-starts;
1917                 endinpos = startinpos+2;
1918                 errmsg = "illegal encoding";
1919                 goto utf8Error;
1920             }
1921             else
1922                 *p++ = (Py_UNICODE)ch;
1923             break;
1924
1925         case 3:
1926             if ((s[1] & 0xc0) != 0x80 ||
1927                 (s[2] & 0xc0) != 0x80) {
1928                 errmsg = "invalid data";
1929                 startinpos = s-starts;
1930                 endinpos = startinpos+3;
1931                 goto utf8Error;
1932             }
1933             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1934             if (ch < 0x0800) {
1935                 /* Note: UTF-8 encodings of surrogates are considered
1936                    legal UTF-8 sequences;
1937
1938                    XXX For wide builds (UCS-4) we should probably try
1939                    to recombine the surrogates into a single code
1940                    unit.
1941                 */
1942                 errmsg = "illegal encoding";
1943                 startinpos = s-starts;
1944                 endinpos = startinpos+3;
1945                 goto utf8Error;
1946             }
1947             else
1948                 *p++ = (Py_UNICODE)ch;
1949             break;
1950
1951         case 4:
1952             if ((s[1] & 0xc0) != 0x80 ||
1953                 (s[2] & 0xc0) != 0x80 ||
1954                 (s[3] & 0xc0) != 0x80) {
1955                 errmsg = "invalid data";
1956                 startinpos = s-starts;
1957                 endinpos = startinpos+4;
1958                 goto utf8Error;
1959             }
1960             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1961                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1962             /* validate and convert to UTF-16 */
1963             if ((ch < 0x10000)        /* minimum value allowed for 4
1964                                          byte encoding */
1965                 || (ch > 0x10ffff))   /* maximum value allowed for
1966                                          UTF-16 */
1967             {
1968                 errmsg = "illegal encoding";
1969                 startinpos = s-starts;
1970                 endinpos = startinpos+4;
1971                 goto utf8Error;
1972             }
1973 #ifdef Py_UNICODE_WIDE
1974             *p++ = (Py_UNICODE)ch;
1975 #else
1976             /*  compute and append the two surrogates: */
1977
1978             /*  translate from 10000..10FFFF to 0..FFFF */
1979             ch -= 0x10000;
1980
1981             /*  high surrogate = top 10 bits added to D800 */
1982             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1983
1984             /*  low surrogate = bottom 10 bits added to DC00 */
1985             *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1986 #endif
1987             break;
1988
1989         default:
1990             /* Other sizes are only needed for UCS-4 */
1991             errmsg = "unsupported Unicode code range";
1992             startinpos = s-starts;
1993             endinpos = startinpos+n;
1994             goto utf8Error;
1995         }
1996         s += n;
1997         continue;
1998
1999       utf8Error:
2000         outpos = p-PyUnicode_AS_UNICODE(unicode);
2001         if (unicode_decode_call_errorhandler(
2002                 errors, &errorHandler,
2003                 "utf8", errmsg,
2004                 starts, size, &startinpos, &endinpos, &exc, &s,
2005                 &unicode, &outpos, &p))
2006             goto onError;
2007     }
2008     if (consumed)
2009         *consumed = s-starts;
2010
2011     /* Adjust length */
2012     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2013         goto onError;
2014
2015     Py_XDECREF(errorHandler);
2016     Py_XDECREF(exc);
2017     return (PyObject *)unicode;
2018
2019   onError:
2020     Py_XDECREF(errorHandler);
2021     Py_XDECREF(exc);
2022     Py_DECREF(unicode);
2023     return NULL;
2024 }
2025
2026 /* Allocation strategy:  if the string is short, convert into a stack buffer
2027    and allocate exactly as much space needed at the end.  Else allocate the
2028    maximum possible needed (4 result bytes per Unicode character), and return
2029    the excess memory at the end.
2030 */
2031 PyObject *
2032 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2033                      Py_ssize_t size,
2034                      const char *errors)
2035 {
2036 #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
2037
2038     Py_ssize_t i;           /* index into s of next input byte */
2039     PyObject *v;        /* result string object */
2040     char *p;            /* next free byte in output buffer */
2041     Py_ssize_t nallocated;  /* number of result bytes allocated */
2042     Py_ssize_t nneeded;        /* number of result bytes needed */
2043     char stackbuf[MAX_SHORT_UNICHARS * 4];
2044
2045     assert(s != NULL);
2046     assert(size >= 0);
2047
2048     if (size <= MAX_SHORT_UNICHARS) {
2049         /* Write into the stack buffer; nallocated can't overflow.
2050          * At the end, we'll allocate exactly as much heap space as it
2051          * turns out we need.
2052          */
2053         nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2054         v = NULL;   /* will allocate after we're done */
2055         p = stackbuf;
2056     }
2057     else {
2058         /* Overallocate on the heap, and give the excess back at the end. */
2059         nallocated = size * 4;
2060         if (nallocated / 4 != size)  /* overflow! */
2061             return PyErr_NoMemory();
2062         v = PyString_FromStringAndSize(NULL, nallocated);
2063         if (v == NULL)
2064             return NULL;
2065         p = PyString_AS_STRING(v);
2066     }
2067
2068     for (i = 0; i < size;) {
2069         Py_UCS4 ch = s[i++];
2070
2071         if (ch < 0x80)
2072             /* Encode ASCII */
2073             *p++ = (char) ch;
2074
2075         else if (ch < 0x0800) {
2076             /* Encode Latin-1 */
2077             *p++ = (char)(0xc0 | (ch >> 6));
2078             *p++ = (char)(0x80 | (ch & 0x3f));
2079         }
2080         else {
2081             /* Encode UCS2 Unicode ordinals */
2082             if (ch < 0x10000) {
2083                 /* Special case: check for high surrogate */
2084                 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2085                     Py_UCS4 ch2 = s[i];
2086                     /* Check for low surrogate and combine the two to
2087                        form a UCS4 value */
2088                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2089                         ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2090                         i++;
2091                         goto encodeUCS4;
2092                     }
2093                     /* Fall through: handles isolated high surrogates */
2094                 }
2095                 *p++ = (char)(0xe0 | (ch >> 12));
2096                 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2097                 *p++ = (char)(0x80 | (ch & 0x3f));
2098                 continue;
2099             }
2100           encodeUCS4:
2101             /* Encode UCS4 Unicode ordinals */
2102             *p++ = (char)(0xf0 | (ch >> 18));
2103             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2104             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2105             *p++ = (char)(0x80 | (ch & 0x3f));
2106         }
2107     }
2108
2109     if (v == NULL) {
2110         /* This was stack allocated. */
2111         nneeded = p - stackbuf;
2112         assert(nneeded <= nallocated);
2113         v = PyString_FromStringAndSize(stackbuf, nneeded);
2114     }
2115     else {
2116         /* Cut back to size actually needed. */
2117         nneeded = p - PyString_AS_STRING(v);
2118         assert(nneeded <= nallocated);
2119         _PyString_Resize(&v, nneeded);
2120     }
2121     return v;
2122
2123 #undef MAX_SHORT_UNICHARS
2124 }
2125
2126 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2127 {
2128     if (!PyUnicode_Check(unicode)) {
2129         PyErr_BadArgument();
2130         return NULL;
2131     }
2132     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2133                                 PyUnicode_GET_SIZE(unicode),
2134                                 NULL);
2135 }
2136
2137 /* --- UTF-32 Codec ------------------------------------------------------- */
2138
2139 PyObject *
2140 PyUnicode_DecodeUTF32(const char *s,
2141                       Py_ssize_t size,
2142                       const char *errors,
2143                       int *byteorder)
2144 {
2145     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2146 }
2147
2148 PyObject *
2149 PyUnicode_DecodeUTF32Stateful(const char *s,
2150                               Py_ssize_t size,
2151                               const char *errors,
2152                               int *byteorder,
2153                               Py_ssize_t *consumed)
2154 {
2155     const char *starts = s;
2156     Py_ssize_t startinpos;
2157     Py_ssize_t endinpos;
2158     Py_ssize_t outpos;
2159     PyUnicodeObject *unicode;
2160     Py_UNICODE *p;
2161 #ifndef Py_UNICODE_WIDE
2162     int i, pairs;
2163 #else
2164     const int pairs = 0;
2165 #endif
2166     const unsigned char *q, *e;
2167     int bo = 0;       /* assume native ordering by default */
2168     const char *errmsg = "";
2169     /* Offsets from q for retrieving bytes in the right order. */
2170 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2171     int iorder[] = {0, 1, 2, 3};
2172 #else
2173     int iorder[] = {3, 2, 1, 0};
2174 #endif
2175     PyObject *errorHandler = NULL;
2176     PyObject *exc = NULL;
2177     /* On narrow builds we split characters outside the BMP into two
2178        codepoints => count how much extra space we need. */
2179 #ifndef Py_UNICODE_WIDE
2180     for (i = pairs = 0; i < size/4; i++)
2181         if (((Py_UCS4 *)s)[i] >= 0x10000)
2182             pairs++;
2183 #endif
2184
2185     /* This might be one to much, because of a BOM */
2186     unicode = _PyUnicode_New((size+3)/4+pairs);
2187     if (!unicode)
2188         return NULL;
2189     if (size == 0)
2190         return (PyObject *)unicode;
2191
2192     /* Unpack UTF-32 encoded data */
2193     p = unicode->str;
2194     q = (unsigned char *)s;
2195     e = q + size;
2196
2197     if (byteorder)
2198         bo = *byteorder;
2199
2200     /* Check for BOM marks (U+FEFF) in the input and adjust current
2201        byte order setting accordingly. In native mode, the leading BOM
2202        mark is skipped, in all other modes, it is copied to the output
2203        stream as-is (giving a ZWNBSP character). */
2204     if (bo == 0) {
2205         if (size >= 4) {
2206             const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2207                 (q[iorder[1]] << 8) | q[iorder[0]];
2208 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2209             if (bom == 0x0000FEFF) {
2210                 q += 4;
2211                 bo = -1;
2212             }
2213             else if (bom == 0xFFFE0000) {
2214                 q += 4;
2215                 bo = 1;
2216             }
2217 #else
2218             if (bom == 0x0000FEFF) {
2219                 q += 4;
2220                 bo = 1;
2221             }
2222             else if (bom == 0xFFFE0000) {
2223                 q += 4;
2224                 bo = -1;
2225             }
2226 #endif
2227         }
2228     }
2229
2230     if (bo == -1) {
2231         /* force LE */
2232         iorder[0] = 0;
2233         iorder[1] = 1;
2234         iorder[2] = 2;
2235         iorder[3] = 3;
2236     }
2237     else if (bo == 1) {
2238         /* force BE */
2239         iorder[0] = 3;
2240         iorder[1] = 2;
2241         iorder[2] = 1;
2242         iorder[3] = 0;
2243     }
2244
2245     while (q < e) {
2246         Py_UCS4 ch;
2247         /* remaining bytes at the end? (size should be divisible by 4) */
2248         if (e-q<4) {
2249             if (consumed)
2250                 break;
2251             errmsg = "truncated data";
2252             startinpos = ((const char *)q)-starts;
2253             endinpos = ((const char *)e)-starts;
2254             goto utf32Error;
2255             /* The remaining input chars are ignored if the callback
2256                chooses to skip the input */
2257         }
2258         ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2259             (q[iorder[1]] << 8) | q[iorder[0]];
2260
2261         if (ch >= 0x110000)
2262         {
2263             errmsg = "codepoint not in range(0x110000)";
2264             startinpos = ((const char *)q)-starts;
2265             endinpos = startinpos+4;
2266             goto utf32Error;
2267         }
2268 #ifndef Py_UNICODE_WIDE
2269         if (ch >= 0x10000)
2270         {
2271             *p++ = 0xD800 | ((ch-0x10000) >> 10);
2272             *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2273         }
2274         else
2275 #endif
2276             *p++ = ch;
2277         q += 4;
2278         continue;
2279       utf32Error:
2280         outpos = p-PyUnicode_AS_UNICODE(unicode);
2281         if (unicode_decode_call_errorhandler(
2282                 errors, &errorHandler,
2283                 "utf32", errmsg,
2284                 starts, size, &startinpos, &endinpos, &exc, &s,
2285                 &unicode, &outpos, &p))
2286             goto onError;
2287     }
2288
2289     if (byteorder)
2290         *byteorder = bo;
2291
2292     if (consumed)
2293         *consumed = (const char *)q-starts;
2294
2295     /* Adjust length */
2296     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2297         goto onError;
2298
2299     Py_XDECREF(errorHandler);
2300     Py_XDECREF(exc);
2301     return (PyObject *)unicode;
2302
2303   onError:
2304     Py_DECREF(unicode);
2305     Py_XDECREF(errorHandler);
2306     Py_XDECREF(exc);
2307     return NULL;
2308 }
2309
2310 PyObject *
2311 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2312                       Py_ssize_t size,
2313                       const char *errors,
2314                       int byteorder)
2315 {
2316     PyObject *v;
2317     unsigned char *p;
2318     Py_ssize_t nsize, bytesize;
2319 #ifndef Py_UNICODE_WIDE
2320     Py_ssize_t i, pairs;
2321 #else
2322     const int pairs = 0;
2323 #endif
2324     /* Offsets from p for storing byte pairs in the right order. */
2325 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2326     int iorder[] = {0, 1, 2, 3};
2327 #else
2328     int iorder[] = {3, 2, 1, 0};
2329 #endif
2330
2331 #define STORECHAR(CH)                           \
2332     do {                                        \
2333         p[iorder[3]] = ((CH) >> 24) & 0xff;     \
2334         p[iorder[2]] = ((CH) >> 16) & 0xff;     \
2335         p[iorder[1]] = ((CH) >> 8) & 0xff;      \
2336         p[iorder[0]] = (CH) & 0xff;             \
2337         p += 4;                                 \
2338     } while(0)
2339
2340     /* In narrow builds we can output surrogate pairs as one codepoint,
2341        so we need less space. */
2342 #ifndef Py_UNICODE_WIDE
2343     for (i = pairs = 0; i < size-1; i++)
2344         if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2345             0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2346             pairs++;
2347 #endif
2348     nsize = (size - pairs + (byteorder == 0));
2349     bytesize = nsize * 4;
2350     if (bytesize / 4 != nsize)
2351         return PyErr_NoMemory();
2352     v = PyString_FromStringAndSize(NULL, bytesize);
2353     if (v == NULL)
2354         return NULL;
2355
2356     p = (unsigned char *)PyString_AS_STRING(v);
2357     if (byteorder == 0)
2358         STORECHAR(0xFEFF);
2359     if (size == 0)
2360         return v;
2361
2362     if (byteorder == -1) {
2363         /* force LE */
2364         iorder[0] = 0;
2365         iorder[1] = 1;
2366         iorder[2] = 2;
2367         iorder[3] = 3;
2368     }
2369     else if (byteorder == 1) {
2370         /* force BE */
2371         iorder[0] = 3;
2372         iorder[1] = 2;
2373         iorder[2] = 1;
2374         iorder[3] = 0;
2375     }
2376
2377     while (size-- > 0) {
2378         Py_UCS4 ch = *s++;
2379 #ifndef Py_UNICODE_WIDE
2380         if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2381             Py_UCS4 ch2 = *s;
2382             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2383                 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2384                 s++;
2385                 size--;
2386             }
2387         }
2388 #endif
2389         STORECHAR(ch);
2390     }
2391     return v;
2392 #undef STORECHAR
2393 }
2394
2395 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2396 {
2397     if (!PyUnicode_Check(unicode)) {
2398         PyErr_BadArgument();
2399         return NULL;
2400     }
2401     return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2402                                  PyUnicode_GET_SIZE(unicode),
2403                                  NULL,
2404                                  0);
2405 }
2406
2407 /* --- UTF-16 Codec ------------------------------------------------------- */
2408
2409 PyObject *
2410 PyUnicode_DecodeUTF16(const char *s,
2411                       Py_ssize_t size,
2412                       const char *errors,
2413                       int *byteorder)
2414 {
2415     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2416 }
2417
2418 PyObject *
2419 PyUnicode_DecodeUTF16Stateful(const char *s,
2420                               Py_ssize_t size,
2421                               const char *errors,
2422                               int *byteorder,
2423                               Py_ssize_t *consumed)
2424 {
2425     const char *starts = s;
2426     Py_ssize_t startinpos;
2427     Py_ssize_t endinpos;
2428     Py_ssize_t outpos;
2429     PyUnicodeObject *unicode;
2430     Py_UNICODE *p;
2431     const unsigned char *q, *e;
2432     int bo = 0;       /* assume native ordering by default */
2433     const char *errmsg = "";
2434     /* Offsets from q for retrieving byte pairs in the right order. */
2435 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2436     int ihi = 1, ilo = 0;
2437 #else
2438     int ihi = 0, ilo = 1;
2439 #endif
2440     PyObject *errorHandler = NULL;
2441     PyObject *exc = NULL;
2442
2443     /* Note: size will always be longer than the resulting Unicode
2444        character count */
2445     unicode = _PyUnicode_New(size);
2446     if (!unicode)
2447         return NULL;
2448     if (size == 0)
2449         return (PyObject *)unicode;
2450
2451     /* Unpack UTF-16 encoded data */
2452     p = unicode->str;
2453     q = (unsigned char *)s;
2454     e = q + size;
2455
2456     if (byteorder)
2457         bo = *byteorder;
2458
2459     /* Check for BOM marks (U+FEFF) in the input and adjust current
2460        byte order setting accordingly. In native mode, the leading BOM
2461        mark is skipped, in all other modes, it is copied to the output
2462        stream as-is (giving a ZWNBSP character). */
2463     if (bo == 0) {
2464         if (size >= 2) {
2465             const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2466 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2467             if (bom == 0xFEFF) {
2468                 q += 2;
2469                 bo = -1;
2470             }
2471             else if (bom == 0xFFFE) {
2472                 q += 2;
2473                 bo = 1;
2474             }
2475 #else
2476             if (bom == 0xFEFF) {
2477                 q += 2;
2478                 bo = 1;
2479             }
2480             else if (bom == 0xFFFE) {
2481                 q += 2;
2482                 bo = -1;
2483             }
2484 #endif
2485         }
2486     }
2487
2488     if (bo == -1) {
2489         /* force LE */
2490         ihi = 1;
2491         ilo = 0;
2492     }
2493     else if (bo == 1) {
2494         /* force BE */
2495         ihi = 0;
2496         ilo = 1;
2497     }
2498
2499     while (q < e) {
2500         Py_UNICODE ch;
2501         /* remaining bytes at the end? (size should be even) */
2502         if (e-q<2) {
2503             if (consumed)
2504                 break;
2505             errmsg = "truncated data";
2506             startinpos = ((const char *)q)-starts;
2507             endinpos = ((const char *)e)-starts;
2508             goto utf16Error;
2509             /* The remaining input chars are ignored if the callback
2510                chooses to skip the input */
2511         }
2512         ch = (q[ihi] << 8) | q[ilo];
2513
2514         q += 2;
2515
2516         if (ch < 0xD800 || ch > 0xDFFF) {
2517             *p++ = ch;
2518             continue;
2519         }
2520
2521         /* UTF-16 code pair: */
2522         if (q >= e) {
2523             errmsg = "unexpected end of data";
2524             startinpos = (((const char *)q)-2)-starts;
2525             endinpos = ((const char *)e)-starts;
2526             goto utf16Error;
2527         }
2528         if (0xD800 <= ch && ch <= 0xDBFF) {
2529             Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2530             q += 2;
2531             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2532 #ifndef Py_UNICODE_WIDE
2533                 *p++ = ch;
2534                 *p++ = ch2;
2535 #else
2536                 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2537 #endif
2538                 continue;
2539             }
2540             else {
2541                 errmsg = "illegal UTF-16 surrogate";
2542                 startinpos = (((const char *)q)-4)-starts;
2543                 endinpos = startinpos+2;
2544                 goto utf16Error;
2545             }
2546
2547         }
2548         errmsg = "illegal encoding";
2549         startinpos = (((const char *)q)-2)-starts;
2550         endinpos = startinpos+2;
2551         /* Fall through to report the error */
2552
2553       utf16Error:
2554         outpos = p-PyUnicode_AS_UNICODE(unicode);
2555         if (unicode_decode_call_errorhandler(
2556                 errors, &errorHandler,
2557                 "utf16", errmsg,
2558                 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2559                 &unicode, &outpos, &p))
2560             goto onError;
2561     }
2562
2563     if (byteorder)
2564         *byteorder = bo;
2565
2566     if (consumed)
2567         *consumed = (const char *)q-starts;
2568
2569     /* Adjust length */
2570     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2571         goto onError;
2572
2573     Py_XDECREF(errorHandler);
2574     Py_XDECREF(exc);
2575     return (PyObject *)unicode;
2576
2577   onError:
2578     Py_DECREF(unicode);
2579     Py_XDECREF(errorHandler);
2580     Py_XDECREF(exc);
2581     return NULL;
2582 }
2583
2584 PyObject *
2585 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2586                       Py_ssize_t size,
2587                       const char *errors,
2588                       int byteorder)
2589 {
2590     PyObject *v;
2591     unsigned char *p;
2592     Py_ssize_t nsize, bytesize;
2593 #ifdef Py_UNICODE_WIDE
2594     Py_ssize_t i, pairs;
2595 #else
2596     const int pairs = 0;
2597 #endif
2598     /* Offsets from p for storing byte pairs in the right order. */
2599 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2600     int ihi = 1, ilo = 0;
2601 #else
2602     int ihi = 0, ilo = 1;
2603 #endif
2604
2605 #define STORECHAR(CH)                           \
2606     do {                                        \
2607         p[ihi] = ((CH) >> 8) & 0xff;            \
2608         p[ilo] = (CH) & 0xff;                   \
2609         p += 2;                                 \
2610     } while(0)
2611
2612 #ifdef Py_UNICODE_WIDE
2613     for (i = pairs = 0; i < size; i++)
2614         if (s[i] >= 0x10000)
2615             pairs++;
2616 #endif
2617     /* 2 * (size + pairs + (byteorder == 0)) */
2618     if (size > PY_SSIZE_T_MAX ||
2619         size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2620         return PyErr_NoMemory();
2621     nsize = size + pairs + (byteorder == 0);
2622     bytesize = nsize * 2;
2623     if (bytesize / 2 != nsize)
2624         return PyErr_NoMemory();
2625     v = PyString_FromStringAndSize(NULL, bytesize);
2626     if (v == NULL)
2627         return NULL;
2628
2629     p = (unsigned char *)PyString_AS_STRING(v);
2630     if (byteorder == 0)
2631         STORECHAR(0xFEFF);
2632     if (size == 0)
2633         return v;
2634
2635     if (byteorder == -1) {
2636         /* force LE */
2637         ihi = 1;
2638         ilo = 0;
2639     }
2640     else if (byteorder == 1) {
2641         /* force BE */
2642         ihi = 0;
2643         ilo = 1;
2644     }
2645
2646     while (size-- > 0) {
2647         Py_UNICODE ch = *s++;
2648         Py_UNICODE ch2 = 0;
2649 #ifdef Py_UNICODE_WIDE
2650         if (ch >= 0x10000) {
2651             ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2652             ch  = 0xD800 | ((ch-0x10000) >> 10);
2653         }
2654 #endif
2655         STORECHAR(ch);
2656         if (ch2)
2657             STORECHAR(ch2);
2658     }
2659     return v;
2660 #undef STORECHAR
2661 }
2662
2663 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2664 {
2665     if (!PyUnicode_Check(unicode)) {
2666         PyErr_BadArgument();
2667         return NULL;
2668     }
2669     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2670                                  PyUnicode_GET_SIZE(unicode),
2671                                  NULL,
2672                                  0);
2673 }
2674
2675 /* --- Unicode Escape Codec ----------------------------------------------- */
2676
2677 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2678
2679 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2680                                         Py_ssize_t size,
2681                                         const char *errors)
2682 {
2683     const char *starts = s;
2684     Py_ssize_t startinpos;
2685     Py_ssize_t endinpos;
2686     Py_ssize_t outpos;
2687     int i;
2688     PyUnicodeObject *v;
2689     Py_UNICODE *p;
2690     const char *end;
2691     char* message;
2692     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2693     PyObject *errorHandler = NULL;
2694     PyObject *exc = NULL;
2695
2696     /* Escaped strings will always be longer than the resulting
2697        Unicode string, so we start with size here and then reduce the
2698        length after conversion to the true value.
2699        (but if the error callback returns a long replacement string
2700        we'll have to allocate more space) */
2701     v = _PyUnicode_New(size);
2702     if (v == NULL)
2703         goto onError;
2704     if (size == 0)
2705         return (PyObject *)v;
2706
2707     p = PyUnicode_AS_UNICODE(v);
2708     end = s + size;
2709
2710     while (s < end) {
2711         unsigned char c;
2712         Py_UNICODE x;
2713         int digits;
2714
2715         /* Non-escape characters are interpreted as Unicode ordinals */
2716         if (*s != '\\') {
2717             *p++ = (unsigned char) *s++;
2718             continue;
2719         }
2720
2721         startinpos = s-starts;
2722         /* \ - Escapes */
2723         s++;
2724         c = *s++;
2725         if (s > end)
2726             c = '\0'; /* Invalid after \ */
2727         switch (c) {
2728
2729             /* \x escapes */
2730         case '\n': break;
2731         case '\\': *p++ = '\\'; break;
2732         case '\'': *p++ = '\''; break;
2733         case '\"': *p++ = '\"'; break;
2734         case 'b': *p++ = '\b'; break;
2735         case 'f': *p++ = '\014'; break; /* FF */
2736         case 't': *p++ = '\t'; break;
2737         case 'n': *p++ = '\n'; break;
2738         case 'r': *p++ = '\r'; break;
2739         case 'v': *p++ = '\013'; break; /* VT */
2740         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2741
2742             /* \OOO (octal) escapes */
2743         case '0': case '1': case '2': case '3':
2744         case '4': case '5': case '6': case '7':
2745             x = s[-1] - '0';
2746             if (s < end && '0' <= *s && *s <= '7') {
2747                 x = (x<<3) + *s++ - '0';
2748                 if (s < end && '0' <= *s && *s <= '7')
2749                     x = (x<<3) + *s++ - '0';
2750             }
2751             *p++ = x;
2752             break;
2753
2754             /* hex escapes */
2755             /* \xXX */
2756         case 'x':
2757             digits = 2;
2758             message = "truncated \\xXX escape";
2759             goto hexescape;
2760
2761             /* \uXXXX */
2762         case 'u':
2763             digits = 4;
2764             message = "truncated \\uXXXX escape";
2765             goto hexescape;
2766
2767             /* \UXXXXXXXX */
2768         case 'U':
2769             digits = 8;
2770             message = "truncated \\UXXXXXXXX escape";
2771         hexescape:
2772             chr = 0;
2773             outpos = p-PyUnicode_AS_UNICODE(v);
2774             if (s+digits>end) {
2775                 endinpos = size;
2776                 if (unicode_decode_call_errorhandler(
2777                         errors, &errorHandler,
2778                         "unicodeescape", "end of string in escape sequence",
2779                         starts, size, &startinpos, &endinpos, &exc, &s,
2780                         &v, &outpos, &p))
2781                     goto onError;
2782                 goto nextByte;
2783             }
2784             for (i = 0; i < digits; ++i) {
2785                 c = (unsigned char) s[i];
2786                 if (!isxdigit(c)) {
2787                     endinpos = (s+i+1)-starts;
2788                     if (unicode_decode_call_errorhandler(
2789                             errors, &errorHandler,
2790                             "unicodeescape", message,
2791                             starts, size, &startinpos, &endinpos, &exc, &s,
2792                             &v, &outpos, &p))
2793                         goto onError;
2794                     goto nextByte;
2795                 }
2796                 chr = (chr<<4) & ~0xF;
2797                 if (c >= '0' && c <= '9')
2798                     chr += c - '0';
2799                 else if (c >= 'a' && c <= 'f')
2800                     chr += 10 + c - 'a';
2801                 else
2802                     chr += 10 + c - 'A';
2803             }
2804             s += i;
2805             if (chr == 0xffffffff && PyErr_Occurred())
2806                 /* _decoding_error will have already written into the
2807                    target buffer. */
2808                 break;
2809         store:
2810             /* when we get here, chr is a 32-bit unicode character */
2811             if (chr <= 0xffff)
2812                 /* UCS-2 character */
2813                 *p++ = (Py_UNICODE) chr;
2814             else if (chr <= 0x10ffff) {
2815                 /* UCS-4 character. Either store directly, or as
2816                    surrogate pair. */
2817 #ifdef Py_UNICODE_WIDE
2818                 *p++ = chr;
2819 #else
2820                 chr -= 0x10000L;
2821                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2822                 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2823 #endif
2824             } else {
2825                 endinpos = s-starts;
2826                 outpos = p-PyUnicode_AS_UNICODE(v);
2827                 if (unicode_decode_call_errorhandler(
2828                         errors, &errorHandler,
2829                         "unicodeescape", "illegal Unicode character",
2830                         starts, size, &startinpos, &endinpos, &exc, &s,
2831                         &v, &outpos, &p))
2832                     goto onError;
2833             }
2834             break;
2835
2836             /* \N{name} */
2837         case 'N':
2838             message = "malformed \\N character escape";
2839             if (ucnhash_CAPI == NULL) {
2840                 /* load the unicode data module */
2841                 PyObject *m, *api;
2842                 m = PyImport_ImportModuleNoBlock("unicodedata");
2843                 if (m == NULL)
2844                     goto ucnhashError;
2845                 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
2846                 Py_DECREF(m);
2847                 if (api == NULL)
2848                     goto ucnhashError;
2849                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
2850                 Py_DECREF(api);
2851                 if (ucnhash_CAPI == NULL)
2852                     goto ucnhashError;
2853             }
2854             if (*s == '{') {
2855                 const char *start = s+1;
2856                 /* look for the closing brace */
2857                 while (*s != '}' && s < end)
2858                     s++;
2859                 if (s > start && s < end && *s == '}') {
2860                     /* found a name.  look it up in the unicode database */
2861                     message = "unknown Unicode character name";
2862                     s++;
2863                     if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2864                         goto store;
2865                 }
2866             }
2867             endinpos = s-starts;
2868             outpos = p-PyUnicode_AS_UNICODE(v);
2869             if (unicode_decode_call_errorhandler(
2870                     errors, &errorHandler,
2871                     "unicodeescape", message,
2872                     starts, size, &startinpos, &endinpos, &exc, &s,
2873                     &v, &outpos, &p))
2874                 goto onError;
2875             break;
2876
2877         default:
2878             if (s > end) {
2879                 message = "\\ at end of string";
2880                 s--;
2881                 endinpos = s-starts;
2882                 outpos = p-PyUnicode_AS_UNICODE(v);
2883                 if (unicode_decode_call_errorhandler(
2884                         errors, &errorHandler,
2885                         "unicodeescape", message,
2886                         starts, size, &startinpos, &endinpos, &exc, &s,
2887                         &v, &outpos, &p))
2888                     goto onError;
2889             }
2890             else {
2891                 *p++ = '\\';
2892                 *p++ = (unsigned char)s[-1];
2893             }
2894             break;
2895         }
2896       nextByte:
2897         ;
2898     }
2899     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2900         goto onError;
2901     Py_XDECREF(errorHandler);
2902     Py_XDECREF(exc);
2903     return (PyObject *)v;
2904
2905   ucnhashError:
2906     PyErr_SetString(
2907         PyExc_UnicodeError,
2908         "\\N escapes not supported (can't load unicodedata module)"
2909         );
2910     Py_XDECREF(v);
2911     Py_XDECREF(errorHandler);
2912     Py_XDECREF(exc);
2913     return NULL;
2914
2915   onError:
2916     Py_XDECREF(v);
2917     Py_XDECREF(errorHandler);
2918     Py_XDECREF(exc);
2919     return NULL;
2920 }
2921
2922 /* Return a Unicode-Escape string version of the Unicode object.
2923
2924    If quotes is true, the string is enclosed in u"" or u'' quotes as
2925    appropriate.
2926
2927 */
2928
2929 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2930                                              Py_ssize_t size,
2931                                              Py_UNICODE ch)
2932 {
2933     /* like wcschr, but doesn't stop at NULL characters */
2934
2935     while (size-- > 0) {
2936         if (*s == ch)
2937             return s;
2938         s++;
2939     }
2940
2941     return NULL;
2942 }
2943
2944 static
2945 PyObject *unicodeescape_string(const Py_UNICODE *s,
2946                                Py_ssize_t size,
2947                                int quotes)
2948 {
2949     PyObject *repr;
2950     char *p;
2951
2952     static const char *hexdigit = "0123456789abcdef";
2953 #ifdef Py_UNICODE_WIDE
2954     const Py_ssize_t expandsize = 10;
2955 #else
2956     const Py_ssize_t expandsize = 6;
2957 #endif
2958
2959     /* XXX(nnorwitz): rather than over-allocating, it would be
2960        better to choose a different scheme.  Perhaps scan the
2961        first N-chars of the string and allocate based on that size.
2962     */
2963     /* Initial allocation is based on the longest-possible unichr
2964        escape.
2965
2966        In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2967        unichr, so in this case it's the longest unichr escape. In
2968        narrow (UTF-16) builds this is five chars per source unichr
2969        since there are two unichrs in the surrogate pair, so in narrow
2970        (UTF-16) builds it's not the longest unichr escape.
2971
2972        In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2973        so in the narrow (UTF-16) build case it's the longest unichr
2974        escape.
2975     */
2976
2977     if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
2978         return PyErr_NoMemory();
2979
2980     repr = PyString_FromStringAndSize(NULL,
2981                                       2
2982                                       + expandsize*size
2983                                       + 1);
2984     if (repr == NULL)
2985         return NULL;
2986
2987     p = PyString_AS_STRING(repr);
2988
2989     if (quotes) {
2990         *p++ = 'u';
2991         *p++ = (findchar(s, size, '\'') &&
2992                 !findchar(s, size, '"')) ? '"' : '\'';
2993     }
2994     while (size-- > 0) {
2995         Py_UNICODE ch = *s++;
2996
2997         /* Escape quotes and backslashes */
2998         if ((quotes &&
2999              ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
3000             *p++ = '\\';
3001             *p++ = (char) ch;
3002             continue;
3003         }
3004
3005 #ifdef Py_UNICODE_WIDE
3006         /* Map 21-bit characters to '\U00xxxxxx' */
3007         else if (ch >= 0x10000) {
3008             *p++ = '\\';
3009             *p++ = 'U';
3010             *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3011             *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3012             *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3013             *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3014             *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3015             *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3016             *p++ = hexdigit[(ch >> 4) & 0x0000000F];
3017             *p++ = hexdigit[ch & 0x0000000F];
3018             continue;
3019         }
3020 #else
3021         /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3022         else if (ch >= 0xD800 && ch < 0xDC00) {
3023             Py_UNICODE ch2;
3024             Py_UCS4 ucs;
3025
3026             ch2 = *s++;
3027             size--;
3028             if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3029                 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3030                 *p++ = '\\';
3031                 *p++ = 'U';
3032                 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3033                 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3034                 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3035                 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3036                 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3037                 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3038                 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3039                 *p++ = hexdigit[ucs & 0x0000000F];
3040                 continue;
3041             }
3042             /* Fall through: isolated surrogates are copied as-is */
3043             s--;
3044             size++;
3045         }
3046 #endif
3047
3048         /* Map 16-bit characters to '\uxxxx' */
3049         if (ch >= 256) {
3050             *p++ = '\\';
3051             *p++ = 'u';
3052             *p++ = hexdigit[(ch >> 12) & 0x000F];
3053             *p++ = hexdigit[(ch >> 8) & 0x000F];
3054             *p++ = hexdigit[(ch >> 4) & 0x000F];
3055             *p++ = hexdigit[ch & 0x000F];
3056         }
3057
3058         /* Map special whitespace to '\t', \n', '\r' */
3059         else if (ch == '\t') {
3060             *p++ = '\\';
3061             *p++ = 't';
3062         }
3063         else if (ch == '\n') {
3064             *p++ = '\\';
3065             *p++ = 'n';
3066         }
3067         else if (ch == '\r') {
3068             *p++ = '\\';
3069             *p++ = 'r';
3070         }
3071
3072         /* Map non-printable US ASCII to '\xhh' */
3073         else if (ch < ' ' || ch >= 0x7F) {
3074             *p++ = '\\';
3075             *p++ = 'x';
3076             *p++ = hexdigit[(ch >> 4) & 0x000F];
3077             *p++ = hexdigit[ch & 0x000F];
3078         }
3079
3080         /* Copy everything else as-is */
3081         else
3082             *p++ = (char) ch;
3083     }
3084     if (quotes)
3085         *p++ = PyString_AS_STRING(repr)[1];
3086
3087     *p = '\0';
3088     _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
3089     return repr;
3090 }
3091
3092 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3093                                         Py_ssize_t size)
3094 {
3095     return unicodeescape_string(s, size, 0);
3096 }
3097
3098 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3099 {
3100     if (!PyUnicode_Check(unicode)) {
3101         PyErr_BadArgument();
3102         return NULL;
3103     }
3104     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3105                                          PyUnicode_GET_SIZE(unicode));
3106 }
3107
3108 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3109
3110 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3111                                            Py_ssize_t size,
3112                                            const char *errors)
3113 {
3114     const char *starts = s;
3115     Py_ssize_t startinpos;
3116     Py_ssize_t endinpos;
3117     Py_ssize_t outpos;
3118     PyUnicodeObject *v;
3119     Py_UNICODE *p;
3120     const char *end;
3121     const char *bs;
3122     PyObject *errorHandler = NULL;
3123     PyObject *exc = NULL;
3124
3125     /* Escaped strings will always be longer than the resulting
3126        Unicode string, so we start with size here and then reduce the
3127        length after conversion to the true value. (But decoding error
3128        handler might have to resize the string) */
3129     v = _PyUnicode_New(size);
3130     if (v == NULL)
3131         goto onError;
3132     if (size == 0)
3133         return (PyObject *)v;
3134     p = PyUnicode_AS_UNICODE(v);
3135     end = s + size;
3136     while (s < end) {
3137         unsigned char c;
3138         Py_UCS4 x;
3139         int i;
3140         int count;
3141
3142         /* Non-escape characters are interpreted as Unicode ordinals */
3143         if (*s != '\\') {
3144             *p++ = (unsigned char)*s++;
3145             continue;
3146         }
3147         startinpos = s-starts;
3148
3149         /* \u-escapes are only interpreted iff the number of leading
3150            backslashes if odd */
3151         bs = s;
3152         for (;s < end;) {
3153             if (*s != '\\')
3154                 break;
3155             *p++ = (unsigned char)*s++;
3156         }
3157         if (((s - bs) & 1) == 0 ||
3158             s >= end ||
3159             (*s != 'u' && *s != 'U')) {
3160             continue;
3161         }
3162         p--;
3163         count = *s=='u' ? 4 : 8;
3164         s++;
3165
3166         /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3167         outpos = p-PyUnicode_AS_UNICODE(v);
3168         for (x = 0, i = 0; i < count; ++i, ++s) {
3169             c = (unsigned char)*s;
3170             if (!isxdigit(c)) {
3171                 endinpos = s-starts;
3172                 if (unicode_decode_call_errorhandler(
3173                         errors, &errorHandler,
3174                         "rawunicodeescape", "truncated \\uXXXX",
3175                         starts, size, &startinpos, &endinpos, &exc, &s,
3176                         &v, &outpos, &p))
3177                     goto onError;
3178                 goto nextByte;
3179             }
3180             x = (x<<4) & ~0xF;
3181             if (c >= '0' && c <= '9')
3182                 x += c - '0';
3183             else if (c >= 'a' && c <= 'f')
3184                 x += 10 + c - 'a';
3185             else
3186                 x += 10 + c - 'A';
3187         }
3188         if (x <= 0xffff)
3189             /* UCS-2 character */
3190             *p++ = (Py_UNICODE) x;
3191         else if (x <= 0x10ffff) {
3192             /* UCS-4 character. Either store directly, or as
3193                surrogate pair. */
3194 #ifdef Py_UNICODE_WIDE
3195             *p++ = (Py_UNICODE) x;
3196 #else
3197             x -= 0x10000L;
3198             *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3199             *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3200 #endif
3201         } else {
3202             endinpos = s-starts;
3203             outpos = p-PyUnicode_AS_UNICODE(v);
3204             if (unicode_decode_call_errorhandler(
3205                     errors, &errorHandler,
3206                     "rawunicodeescape", "\\Uxxxxxxxx out of range",
3207                     starts, size, &startinpos, &endinpos, &exc, &s,
3208                     &v, &outpos, &p))
3209                 goto onError;
3210         }
3211       nextByte:
3212         ;
3213     }
3214     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3215         goto onError;
3216     Py_XDECREF(errorHandler);
3217     Py_XDECREF(exc);
3218     return (PyObject *)v;
3219
3220   onError:
3221     Py_XDECREF(v);
3222     Py_XDECREF(errorHandler);
3223     Py_XDECREF(exc);
3224     return NULL;
3225 }
3226
3227 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3228                                            Py_ssize_t size)
3229 {
3230     PyObject *repr;
3231     char *p;
3232     char *q;
3233
3234     static const char *hexdigit = "0123456789abcdef";
3235 #ifdef Py_UNICODE_WIDE
3236     const Py_ssize_t expandsize = 10;
3237 #else
3238     const Py_ssize_t expandsize = 6;
3239 #endif
3240
3241     if (size > PY_SSIZE_T_MAX / expandsize)
3242         return PyErr_NoMemory();
3243
3244     repr = PyString_FromStringAndSize(NULL, expandsize * size);
3245     if (repr == NULL)
3246         return NULL;
3247     if (size == 0)
3248         return repr;
3249
3250     p = q = PyString_AS_STRING(repr);
3251     while (size-- > 0) {
3252         Py_UNICODE ch = *s++;
3253 #ifdef Py_UNICODE_WIDE
3254         /* Map 32-bit characters to '\Uxxxxxxxx' */
3255         if (ch >= 0x10000) {
3256             *p++ = '\\';
3257             *p++ = 'U';
3258             *p++ = hexdigit[(ch >> 28) & 0xf];
3259             *p++ = hexdigit[(ch >> 24) & 0xf];
3260             *p++ = hexdigit[(ch >> 20) & 0xf];
3261             *p++ = hexdigit[(ch >> 16) & 0xf];
3262             *p++ = hexdigit[(ch >> 12) & 0xf];
3263             *p++ = hexdigit[(ch >> 8) & 0xf];
3264             *p++ = hexdigit[(ch >> 4) & 0xf];
3265             *p++ = hexdigit[ch & 15];
3266         }
3267         else
3268 #else
3269             /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3270             if (ch >= 0xD800 && ch < 0xDC00) {
3271                 Py_UNICODE ch2;
3272                 Py_UCS4 ucs;
3273
3274                 ch2 = *s++;
3275                 size--;
3276                 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3277                     ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3278                     *p++ = '\\';
3279                     *p++ = 'U';
3280                     *p++ = hexdigit[(ucs >> 28) & 0xf];
3281                     *p++ = hexdigit[(ucs >> 24) & 0xf];
3282                     *p++ = hexdigit[(ucs >> 20) & 0xf];
3283                     *p++ = hexdigit[(ucs >> 16) & 0xf];
3284                     *p++ = hexdigit[(ucs >> 12) & 0xf];
3285                     *p++ = hexdigit[(ucs >> 8) & 0xf];
3286                     *p++ = hexdigit[(ucs >> 4) & 0xf];
3287                     *p++ = hexdigit[ucs & 0xf];
3288                     continue;
3289                 }
3290                 /* Fall through: isolated surrogates are copied as-is */
3291                 s--;
3292                 size++;
3293             }
3294 #endif
3295         /* Map 16-bit characters to '\uxxxx' */
3296         if (ch >= 256) {
3297             *p++ = '\\';
3298             *p++ = 'u';
3299             *p++ = hexdigit[(ch >> 12) & 0xf];
3300             *p++ = hexdigit[(ch >> 8) & 0xf];
3301             *p++ = hexdigit[(ch >> 4) & 0xf];
3302             *p++ = hexdigit[ch & 15];
3303         }
3304         /* Copy everything else as-is */
3305         else
3306             *p++ = (char) ch;
3307     }
3308     *p = '\0';
3309     _PyString_Resize(&repr, p - q);
3310     return repr;
3311 }
3312
3313 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3314 {
3315     if (!PyUnicode_Check(unicode)) {
3316         PyErr_BadArgument();
3317         return NULL;
3318     }
3319     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3320                                             PyUnicode_GET_SIZE(unicode));
3321 }
3322
3323 /* --- Unicode Internal Codec ------------------------------------------- */
3324
3325 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3326                                            Py_ssize_t size,
3327                                            const char *errors)
3328 {
3329     const char *starts = s;
3330     Py_ssize_t startinpos;
3331     Py_ssize_t endinpos;
3332     Py_ssize_t outpos;
3333     PyUnicodeObject *v;
3334     Py_UNICODE *p;
3335     const char *end;
3336     const char *reason;
3337     PyObject *errorHandler = NULL;
3338     PyObject *exc = NULL;
3339
3340 #ifdef Py_UNICODE_WIDE
3341     Py_UNICODE unimax = PyUnicode_GetMax();
3342 #endif
3343
3344     /* XXX overflow detection missing */
3345     v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3346     if (v == NULL)
3347         goto onError;
3348     if (PyUnicode_GetSize((PyObject *)v) == 0)
3349         return (PyObject *)v;
3350     p = PyUnicode_AS_UNICODE(v);
3351     end = s + size;
3352
3353     while (s < end) {
3354         memcpy(p, s, sizeof(Py_UNICODE));
3355         /* We have to sanity check the raw data, otherwise doom looms for
3356            some malformed UCS-4 data. */
3357         if (
3358 #ifdef Py_UNICODE_WIDE
3359             *p > unimax || *p < 0 ||
3360 #endif
3361             end-s < Py_UNICODE_SIZE
3362             )
3363         {
3364             startinpos = s - starts;
3365             if (end-s < Py_UNICODE_SIZE) {
3366                 endinpos = end-starts;
3367                 reason = "truncated input";
3368             }
3369             else {
3370                 endinpos = s - starts + Py_UNICODE_SIZE;
3371                 reason = "illegal code point (> 0x10FFFF)";
3372             }
3373             outpos = p - PyUnicode_AS_UNICODE(v);
3374             if (unicode_decode_call_errorhandler(
3375                     errors, &errorHandler,
3376                     "unicode_internal", reason,
3377                     starts, size, &startinpos, &endinpos, &exc, &s,
3378                     &v, &outpos, &p)) {
3379                 goto onError;
3380             }
3381         }
3382         else {
3383             p++;
3384             s += Py_UNICODE_SIZE;
3385         }
3386     }
3387
3388     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3389         goto onError;
3390     Py_XDECREF(errorHandler);
3391     Py_XDECREF(exc);
3392     return (PyObject *)v;
3393
3394   onError:
3395     Py_XDECREF(v);
3396     Py_XDECREF(errorHandler);
3397     Py_XDECREF(exc);
3398     return NULL;
3399 }
3400
3401 /* --- Latin-1 Codec ------------------------------------------------------ */
3402
3403 PyObject *PyUnicode_DecodeLatin1(const char *s,
3404                                  Py_ssize_t size,
3405                                  const char *errors)
3406 {
3407     PyUnicodeObject *v;
3408     Py_UNICODE *p;
3409
3410     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3411     if (size == 1) {
3412         Py_UNICODE r = *(unsigned char*)s;
3413         return PyUnicode_FromUnicode(&r, 1);
3414     }
3415
3416     v = _PyUnicode_New(size);
3417     if (v == NULL)
3418         goto onError;
3419     if (size == 0)
3420         return (PyObject *)v;
3421     p = PyUnicode_AS_UNICODE(v);
3422     while (size-- > 0)
3423         *p++ = (unsigned char)*s++;
3424     return (PyObject *)v;
3425
3426   onError:
3427     Py_XDECREF(v);
3428     return NULL;
3429 }
3430
3431 /* create or adjust a UnicodeEncodeError */
3432 static void make_encode_exception(PyObject **exceptionObject,
3433                                   const char *encoding,
3434                                   const Py_UNICODE *unicode, Py_ssize_t size,
3435                                   Py_ssize_t startpos, Py_ssize_t endpos,
3436                                   const char *reason)
3437 {
3438     if (*exceptionObject == NULL) {
3439         *exceptionObject = PyUnicodeEncodeError_Create(
3440             encoding, unicode, size, startpos, endpos, reason);
3441     }
3442     else {
3443         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3444             goto onError;
3445         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3446             goto onError;
3447         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3448             goto onError;
3449         return;
3450       onError:
3451         Py_DECREF(*exceptionObject);
3452         *exceptionObject = NULL;
3453     }
3454 }
3455
3456 /* raises a UnicodeEncodeError */
3457 static void raise_encode_exception(PyObject **exceptionObject,
3458                                    const char *encoding,
3459                                    const Py_UNICODE *unicode, Py_ssize_t size,
3460                                    Py_ssize_t startpos, Py_ssize_t endpos,
3461                                    const char *reason)
3462 {
3463     make_encode_exception(exceptionObject,
3464                           encoding, unicode, size, startpos, endpos, reason);
3465     if (*exceptionObject != NULL)
3466         PyCodec_StrictErrors(*exceptionObject);
3467 }
3468
3469 /* error handling callback helper:
3470    build arguments, call the callback and check the arguments,
3471    put the result into newpos and return the replacement string, which
3472    has to be freed by the caller */
3473 static PyObject *unicode_encode_call_errorhandler(const char *errors,
3474                                                   PyObject **errorHandler,
3475                                                   const char *encoding, const char *reason,
3476                                                   const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3477                                                   Py_ssize_t startpos, Py_ssize_t endpos,
3478                                                   Py_ssize_t *newpos)
3479 {
3480     static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3481
3482     PyObject *restuple;
3483     PyObject *resunicode;
3484
3485     if (*errorHandler == NULL) {
3486         *errorHandler = PyCodec_LookupError(errors);
3487         if (*errorHandler == NULL)
3488             return NULL;
3489     }
3490
3491     make_encode_exception(exceptionObject,
3492                           encoding, unicode, size, startpos, endpos, reason);
3493     if (*exceptionObject == NULL)
3494         return NULL;
3495
3496     restuple = PyObject_CallFunctionObjArgs(
3497         *errorHandler, *exceptionObject, NULL);
3498     if (restuple == NULL)
3499         return NULL;
3500     if (!PyTuple_Check(restuple)) {
3501         PyErr_SetString(PyExc_TypeError, &argparse[4]);
3502         Py_DECREF(restuple);
3503         return NULL;
3504     }
3505     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3506                           &resunicode, newpos)) {
3507         Py_DECREF(restuple);
3508         return NULL;
3509     }
3510     if (*newpos<0)
3511         *newpos = size+*newpos;
3512     if (*newpos<0 || *newpos>size) {
3513         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3514         Py_DECREF(restuple);
3515         return NULL;
3516     }
3517     Py_INCREF(resunicode);
3518     Py_DECREF(restuple);
3519     return resunicode;
3520 }
3521
3522 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3523                                      Py_ssize_t size,
3524                                      const char *errors,
3525                                      int limit)
3526 {
3527     /* output object */
3528     PyObject *res;
3529     /* pointers to the beginning and end+1 of input */
3530     const Py_UNICODE *startp = p;
3531     const Py_UNICODE *endp = p + size;
3532     /* pointer to the beginning of the unencodable characters */
3533     /* const Py_UNICODE *badp = NULL; */
3534     /* pointer into the output */
3535     char *str;
3536     /* current output position */
3537     Py_ssize_t respos = 0;
3538     Py_ssize_t ressize;
3539     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3540     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3541     PyObject *errorHandler = NULL;
3542     PyObject *exc = NULL;
3543     /* the following variable is used for caching string comparisons
3544      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3545     int known_errorHandler = -1;
3546
3547     /* allocate enough for a simple encoding without
3548        replacements, if we need more, we'll resize */
3549     res = PyString_FromStringAndSize(NULL, size);
3550     if (res == NULL)
3551         goto onError;
3552     if (size == 0)
3553         return res;
3554     str = PyString_AS_STRING(res);
3555     ressize = size;
3556
3557     while (p<endp) {
3558         Py_UNICODE c = *p;
3559
3560         /* can we encode this? */
3561         if (c<limit) {
3562             /* no overflow check, because we know that the space is enough */
3563             *str++ = (char)c;
3564             ++p;
3565         }
3566         else {
3567             Py_ssize_t unicodepos = p-startp;
3568             Py_ssize_t requiredsize;
3569             PyObject *repunicode;
3570             Py_ssize_t repsize;
3571             Py_ssize_t newpos;
3572             Py_ssize_t respos;
3573             Py_UNICODE *uni2;
3574             /* startpos for collecting unencodable chars */
3575             const Py_UNICODE *collstart = p;
3576             const Py_UNICODE *collend = p;
3577             /* find all unecodable characters */
3578             while ((collend < endp) && ((*collend)>=limit))
3579                 ++collend;
3580             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3581             if (known_errorHandler==-1) {
3582                 if ((errors==NULL) || (!strcmp(errors, "strict")))
3583                     known_errorHandler = 1;
3584                 else if (!strcmp(errors, "replace"))
3585                     known_errorHandler = 2;
3586                 else if (!strcmp(errors, "ignore"))
3587                     known_errorHandler = 3;
3588                 else if (!strcmp(errors, "xmlcharrefreplace"))
3589                     known_errorHandler = 4;
3590                 else
3591                     known_errorHandler = 0;
3592             }
3593             switch (known_errorHandler) {
3594             case 1: /* strict */
3595                 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3596                 goto onError;
3597             case 2: /* replace */
3598                 while (collstart++<collend)
3599                     *str++ = '?'; /* fall through */
3600             case 3: /* ignore */
3601                 p = collend;
3602                 break;
3603             case 4: /* xmlcharrefreplace */
3604                 respos = str-PyString_AS_STRING(res);
3605                 /* determine replacement size (temporarily (mis)uses p) */
3606                 for (p = collstart, repsize = 0; p < collend; ++p) {
3607                     if (*p<10)
3608                         repsize += 2+1+1;
3609                     else if (*p<100)
3610                         repsize += 2+2+1;
3611                     else if (*p<1000)
3612                         repsize += 2+3+1;
3613                     else if (*p<10000)
3614                         repsize += 2+4+1;
3615 #ifndef Py_UNICODE_WIDE
3616                     else
3617                         repsize += 2+5+1;
3618 #else
3619                     else if (*p<100000)
3620                         repsize += 2+5+1;
3621                     else if (*p<1000000)
3622                         repsize += 2+6+1;
3623                     else
3624                         repsize += 2+7+1;
3625 #endif
3626                 }
3627                 requiredsize = respos+repsize+(endp-collend);
3628                 if (requiredsize > ressize) {
3629                     if (requiredsize<2*ressize)
3630                         requiredsize = 2*ressize;
3631                     if (_PyString_Resize(&res, requiredsize))
3632                         goto onError;
3633                     str = PyString_AS_STRING(res) + respos;
3634                     ressize = requiredsize;
3635                 }
3636                 /* generate replacement (temporarily (mis)uses p) */
3637                 for (p = collstart; p < collend; ++p) {
3638                     str += sprintf(str, "&#%d;", (int)*p);
3639                 }
3640                 p = collend;
3641                 break;
3642             default:
3643                 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3644                                                               encoding, reason, startp, size, &exc,
3645                                                               collstart-startp, collend-startp, &newpos);
3646                 if (repunicode == NULL)
3647                     goto onError;
3648                 /* need more space? (at least enough for what we have+the
3649                    replacement+the rest of the string, so we won't have to
3650                    check space for encodable characters) */
3651                 respos = str-PyString_AS_STRING(res);
3652                 repsize = PyUnicode_GET_SIZE(repunicode);
3653                 requiredsize = respos+repsize+(endp-collend);
3654                 if (requiredsize > ressize) {
3655                     if (requiredsize<2*ressize)
3656                         requiredsize = 2*ressize;
3657                     if (_PyString_Resize(&res, requiredsize)) {
3658                         Py_DECREF(repunicode);
3659                         goto onError;
3660                     }
3661                     str = PyString_AS_STRING(res) + respos;
3662                     ressize = requiredsize;
3663                 }
3664                 /* check if there is anything unencodable in the replacement
3665                    and copy it to the output */
3666                 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3667                     c = *uni2;
3668                     if (c >= limit) {
3669                         raise_encode_exception(&exc, encoding, startp, size,
3670                                                unicodepos, unicodepos+1, reason);
3671                         Py_DECREF(repunicode);
3672                         goto onError;
3673                     }
3674                     *str = (char)c;
3675                 }
3676                 p = startp + newpos;
3677                 Py_DECREF(repunicode);
3678             }
3679         }
3680     }
3681     /* Resize if we allocated to much */
3682     respos = str-PyString_AS_STRING(res);
3683     if (respos<ressize)
3684         /* If this falls res will be NULL */
3685         _PyString_Resize(&res, respos);
3686     Py_XDECREF(errorHandler);
3687     Py_XDECREF(exc);
3688     return res;
3689
3690   onError:
3691     Py_XDECREF(res);
3692     Py_XDECREF(errorHandler);
3693     Py_XDECREF(exc);
3694     return NULL;
3695 }
3696
3697 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3698                                  Py_ssize_t size,
3699                                  const char *errors)
3700 {
3701     return unicode_encode_ucs1(p, size, errors, 256);
3702 }
3703
3704 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3705 {
3706     if (!PyUnicode_Check(unicode)) {
3707         PyErr_BadArgument();
3708         return NULL;
3709     }
3710     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3711                                   PyUnicode_GET_SIZE(unicode),
3712                                   NULL);
3713 }
3714
3715 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3716
3717 PyObject *PyUnicode_DecodeASCII(const char *s,
3718                                 Py_ssize_t size,
3719                                 const char *errors)
3720 {
3721     const char *starts = s;
3722     PyUnicodeObject *v;
3723     Py_UNICODE *p;
3724     Py_ssize_t startinpos;
3725     Py_ssize_t endinpos;
3726     Py_ssize_t outpos;
3727     const char *e;
3728     PyObject *errorHandler = NULL;
3729     PyObject *exc = NULL;
3730
3731     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3732     if (size == 1 && *(unsigned char*)s < 128) {
3733         Py_UNICODE r = *(unsigned char*)s;
3734         return PyUnicode_FromUnicode(&r, 1);
3735     }
3736
3737     v = _PyUnicode_New(size);
3738     if (v == NULL)
3739         goto onError;
3740     if (size == 0)
3741         return (PyObject *)v;
3742     p = PyUnicode_AS_UNICODE(v);
3743     e = s + size;
3744     while (s < e) {
3745         register unsigned char c = (unsigned char)*s;
3746         if (c < 128) {
3747             *p++ = c;
3748             ++s;
3749         }
3750         else {
3751             startinpos = s-starts;
3752             endinpos = startinpos + 1;
3753             outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3754             if (unicode_decode_call_errorhandler(
3755                     errors, &errorHandler,
3756                     "ascii", "ordinal not in range(128)",
3757                     starts, size, &startinpos, &endinpos, &exc, &s,
3758                     &v, &outpos, &p))
3759                 goto onError;
3760         }
3761     }
3762     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
3763         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3764             goto onError;
3765     Py_XDECREF(errorHandler);
3766     Py_XDECREF(exc);
3767     return (PyObject *)v;
3768
3769   onError:
3770     Py_XDECREF(v);
3771     Py_XDECREF(errorHandler);
3772     Py_XDECREF(exc);
3773     return NULL;
3774 }
3775
3776 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3777                                 Py_ssize_t size,
3778                                 const char *errors)
3779 {
3780     return unicode_encode_ucs1(p, size, errors, 128);
3781 }
3782
3783 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3784 {
3785     if (!PyUnicode_Check(unicode)) {
3786         PyErr_BadArgument();
3787         return NULL;
3788     }
3789     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3790                                  PyUnicode_GET_SIZE(unicode),
3791                                  NULL);
3792 }
3793
3794 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3795
3796 /* --- MBCS codecs for Windows -------------------------------------------- */
3797
3798 #if SIZEOF_INT < SIZEOF_SIZE_T
3799 #define NEED_RETRY
3800 #endif
3801
3802 /* XXX This code is limited to "true" double-byte encodings, as
3803    a) it assumes an incomplete character consists of a single byte, and
3804    b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3805    encodings, see IsDBCSLeadByteEx documentation. */
3806
3807 static int is_dbcs_lead_byte(const char *s, int offset)
3808 {
3809     const char *curr = s + offset;
3810
3811     if (IsDBCSLeadByte(*curr)) {
3812         const char *prev = CharPrev(s, curr);
3813         return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3814     }
3815     return 0;
3816 }
3817
3818 /*
3819  * Decode MBCS string into unicode object. If 'final' is set, converts
3820  * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3821  */
3822 static int decode_mbcs(PyUnicodeObject **v,
3823                        const char *s, /* MBCS string */
3824                        int size, /* sizeof MBCS string */
3825                        int final)
3826 {
3827     Py_UNICODE *p;
3828     Py_ssize_t n = 0;
3829     int usize = 0;
3830
3831     assert(size >= 0);
3832
3833     /* Skip trailing lead-byte unless 'final' is set */
3834     if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3835         --size;
3836
3837     /* First get the size of the result */
3838     if (size > 0) {
3839         usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3840         if (usize == 0) {
3841             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3842             return -1;
3843         }
3844     }
3845
3846     if (*v == NULL) {
3847         /* Create unicode object */
3848         *v = _PyUnicode_New(usize);
3849         if (*v == NULL)
3850             return -1;
3851     }
3852     else {
3853         /* Extend unicode object */
3854         n = PyUnicode_GET_SIZE(*v);
3855         if (_PyUnicode_Resize(v, n + usize) < 0)
3856             return -1;
3857     }
3858
3859     /* Do the conversion */
3860     if (size > 0) {
3861         p = PyUnicode_AS_UNICODE(*v) + n;
3862         if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3863             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3864             return -1;
3865         }
3866     }
3867
3868     return size;
3869 }
3870
3871 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3872                                        Py_ssize_t size,
3873                                        const char *errors,
3874                                        Py_ssize_t *consumed)
3875 {
3876     PyUnicodeObject *v = NULL;
3877     int done;
3878
3879     if (consumed)
3880         *consumed = 0;
3881
3882 #ifdef NEED_RETRY
3883   retry:
3884     if (size > INT_MAX)
3885         done = decode_mbcs(&v, s, INT_MAX, 0);
3886     else
3887 #endif
3888         done = decode_mbcs(&v, s, (int)size, !consumed);
3889
3890     if (done < 0) {
3891         Py_XDECREF(v);
3892         return NULL;
3893     }
3894
3895     if (consumed)
3896         *consumed += done;
3897
3898 #ifdef NEED_RETRY
3899     if (size > INT_MAX) {
3900         s += done;
3901         size -= done;
3902         goto retry;
3903     }
3904 #endif
3905
3906     return (PyObject *)v;
3907 }
3908
3909 PyObject *PyUnicode_DecodeMBCS(const char *s,
3910                                Py_ssize_t size,
3911                                const char *errors)
3912 {
3913     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3914 }
3915
3916 /*
3917  * Convert unicode into string object (MBCS).
3918  * Returns 0 if succeed, -1 otherwise.
3919  */
3920 static int encode_mbcs(PyObject **repr,
3921                        const Py_UNICODE *p, /* unicode */
3922                        int size) /* size of unicode */
3923 {
3924     int mbcssize = 0;
3925     Py_ssize_t n = 0;
3926
3927     assert(size >= 0);
3928
3929     /* First get the size of the result */
3930     if (size > 0) {
3931         mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3932         if (mbcssize == 0) {
3933             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3934             return -1;
3935         }
3936     }
3937
3938     if (*repr == NULL) {
3939         /* Create string object */
3940         *repr = PyString_FromStringAndSize(NULL, mbcssize);
3941         if (*repr == NULL)
3942             return -1;
3943     }
3944     else {
3945         /* Extend string object */
3946         n = PyString_Size(*repr);
3947         if (_PyString_Resize(repr, n + mbcssize) < 0)
3948             return -1;
3949     }
3950
3951     /* Do the conversion */
3952     if (size > 0) {
3953         char *s = PyString_AS_STRING(*repr) + n;
3954         if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3955             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3956             return -1;
3957         }
3958     }
3959
3960     return 0;
3961 }
3962
3963 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
3964                                Py_ssize_t size,
3965                                const char *errors)
3966 {
3967     PyObject *repr = NULL;
3968     int ret;
3969
3970 #ifdef NEED_RETRY
3971   retry:
3972     if (size > INT_MAX)
3973         ret = encode_mbcs(&repr, p, INT_MAX);
3974     else
3975 #endif
3976         ret = encode_mbcs(&repr, p, (int)size);
3977
3978     if (ret < 0) {
3979         Py_XDECREF(repr);
3980         return NULL;
3981     }
3982
3983 #ifdef NEED_RETRY
3984     if (size > INT_MAX) {
3985         p += INT_MAX;
3986         size -= INT_MAX;
3987         goto retry;
3988     }
3989 #endif
3990
3991     return repr;
3992 }
3993
3994 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3995 {
3996     if (!PyUnicode_Check(unicode)) {
3997         PyErr_BadArgument();
3998         return NULL;
3999     }
4000     return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4001                                 PyUnicode_GET_SIZE(unicode),
4002                                 NULL);
4003 }
4004
4005 #undef NEED_RETRY
4006
4007 #endif /* MS_WINDOWS */
4008
4009 /* --- Character Mapping Codec -------------------------------------------- */
4010
4011 PyObject *PyUnicode_DecodeCharmap(const char *s,
4012                                   Py_ssize_t size,
4013                                   PyObject *mapping,
4014                                   const char *errors)
4015 {
4016     const char *starts = s;
4017     Py_ssize_t startinpos;
4018     Py_ssize_t endinpos;
4019     Py_ssize_t outpos;
4020     const char *e;
4021     PyUnicodeObject *v;
4022     Py_UNICODE *p;
4023     Py_ssize_t extrachars = 0;
4024     PyObject *errorHandler = NULL;
4025     PyObject *exc = NULL;
4026     Py_UNICODE *mapstring = NULL;
4027     Py_ssize_t maplen = 0;
4028
4029     /* Default to Latin-1 */
4030     if (mapping == NULL)
4031         return PyUnicode_DecodeLatin1(s, size, errors);
4032
4033     v = _PyUnicode_New(size);
4034     if (v == NULL)
4035         goto onError;
4036     if (size == 0)
4037         return (PyObject *)v;
4038     p = PyUnicode_AS_UNICODE(v);
4039     e = s + size;
4040     if (PyUnicode_CheckExact(mapping)) {
4041         mapstring = PyUnicode_AS_UNICODE(mapping);
4042         maplen = PyUnicode_GET_SIZE(mapping);
4043         while (s < e) {
4044             unsigned char ch = *s;
4045             Py_UNICODE x = 0xfffe; /* illegal value */
4046
4047             if (ch < maplen)
4048                 x = mapstring[ch];
4049
4050             if (x == 0xfffe) {
4051                 /* undefined mapping */
4052                 outpos = p-PyUnicode_AS_UNICODE(v);
4053                 startinpos = s-starts;
4054                 endinpos = startinpos+1;
4055                 if (unicode_decode_call_errorhandler(
4056                         errors, &errorHandler,
4057                         "charmap", "character maps to <undefined>",
4058                         starts, size, &startinpos, &endinpos, &exc, &s,
4059                         &v, &outpos, &p)) {
4060                     goto onError;
4061                 }
4062                 continue;
4063             }
4064             *p++ = x;
4065             ++s;
4066         }
4067     }
4068     else {
4069         while (s < e) {
4070             unsigned char ch = *s;
4071             PyObject *w, *x;
4072
4073             /* Get mapping (char ordinal -> integer, Unicode char or None) */
4074             w = PyInt_FromLong((long)ch);
4075             if (w == NULL)
4076                 goto onError;
4077             x = PyObject_GetItem(mapping, w);
4078             Py_DECREF(w);
4079             if (x == NULL) {
4080                 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4081                     /* No mapping found means: mapping is undefined. */
4082                     PyErr_Clear();
4083                     x = Py_None;
4084                     Py_INCREF(x);
4085                 } else
4086                     goto onError;
4087             }
4088
4089             /* Apply mapping */
4090             if (PyInt_Check(x)) {
4091                 long value = PyInt_AS_LONG(x);
4092                 if (value < 0 || value > 65535) {
4093                     PyErr_SetString(PyExc_TypeError,
4094                                     "character mapping must be in range(65536)");
4095                     Py_DECREF(x);
4096                     goto onError;
4097                 }
4098                 *p++ = (Py_UNICODE)value;
4099             }
4100             else if (x == Py_None) {
4101                 /* undefined mapping */
4102                 outpos = p-PyUnicode_AS_UNICODE(v);
4103                 startinpos = s-starts;
4104                 endinpos = startinpos+1;
4105                 if (unicode_decode_call_errorhandler(
4106                         errors, &errorHandler,
4107                         "charmap", "character maps to <undefined>",
4108                         starts, size, &startinpos, &endinpos, &exc, &s,
4109                         &v, &outpos, &p)) {
4110                     Py_DECREF(x);
4111                     goto onError;
4112                 }
4113                 Py_DECREF(x);
4114                 continue;
4115             }
4116             else if (PyUnicode_Check(x)) {
4117                 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4118
4119                 if (targetsize == 1)
4120                     /* 1-1 mapping */
4121                     *p++ = *PyUnicode_AS_UNICODE(x);
4122
4123                 else if (targetsize > 1) {
4124                     /* 1-n mapping */
4125                     if (targetsize > extrachars) {
4126                         /* resize first */
4127                         Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4128                         Py_ssize_t needed = (targetsize - extrachars) + \
4129                             (targetsize << 2);
4130                         extrachars += needed;
4131                         /* XXX overflow detection missing */
4132                         if (_PyUnicode_Resize(&v,
4133                                               PyUnicode_GET_SIZE(v) + needed) < 0) {
4134                             Py_DECREF(x);
4135                             goto onError;
4136                         }
4137                         p = PyUnicode_AS_UNICODE(v) + oldpos;
4138                     }
4139                     Py_UNICODE_COPY(p,
4140                                     PyUnicode_AS_UNICODE(x),
4141                                     targetsize);
4142                     p += targetsize;
4143                     extrachars -= targetsize;
4144                 }
4145                 /* 1-0 mapping: skip the character */
4146             }
4147             else {
4148                 /* wrong return value */
4149                 PyErr_SetString(PyExc_TypeError,
4150                                 "character mapping must return integer, None or unicode");
4151                 Py_DECREF(x);
4152                 goto onError;
4153             }
4154             Py_DECREF(x);
4155             ++s;
4156         }
4157     }
4158     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4159         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4160             goto onError;
4161     Py_XDECREF(errorHandler);
4162     Py_XDECREF(exc);
4163     return (PyObject *)v;
4164
4165   onError:
4166     Py_XDECREF(errorHandler);
4167     Py_XDECREF(exc);
4168     Py_XDECREF(v);
4169     return NULL;
4170 }
4171
4172 /* Charmap encoding: the lookup table */
4173
4174 struct encoding_map{
4175     PyObject_HEAD
4176     unsigned char level1[32];
4177     int count2, count3;
4178     unsigned char level23[1];
4179 };
4180
4181 static PyObject*
4182 encoding_map_size(PyObject *obj, PyObject* args)
4183 {
4184     struct encoding_map *map = (struct encoding_map*)obj;
4185     return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4186                           128*map->count3);
4187 }
4188
4189 static PyMethodDef encoding_map_methods[] = {
4190     {"size", encoding_map_size, METH_NOARGS,
4191      PyDoc_STR("Return the size (in bytes) of this object") },
4192     { 0 }
4193 };
4194
4195 static void
4196 encoding_map_dealloc(PyObject* o)
4197 {
4198     PyObject_FREE(o);
4199 }
4200
4201 static PyTypeObject EncodingMapType = {
4202     PyVarObject_HEAD_INIT(NULL, 0)
4203     "EncodingMap",          /*tp_name*/
4204     sizeof(struct encoding_map),   /*tp_basicsize*/
4205     0,                      /*tp_itemsize*/
4206     /* methods */
4207     encoding_map_dealloc,   /*tp_dealloc*/
4208     0,                      /*tp_print*/
4209     0,                      /*tp_getattr*/
4210     0,                      /*tp_setattr*/
4211     0,                      /*tp_compare*/
4212     0,                      /*tp_repr*/
4213     0,                      /*tp_as_number*/
4214     0,                      /*tp_as_sequence*/
4215     0,                      /*tp_as_mapping*/
4216     0,                      /*tp_hash*/
4217     0,                      /*tp_call*/
4218     0,                      /*tp_str*/
4219     0,                      /*tp_getattro*/
4220     0,                      /*tp_setattro*/
4221     0,                      /*tp_as_buffer*/
4222     Py_TPFLAGS_DEFAULT,     /*tp_flags*/
4223     0,                      /*tp_doc*/
4224     0,                      /*tp_traverse*/
4225     0,                      /*tp_clear*/
4226     0,                      /*tp_richcompare*/
4227     0,                      /*tp_weaklistoffset*/
4228     0,                      /*tp_iter*/
4229     0,                      /*tp_iternext*/
4230     encoding_map_methods,   /*tp_methods*/
4231     0,                      /*tp_members*/
4232     0,                      /*tp_getset*/
4233     0,                      /*tp_base*/
4234     0,                      /*tp_dict*/
4235     0,                      /*tp_descr_get*/
4236     0,                      /*tp_descr_set*/
4237     0,                      /*tp_dictoffset*/
4238     0,                      /*tp_init*/
4239     0,                      /*tp_alloc*/
4240     0,                      /*tp_new*/
4241     0,                      /*tp_free*/
4242     0,                      /*tp_is_gc*/
4243 };
4244
4245 PyObject*
4246 PyUnicode_BuildEncodingMap(PyObject* string)
4247 {
4248     Py_UNICODE *decode;
4249     PyObject *result;
4250     struct encoding_map *mresult;
4251     int i;
4252     int need_dict = 0;
4253     unsigned char level1[32];
4254     unsigned char level2[512];
4255     unsigned char *mlevel1, *mlevel2, *mlevel3;
4256     int count2 = 0, count3 = 0;
4257
4258     if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4259         PyErr_BadArgument();
4260         return NULL;
4261     }
4262     decode = PyUnicode_AS_UNICODE(string);
4263     memset(level1, 0xFF, sizeof level1);
4264     memset(level2, 0xFF, sizeof level2);
4265
4266     /* If there isn't a one-to-one mapping of NULL to \0,
4267        or if there are non-BMP characters, we need to use
4268        a mapping dictionary. */
4269     if (decode[0] != 0)
4270         need_dict = 1;
4271     for (i = 1; i < 256; i++) {
4272         int l1, l2;
4273         if (decode[i] == 0
4274 #ifdef Py_UNICODE_WIDE
4275             || decode[i] > 0xFFFF
4276 #endif
4277             ) {
4278             need_dict = 1;
4279             break;
4280         }
4281         if (decode[i] == 0xFFFE)
4282             /* unmapped character */
4283             continue;
4284         l1 = decode[i] >> 11;
4285         l2 = decode[i] >> 7;
4286         if (level1[l1] == 0xFF)
4287             level1[l1] = count2++;
4288         if (level2[l2] == 0xFF)
4289             level2[l2] = count3++;
4290     }
4291
4292     if (count2 >= 0xFF || count3 >= 0xFF)
4293         need_dict = 1;
4294
4295     if (need_dict) {
4296         PyObject *result = PyDict_New();
4297         PyObject *key, *value;
4298         if (!result)
4299             return NULL;
4300         for (i = 0; i < 256; i++) {
4301             key = value = NULL;
4302             key = PyInt_FromLong(decode[i]);
4303             value = PyInt_FromLong(i);
4304             if (!key || !value)
4305                 goto failed1;
4306             if (PyDict_SetItem(result, key, value) == -1)
4307                 goto failed1;
4308             Py_DECREF(key);
4309             Py_DECREF(value);
4310         }
4311         return result;
4312       failed1:
4313         Py_XDECREF(key);
4314         Py_XDECREF(value);
4315         Py_DECREF(result);
4316         return NULL;
4317     }
4318
4319     /* Create a three-level trie */
4320     result = PyObject_MALLOC(sizeof(struct encoding_map) +
4321                              16*count2 + 128*count3 - 1);
4322     if (!result)
4323         return PyErr_NoMemory();
4324     PyObject_Init(result, &EncodingMapType);
4325     mresult = (struct encoding_map*)result;
4326     mresult->count2 = count2;
4327     mresult->count3 = count3;
4328     mlevel1 = mresult->level1;
4329     mlevel2 = mresult->level23;
4330     mlevel3 = mresult->level23 + 16*count2;
4331     memcpy(mlevel1, level1, 32);
4332     memset(mlevel2, 0xFF, 16*count2);
4333     memset(mlevel3, 0, 128*count3);
4334     count3 = 0;
4335     for (i = 1; i < 256; i++) {
4336         int o1, o2, o3, i2, i3;
4337         if (decode[i] == 0xFFFE)
4338             /* unmapped character */
4339             continue;
4340         o1 = decode[i]>>11;
4341         o2 = (decode[i]>>7) & 0xF;
4342         i2 = 16*mlevel1[o1] + o2;
4343         if (mlevel2[i2] == 0xFF)
4344             mlevel2[i2] = count3++;
4345         o3 = decode[i] & 0x7F;
4346         i3 = 128*mlevel2[i2] + o3;
4347         mlevel3[i3] = i;
4348     }
4349     return result;
4350 }
4351
4352 static int
4353 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4354 {
4355     struct encoding_map *map = (struct encoding_map*)mapping;
4356     int l1 = c>>11;
4357     int l2 = (c>>7) & 0xF;
4358     int l3 = c & 0x7F;
4359     int i;
4360
4361 #ifdef Py_UNICODE_WIDE
4362     if (c > 0xFFFF) {
4363         return -1;
4364     }
4365 #endif
4366     if (c == 0)
4367         return 0;
4368     /* level 1*/
4369     i = map->level1[l1];
4370     if (i == 0xFF) {
4371         return -1;
4372     }
4373     /* level 2*/
4374     i = map->level23[16*i+l2];
4375     if (i == 0xFF) {
4376         return -1;
4377     }
4378     /* level 3 */
4379     i = map->level23[16*map->count2 + 128*i + l3];
4380     if (i == 0) {
4381         return -1;
4382     }
4383     return i;
4384 }
4385
4386 /* Lookup the character ch in the mapping. If the character
4387    can't be found, Py_None is returned (or NULL, if another
4388    error occurred). */
4389 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4390 {
4391     PyObject *w = PyInt_FromLong((long)c);
4392     PyObject *x;
4393
4394     if (w == NULL)
4395         return NULL;
4396     x = PyObject_GetItem(mapping, w);
4397     Py_DECREF(w);
4398     if (x == NULL) {
4399         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4400             /* No mapping found means: mapping is undefined. */
4401             PyErr_Clear();
4402             x = Py_None;
4403             Py_INCREF(x);
4404             return x;
4405         } else
4406             return NULL;
4407     }
4408     else if (x == Py_None)
4409         return x;
4410     else if (PyInt_Check(x)) {
4411         long value = PyInt_AS_LONG(x);
4412         if (value < 0 || value > 255) {
4413             PyErr_SetString(PyExc_TypeError,
4414                             "character mapping must be in range(256)");
4415             Py_DECREF(x);
4416             return NULL;
4417         }
4418         return x;
4419     }
4420     else if (PyString_Check(x))
4421         return x;
4422     else {
4423         /* wrong return value */
4424         PyErr_SetString(PyExc_TypeError,
4425                         "character mapping must return integer, None or str");
4426         Py_DECREF(x);
4427         return NULL;
4428     }
4429 }
4430
4431 static int
4432 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4433 {
4434     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4435     /* exponentially overallocate to minimize reallocations */
4436     if (requiredsize < 2*outsize)
4437         requiredsize = 2*outsize;
4438     if (_PyString_Resize(outobj, requiredsize)) {
4439         return 0;
4440     }
4441     return 1;
4442 }
4443
4444 typedef enum charmapencode_result {
4445     enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4446 }charmapencode_result;
4447 /* lookup the character, put the result in the output string and adjust
4448    various state variables. Reallocate the output string if not enough
4449    space is available. Return a new reference to the object that
4450    was put in the output buffer, or Py_None, if the mapping was undefined
4451    (in which case no character was written) or NULL, if a
4452    reallocation error occurred. The caller must decref the result */
4453 static
4454 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4455                                           PyObject **outobj, Py_ssize_t *outpos)
4456 {
4457     PyObject *rep;
4458     char *outstart;
4459     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4460
4461     if (Py_TYPE(mapping) == &EncodingMapType) {
4462         int res = encoding_map_lookup(c, mapping);
4463         Py_ssize_t requiredsize = *outpos+1;
4464         if (res == -1)
4465             return enc_FAILED;
4466         if (outsize<requiredsize)
4467             if (!charmapencode_resize(outobj, outpos, requiredsize))
4468                 return enc_EXCEPTION;
4469         outstart = PyString_AS_STRING(*outobj);
4470         outstart[(*outpos)++] = (char)res;
4471         return enc_SUCCESS;
4472     }
4473
4474     rep = charmapencode_lookup(c, mapping);
4475     if (rep==NULL)
4476         return enc_EXCEPTION;
4477     else if (rep==Py_None) {
4478         Py_DECREF(rep);
4479         return enc_FAILED;
4480     } else {
4481         if (PyInt_Check(rep)) {
4482             Py_ssize_t requiredsize = *outpos+1;
4483             if (outsize<requiredsize)
4484                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4485                     Py_DECREF(rep);
4486                     return enc_EXCEPTION;
4487                 }
4488             outstart = PyString_AS_STRING(*outobj);
4489             outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4490         }
4491         else {
4492             const char *repchars = PyString_AS_STRING(rep);
4493             Py_ssize_t repsize = PyString_GET_SIZE(rep);
4494             Py_ssize_t requiredsize = *outpos+repsize;
4495             if (outsize<requiredsize)
4496                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4497                     Py_DECREF(rep);
4498                     return enc_EXCEPTION;
4499                 }
4500             outstart = PyString_AS_STRING(*outobj);
4501             memcpy(outstart + *outpos, repchars, repsize);
4502             *outpos += repsize;
4503         }
4504     }
4505     Py_DECREF(rep);
4506     return enc_SUCCESS;
4507 }
4508
4509 /* handle an error in PyUnicode_EncodeCharmap
4510    Return 0 on success, -1 on error */
4511 static
4512 int charmap_encoding_error(
4513     const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4514     PyObject **exceptionObject,
4515     int *known_errorHandler, PyObject **errorHandler, const char *errors,
4516     PyObject **res, Py_ssize_t *respos)
4517 {
4518     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4519     Py_ssize_t repsize;
4520     Py_ssize_t newpos;
4521     Py_UNICODE *uni2;
4522     /* startpos for collecting unencodable chars */
4523     Py_ssize_t collstartpos = *inpos;
4524     Py_ssize_t collendpos = *inpos+1;
4525     Py_ssize_t collpos;
4526     char *encoding = "charmap";
4527     char *reason = "character maps to <undefined>";
4528     charmapencode_result x;
4529
4530     /* find all unencodable characters */
4531     while (collendpos < size) {
4532         PyObject *rep;
4533         if (Py_TYPE(mapping) == &EncodingMapType) {
4534             int res = encoding_map_lookup(p[collendpos], mapping);
4535             if (res != -1)
4536                 break;
4537             ++collendpos;
4538             continue;
4539         }
4540
4541         rep = charmapencode_lookup(p[collendpos], mapping);
4542         if (rep==NULL)
4543             return -1;
4544         else if (rep!=Py_None) {
4545             Py_DECREF(rep);
4546             break;
4547         }
4548         Py_DECREF(rep);
4549         ++collendpos;
4550     }
4551     /* cache callback name lookup
4552      * (if not done yet, i.e. it's the first error) */
4553     if (*known_errorHandler==-1) {
4554         if ((errors==NULL) || (!strcmp(errors, "strict")))
4555             *known_errorHandler = 1;
4556         else if (!strcmp(errors, "replace"))
4557             *known_errorHandler = 2;
4558         else if (!strcmp(errors, "ignore"))
4559             *known_errorHandler = 3;
4560         else if (!strcmp(errors, "xmlcharrefreplace"))
4561             *known_errorHandler = 4;
4562         else
4563             *known_errorHandler = 0;
4564     }
4565     switch (*known_errorHandler) {
4566     case 1: /* strict */
4567         raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4568         return -1;
4569     case 2: /* replace */
4570         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4571             x = charmapencode_output('?', mapping, res, respos);
4572             if (x==enc_EXCEPTION) {
4573                 return -1;
4574             }
4575             else if (x==enc_FAILED) {
4576                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4577                 return -1;
4578             }
4579         }
4580         /* fall through */
4581     case 3: /* ignore */
4582         *inpos = collendpos;
4583         break;
4584     case 4: /* xmlcharrefreplace */
4585         /* generate replacement (temporarily (mis)uses p) */
4586         for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4587             char buffer[2+29+1+1];
4588             char *cp;
4589             sprintf(buffer, "&#%d;", (int)p[collpos]);
4590             for (cp = buffer; *cp; ++cp) {
4591                 x = charmapencode_output(*cp, mapping, res, respos);
4592                 if (x==enc_EXCEPTION)
4593                     return -1;
4594                 else if (x==enc_FAILED) {
4595                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4596                     return -1;
4597                 }
4598             }
4599         }
4600         *inpos = collendpos;
4601         break;
4602     default:
4603         repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4604                                                       encoding, reason, p, size, exceptionObject,
4605                                                       collstartpos, collendpos, &newpos);
4606         if (repunicode == NULL)
4607             return -1;
4608         /* generate replacement  */
4609         repsize = PyUnicode_GET_SIZE(repunicode);
4610         for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4611             x = charmapencode_output(*uni2, mapping, res, respos);
4612             if (x==enc_EXCEPTION) {
4613                 return -1;
4614             }
4615             else if (x==enc_FAILED) {
4616                 Py_DECREF(repunicode);
4617                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4618                 return -1;
4619             }
4620         }
4621         *inpos = newpos;
4622         Py_DECREF(repunicode);
4623     }
4624     return 0;
4625 }
4626
4627 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4628                                   Py_ssize_t size,
4629                                   PyObject *mapping,
4630                                   const char *errors)
4631 {
4632     /* output object */
4633     PyObject *res = NULL;
4634     /* current input position */
4635     Py_ssize_t inpos = 0;
4636     /* current output position */
4637     Py_ssize_t respos = 0;
4638     PyObject *errorHandler = NULL;
4639     PyObject *exc = NULL;
4640     /* the following variable is used for caching string comparisons
4641      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4642      * 3=ignore, 4=xmlcharrefreplace */
4643     int known_errorHandler = -1;
4644
4645     /* Default to Latin-1 */
4646     if (mapping == NULL)
4647         return PyUnicode_EncodeLatin1(p, size, errors);
4648
4649     /* allocate enough for a simple encoding without
4650        replacements, if we need more, we'll resize */
4651     res = PyString_FromStringAndSize(NULL, size);
4652     if (res == NULL)
4653         goto onError;
4654     if (size == 0)
4655         return res;
4656
4657     while (inpos<size) {
4658         /* try to encode it */
4659         charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4660         if (x==enc_EXCEPTION) /* error */
4661             goto onError;
4662         if (x==enc_FAILED) { /* unencodable character */
4663             if (charmap_encoding_error(p, size, &inpos, mapping,
4664                                        &exc,
4665                                        &known_errorHandler, &errorHandler, errors,
4666                                        &res, &respos)) {
4667                 goto onError;
4668             }
4669         }
4670         else
4671             /* done with this character => adjust input position */
4672             ++inpos;
4673     }
4674
4675     /* Resize if we allocated to much */
4676     if (respos<PyString_GET_SIZE(res)) {
4677         if (_PyString_Resize(&res, respos))
4678             goto onError;
4679     }
4680     Py_XDECREF(exc);
4681     Py_XDECREF(errorHandler);
4682     return res;
4683
4684   onError:
4685     Py_XDECREF(res);
4686     Py_XDECREF(exc);
4687     Py_XDECREF(errorHandler);
4688     return NULL;
4689 }
4690
4691 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4692                                     PyObject *mapping)
4693 {
4694     if (!PyUnicode_Check(unicode) || mapping == NULL) {
4695         PyErr_BadArgument();
4696         return NULL;
4697     }
4698     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4699                                    PyUnicode_GET_SIZE(unicode),
4700                                    mapping,
4701                                    NULL);
4702 }
4703
4704 /* create or adjust a UnicodeTranslateError */
4705 static void make_translate_exception(PyObject **exceptionObject,
4706                                      const Py_UNICODE *unicode, Py_ssize_t size,
4707                                      Py_ssize_t startpos, Py_ssize_t endpos,
4708                                      const char *reason)
4709 {
4710     if (*exceptionObject == NULL) {
4711         *exceptionObject = PyUnicodeTranslateError_Create(
4712             unicode, size, startpos, endpos, reason);
4713     }
4714     else {
4715         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4716             goto onError;
4717         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4718             goto onError;
4719         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4720             goto onError;
4721         return;
4722       onError:
4723         Py_DECREF(*exceptionObject);
4724         *exceptionObject = NULL;
4725     }
4726 }
4727
4728 /* raises a UnicodeTranslateError */
4729 static void raise_translate_exception(PyObject **exceptionObject,
4730                                       const Py_UNICODE *unicode, Py_ssize_t size,
4731                                       Py_ssize_t startpos, Py_ssize_t endpos,
4732                                       const char *reason)
4733 {
4734     make_translate_exception(exceptionObject,
4735                              unicode, size, startpos, endpos, reason);
4736     if (*exceptionObject != NULL)
4737         PyCodec_StrictErrors(*exceptionObject);
4738 }
4739
4740 /* error handling callback helper:
4741    build arguments, call the callback and check the arguments,
4742    put the result into newpos and return the replacement string, which
4743    has to be freed by the caller */
4744 static PyObject *unicode_translate_call_errorhandler(const char *errors,
4745                                                      PyObject **errorHandler,
4746                                                      const char *reason,
4747                                                      const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4748                                                      Py_ssize_t startpos, Py_ssize_t endpos,
4749                                                      Py_ssize_t *newpos)
4750 {
4751     static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4752
4753     Py_ssize_t i_newpos;
4754     PyObject *restuple;
4755     PyObject *resunicode;
4756
4757     if (*errorHandler == NULL) {
4758         *errorHandler = PyCodec_LookupError(errors);
4759         if (*errorHandler == NULL)
4760             return NULL;
4761     }
4762
4763     make_translate_exception(exceptionObject,
4764                              unicode, size, startpos, endpos, reason);
4765     if (*exceptionObject == NULL)
4766         return NULL;
4767
4768     restuple = PyObject_CallFunctionObjArgs(
4769         *errorHandler, *exceptionObject, NULL);
4770     if (restuple == NULL)
4771         return NULL;
4772     if (!PyTuple_Check(restuple)) {
4773         PyErr_SetString(PyExc_TypeError, &argparse[4]);
4774         Py_DECREF(restuple);
4775         return NULL;
4776     }
4777     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4778                           &resunicode, &i_newpos)) {
4779         Py_DECREF(restuple);
4780         return NULL;
4781     }
4782     if (i_newpos<0)
4783         *newpos = size+i_newpos;
4784     else
4785         *newpos = i_newpos;
4786     if (*newpos<0 || *newpos>size) {
4787         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4788         Py_DECREF(restuple);
4789         return NULL;
4790     }
4791     Py_INCREF(resunicode);
4792     Py_DECREF(restuple);
4793     return resunicode;
4794 }
4795
4796 /* Lookup the character ch in the mapping and put the result in result,
4797    which must be decrefed by the caller.
4798    Return 0 on success, -1 on error */
4799 static
4800 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4801 {
4802     PyObject *w = PyInt_FromLong((long)c);
4803     PyObject *x;
4804
4805     if (w == NULL)
4806         return -1;
4807     x = PyObject_GetItem(mapping, w);
4808     Py_DECREF(w);
4809     if (x == NULL) {
4810         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4811             /* No mapping found means: use 1:1 mapping. */
4812             PyErr_Clear();
4813             *result = NULL;
4814             return 0;
4815         } else
4816             return -1;
4817     }
4818     else if (x == Py_None) {
4819         *result = x;
4820         return 0;
4821     }
4822     else if (PyInt_Check(x)) {
4823         long value = PyInt_AS_LONG(x);
4824         long max = PyUnicode_GetMax();
4825         if (value < 0 || value > max) {
4826             PyErr_Format(PyExc_TypeError,
4827                          "character mapping must be in range(0x%lx)", max+1);
4828             Py_DECREF(x);
4829             return -1;
4830         }
4831         *result = x;
4832         return 0;
4833     }
4834     else if (PyUnicode_Check(x)) {
4835         *result = x;
4836         return 0;
4837     }
4838     else {
4839         /* wrong return value */
4840         PyErr_SetString(PyExc_TypeError,
4841                         "character mapping must return integer, None or unicode");
4842         Py_DECREF(x);
4843         return -1;
4844     }
4845 }
4846 /* ensure that *outobj is at least requiredsize characters long,
4847    if not reallocate and adjust various state variables.
4848    Return 0 on success, -1 on error */
4849 static
4850 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4851                                Py_ssize_t requiredsize)
4852 {
4853     Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4854     if (requiredsize > oldsize) {
4855         /* remember old output position */
4856         Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4857         /* exponentially overallocate to minimize reallocations */
4858         if (requiredsize < 2 * oldsize)
4859             requiredsize = 2 * oldsize;
4860         if (PyUnicode_Resize(outobj, requiredsize) < 0)
4861             return -1;
4862         *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4863     }
4864     return 0;
4865 }
4866 /* lookup the character, put the result in the output string and adjust
4867    various state variables. Return a new reference to the object that
4868    was put in the output buffer in *result, or Py_None, if the mapping was
4869    undefined (in which case no character was written).
4870    The called must decref result.
4871    Return 0 on success, -1 on error. */
4872 static
4873 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
4874                             Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4875                             PyObject **res)
4876 {
4877     if (charmaptranslate_lookup(*curinp, mapping, res))
4878         return -1;
4879     if (*res==NULL) {
4880         /* not found => default to 1:1 mapping */
4881         *(*outp)++ = *curinp;
4882     }
4883     else if (*res==Py_None)
4884         ;
4885     else if (PyInt_Check(*res)) {
4886         /* no overflow check, because we know that the space is enough */
4887         *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4888     }
4889     else if (PyUnicode_Check(*res)) {
4890         Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4891         if (repsize==1) {
4892             /* no overflow check, because we know that the space is enough */
4893             *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4894         }
4895         else if (repsize!=0) {
4896             /* more than one character */
4897             Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4898                 (insize - (curinp-startinp)) +
4899                 repsize - 1;
4900             if (charmaptranslate_makespace(outobj, outp, requiredsize))
4901                 return -1;
4902             memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4903             *outp += repsize;
4904         }
4905     }
4906     else
4907         return -1;
4908     return 0;
4909 }
4910
4911 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
4912                                      Py_ssize_t size,
4913                                      PyObject *mapping,
4914                                      const char *errors)
4915 {
4916     /* output object */
4917     PyObject *res = NULL;
4918     /* pointers to the beginning and end+1 of input */
4919     const Py_UNICODE *startp = p;
4920     const Py_UNICODE *endp = p + size;
4921     /* pointer into the output */
4922     Py_UNICODE *str;
4923     /* current output position */
4924     Py_ssize_t respos = 0;
4925     char *reason = "character maps to <undefined>";
4926     PyObject *errorHandler = NULL;
4927     PyObject *exc = NULL;
4928     /* the following variable is used for caching string comparisons
4929      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4930      * 3=ignore, 4=xmlcharrefreplace */
4931     int known_errorHandler = -1;
4932
4933     if (mapping == NULL) {
4934         PyErr_BadArgument();
4935         return NULL;
4936     }
4937
4938     /* allocate enough for a simple 1:1 translation without
4939        replacements, if we need more, we'll resize */
4940     res = PyUnicode_FromUnicode(NULL, size);
4941     if (res == NULL)
4942         goto onError;
4943     if (size == 0)
4944         return res;
4945     str = PyUnicode_AS_UNICODE(res);
4946
4947     while (p<endp) {
4948         /* try to encode it */
4949         PyObject *x = NULL;
4950         if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4951             Py_XDECREF(x);
4952             goto onError;
4953         }
4954         Py_XDECREF(x);
4955         if (x!=Py_None) /* it worked => adjust input pointer */
4956             ++p;
4957         else { /* untranslatable character */
4958             PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4959             Py_ssize_t repsize;
4960             Py_ssize_t newpos;
4961             Py_UNICODE *uni2;
4962             /* startpos for collecting untranslatable chars */
4963             const Py_UNICODE *collstart = p;
4964             const Py_UNICODE *collend = p+1;
4965             const Py_UNICODE *coll;
4966
4967             /* find all untranslatable characters */
4968             while (collend < endp) {
4969                 if (charmaptranslate_lookup(*collend, mapping, &x))
4970                     goto onError;
4971                 Py_XDECREF(x);
4972                 if (x!=Py_None)
4973                     break;
4974                 ++collend;
4975             }
4976             /* cache callback name lookup
4977              * (if not done yet, i.e. it's the first error) */
4978             if (known_errorHandler==-1) {
4979                 if ((errors==NULL) || (!strcmp(errors, "strict")))
4980                     known_errorHandler = 1;
4981                 else if (!strcmp(errors, "replace"))
4982                     known_errorHandler = 2;
4983                 else if (!strcmp(errors, "ignore"))
4984                     known_errorHandler = 3;
4985                 else if (!strcmp(errors, "xmlcharrefreplace"))
4986                     known_errorHandler = 4;
4987                 else
4988                     known_errorHandler = 0;
4989             }
4990             switch (known_errorHandler) {
4991             case 1: /* strict */
4992                 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4993                 goto onError;
4994             case 2: /* replace */
4995                 /* No need to check for space, this is a 1:1 replacement */
4996                 for (coll = collstart; coll<collend; ++coll)
4997                     *str++ = '?';
4998                 /* fall through */
4999             case 3: /* ignore */
5000                 p = collend;
5001                 break;
5002             case 4: /* xmlcharrefreplace */
5003                 /* generate replacement (temporarily (mis)uses p) */
5004                 for (p = collstart; p < collend; ++p) {
5005                     char buffer[2+29+1+1];
5006                     char *cp;
5007                     sprintf(buffer, "&#%d;", (int)*p);
5008                     if (charmaptranslate_makespace(&res, &str,
5009                                                    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5010                         goto onError;
5011                     for (cp = buffer; *cp; ++cp)
5012                         *str++ = *cp;
5013                 }
5014                 p = collend;
5015                 break;
5016             default:
5017                 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5018                                                                  reason, startp, size, &exc,
5019                                                                  collstart-startp, collend-startp, &newpos);
5020                 if (repunicode == NULL)
5021                     goto onError;
5022                 /* generate replacement  */
5023                 repsize = PyUnicode_GET_SIZE(repunicode);
5024                 if (charmaptranslate_makespace(&res, &str,
5025                                                (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5026                     Py_DECREF(repunicode);
5027                     goto onError;
5028                 }
5029                 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5030                     *str++ = *uni2;
5031                 p = startp + newpos;
5032                 Py_DECREF(repunicode);
5033             }
5034         }
5035     }
5036     /* Resize if we allocated to much */
5037     respos = str-PyUnicode_AS_UNICODE(res);
5038     if (respos<PyUnicode_GET_SIZE(res)) {
5039         if (PyUnicode_Resize(&res, respos) < 0)
5040             goto onError;
5041     }
5042     Py_XDECREF(exc);
5043     Py_XDECREF(errorHandler);
5044     return res;
5045
5046   onError:
5047     Py_XDECREF(res);
5048     Py_XDECREF(exc);
5049     Py_XDECREF(errorHandler);
5050     return NULL;
5051 }
5052
5053 PyObject *PyUnicode_Translate(PyObject *str,
5054                               PyObject *mapping,
5055                               const char *errors)
5056 {
5057     PyObject *result;
5058
5059     str = PyUnicode_FromObject(str);
5060     if (str == NULL)
5061         goto onError;
5062     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5063                                         PyUnicode_GET_SIZE(str),
5064                                         mapping,
5065                                         errors);
5066     Py_DECREF(str);
5067     return result;
5068
5069   onError:
5070     Py_XDECREF(str);
5071     return NULL;
5072 }
5073
5074 /* --- Decimal Encoder ---------------------------------------------------- */
5075
5076 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5077                             Py_ssize_t length,
5078                             char *output,
5079                             const char *errors)
5080 {
5081     Py_UNICODE *p, *end;
5082     PyObject *errorHandler = NULL;
5083     PyObject *exc = NULL;
5084     const char *encoding = "decimal";
5085     const char *reason = "invalid decimal Unicode string";
5086     /* the following variable is used for caching string comparisons
5087      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5088     int known_errorHandler = -1;
5089
5090     if (output == NULL) {
5091         PyErr_BadArgument();
5092         return -1;
5093     }
5094
5095     p = s;
5096     end = s + length;
5097     while (p < end) {
5098         register Py_UNICODE ch = *p;
5099         int decimal;
5100         PyObject *repunicode;
5101         Py_ssize_t repsize;
5102         Py_ssize_t newpos;
5103         Py_UNICODE *uni2;
5104         Py_UNICODE *collstart;
5105         Py_UNICODE *collend;
5106
5107         if (Py_UNICODE_ISSPACE(ch)) {
5108             *output++ = ' ';
5109             ++p;
5110             continue;
5111         }
5112         decimal = Py_UNICODE_TODECIMAL(ch);
5113         if (decimal >= 0) {
5114             *output++ = '0' + decimal;
5115             ++p;
5116             continue;
5117         }
5118         if (0 < ch && ch < 256) {
5119             *output++ = (char)ch;
5120             ++p;
5121             continue;
5122         }
5123         /* All other characters are considered unencodable */
5124         collstart = p;
5125         collend = p+1;
5126         while (collend < end) {
5127             if ((0 < *collend && *collend < 256) ||
5128                 !Py_UNICODE_ISSPACE(*collend) ||
5129                 Py_UNICODE_TODECIMAL(*collend))
5130                 break;
5131         }
5132         /* cache callback name lookup
5133          * (if not done yet, i.e. it's the first error) */
5134         if (known_errorHandler==-1) {
5135             if ((errors==NULL) || (!strcmp(errors, "strict")))
5136                 known_errorHandler = 1;
5137             else if (!strcmp(errors, "replace"))
5138                 known_errorHandler = 2;
5139             else if (!strcmp(errors, "ignore"))
5140                 known_errorHandler = 3;
5141             else if (!strcmp(errors, "xmlcharrefreplace"))
5142                 known_errorHandler = 4;
5143             else
5144                 known_errorHandler = 0;
5145         }
5146         switch (known_errorHandler) {
5147         case 1: /* strict */
5148             raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5149             goto onError;
5150         case 2: /* replace */
5151             for (p = collstart; p < collend; ++p)
5152                 *output++ = '?';
5153             /* fall through */
5154         case 3: /* ignore */
5155             p = collend;
5156             break;
5157         case 4: /* xmlcharrefreplace */
5158             /* generate replacement (temporarily (mis)uses p) */
5159             for (p = collstart; p < collend; ++p)
5160                 output += sprintf(output, "&#%d;", (int)*p);
5161             p = collend;
5162             break;
5163         default:
5164             repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5165                                                           encoding, reason, s, length, &exc,
5166                                                           collstart-s, collend-s, &newpos);
5167             if (repunicode == NULL)
5168                 goto onError;
5169             /* generate replacement  */
5170             repsize = PyUnicode_GET_SIZE(repunicode);
5171             for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5172                 Py_UNICODE ch = *uni2;
5173                 if (Py_UNICODE_ISSPACE(ch))
5174                     *output++ = ' ';
5175                 else {
5176                     decimal = Py_UNICODE_TODECIMAL(ch);
5177                     if (decimal >= 0)
5178                         *output++ = '0' + decimal;
5179                     else if (0 < ch && ch < 256)
5180                         *output++ = (char)ch;
5181                     else {
5182                         Py_DECREF(repunicode);
5183                         raise_encode_exception(&exc, encoding,
5184                                                s, length, collstart-s, collend-s, reason);
5185                         goto onError;
5186                     }
5187                 }
5188             }
5189             p = s + newpos;
5190             Py_DECREF(repunicode);
5191         }
5192     }
5193     /* 0-terminate the output string */
5194     *output++ = '\0';
5195     Py_XDECREF(exc);
5196     Py_XDECREF(errorHandler);
5197     return 0;
5198
5199   onError:
5200     Py_XDECREF(exc);
5201     Py_XDECREF(errorHandler);
5202     return -1;
5203 }
5204
5205 /* --- Helpers ------------------------------------------------------------ */
5206
5207 #include "stringlib/unicodedefs.h"
5208
5209 #define FROM_UNICODE
5210
5211 #include "stringlib/fastsearch.h"
5212
5213 #include "stringlib/count.h"
5214 #include "stringlib/find.h"
5215 #include "stringlib/partition.h"
5216
5217 /* helper macro to fixup start/end slice values */
5218 #define FIX_START_END(obj)                      \
5219     if (start < 0)                              \
5220         start += (obj)->length;                 \
5221     if (start < 0)                              \
5222         start = 0;                              \
5223     if (end > (obj)->length)                    \
5224         end = (obj)->length;                    \
5225     if (end < 0)                                \
5226         end += (obj)->length;                   \
5227     if (end < 0)                                \
5228         end = 0;
5229
5230 Py_ssize_t PyUnicode_Count(PyObject *str,
5231                            PyObject *substr,
5232                            Py_ssize_t start,
5233                            Py_ssize_t end)
5234 {
5235     Py_ssize_t result;
5236     PyUnicodeObject* str_obj;
5237     PyUnicodeObject* sub_obj;
5238
5239     str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5240     if (!str_obj)
5241         return -1;
5242     sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5243     if (!sub_obj) {
5244         Py_DECREF(str_obj);
5245         return -1;
5246     }
5247
5248     FIX_START_END(str_obj);
5249
5250     result = stringlib_count(
5251         str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5252         );
5253
5254     Py_DECREF(sub_obj);
5255     Py_DECREF(str_obj);
5256
5257     return result;
5258 }
5259
5260 Py_ssize_t PyUnicode_Find(PyObject *str,
5261                           PyObject *sub,
5262                           Py_ssize_t start,
5263                           Py_ssize_t end,
5264                           int direction)
5265 {
5266     Py_ssize_t result;
5267
5268     str = PyUnicode_FromObject(str);
5269     if (!str)
5270         return -2;
5271     sub = PyUnicode_FromObject(sub);
5272     if (!sub) {
5273         Py_DECREF(str);
5274         return -2;
5275     }
5276
5277     if (direction > 0)
5278         result = stringlib_find_slice(
5279             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5280             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5281             start, end
5282             );
5283     else
5284         result = stringlib_rfind_slice(
5285             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5286             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5287             start, end
5288             );
5289
5290     Py_DECREF(str);
5291     Py_DECREF(sub);
5292
5293     return result;
5294 }
5295
5296 static
5297 int tailmatch(PyUnicodeObject *self,
5298               PyUnicodeObject *substring,
5299               Py_ssize_t start,
5300               Py_ssize_t end,
5301               int direction)
5302 {
5303     if (substring->length == 0)
5304         return 1;
5305
5306     FIX_START_END(self);
5307
5308     end -= substring->length;
5309     if (end < start)
5310         return 0;
5311
5312     if (direction > 0) {
5313         if (Py_UNICODE_MATCH(self, end, substring))
5314             return 1;
5315     } else {
5316         if (Py_UNICODE_MATCH(self, start, substring))
5317             return 1;
5318     }
5319
5320     return 0;
5321 }
5322
5323 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5324                                PyObject *substr,
5325                                Py_ssize_t start,
5326                                Py_ssize_t end,
5327                                int direction)
5328 {
5329     Py_ssize_t result;
5330
5331     str = PyUnicode_FromObject(str);
5332     if (str == NULL)
5333         return -1;
5334     substr = PyUnicode_FromObject(substr);
5335     if (substr == NULL) {
5336         Py_DECREF(str);
5337         return -1;
5338     }
5339
5340     result = tailmatch((PyUnicodeObject *)str,
5341                        (PyUnicodeObject *)substr,
5342                        start, end, direction);
5343     Py_DECREF(str);
5344     Py_DECREF(substr);
5345     return result;
5346 }
5347
5348 /* Apply fixfct filter to the Unicode object self and return a
5349    reference to the modified object */
5350
5351 static
5352 PyObject *fixup(PyUnicodeObject *self,
5353                 int (*fixfct)(PyUnicodeObject *s))
5354 {
5355
5356     PyUnicodeObject *u;
5357
5358     u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5359     if (u == NULL)
5360         return NULL;
5361
5362     Py_UNICODE_COPY(u->str, self->str, self->length);
5363
5364     if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5365         /* fixfct should return TRUE if it modified the buffer. If
5366            FALSE, return a reference to the original buffer instead
5367            (to save space, not time) */
5368         Py_INCREF(self);
5369         Py_DECREF(u);
5370         return (PyObject*) self;
5371     }
5372     return (PyObject*) u;
5373 }
5374
5375 static
5376 int fixupper(PyUnicodeObject *self)
5377 {
5378     Py_ssize_t len = self->length;
5379     Py_UNICODE *s = self->str;
5380     int status = 0;
5381
5382     while (len-- > 0) {
5383         register Py_UNICODE ch;
5384
5385         ch = Py_UNICODE_TOUPPER(*s);
5386         if (ch != *s) {
5387             status = 1;
5388             *s = ch;
5389         }
5390         s++;
5391     }
5392
5393     return status;
5394 }
5395
5396 static
5397 int fixlower(PyUnicodeObject *self)
5398 {
5399     Py_ssize_t len = self->length;
5400     Py_UNICODE *s = self->str;
5401     int status = 0;
5402
5403     while (len-- > 0) {
5404         register Py_UNICODE ch;
5405
5406         ch = Py_UNICODE_TOLOWER(*s);
5407         if (ch != *s) {
5408             status = 1;
5409             *s = ch;
5410         }
5411         s++;
5412     }
5413
5414     return status;
5415 }
5416
5417 static
5418 int fixswapcase(PyUnicodeObject *self)
5419 {
5420     Py_ssize_t len = self->length;
5421     Py_UNICODE *s = self->str;
5422     int status = 0;
5423
5424     while (len-- > 0) {
5425         if (Py_UNICODE_ISUPPER(*s)) {
5426             *s = Py_UNICODE_TOLOWER(*s);
5427             status = 1;
5428         } else if (Py_UNICODE_ISLOWER(*s)) {
5429             *s = Py_UNICODE_TOUPPER(*s);
5430             status = 1;
5431         }
5432         s++;
5433     }
5434
5435     return status;
5436 }
5437
5438 static
5439 int fixcapitalize(PyUnicodeObject *self)
5440 {
5441     Py_ssize_t len = self->length;
5442     Py_UNICODE *s = self->str;
5443     int status = 0;
5444
5445     if (len == 0)
5446         return 0;
5447     if (Py_UNICODE_ISLOWER(*s)) {
5448         *s = Py_UNICODE_TOUPPER(*s);
5449         status = 1;
5450     }
5451     s++;
5452     while (--len > 0) {
5453         if (Py_UNICODE_ISUPPER(*s)) {
5454             *s = Py_UNICODE_TOLOWER(*s);
5455             status = 1;
5456         }
5457         s++;
5458     }
5459     return status;
5460 }
5461
5462 static
5463 int fixtitle(PyUnicodeObject *self)
5464 {
5465     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5466     register Py_UNICODE *e;
5467     int previous_is_cased;
5468
5469     /* Shortcut for single character strings */
5470     if (PyUnicode_GET_SIZE(self) == 1) {
5471         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5472         if (*p != ch) {
5473             *p = ch;
5474             return 1;
5475         }
5476         else
5477             return 0;
5478     }
5479
5480     e = p + PyUnicode_GET_SIZE(self);
5481     previous_is_cased = 0;
5482     for (; p < e; p++) {
5483         register const Py_UNICODE ch = *p;
5484
5485         if (previous_is_cased)
5486             *p = Py_UNICODE_TOLOWER(ch);
5487         else
5488             *p = Py_UNICODE_TOTITLE(ch);
5489
5490         if (Py_UNICODE_ISLOWER(ch) ||
5491             Py_UNICODE_ISUPPER(ch) ||
5492             Py_UNICODE_ISTITLE(ch))
5493             previous_is_cased = 1;
5494         else
5495             previous_is_cased = 0;
5496     }
5497     return 1;
5498 }
5499
5500 PyObject *
5501 PyUnicode_Join(PyObject *separator, PyObject *seq)
5502 {
5503     PyObject *internal_separator = NULL;
5504     const Py_UNICODE blank = ' ';
5505     const Py_UNICODE *sep = &blank;
5506     Py_ssize_t seplen = 1;
5507     PyUnicodeObject *res = NULL; /* the result */
5508     Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
5509     Py_ssize_t res_used;         /* # used bytes */
5510     Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
5511     PyObject *fseq;          /* PySequence_Fast(seq) */
5512     Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
5513     PyObject *item;
5514     Py_ssize_t i;
5515
5516     fseq = PySequence_Fast(seq, "");
5517     if (fseq == NULL) {
5518         return NULL;
5519     }
5520
5521     /* Grrrr.  A codec may be invoked to convert str objects to
5522      * Unicode, and so it's possible to call back into Python code
5523      * during PyUnicode_FromObject(), and so it's possible for a sick
5524      * codec to change the size of fseq (if seq is a list).  Therefore
5525      * we have to keep refetching the size -- can't assume seqlen
5526      * is invariant.
5527      */
5528     seqlen = PySequence_Fast_GET_SIZE(fseq);
5529     /* If empty sequence, return u"". */
5530     if (seqlen == 0) {
5531         res = _PyUnicode_New(0);  /* empty sequence; return u"" */
5532         goto Done;
5533     }
5534     /* If singleton sequence with an exact Unicode, return that. */
5535     if (seqlen == 1) {
5536         item = PySequence_Fast_GET_ITEM(fseq, 0);
5537         if (PyUnicode_CheckExact(item)) {
5538             Py_INCREF(item);
5539             res = (PyUnicodeObject *)item;
5540             goto Done;
5541         }
5542     }
5543
5544     /* At least two items to join, or one that isn't exact Unicode. */
5545     if (seqlen > 1) {
5546         /* Set up sep and seplen -- they're needed. */
5547         if (separator == NULL) {
5548             sep = &blank;
5549             seplen = 1;
5550         }
5551         else {
5552             internal_separator = PyUnicode_FromObject(separator);
5553             if (internal_separator == NULL)
5554                 goto onError;
5555             sep = PyUnicode_AS_UNICODE(internal_separator);
5556             seplen = PyUnicode_GET_SIZE(internal_separator);
5557             /* In case PyUnicode_FromObject() mutated seq. */
5558             seqlen = PySequence_Fast_GET_SIZE(fseq);
5559         }
5560     }
5561
5562     /* Get space. */
5563     res = _PyUnicode_New(res_alloc);
5564     if (res == NULL)
5565         goto onError;
5566     res_p = PyUnicode_AS_UNICODE(res);
5567     res_used = 0;
5568
5569     for (i = 0; i < seqlen; ++i) {
5570         Py_ssize_t itemlen;
5571         Py_ssize_t new_res_used;
5572
5573         item = PySequence_Fast_GET_ITEM(fseq, i);
5574         /* Convert item to Unicode. */
5575         if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5576             PyErr_Format(PyExc_TypeError,
5577                          "sequence item %zd: expected string or Unicode,"
5578                          " %.80s found",
5579                          i, Py_TYPE(item)->tp_name);
5580             goto onError;
5581         }
5582         item = PyUnicode_FromObject(item);
5583         if (item == NULL)
5584             goto onError;
5585         /* We own a reference to item from here on. */
5586
5587         /* In case PyUnicode_FromObject() mutated seq. */
5588         seqlen = PySequence_Fast_GET_SIZE(fseq);
5589
5590         /* Make sure we have enough space for the separator and the item. */
5591         itemlen = PyUnicode_GET_SIZE(item);
5592         new_res_used = res_used + itemlen;
5593         if (new_res_used < 0)
5594             goto Overflow;
5595         if (i < seqlen - 1) {
5596             new_res_used += seplen;
5597             if (new_res_used < 0)
5598                 goto Overflow;
5599         }
5600         if (new_res_used > res_alloc) {
5601             /* double allocated size until it's big enough */
5602             do {
5603                 res_alloc += res_alloc;
5604                 if (res_alloc <= 0)
5605                     goto Overflow;
5606             } while (new_res_used > res_alloc);
5607             if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5608                 Py_DECREF(item);
5609                 goto onError;
5610             }
5611             res_p = PyUnicode_AS_UNICODE(res) + res_used;
5612         }
5613
5614         /* Copy item, and maybe the separator. */
5615         Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5616         res_p += itemlen;
5617         if (i < seqlen - 1) {
5618             Py_UNICODE_COPY(res_p, sep, seplen);
5619             res_p += seplen;
5620         }
5621         Py_DECREF(item);
5622         res_used = new_res_used;
5623     }
5624
5625     /* Shrink res to match the used area; this probably can't fail,
5626      * but it's cheap to check.
5627      */
5628     if (_PyUnicode_Resize(&res, res_used) < 0)
5629         goto onError;
5630
5631   Done:
5632     Py_XDECREF(internal_separator);
5633     Py_DECREF(fseq);
5634     return (PyObject *)res;
5635
5636   Overflow:
5637     PyErr_SetString(PyExc_OverflowError,
5638                     "join() result is too long for a Python string");
5639     Py_DECREF(item);
5640     /* fall through */
5641
5642   onError:
5643     Py_XDECREF(internal_separator);
5644     Py_DECREF(fseq);
5645     Py_XDECREF(res);
5646     return NULL;
5647 }
5648
5649 static
5650 PyUnicodeObject *pad(PyUnicodeObject *self,
5651                      Py_ssize_t left,
5652                      Py_ssize_t right,
5653                      Py_UNICODE fill)
5654 {
5655     PyUnicodeObject *u;
5656
5657     if (left < 0)
5658         left = 0;
5659     if (right < 0)
5660         right = 0;
5661
5662     if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5663         Py_INCREF(self);
5664         return self;
5665     }
5666
5667     if (left > PY_SSIZE_T_MAX - self->length ||
5668         right > PY_SSIZE_T_MAX - (left + self->length)) {
5669         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5670         return NULL;
5671     }
5672     u = _PyUnicode_New(left + self->length + right);
5673     if (u) {
5674         if (left)
5675             Py_UNICODE_FILL(u->str, fill, left);
5676         Py_UNICODE_COPY(u->str + left, self->str, self->length);
5677         if (right)
5678             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5679     }
5680
5681     return u;
5682 }
5683
5684 #define SPLIT_APPEND(data, left, right)                                 \
5685     str = PyUnicode_FromUnicode((data) + (left), (right) - (left));     \
5686     if (!str)                                                           \
5687         goto onError;                                                   \
5688     if (PyList_Append(list, str)) {                                     \
5689         Py_DECREF(str);                                                 \
5690         goto onError;                                                   \
5691     }                                                                   \
5692     else                                                                \
5693         Py_DECREF(str);
5694
5695 static
5696 PyObject *split_whitespace(PyUnicodeObject *self,
5697                            PyObject *list,
5698                            Py_ssize_t maxcount)
5699 {
5700     register Py_ssize_t i;
5701     register Py_ssize_t j;
5702     Py_ssize_t len = self->length;
5703     PyObject *str;
5704     register const Py_UNICODE *buf = self->str;
5705
5706     for (i = j = 0; i < len; ) {
5707         /* find a token */
5708         while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5709             i++;
5710         j = i;
5711         while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
5712             i++;
5713         if (j < i) {
5714             if (maxcount-- <= 0)
5715                 break;
5716             SPLIT_APPEND(buf, j, i);
5717             while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5718                 i++;
5719             j = i;
5720         }
5721     }
5722     if (j < len) {
5723         SPLIT_APPEND(buf, j, len);
5724     }
5725     return list;
5726
5727   onError:
5728     Py_DECREF(list);
5729     return NULL;
5730 }
5731
5732 PyObject *PyUnicode_Splitlines(PyObject *string,
5733                                int keepends)
5734 {
5735     register Py_ssize_t i;
5736     register Py_ssize_t j;
5737     Py_ssize_t len;
5738     PyObject *list;
5739     PyObject *str;
5740     Py_UNICODE *data;
5741
5742     string = PyUnicode_FromObject(string);
5743     if (string == NULL)
5744         return NULL;
5745     data = PyUnicode_AS_UNICODE(string);
5746     len = PyUnicode_GET_SIZE(string);
5747
5748     list = PyList_New(0);
5749     if (!list)
5750         goto onError;
5751
5752     for (i = j = 0; i < len; ) {
5753         Py_ssize_t eol;
5754
5755         /* Find a line and append it */
5756         while (i < len && !BLOOM_LINEBREAK(data[i]))
5757             i++;
5758
5759         /* Skip the line break reading CRLF as one line break */
5760         eol = i;
5761         if (i < len) {
5762             if (data[i] == '\r' && i + 1 < len &&
5763                 data[i+1] == '\n')
5764                 i += 2;
5765             else
5766                 i++;
5767             if (keepends)
5768                 eol = i;
5769         }
5770         SPLIT_APPEND(data, j, eol);
5771         j = i;
5772     }
5773     if (j < len) {
5774         SPLIT_APPEND(data, j, len);
5775     }
5776
5777     Py_DECREF(string);
5778     return list;
5779
5780   onError:
5781     Py_XDECREF(list);
5782     Py_DECREF(string);
5783     return NULL;
5784 }
5785
5786 static
5787 PyObject *split_char(PyUnicodeObject *self,
5788                      PyObject *list,
5789                      Py_UNICODE ch,
5790                      Py_ssize_t maxcount)
5791 {
5792     register Py_ssize_t i;
5793     register Py_ssize_t j;
5794     Py_ssize_t len = self->length;
5795     PyObject *str;
5796     register const Py_UNICODE *buf = self->str;
5797
5798     for (i = j = 0; i < len; ) {
5799         if (buf[i] == ch) {
5800             if (maxcount-- <= 0)
5801                 break;
5802             SPLIT_APPEND(buf, j, i);
5803             i = j = i + 1;
5804         } else
5805             i++;
5806     }
5807     if (j <= len) {
5808         SPLIT_APPEND(buf, j, len);
5809     }
5810     return list;
5811
5812   onError:
5813     Py_DECREF(list);
5814     return NULL;
5815 }
5816
5817 static
5818 PyObject *split_substring(PyUnicodeObject *self,
5819                           PyObject *list,
5820                           PyUnicodeObject *substring,
5821                           Py_ssize_t maxcount)
5822 {
5823     register Py_ssize_t i;
5824     register Py_ssize_t j;
5825     Py_ssize_t len = self->length;
5826     Py_ssize_t sublen = substring->length;
5827     PyObject *str;
5828
5829     for (i = j = 0; i <= len - sublen; ) {
5830         if (Py_UNICODE_MATCH(self, i, substring)) {
5831             if (maxcount-- <= 0)
5832                 break;
5833             SPLIT_APPEND(self->str, j, i);
5834             i = j = i + sublen;
5835         } else
5836             i++;
5837     }
5838     if (j <= len) {
5839         SPLIT_APPEND(self->str, j, len);
5840     }
5841     return list;
5842
5843   onError:
5844     Py_DECREF(list);
5845     return NULL;
5846 }
5847
5848 static
5849 PyObject *rsplit_whitespace(PyUnicodeObject *self,
5850                             PyObject *list,
5851                             Py_ssize_t maxcount)
5852 {
5853     register Py_ssize_t i;
5854     register Py_ssize_t j;
5855     Py_ssize_t len = self->length;
5856     PyObject *str;
5857     register const Py_UNICODE *buf = self->str;
5858
5859     for (i = j = len - 1; i >= 0; ) {
5860         /* find a token */
5861         while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5862             i--;
5863         j = i;
5864         while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
5865             i--;
5866         if (j > i) {
5867             if (maxcount-- <= 0)
5868                 break;
5869             SPLIT_APPEND(buf, i + 1, j + 1);
5870             while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5871                 i--;
5872             j = i;
5873         }
5874     }
5875     if (j >= 0) {
5876         SPLIT_APPEND(buf, 0, j + 1);
5877     }
5878     if (PyList_Reverse(list) < 0)
5879         goto onError;
5880     return list;
5881
5882   onError:
5883     Py_DECREF(list);
5884     return NULL;
5885 }
5886
5887 static
5888 PyObject *rsplit_char(PyUnicodeObject *self,
5889                       PyObject *list,
5890                       Py_UNICODE ch,
5891                       Py_ssize_t maxcount)
5892 {
5893     register Py_ssize_t i;
5894     register Py_ssize_t j;
5895     Py_ssize_t len = self->length;
5896     PyObject *str;
5897     register const Py_UNICODE *buf = self->str;
5898
5899     for (i = j = len - 1; i >= 0; ) {
5900         if (buf[i] == ch) {
5901             if (maxcount-- <= 0)
5902                 break;
5903             SPLIT_APPEND(buf, i + 1, j + 1);
5904             j = i = i - 1;
5905         } else
5906             i--;
5907     }
5908     if (j >= -1) {
5909         SPLIT_APPEND(buf, 0, j + 1);
5910     }
5911     if (PyList_Reverse(list) < 0)
5912         goto onError;
5913     return list;
5914
5915   onError:
5916     Py_DECREF(list);
5917     return NULL;
5918 }
5919
5920 static
5921 PyObject *rsplit_substring(PyUnicodeObject *self,
5922                            PyObject *list,
5923                            PyUnicodeObject *substring,
5924                            Py_ssize_t maxcount)
5925 {
5926     register Py_ssize_t i;
5927     register Py_ssize_t j;
5928     Py_ssize_t len = self->length;
5929     Py_ssize_t sublen = substring->length;
5930     PyObject *str;
5931
5932     for (i = len - sublen, j = len; i >= 0; ) {
5933         if (Py_UNICODE_MATCH(self, i, substring)) {
5934             if (maxcount-- <= 0)
5935                 break;
5936             SPLIT_APPEND(self->str, i + sublen, j);
5937             j = i;
5938             i -= sublen;
5939         } else
5940             i--;
5941     }
5942     if (j >= 0) {
5943         SPLIT_APPEND(self->str, 0, j);
5944     }
5945     if (PyList_Reverse(list) < 0)
5946         goto onError;
5947     return list;
5948
5949   onError:
5950     Py_DECREF(list);
5951     return NULL;
5952 }
5953
5954 #undef SPLIT_APPEND
5955
5956 static
5957 PyObject *split(PyUnicodeObject *self,
5958                 PyUnicodeObject *substring,
5959                 Py_ssize_t maxcount)
5960 {
5961     PyObject *list;
5962
5963     if (maxcount < 0)
5964         maxcount = PY_SSIZE_T_MAX;
5965
5966     list = PyList_New(0);
5967     if (!list)
5968         return NULL;
5969
5970     if (substring == NULL)
5971         return split_whitespace(self,list,maxcount);
5972
5973     else if (substring->length == 1)
5974         return split_char(self,list,substring->str[0],maxcount);
5975
5976     else if (substring->length == 0) {
5977         Py_DECREF(list);
5978         PyErr_SetString(PyExc_ValueError, "empty separator");
5979         return NULL;
5980     }
5981     else
5982         return split_substring(self,list,substring,maxcount);
5983 }
5984
5985 static
5986 PyObject *rsplit(PyUnicodeObject *self,
5987                  PyUnicodeObject *substring,
5988                  Py_ssize_t maxcount)
5989 {
5990     PyObject *list;
5991
5992     if (maxcount < 0)
5993         maxcount = PY_SSIZE_T_MAX;
5994
5995     list = PyList_New(0);
5996     if (!list)
5997         return NULL;
5998
5999     if (substring == NULL)
6000         return rsplit_whitespace(self,list,maxcount);
6001
6002     else if (substring->length == 1)
6003         return rsplit_char(self,list,substring->str[0],maxcount);
6004
6005     else if (substring->length == 0) {
6006         Py_DECREF(list);
6007         PyErr_SetString(PyExc_ValueError, "empty separator");
6008         return NULL;
6009     }
6010     else
6011         return rsplit_substring(self,list,substring,maxcount);
6012 }
6013
6014 static
6015 PyObject *replace(PyUnicodeObject *self,
6016                   PyUnicodeObject *str1,
6017                   PyUnicodeObject *str2,
6018                   Py_ssize_t maxcount)
6019 {
6020     PyUnicodeObject *u;
6021
6022     if (maxcount < 0)
6023         maxcount = PY_SSIZE_T_MAX;
6024
6025     if (str1->length == str2->length) {
6026         /* same length */
6027         Py_ssize_t i;
6028         if (str1->length == 1) {
6029             /* replace characters */
6030             Py_UNICODE u1, u2;
6031             if (!findchar(self->str, self->length, str1->str[0]))
6032                 goto nothing;
6033             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6034             if (!u)
6035                 return NULL;
6036             Py_UNICODE_COPY(u->str, self->str, self->length);
6037             u1 = str1->str[0];
6038             u2 = str2->str[0];
6039             for (i = 0; i < u->length; i++)
6040                 if (u->str[i] == u1) {
6041                     if (--maxcount < 0)
6042                         break;
6043                     u->str[i] = u2;
6044                 }
6045         } else {
6046             i = fastsearch(
6047                 self->str, self->length, str1->str, str1->length, FAST_SEARCH
6048                 );
6049             if (i < 0)
6050                 goto nothing;
6051             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6052             if (!u)
6053                 return NULL;
6054             Py_UNICODE_COPY(u->str, self->str, self->length);
6055             while (i <= self->length - str1->length)
6056                 if (Py_UNICODE_MATCH(self, i, str1)) {
6057                     if (--maxcount < 0)
6058                         break;
6059                     Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6060                     i += str1->length;
6061                 } else
6062                     i++;
6063         }
6064     } else {
6065
6066         Py_ssize_t n, i, j, e;
6067         Py_ssize_t product, new_size, delta;
6068         Py_UNICODE *p;
6069
6070         /* replace strings */
6071         n = stringlib_count(self->str, self->length, str1->str, str1->length);
6072         if (n > maxcount)
6073             n = maxcount;
6074         if (n == 0)
6075             goto nothing;
6076         /* new_size = self->length + n * (str2->length - str1->length)); */
6077         delta = (str2->length - str1->length);
6078         if (delta == 0) {
6079             new_size = self->length;
6080         } else {
6081             product = n * (str2->length - str1->length);
6082             if ((product / (str2->length - str1->length)) != n) {
6083                 PyErr_SetString(PyExc_OverflowError,
6084                                 "replace string is too long");
6085                 return NULL;
6086             }
6087             new_size = self->length + product;
6088             if (new_size < 0) {
6089                 PyErr_SetString(PyExc_OverflowError,
6090                                 "replace string is too long");
6091                 return NULL;
6092             }
6093         }
6094         u = _PyUnicode_New(new_size);
6095         if (!u)
6096             return NULL;
6097         i = 0;
6098         p = u->str;
6099         e = self->length - str1->length;
6100         if (str1->length > 0) {
6101             while (n-- > 0) {
6102                 /* look for next match */
6103                 j = i;
6104                 while (j <= e) {
6105                     if (Py_UNICODE_MATCH(self, j, str1))
6106                         break;
6107                     j++;
6108                 }
6109                 if (j > i) {
6110                     if (j > e)
6111                         break;
6112                     /* copy unchanged part [i:j] */
6113                     Py_UNICODE_COPY(p, self->str+i, j-i);
6114                     p += j - i;
6115                 }
6116                 /* copy substitution string */
6117                 if (str2->length > 0) {
6118                     Py_UNICODE_COPY(p, str2->str, str2->length);
6119                     p += str2->length;
6120                 }
6121                 i = j + str1->length;
6122             }
6123             if (i < self->length)
6124                 /* copy tail [i:] */
6125                 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6126         } else {
6127             /* interleave */
6128             while (n > 0) {
6129                 Py_UNICODE_COPY(p, str2->str, str2->length);
6130                 p += str2->length;
6131                 if (--n <= 0)
6132                     break;
6133                 *p++ = self->str[i++];
6134             }
6135             Py_UNICODE_COPY(p, self->str+i, self->length-i);
6136         }
6137     }
6138     return (PyObject *) u;
6139
6140   nothing:
6141     /* nothing to replace; return original string (when possible) */
6142     if (PyUnicode_CheckExact(self)) {
6143         Py_INCREF(self);
6144         return (PyObject *) self;
6145     }
6146     return PyUnicode_FromUnicode(self->str, self->length);
6147 }
6148
6149 /* --- Unicode Object Methods --------------------------------------------- */
6150
6151 PyDoc_STRVAR(title__doc__,
6152              "S.title() -> unicode\n\
6153 \n\
6154 Return a titlecased version of S, i.e. words start with title case\n\
6155 characters, all remaining cased characters have lower case.");
6156
6157 static PyObject*
6158 unicode_title(PyUnicodeObject *self)
6159 {
6160     return fixup(self, fixtitle);
6161 }
6162
6163 PyDoc_STRVAR(capitalize__doc__,
6164              "S.capitalize() -> unicode\n\
6165 \n\
6166 Return a capitalized version of S, i.e. make the first character\n\
6167 have upper case.");
6168
6169 static PyObject*
6170 unicode_capitalize(PyUnicodeObject *self)
6171 {
6172     return fixup(self, fixcapitalize);
6173 }
6174
6175 #if 0
6176 PyDoc_STRVAR(capwords__doc__,
6177              "S.capwords() -> unicode\n\
6178 \n\
6179 Apply .capitalize() to all words in S and return the result with\n\
6180 normalized whitespace (all whitespace strings are replaced by ' ').");
6181
6182 static PyObject*
6183 unicode_capwords(PyUnicodeObject *self)
6184 {
6185     PyObject *list;
6186     PyObject *item;
6187     Py_ssize_t i;
6188
6189     /* Split into words */
6190     list = split(self, NULL, -1);
6191     if (!list)
6192         return NULL;
6193
6194     /* Capitalize each word */
6195     for (i = 0; i < PyList_GET_SIZE(list); i++) {
6196         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6197                      fixcapitalize);
6198         if (item == NULL)
6199             goto onError;
6200         Py_DECREF(PyList_GET_ITEM(list, i));
6201         PyList_SET_ITEM(list, i, item);
6202     }
6203
6204     /* Join the words to form a new string */
6205     item = PyUnicode_Join(NULL, list);
6206
6207   onError:
6208     Py_DECREF(list);
6209     return (PyObject *)item;
6210 }
6211 #endif
6212
6213 /* Argument converter.  Coerces to a single unicode character */
6214
6215 static int
6216 convert_uc(PyObject *obj, void *addr)
6217 {
6218     Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6219     PyObject *uniobj;
6220     Py_UNICODE *unistr;
6221
6222     uniobj = PyUnicode_FromObject(obj);
6223     if (uniobj == NULL) {
6224         PyErr_SetString(PyExc_TypeError,
6225                         "The fill character cannot be converted to Unicode");
6226         return 0;
6227     }
6228     if (PyUnicode_GET_SIZE(uniobj) != 1) {
6229         PyErr_SetString(PyExc_TypeError,
6230                         "The fill character must be exactly one character long");
6231         Py_DECREF(uniobj);
6232         return 0;
6233     }
6234     unistr = PyUnicode_AS_UNICODE(uniobj);
6235     *fillcharloc = unistr[0];
6236     Py_DECREF(uniobj);
6237     return 1;
6238 }
6239
6240 PyDoc_STRVAR(center__doc__,
6241              "S.center(width[, fillchar]) -> unicode\n\
6242 \n\
6243 Return S centered in a Unicode string of length width. Padding is\n\
6244 done using the specified fill character (default is a space)");
6245
6246 static PyObject *
6247 unicode_center(PyUnicodeObject *self, PyObject *args)
6248 {
6249     Py_ssize_t marg, left;
6250     Py_ssize_t width;
6251     Py_UNICODE fillchar = ' ';
6252
6253     if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6254         return NULL;
6255
6256     if (self->length >= width && PyUnicode_CheckExact(self)) {
6257         Py_INCREF(self);
6258         return (PyObject*) self;
6259     }
6260
6261     marg = width - self->length;
6262     left = marg / 2 + (marg & width & 1);
6263
6264     return (PyObject*) pad(self, left, marg - left, fillchar);
6265 }
6266
6267 #if 0
6268
6269 /* This code should go into some future Unicode collation support
6270    module. The basic comparison should compare ordinals on a naive
6271    basis (this is what Java does and thus JPython too). */
6272
6273 /* speedy UTF-16 code point order comparison */
6274 /* gleaned from: */
6275 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6276
6277 static short utf16Fixup[32] =
6278 {
6279     0, 0, 0, 0, 0, 0, 0, 0,
6280     0, 0, 0, 0, 0, 0, 0, 0,
6281     0, 0, 0, 0, 0, 0, 0, 0,
6282     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6283 };
6284
6285 static int
6286 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6287 {
6288     Py_ssize_t len1, len2;
6289
6290     Py_UNICODE *s1 = str1->str;
6291     Py_UNICODE *s2 = str2->str;
6292
6293     len1 = str1->length;
6294     len2 = str2->length;
6295
6296     while (len1 > 0 && len2 > 0) {
6297         Py_UNICODE c1, c2;
6298
6299         c1 = *s1++;
6300         c2 = *s2++;
6301
6302         if (c1 > (1<<11) * 26)
6303             c1 += utf16Fixup[c1>>11];
6304         if (c2 > (1<<11) * 26)
6305             c2 += utf16Fixup[c2>>11];
6306         /* now c1 and c2 are in UTF-32-compatible order */
6307
6308         if (c1 != c2)
6309             return (c1 < c2) ? -1 : 1;
6310
6311         len1--; len2--;
6312     }
6313
6314     return (len1 < len2) ? -1 : (len1 != len2);
6315 }
6316
6317 #else
6318
6319 static int
6320 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6321 {
6322     register Py_ssize_t len1, len2;
6323
6324     Py_UNICODE *s1 = str1->str;
6325     Py_UNICODE *s2 = str2->str;
6326
6327     len1 = str1->length;
6328     len2 = str2->length;
6329
6330     while (len1 > 0 && len2 > 0) {
6331         Py_UNICODE c1, c2;
6332
6333         c1 = *s1++;
6334         c2 = *s2++;
6335
6336         if (c1 != c2)
6337             return (c1 < c2) ? -1 : 1;
6338
6339         len1--; len2--;
6340     }
6341
6342     return (len1 < len2) ? -1 : (len1 != len2);
6343 }
6344
6345 #endif
6346
6347 int PyUnicode_Compare(PyObject *left,
6348                       PyObject *right)
6349 {
6350     PyUnicodeObject *u = NULL, *v = NULL;
6351     int result;
6352
6353     /* Coerce the two arguments */
6354     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6355     if (u == NULL)
6356         goto onError;
6357     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6358     if (v == NULL)
6359         goto onError;
6360
6361     /* Shortcut for empty or interned objects */
6362     if (v == u) {
6363         Py_DECREF(u);
6364         Py_DECREF(v);
6365         return 0;
6366     }
6367
6368     result = unicode_compare(u, v);
6369
6370     Py_DECREF(u);
6371     Py_DECREF(v);
6372     return result;
6373
6374   onError:
6375     Py_XDECREF(u);
6376     Py_XDECREF(v);
6377     return -1;
6378 }
6379
6380 PyObject *PyUnicode_RichCompare(PyObject *left,
6381                                 PyObject *right,
6382                                 int op)
6383 {
6384     int result;
6385
6386     result = PyUnicode_Compare(left, right);
6387     if (result == -1 && PyErr_Occurred())
6388         goto onError;
6389
6390     /* Convert the return value to a Boolean */
6391     switch (op) {
6392     case Py_EQ:
6393         result = (result == 0);
6394         break;
6395     case Py_NE:
6396         result = (result != 0);
6397         break;
6398     case Py_LE:
6399         result = (result <= 0);
6400         break;
6401     case Py_GE:
6402         result = (result >= 0);
6403         break;
6404     case Py_LT:
6405         result = (result == -1);
6406         break;
6407     case Py_GT:
6408         result = (result == 1);
6409         break;
6410     }
6411     return PyBool_FromLong(result);
6412
6413   onError:
6414
6415     /* Standard case
6416
6417        Type errors mean that PyUnicode_FromObject() could not convert
6418        one of the arguments (usually the right hand side) to Unicode,
6419        ie. we can't handle the comparison request. However, it is
6420        possible that the other object knows a comparison method, which
6421        is why we return Py_NotImplemented to give the other object a
6422        chance.
6423
6424     */
6425     if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6426         PyErr_Clear();
6427         Py_INCREF(Py_NotImplemented);
6428         return Py_NotImplemented;
6429     }
6430     if (op != Py_EQ && op != Py_NE)
6431         return NULL;
6432
6433     /* Equality comparison.
6434
6435        This is a special case: we silence any PyExc_UnicodeDecodeError
6436        and instead turn it into a PyErr_UnicodeWarning.
6437
6438     */
6439     if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6440         return NULL;
6441     PyErr_Clear();
6442     if (PyErr_Warn(PyExc_UnicodeWarning,
6443                    (op == Py_EQ) ?
6444                    "Unicode equal comparison "
6445                    "failed to convert both arguments to Unicode - "
6446                    "interpreting them as being unequal" :
6447                    "Unicode unequal comparison "
6448                    "failed to convert both arguments to Unicode - "
6449                    "interpreting them as being unequal"
6450             ) < 0)
6451         return NULL;
6452     result = (op == Py_NE);
6453     return PyBool_FromLong(result);
6454 }
6455
6456 int PyUnicode_Contains(PyObject *container,
6457                        PyObject *element)
6458 {
6459     PyObject *str, *sub;
6460     int result;
6461
6462     /* Coerce the two arguments */
6463     sub = PyUnicode_FromObject(element);
6464     if (!sub) {
6465         PyErr_SetString(PyExc_TypeError,
6466                         "'in <string>' requires string as left operand");
6467         return -1;
6468     }
6469
6470     str = PyUnicode_FromObject(container);
6471     if (!str) {
6472         Py_DECREF(sub);
6473         return -1;
6474     }
6475
6476     result = stringlib_contains_obj(str, sub);
6477
6478     Py_DECREF(str);
6479     Py_DECREF(sub);
6480
6481     return result;
6482 }
6483
6484 /* Concat to string or Unicode object giving a new Unicode object. */
6485
6486 PyObject *PyUnicode_Concat(PyObject *left,
6487                            PyObject *right)
6488 {
6489     PyUnicodeObject *u = NULL, *v = NULL, *w;
6490
6491     /* Coerce the two arguments */
6492     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6493     if (u == NULL)
6494         goto onError;
6495     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6496     if (v == NULL)
6497         goto onError;
6498
6499     /* Shortcuts */
6500     if (v == unicode_empty) {
6501         Py_DECREF(v);
6502         return (PyObject *)u;
6503     }
6504     if (u == unicode_empty) {
6505         Py_DECREF(u);
6506         return (PyObject *)v;
6507     }
6508
6509     /* Concat the two Unicode strings */
6510     w = _PyUnicode_New(u->length + v->length);
6511     if (w == NULL)
6512         goto onError;
6513     Py_UNICODE_COPY(w->str, u->str, u->length);
6514     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6515
6516     Py_DECREF(u);
6517     Py_DECREF(v);
6518     return (PyObject *)w;
6519
6520   onError:
6521     Py_XDECREF(u);
6522     Py_XDECREF(v);
6523     return NULL;
6524 }
6525
6526 PyDoc_STRVAR(count__doc__,
6527              "S.count(sub[, start[, end]]) -> int\n\
6528 \n\
6529 Return the number of non-overlapping occurrences of substring sub in\n\
6530 Unicode string S[start:end].  Optional arguments start and end are\n\
6531 interpreted as in slice notation.");
6532
6533 static PyObject *
6534 unicode_count(PyUnicodeObject *self, PyObject *args)
6535 {
6536     PyUnicodeObject *substring;
6537     Py_ssize_t start = 0;
6538     Py_ssize_t end = PY_SSIZE_T_MAX;
6539     PyObject *result;
6540
6541     if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6542                           _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6543         return NULL;
6544
6545     substring = (PyUnicodeObject *)PyUnicode_FromObject(
6546         (PyObject *)substring);
6547     if (substring == NULL)
6548         return NULL;
6549
6550     FIX_START_END(self);
6551
6552     result = PyInt_FromSsize_t(
6553         stringlib_count(self->str + start, end - start,
6554                         substring->str, substring->length)
6555         );
6556
6557     Py_DECREF(substring);
6558
6559     return result;
6560 }
6561
6562 PyDoc_STRVAR(encode__doc__,
6563              "S.encode([encoding[,errors]]) -> string or unicode\n\
6564 \n\
6565 Encodes S using the codec registered for encoding. encoding defaults\n\
6566 to the default encoding. errors may be given to set a different error\n\
6567 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6568 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6569 'xmlcharrefreplace' as well as any other name registered with\n\
6570 codecs.register_error that can handle UnicodeEncodeErrors.");
6571
6572 static PyObject *
6573 unicode_encode(PyUnicodeObject *self, PyObject *args)
6574 {
6575     char *encoding = NULL;
6576     char *errors = NULL;
6577     PyObject *v;
6578
6579     if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6580         return NULL;
6581     v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6582     if (v == NULL)
6583         goto onError;
6584     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6585         PyErr_Format(PyExc_TypeError,
6586                      "encoder did not return a string/unicode object "
6587                      "(type=%.400s)",
6588                      Py_TYPE(v)->tp_name);
6589         Py_DECREF(v);
6590         return NULL;
6591     }
6592     return v;
6593
6594   onError:
6595     return NULL;
6596 }
6597
6598 PyDoc_STRVAR(decode__doc__,
6599              "S.decode([encoding[,errors]]) -> string or unicode\n\
6600 \n\
6601 Decodes S using the codec registered for encoding. encoding defaults\n\
6602 to the default encoding. errors may be given to set a different error\n\
6603 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6604 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6605 as well as any other name registerd with codecs.register_error that is\n\
6606 able to handle UnicodeDecodeErrors.");
6607
6608 static PyObject *
6609 unicode_decode(PyUnicodeObject *self, PyObject *args)
6610 {
6611     char *encoding = NULL;
6612     char *errors = NULL;
6613     PyObject *v;
6614
6615     if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6616         return NULL;
6617     v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
6618     if (v == NULL)
6619         goto onError;
6620     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6621         PyErr_Format(PyExc_TypeError,
6622                      "decoder did not return a string/unicode object "
6623                      "(type=%.400s)",
6624                      Py_TYPE(v)->tp_name);
6625         Py_DECREF(v);
6626         return NULL;
6627     }
6628     return v;
6629
6630   onError:
6631     return NULL;
6632 }
6633
6634 PyDoc_STRVAR(expandtabs__doc__,
6635              "S.expandtabs([tabsize]) -> unicode\n\
6636 \n\
6637 Return a copy of S where all tab characters are expanded using spaces.\n\
6638 If tabsize is not given, a tab size of 8 characters is assumed.");
6639
6640 static PyObject*
6641 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6642 {
6643     Py_UNICODE *e;
6644     Py_UNICODE *p;
6645     Py_UNICODE *q;
6646     Py_UNICODE *qe;
6647     Py_ssize_t i, j, incr;
6648     PyUnicodeObject *u;
6649     int tabsize = 8;
6650
6651     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6652         return NULL;
6653
6654     /* First pass: determine size of output string */
6655     i = 0; /* chars up to and including most recent \n or \r */
6656     j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6657     e = self->str + self->length; /* end of input */
6658     for (p = self->str; p < e; p++)
6659         if (*p == '\t') {
6660             if (tabsize > 0) {
6661                 incr = tabsize - (j % tabsize); /* cannot overflow */
6662                 if (j > PY_SSIZE_T_MAX - incr)
6663                     goto overflow1;
6664                 j += incr;
6665             }
6666         }
6667         else {
6668             if (j > PY_SSIZE_T_MAX - 1)
6669                 goto overflow1;
6670             j++;
6671             if (*p == '\n' || *p == '\r') {
6672                 if (i > PY_SSIZE_T_MAX - j)
6673                     goto overflow1;
6674                 i += j;
6675                 j = 0;
6676             }
6677         }
6678
6679     if (i > PY_SSIZE_T_MAX - j)
6680         goto overflow1;
6681
6682     /* Second pass: create output string and fill it */
6683     u = _PyUnicode_New(i + j);
6684     if (!u)
6685         return NULL;
6686
6687     j = 0; /* same as in first pass */
6688     q = u->str; /* next output char */
6689     qe = u->str + u->length; /* end of output */
6690
6691     for (p = self->str; p < e; p++)
6692         if (*p == '\t') {
6693             if (tabsize > 0) {
6694                 i = tabsize - (j % tabsize);
6695                 j += i;
6696                 while (i--) {
6697                     if (q >= qe)
6698                         goto overflow2;
6699                     *q++ = ' ';
6700                 }
6701             }
6702         }
6703         else {
6704             if (q >= qe)
6705                 goto overflow2;
6706             *q++ = *p;
6707             j++;
6708             if (*p == '\n' || *p == '\r')
6709                 j = 0;
6710         }
6711
6712     return (PyObject*) u;
6713
6714   overflow2:
6715     Py_DECREF(u);
6716   overflow1:
6717     PyErr_SetString(PyExc_OverflowError, "new string is too long");
6718     return NULL;
6719 }
6720
6721 PyDoc_STRVAR(find__doc__,
6722              "S.find(sub [,start [,end]]) -> int\n\
6723 \n\
6724 Return the lowest index in S where substring sub is found,\n\
6725 such that sub is contained within s[start:end].  Optional\n\
6726 arguments start and end are interpreted as in slice notation.\n\
6727 \n\
6728 Return -1 on failure.");
6729
6730 static PyObject *
6731 unicode_find(PyUnicodeObject *self, PyObject *args)
6732 {
6733     PyObject *substring;
6734     Py_ssize_t start;
6735     Py_ssize_t end;
6736     Py_ssize_t result;
6737
6738     if (!_ParseTupleFinds(args, &substring, &start, &end))
6739         return NULL;
6740
6741     result = stringlib_find_slice(
6742         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6743         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6744         start, end
6745         );
6746
6747     Py_DECREF(substring);
6748
6749     return PyInt_FromSsize_t(result);
6750 }
6751
6752 static PyObject *
6753 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6754 {
6755     if (index < 0 || index >= self->length) {
6756         PyErr_SetString(PyExc_IndexError, "string index out of range");
6757         return NULL;
6758     }
6759
6760     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6761 }
6762
6763 static long
6764 unicode_hash(PyUnicodeObject *self)
6765 {
6766     /* Since Unicode objects compare equal to their ASCII string
6767        counterparts, they should use the individual character values
6768        as basis for their hash value.  This is needed to assure that
6769        strings and Unicode objects behave in the same way as
6770        dictionary keys. */
6771
6772     register Py_ssize_t len;
6773     register Py_UNICODE *p;
6774     register long x;
6775
6776     if (self->hash != -1)
6777         return self->hash;
6778     len = PyUnicode_GET_SIZE(self);
6779     p = PyUnicode_AS_UNICODE(self);
6780     x = *p << 7;
6781     while (--len >= 0)
6782         x = (1000003*x) ^ *p++;
6783     x ^= PyUnicode_GET_SIZE(self);
6784     if (x == -1)
6785         x = -2;
6786     self->hash = x;
6787     return x;
6788 }
6789
6790 PyDoc_STRVAR(index__doc__,
6791              "S.index(sub [,start [,end]]) -> int\n\
6792 \n\
6793 Like S.find() but raise ValueError when the substring is not found.");
6794
6795 static PyObject *
6796 unicode_index(PyUnicodeObject *self, PyObject *args)
6797 {
6798     Py_ssize_t result;
6799     PyObject *substring;
6800     Py_ssize_t start;
6801     Py_ssize_t end;
6802
6803     if (!_ParseTupleFinds(args, &substring, &start, &end))
6804         return NULL;
6805
6806     result = stringlib_find_slice(
6807         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6808         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6809         start, end
6810         );
6811
6812     Py_DECREF(substring);
6813
6814     if (result < 0) {
6815         PyErr_SetString(PyExc_ValueError, "substring not found");
6816         return NULL;
6817     }
6818
6819     return PyInt_FromSsize_t(result);
6820 }
6821
6822 PyDoc_STRVAR(islower__doc__,
6823              "S.islower() -> bool\n\
6824 \n\
6825 Return True if all cased characters in S are lowercase and there is\n\
6826 at least one cased character in S, False otherwise.");
6827
6828 static PyObject*
6829 unicode_islower(PyUnicodeObject *self)
6830 {
6831     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6832     register const Py_UNICODE *e;
6833     int cased;
6834
6835     /* Shortcut for single character strings */
6836     if (PyUnicode_GET_SIZE(self) == 1)
6837         return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6838
6839     /* Special case for empty strings */
6840     if (PyUnicode_GET_SIZE(self) == 0)
6841         return PyBool_FromLong(0);
6842
6843     e = p + PyUnicode_GET_SIZE(self);
6844     cased = 0;
6845     for (; p < e; p++) {
6846         register const Py_UNICODE ch = *p;
6847
6848         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6849             return PyBool_FromLong(0);
6850         else if (!cased && Py_UNICODE_ISLOWER(ch))
6851             cased = 1;
6852     }
6853     return PyBool_FromLong(cased);
6854 }
6855
6856 PyDoc_STRVAR(isupper__doc__,
6857              "S.isupper() -> bool\n\
6858 \n\
6859 Return True if all cased characters in S are uppercase and there is\n\
6860 at least one cased character in S, False otherwise.");
6861
6862 static PyObject*
6863 unicode_isupper(PyUnicodeObject *self)
6864 {
6865     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6866     register const Py_UNICODE *e;
6867     int cased;
6868
6869     /* Shortcut for single character strings */
6870     if (PyUnicode_GET_SIZE(self) == 1)
6871         return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6872
6873     /* Special case for empty strings */
6874     if (PyUnicode_GET_SIZE(self) == 0)
6875         return PyBool_FromLong(0);
6876
6877     e = p + PyUnicode_GET_SIZE(self);
6878     cased = 0;
6879     for (; p < e; p++) {
6880         register const Py_UNICODE ch = *p;
6881
6882         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6883             return PyBool_FromLong(0);
6884         else if (!cased && Py_UNICODE_ISUPPER(ch))
6885             cased = 1;
6886     }
6887     return PyBool_FromLong(cased);
6888 }
6889
6890 PyDoc_STRVAR(istitle__doc__,
6891              "S.istitle() -> bool\n\
6892 \n\
6893 Return True if S is a titlecased string and there is at least one\n\
6894 character in S, i.e. upper- and titlecase characters may only\n\
6895 follow uncased characters and lowercase characters only cased ones.\n\
6896 Return False otherwise.");
6897
6898 static PyObject*
6899 unicode_istitle(PyUnicodeObject *self)
6900 {
6901     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6902     register const Py_UNICODE *e;
6903     int cased, previous_is_cased;
6904
6905     /* Shortcut for single character strings */
6906     if (PyUnicode_GET_SIZE(self) == 1)
6907         return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6908                                (Py_UNICODE_ISUPPER(*p) != 0));
6909
6910     /* Special case for empty strings */
6911     if (PyUnicode_GET_SIZE(self) == 0)
6912         return PyBool_FromLong(0);
6913
6914     e = p + PyUnicode_GET_SIZE(self);
6915     cased = 0;
6916     previous_is_cased = 0;
6917     for (; p < e; p++) {
6918         register const Py_UNICODE ch = *p;
6919
6920         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6921             if (previous_is_cased)
6922                 return PyBool_FromLong(0);
6923             previous_is_cased = 1;
6924             cased = 1;
6925         }
6926         else if (Py_UNICODE_ISLOWER(ch)) {
6927             if (!previous_is_cased)
6928                 return PyBool_FromLong(0);
6929             previous_is_cased = 1;
6930             cased = 1;
6931         }
6932         else
6933             previous_is_cased = 0;
6934     }
6935     return PyBool_FromLong(cased);
6936 }
6937
6938 PyDoc_STRVAR(isspace__doc__,
6939              "S.isspace() -> bool\n\
6940 \n\
6941 Return True if all characters in S are whitespace\n\
6942 and there is at least one character in S, False otherwise.");
6943
6944 static PyObject*
6945 unicode_isspace(PyUnicodeObject *self)
6946 {
6947     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6948     register const Py_UNICODE *e;
6949
6950     /* Shortcut for single character strings */
6951     if (PyUnicode_GET_SIZE(self) == 1 &&
6952         Py_UNICODE_ISSPACE(*p))
6953         return PyBool_FromLong(1);
6954
6955     /* Special case for empty strings */
6956     if (PyUnicode_GET_SIZE(self) == 0)
6957         return PyBool_FromLong(0);
6958
6959     e = p + PyUnicode_GET_SIZE(self);
6960     for (; p < e; p++) {
6961         if (!Py_UNICODE_ISSPACE(*p))
6962             return PyBool_FromLong(0);
6963     }
6964     return PyBool_FromLong(1);
6965 }
6966
6967 PyDoc_STRVAR(isalpha__doc__,
6968              "S.isalpha() -> bool\n\
6969 \n\
6970 Return True if all characters in S are alphabetic\n\
6971 and there is at least one character in S, False otherwise.");
6972
6973 static PyObject*
6974 unicode_isalpha(PyUnicodeObject *self)
6975 {
6976     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6977     register const Py_UNICODE *e;
6978
6979     /* Shortcut for single character strings */
6980     if (PyUnicode_GET_SIZE(self) == 1 &&
6981         Py_UNICODE_ISALPHA(*p))
6982         return PyBool_FromLong(1);
6983
6984     /* Special case for empty strings */
6985     if (PyUnicode_GET_SIZE(self) == 0)
6986         return PyBool_FromLong(0);
6987
6988     e = p + PyUnicode_GET_SIZE(self);
6989     for (; p < e; p++) {
6990         if (!Py_UNICODE_ISALPHA(*p))
6991             return PyBool_FromLong(0);
6992     }
6993     return PyBool_FromLong(1);
6994 }
6995
6996 PyDoc_STRVAR(isalnum__doc__,
6997              "S.isalnum() -> bool\n\
6998 \n\
6999 Return True if all characters in S are alphanumeric\n\
7000 and there is at least one character in S, False otherwise.");
7001
7002 static PyObject*
7003 unicode_isalnum(PyUnicodeObject *self)
7004 {
7005     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7006     register const Py_UNICODE *e;
7007
7008     /* Shortcut for single character strings */
7009     if (PyUnicode_GET_SIZE(self) == 1 &&
7010         Py_UNICODE_ISALNUM(*p))
7011         return PyBool_FromLong(1);
7012
7013     /* Special case for empty strings */
7014     if (PyUnicode_GET_SIZE(self) == 0)
7015         return PyBool_FromLong(0);
7016
7017     e = p + PyUnicode_GET_SIZE(self);
7018     for (; p < e; p++) {
7019         if (!Py_UNICODE_ISALNUM(*p))
7020             return PyBool_FromLong(0);
7021     }
7022     return PyBool_FromLong(1);
7023 }
7024
7025 PyDoc_STRVAR(isdecimal__doc__,
7026              "S.isdecimal() -> bool\n\
7027 \n\
7028 Return True if there are only decimal characters in S,\n\
7029 False otherwise.");
7030
7031 static PyObject*
7032 unicode_isdecimal(PyUnicodeObject *self)
7033 {
7034     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7035     register const Py_UNICODE *e;
7036
7037     /* Shortcut for single character strings */
7038     if (PyUnicode_GET_SIZE(self) == 1 &&
7039         Py_UNICODE_ISDECIMAL(*p))
7040         return PyBool_FromLong(1);
7041
7042     /* Special case for empty strings */
7043     if (PyUnicode_GET_SIZE(self) == 0)
7044         return PyBool_FromLong(0);
7045
7046     e = p + PyUnicode_GET_SIZE(self);
7047     for (; p < e; p++) {
7048         if (!Py_UNICODE_ISDECIMAL(*p))
7049             return PyBool_FromLong(0);
7050     }
7051     return PyBool_FromLong(1);
7052 }
7053
7054 PyDoc_STRVAR(isdigit__doc__,
7055              "S.isdigit() -> bool\n\
7056 \n\
7057 Return True if all characters in S are digits\n\
7058 and there is at least one character in S, False otherwise.");
7059
7060 static PyObject*
7061 unicode_isdigit(PyUnicodeObject *self)
7062 {
7063     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7064     register const Py_UNICODE *e;
7065
7066     /* Shortcut for single character strings */
7067     if (PyUnicode_GET_SIZE(self) == 1 &&
7068         Py_UNICODE_ISDIGIT(*p))
7069         return PyBool_FromLong(1);
7070
7071     /* Special case for empty strings */
7072     if (PyUnicode_GET_SIZE(self) == 0)
7073         return PyBool_FromLong(0);
7074
7075     e = p + PyUnicode_GET_SIZE(self);
7076     for (; p < e; p++) {
7077         if (!Py_UNICODE_ISDIGIT(*p))
7078             return PyBool_FromLong(0);
7079     }
7080     return PyBool_FromLong(1);
7081 }
7082
7083 PyDoc_STRVAR(isnumeric__doc__,
7084              "S.isnumeric() -> bool\n\
7085 \n\
7086 Return True if there are only numeric characters in S,\n\
7087 False otherwise.");
7088
7089 static PyObject*
7090 unicode_isnumeric(PyUnicodeObject *self)
7091 {
7092     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7093     register const Py_UNICODE *e;
7094
7095     /* Shortcut for single character strings */
7096     if (PyUnicode_GET_SIZE(self) == 1 &&
7097         Py_UNICODE_ISNUMERIC(*p))
7098         return PyBool_FromLong(1);
7099
7100     /* Special case for empty strings */
7101     if (PyUnicode_GET_SIZE(self) == 0)
7102         return PyBool_FromLong(0);
7103
7104     e = p + PyUnicode_GET_SIZE(self);
7105     for (; p < e; p++) {
7106         if (!Py_UNICODE_ISNUMERIC(*p))
7107             return PyBool_FromLong(0);
7108     }
7109     return PyBool_FromLong(1);
7110 }
7111
7112 PyDoc_STRVAR(join__doc__,
7113              "S.join(sequence) -> unicode\n\
7114 \n\
7115 Return a string which is the concatenation of the strings in the\n\
7116 sequence.  The separator between elements is S.");
7117
7118 static PyObject*
7119 unicode_join(PyObject *self, PyObject *data)
7120 {
7121     return PyUnicode_Join(self, data);
7122 }
7123
7124 static Py_ssize_t
7125 unicode_length(PyUnicodeObject *self)
7126 {
7127     return self->length;
7128 }
7129
7130 PyDoc_STRVAR(ljust__doc__,
7131              "S.ljust(width[, fillchar]) -> int\n\
7132 \n\
7133 Return S left-justified in a Unicode string of length width. Padding is\n\
7134 done using the specified fill character (default is a space).");
7135
7136 static PyObject *
7137 unicode_ljust(PyUnicodeObject *self, PyObject *args)
7138 {
7139     Py_ssize_t width;
7140     Py_UNICODE fillchar = ' ';
7141
7142     if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7143         return NULL;
7144
7145     if (self->length >= width && PyUnicode_CheckExact(self)) {
7146         Py_INCREF(self);
7147         return (PyObject*) self;
7148     }
7149
7150     return (PyObject*) pad(self, 0, width - self->length, fillchar);
7151 }
7152
7153 PyDoc_STRVAR(lower__doc__,
7154              "S.lower() -> unicode\n\
7155 \n\
7156 Return a copy of the string S converted to lowercase.");
7157
7158 static PyObject*
7159 unicode_lower(PyUnicodeObject *self)
7160 {
7161     return fixup(self, fixlower);
7162 }
7163
7164 #define LEFTSTRIP 0
7165 #define RIGHTSTRIP 1
7166 #define BOTHSTRIP 2
7167
7168 /* Arrays indexed by above */
7169 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7170
7171 #define STRIPNAME(i) (stripformat[i]+3)
7172
7173 /* externally visible for str.strip(unicode) */
7174 PyObject *
7175 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7176 {
7177     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7178     Py_ssize_t len = PyUnicode_GET_SIZE(self);
7179     Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7180     Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7181     Py_ssize_t i, j;
7182
7183     BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7184
7185     i = 0;
7186     if (striptype != RIGHTSTRIP) {
7187         while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7188             i++;
7189         }
7190     }
7191
7192     j = len;
7193     if (striptype != LEFTSTRIP) {
7194         do {
7195             j--;
7196         } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7197         j++;
7198     }
7199
7200     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7201         Py_INCREF(self);
7202         return (PyObject*)self;
7203     }
7204     else
7205         return PyUnicode_FromUnicode(s+i, j-i);
7206 }
7207
7208
7209 static PyObject *
7210 do_strip(PyUnicodeObject *self, int striptype)
7211 {
7212     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7213     Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7214
7215     i = 0;
7216     if (striptype != RIGHTSTRIP) {
7217         while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7218             i++;
7219         }
7220     }
7221
7222     j = len;
7223     if (striptype != LEFTSTRIP) {
7224         do {
7225             j--;
7226         } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7227         j++;
7228     }
7229
7230     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7231         Py_INCREF(self);
7232         return (PyObject*)self;
7233     }
7234     else
7235         return PyUnicode_FromUnicode(s+i, j-i);
7236 }
7237
7238
7239 static PyObject *
7240 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7241 {
7242     PyObject *sep = NULL;
7243
7244     if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7245         return NULL;
7246
7247     if (sep != NULL && sep != Py_None) {
7248         if (PyUnicode_Check(sep))
7249             return _PyUnicode_XStrip(self, striptype, sep);
7250         else if (PyString_Check(sep)) {
7251             PyObject *res;
7252             sep = PyUnicode_FromObject(sep);
7253             if (sep==NULL)
7254                 return NULL;
7255             res = _PyUnicode_XStrip(self, striptype, sep);
7256             Py_DECREF(sep);
7257             return res;
7258         }
7259         else {
7260             PyErr_Format(PyExc_TypeError,
7261                          "%s arg must be None, unicode or str",
7262                          STRIPNAME(striptype));
7263             return NULL;
7264         }
7265     }
7266
7267     return do_strip(self, striptype);
7268 }
7269
7270
7271 PyDoc_STRVAR(strip__doc__,
7272              "S.strip([chars]) -> unicode\n\
7273 \n\
7274 Return a copy of the string S with leading and trailing\n\
7275 whitespace removed.\n\
7276 If chars is given and not None, remove characters in chars instead.\n\
7277 If chars is a str, it will be converted to unicode before stripping");
7278
7279 static PyObject *
7280 unicode_strip(PyUnicodeObject *self, PyObject *args)
7281 {
7282     if (PyTuple_GET_SIZE(args) == 0)
7283         return do_strip(self, BOTHSTRIP); /* Common case */
7284     else
7285         return do_argstrip(self, BOTHSTRIP, args);
7286 }
7287
7288
7289 PyDoc_STRVAR(lstrip__doc__,
7290              "S.lstrip([chars]) -> unicode\n\
7291 \n\
7292 Return a copy of the string S with leading whitespace removed.\n\
7293 If chars is given and not None, remove characters in chars instead.\n\
7294 If chars is a str, it will be converted to unicode before stripping");
7295
7296 static PyObject *
7297 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7298 {
7299     if (PyTuple_GET_SIZE(args) == 0)
7300         return do_strip(self, LEFTSTRIP); /* Common case */
7301     else
7302         return do_argstrip(self, LEFTSTRIP, args);
7303 }
7304
7305
7306 PyDoc_STRVAR(rstrip__doc__,
7307              "S.rstrip([chars]) -> unicode\n\
7308 \n\
7309 Return a copy of the string S with trailing whitespace removed.\n\
7310 If chars is given and not None, remove characters in chars instead.\n\
7311 If chars is a str, it will be converted to unicode before stripping");
7312
7313 static PyObject *
7314 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7315 {
7316     if (PyTuple_GET_SIZE(args) == 0)
7317         return do_strip(self, RIGHTSTRIP); /* Common case */
7318     else
7319         return do_argstrip(self, RIGHTSTRIP, args);
7320 }
7321
7322
7323 static PyObject*
7324 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7325 {
7326     PyUnicodeObject *u;
7327     Py_UNICODE *p;
7328     Py_ssize_t nchars;
7329     size_t nbytes;
7330
7331     if (len < 0)
7332         len = 0;
7333
7334     if (len == 1 && PyUnicode_CheckExact(str)) {
7335         /* no repeat, return original string */
7336         Py_INCREF(str);
7337         return (PyObject*) str;
7338     }
7339
7340     /* ensure # of chars needed doesn't overflow int and # of bytes
7341      * needed doesn't overflow size_t
7342      */
7343     nchars = len * str->length;
7344     if (len && nchars / len != str->length) {
7345         PyErr_SetString(PyExc_OverflowError,
7346                         "repeated string is too long");
7347         return NULL;
7348     }
7349     nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7350     if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7351         PyErr_SetString(PyExc_OverflowError,
7352                         "repeated string is too long");
7353         return NULL;
7354     }
7355     u = _PyUnicode_New(nchars);
7356     if (!u)
7357         return NULL;
7358
7359     p = u->str;
7360
7361     if (str->length == 1 && len > 0) {
7362         Py_UNICODE_FILL(p, str->str[0], len);
7363     } else {
7364         Py_ssize_t done = 0; /* number of characters copied this far */
7365         if (done < nchars) {
7366             Py_UNICODE_COPY(p, str->str, str->length);
7367             done = str->length;
7368         }
7369         while (done < nchars) {
7370             Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7371             Py_UNICODE_COPY(p+done, p, n);
7372             done += n;
7373         }
7374     }
7375
7376     return (PyObject*) u;
7377 }
7378
7379 PyObject *PyUnicode_Replace(PyObject *obj,
7380                             PyObject *subobj,
7381                             PyObject *replobj,
7382                             Py_ssize_t maxcount)
7383 {
7384     PyObject *self;
7385     PyObject *str1;
7386     PyObject *str2;
7387     PyObject *result;
7388
7389     self = PyUnicode_FromObject(obj);
7390     if (self == NULL)
7391         return NULL;
7392     str1 = PyUnicode_FromObject(subobj);
7393     if (str1 == NULL) {
7394         Py_DECREF(self);
7395         return NULL;
7396     }
7397     str2 = PyUnicode_FromObject(replobj);
7398     if (str2 == NULL) {
7399         Py_DECREF(self);
7400         Py_DECREF(str1);
7401         return NULL;
7402     }
7403     result = replace((PyUnicodeObject *)self,
7404                      (PyUnicodeObject *)str1,
7405                      (PyUnicodeObject *)str2,
7406                      maxcount);
7407     Py_DECREF(self);
7408     Py_DECREF(str1);
7409     Py_DECREF(str2);
7410     return result;
7411 }
7412
7413 PyDoc_STRVAR(replace__doc__,
7414              "S.replace (old, new[, count]) -> unicode\n\
7415 \n\
7416 Return a copy of S with all occurrences of substring\n\
7417 old replaced by new.  If the optional argument count is\n\
7418 given, only the first count occurrences are replaced.");
7419
7420 static PyObject*
7421 unicode_replace(PyUnicodeObject *self, PyObject *args)
7422 {
7423     PyUnicodeObject *str1;
7424     PyUnicodeObject *str2;
7425     Py_ssize_t maxcount = -1;
7426     PyObject *result;
7427
7428     if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7429         return NULL;
7430     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7431     if (str1 == NULL)
7432         return NULL;
7433     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7434     if (str2 == NULL) {
7435         Py_DECREF(str1);
7436         return NULL;
7437     }
7438
7439     result = replace(self, str1, str2, maxcount);
7440
7441     Py_DECREF(str1);
7442     Py_DECREF(str2);
7443     return result;
7444 }
7445
7446 static
7447 PyObject *unicode_repr(PyObject *unicode)
7448 {
7449     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7450                                 PyUnicode_GET_SIZE(unicode),
7451                                 1);
7452 }
7453
7454 PyDoc_STRVAR(rfind__doc__,
7455              "S.rfind(sub [,start [,end]]) -> int\n\
7456 \n\
7457 Return the highest index in S where substring sub is found,\n\
7458 such that sub is contained within s[start:end].  Optional\n\
7459 arguments start and end are interpreted as in slice notation.\n\
7460 \n\
7461 Return -1 on failure.");
7462
7463 static PyObject *
7464 unicode_rfind(PyUnicodeObject *self, PyObject *args)
7465 {
7466     PyObject *substring;
7467     Py_ssize_t start;
7468     Py_ssize_t end;
7469     Py_ssize_t result;
7470
7471     if (!_ParseTupleFinds(args, &substring, &start, &end))
7472         return NULL;
7473
7474     result = stringlib_rfind_slice(
7475         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7476         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7477         start, end
7478         );
7479
7480     Py_DECREF(substring);
7481
7482     return PyInt_FromSsize_t(result);
7483 }
7484
7485 PyDoc_STRVAR(rindex__doc__,
7486              "S.rindex(sub [,start [,end]]) -> int\n\
7487 \n\
7488 Like S.rfind() but raise ValueError when the substring is not found.");
7489
7490 static PyObject *
7491 unicode_rindex(PyUnicodeObject *self, PyObject *args)
7492 {
7493     PyObject *substring;
7494     Py_ssize_t start;
7495     Py_ssize_t end;
7496     Py_ssize_t result;
7497
7498     if (!_ParseTupleFinds(args, &substring, &start, &end))
7499         return NULL;
7500
7501     result = stringlib_rfind_slice(
7502         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7503         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7504         start, end
7505         );
7506
7507     Py_DECREF(substring);
7508
7509     if (result < 0) {
7510         PyErr_SetString(PyExc_ValueError, "substring not found");
7511         return NULL;
7512     }
7513     return PyInt_FromSsize_t(result);
7514 }
7515
7516 PyDoc_STRVAR(rjust__doc__,
7517              "S.rjust(width[, fillchar]) -> unicode\n\
7518 \n\
7519 Return S right-justified in a Unicode string of length width. Padding is\n\
7520 done using the specified fill character (default is a space).");
7521
7522 static PyObject *
7523 unicode_rjust(PyUnicodeObject *self, PyObject *args)
7524 {
7525     Py_ssize_t width;
7526     Py_UNICODE fillchar = ' ';
7527
7528     if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7529         return NULL;
7530
7531     if (self->length >= width && PyUnicode_CheckExact(self)) {
7532         Py_INCREF(self);
7533         return (PyObject*) self;
7534     }
7535
7536     return (PyObject*) pad(self, width - self->length, 0, fillchar);
7537 }
7538
7539 static PyObject*
7540 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
7541 {
7542     /* standard clamping */
7543     if (start < 0)
7544         start = 0;
7545     if (end < 0)
7546         end = 0;
7547     if (end > self->length)
7548         end = self->length;
7549     if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
7550         /* full slice, return original string */
7551         Py_INCREF(self);
7552         return (PyObject*) self;
7553     }
7554     if (start > end)
7555         start = end;
7556     /* copy slice */
7557     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7558                                              end - start);
7559 }
7560
7561 PyObject *PyUnicode_Split(PyObject *s,
7562                           PyObject *sep,
7563                           Py_ssize_t maxsplit)
7564 {
7565     PyObject *result;
7566
7567     s = PyUnicode_FromObject(s);
7568     if (s == NULL)
7569         return NULL;
7570     if (sep != NULL) {
7571         sep = PyUnicode_FromObject(sep);
7572         if (sep == NULL) {
7573             Py_DECREF(s);
7574             return NULL;
7575         }
7576     }
7577
7578     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7579
7580     Py_DECREF(s);
7581     Py_XDECREF(sep);
7582     return result;
7583 }
7584
7585 PyDoc_STRVAR(split__doc__,
7586              "S.split([sep [,maxsplit]]) -> list of strings\n\
7587 \n\
7588 Return a list of the words in S, using sep as the\n\
7589 delimiter string.  If maxsplit is given, at most maxsplit\n\
7590 splits are done. If sep is not specified or is None, any\n\
7591 whitespace string is a separator and empty strings are\n\
7592 removed from the result.");
7593
7594 static PyObject*
7595 unicode_split(PyUnicodeObject *self, PyObject *args)
7596 {
7597     PyObject *substring = Py_None;
7598     Py_ssize_t maxcount = -1;
7599
7600     if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7601         return NULL;
7602
7603     if (substring == Py_None)
7604         return split(self, NULL, maxcount);
7605     else if (PyUnicode_Check(substring))
7606         return split(self, (PyUnicodeObject *)substring, maxcount);
7607     else
7608         return PyUnicode_Split((PyObject *)self, substring, maxcount);
7609 }
7610
7611 PyObject *
7612 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7613 {
7614     PyObject* str_obj;
7615     PyObject* sep_obj;
7616     PyObject* out;
7617
7618     str_obj = PyUnicode_FromObject(str_in);
7619     if (!str_obj)
7620         return NULL;
7621     sep_obj = PyUnicode_FromObject(sep_in);
7622     if (!sep_obj) {
7623         Py_DECREF(str_obj);
7624         return NULL;
7625     }
7626
7627     out = stringlib_partition(
7628         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7629         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7630         );
7631
7632     Py_DECREF(sep_obj);
7633     Py_DECREF(str_obj);
7634
7635     return out;
7636 }
7637
7638
7639 PyObject *
7640 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7641 {
7642     PyObject* str_obj;
7643     PyObject* sep_obj;
7644     PyObject* out;
7645
7646     str_obj = PyUnicode_FromObject(str_in);
7647     if (!str_obj)
7648         return NULL;
7649     sep_obj = PyUnicode_FromObject(sep_in);
7650     if (!sep_obj) {
7651         Py_DECREF(str_obj);
7652         return NULL;
7653     }
7654
7655     out = stringlib_rpartition(
7656         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7657         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7658         );
7659
7660     Py_DECREF(sep_obj);
7661     Py_DECREF(str_obj);
7662
7663     return out;
7664 }
7665
7666 PyDoc_STRVAR(partition__doc__,
7667              "S.partition(sep) -> (head, sep, tail)\n\
7668 \n\
7669 Search for the separator sep in S, and return the part before it,\n\
7670 the separator itself, and the part after it.  If the separator is not\n\
7671 found, return S and two empty strings.");
7672
7673 static PyObject*
7674 unicode_partition(PyUnicodeObject *self, PyObject *separator)
7675 {
7676     return PyUnicode_Partition((PyObject *)self, separator);
7677 }
7678
7679 PyDoc_STRVAR(rpartition__doc__,
7680              "S.rpartition(sep) -> (tail, sep, head)\n\
7681 \n\
7682 Search for the separator sep in S, starting at the end of S, and return\n\
7683 the part before it, the separator itself, and the part after it.  If the\n\
7684 separator is not found, return two empty strings and S.");
7685
7686 static PyObject*
7687 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7688 {
7689     return PyUnicode_RPartition((PyObject *)self, separator);
7690 }
7691
7692 PyObject *PyUnicode_RSplit(PyObject *s,
7693                            PyObject *sep,
7694                            Py_ssize_t maxsplit)
7695 {
7696     PyObject *result;
7697
7698     s = PyUnicode_FromObject(s);
7699     if (s == NULL)
7700         return NULL;
7701     if (sep != NULL) {
7702         sep = PyUnicode_FromObject(sep);
7703         if (sep == NULL) {
7704             Py_DECREF(s);
7705             return NULL;
7706         }
7707     }
7708
7709     result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7710
7711     Py_DECREF(s);
7712     Py_XDECREF(sep);
7713     return result;
7714 }
7715
7716 PyDoc_STRVAR(rsplit__doc__,
7717              "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7718 \n\
7719 Return a list of the words in S, using sep as the\n\
7720 delimiter string, starting at the end of the string and\n\
7721 working to the front.  If maxsplit is given, at most maxsplit\n\
7722 splits are done. If sep is not specified, any whitespace string\n\
7723 is a separator.");
7724
7725 static PyObject*
7726 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7727 {
7728     PyObject *substring = Py_None;
7729     Py_ssize_t maxcount = -1;
7730
7731     if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7732         return NULL;
7733
7734     if (substring == Py_None)
7735         return rsplit(self, NULL, maxcount);
7736     else if (PyUnicode_Check(substring))
7737         return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7738     else
7739         return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7740 }
7741
7742 PyDoc_STRVAR(splitlines__doc__,
7743              "S.splitlines([keepends]) -> list of strings\n\
7744 \n\
7745 Return a list of the lines in S, breaking at line boundaries.\n\
7746 Line breaks are not included in the resulting list unless keepends\n\
7747 is given and true.");
7748
7749 static PyObject*
7750 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7751 {
7752     int keepends = 0;
7753
7754     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7755         return NULL;
7756
7757     return PyUnicode_Splitlines((PyObject *)self, keepends);
7758 }
7759
7760 static
7761 PyObject *unicode_str(PyUnicodeObject *self)
7762 {
7763     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
7764 }
7765
7766 PyDoc_STRVAR(swapcase__doc__,
7767              "S.swapcase() -> unicode\n\
7768 \n\
7769 Return a copy of S with uppercase characters converted to lowercase\n\
7770 and vice versa.");
7771
7772 static PyObject*
7773 unicode_swapcase(PyUnicodeObject *self)
7774 {
7775     return fixup(self, fixswapcase);
7776 }
7777
7778 PyDoc_STRVAR(translate__doc__,
7779              "S.translate(table) -> unicode\n\
7780 \n\
7781 Return a copy of the string S, where all characters have been mapped\n\
7782 through the given translation table, which must be a mapping of\n\
7783 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7784 Unmapped characters are left untouched. Characters mapped to None\n\
7785 are deleted.");
7786
7787 static PyObject*
7788 unicode_translate(PyUnicodeObject *self, PyObject *table)
7789 {
7790     return PyUnicode_TranslateCharmap(self->str,
7791                                       self->length,
7792                                       table,
7793                                       "ignore");
7794 }
7795
7796 PyDoc_STRVAR(upper__doc__,
7797              "S.upper() -> unicode\n\
7798 \n\
7799 Return a copy of S converted to uppercase.");
7800
7801 static PyObject*
7802 unicode_upper(PyUnicodeObject *self)
7803 {
7804     return fixup(self, fixupper);
7805 }
7806
7807 PyDoc_STRVAR(zfill__doc__,
7808              "S.zfill(width) -> unicode\n\
7809 \n\
7810 Pad a numeric string S with zeros on the left, to fill a field\n\
7811 of the specified width. The string S is never truncated.");
7812
7813 static PyObject *
7814 unicode_zfill(PyUnicodeObject *self, PyObject *args)
7815 {
7816     Py_ssize_t fill;
7817     PyUnicodeObject *u;
7818
7819     Py_ssize_t width;
7820     if (!PyArg_ParseTuple(args, "n:zfill", &width))
7821         return NULL;
7822
7823     if (self->length >= width) {
7824         if (PyUnicode_CheckExact(self)) {
7825             Py_INCREF(self);
7826             return (PyObject*) self;
7827         }
7828         else
7829             return PyUnicode_FromUnicode(
7830                 PyUnicode_AS_UNICODE(self),
7831                 PyUnicode_GET_SIZE(self)
7832                 );
7833     }
7834
7835     fill = width - self->length;
7836
7837     u = pad(self, fill, 0, '0');
7838
7839     if (u == NULL)
7840         return NULL;
7841
7842     if (u->str[fill] == '+' || u->str[fill] == '-') {
7843         /* move sign to beginning of string */
7844         u->str[0] = u->str[fill];
7845         u->str[fill] = '0';
7846     }
7847
7848     return (PyObject*) u;
7849 }
7850
7851 #if 0
7852 static PyObject*
7853 free_listsize(PyUnicodeObject *self)
7854 {
7855     return PyInt_FromLong(numfree);
7856 }
7857 #endif
7858
7859 PyDoc_STRVAR(startswith__doc__,
7860              "S.startswith(prefix[, start[, end]]) -> bool\n\
7861 \n\
7862 Return True if S starts with the specified prefix, False otherwise.\n\
7863 With optional start, test S beginning at that position.\n\
7864 With optional end, stop comparing S at that position.\n\
7865 prefix can also be a tuple of strings to try.");
7866
7867 static PyObject *
7868 unicode_startswith(PyUnicodeObject *self,
7869                    PyObject *args)
7870 {
7871     PyObject *subobj;
7872     PyUnicodeObject *substring;
7873     Py_ssize_t start = 0;
7874     Py_ssize_t end = PY_SSIZE_T_MAX;
7875     int result;
7876
7877     if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
7878                           _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7879         return NULL;
7880     if (PyTuple_Check(subobj)) {
7881         Py_ssize_t i;
7882         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7883             substring = (PyUnicodeObject *)PyUnicode_FromObject(
7884                 PyTuple_GET_ITEM(subobj, i));
7885             if (substring == NULL)
7886                 return NULL;
7887             result = tailmatch(self, substring, start, end, -1);
7888             Py_DECREF(substring);
7889             if (result) {
7890                 Py_RETURN_TRUE;
7891             }
7892         }
7893         /* nothing matched */
7894         Py_RETURN_FALSE;
7895     }
7896     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7897     if (substring == NULL)
7898         return NULL;
7899     result = tailmatch(self, substring, start, end, -1);
7900     Py_DECREF(substring);
7901     return PyBool_FromLong(result);
7902 }
7903
7904
7905 PyDoc_STRVAR(endswith__doc__,
7906              "S.endswith(suffix[, start[, end]]) -> bool\n\
7907 \n\
7908 Return True if S ends with the specified suffix, False otherwise.\n\
7909 With optional start, test S beginning at that position.\n\
7910 With optional end, stop comparing S at that position.\n\
7911 suffix can also be a tuple of strings to try.");
7912
7913 static PyObject *
7914 unicode_endswith(PyUnicodeObject *self,
7915                  PyObject *args)
7916 {
7917     PyObject *subobj;
7918     PyUnicodeObject *substring;
7919     Py_ssize_t start = 0;
7920     Py_ssize_t end = PY_SSIZE_T_MAX;
7921     int result;
7922
7923     if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7924                           _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7925         return NULL;
7926     if (PyTuple_Check(subobj)) {
7927         Py_ssize_t i;
7928         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7929             substring = (PyUnicodeObject *)PyUnicode_FromObject(
7930                 PyTuple_GET_ITEM(subobj, i));
7931             if (substring == NULL)
7932                 return NULL;
7933             result = tailmatch(self, substring, start, end, +1);
7934             Py_DECREF(substring);
7935             if (result) {
7936                 Py_RETURN_TRUE;
7937             }
7938         }
7939         Py_RETURN_FALSE;
7940     }
7941     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7942     if (substring == NULL)
7943         return NULL;
7944
7945     result = tailmatch(self, substring, start, end, +1);
7946     Py_DECREF(substring);
7947     return PyBool_FromLong(result);
7948 }
7949
7950
7951 /* Implements do_string_format, which is unicode because of stringlib */
7952 #include "stringlib/string_format.h"
7953
7954 PyDoc_STRVAR(format__doc__,
7955              "S.format(*args, **kwargs) -> unicode\n\
7956 \n\
7957 ");
7958
7959 static PyObject *
7960 unicode__format__(PyObject *self, PyObject *args)
7961 {
7962     PyObject *format_spec;
7963     PyObject *result = NULL;
7964     PyObject *tmp = NULL;
7965
7966     /* If 2.x, convert format_spec to the same type as value */
7967     /* This is to allow things like u''.format('') */
7968     if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7969         goto done;
7970     if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7971         PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
7972                      "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
7973         goto done;
7974     }
7975     tmp = PyObject_Unicode(format_spec);
7976     if (tmp == NULL)
7977         goto done;
7978     format_spec = tmp;
7979
7980     result = _PyUnicode_FormatAdvanced(self,
7981                                        PyUnicode_AS_UNICODE(format_spec),
7982                                        PyUnicode_GET_SIZE(format_spec));
7983   done:
7984     Py_XDECREF(tmp);
7985     return result;
7986 }
7987
7988 PyDoc_STRVAR(p_format__doc__,
7989              "S.__format__(format_spec) -> unicode\n\
7990 \n\
7991 ");
7992
7993 static PyObject *
7994 unicode__sizeof__(PyUnicodeObject *v)
7995 {
7996     return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7997                              sizeof(Py_UNICODE) * (v->length + 1));
7998 }
7999
8000 PyDoc_STRVAR(sizeof__doc__,
8001              "S.__sizeof__() -> size of S in memory, in bytes\n\
8002 \n\
8003 ");
8004
8005 static PyObject *
8006 unicode_getnewargs(PyUnicodeObject *v)
8007 {
8008     return Py_BuildValue("(u#)", v->str, v->length);
8009 }
8010
8011
8012 static PyMethodDef unicode_methods[] = {
8013
8014     /* Order is according to common usage: often used methods should
8015        appear first, since lookup is done sequentially. */
8016
8017     {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8018     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8019     {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
8020     {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
8021     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8022     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8023     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8024     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8025     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8026     {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8027     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
8028     {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
8029     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8030     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8031     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
8032     {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
8033     {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
8034 /*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
8035     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8036     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8037     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
8038     {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
8039     {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
8040     {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
8041     {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
8042     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8043     {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8044     {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8045     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8046     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8047     {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8048     {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8049     {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8050     {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8051     {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8052     {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8053     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8054     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8055     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
8056     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
8057     {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8058     {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
8059     {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8060     {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
8061     {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
8062 #if 0
8063     {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
8064 #endif
8065
8066 #if 0
8067     /* This one is just used for debugging the implementation. */
8068     {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
8069 #endif
8070
8071     {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
8072     {NULL, NULL}
8073 };
8074
8075 static PyObject *
8076 unicode_mod(PyObject *v, PyObject *w)
8077 {
8078     if (!PyUnicode_Check(v)) {
8079         Py_INCREF(Py_NotImplemented);
8080         return Py_NotImplemented;
8081     }
8082     return PyUnicode_Format(v, w);
8083 }
8084
8085 static PyNumberMethods unicode_as_number = {
8086     0,              /*nb_add*/
8087     0,              /*nb_subtract*/
8088     0,              /*nb_multiply*/
8089     0,              /*nb_divide*/
8090     unicode_mod,            /*nb_remainder*/
8091 };
8092
8093 static PySequenceMethods unicode_as_sequence = {
8094     (lenfunc) unicode_length,       /* sq_length */
8095     PyUnicode_Concat,           /* sq_concat */
8096     (ssizeargfunc) unicode_repeat,  /* sq_repeat */
8097     (ssizeargfunc) unicode_getitem,     /* sq_item */
8098     (ssizessizeargfunc) unicode_slice,  /* sq_slice */
8099     0,                  /* sq_ass_item */
8100     0,                  /* sq_ass_slice */
8101     PyUnicode_Contains,         /* sq_contains */
8102 };
8103
8104 static PyObject*
8105 unicode_subscript(PyUnicodeObject* self, PyObject* item)
8106 {
8107     if (PyIndex_Check(item)) {
8108         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
8109         if (i == -1 && PyErr_Occurred())
8110             return NULL;
8111         if (i < 0)
8112             i += PyUnicode_GET_SIZE(self);
8113         return unicode_getitem(self, i);
8114     } else if (PySlice_Check(item)) {
8115         Py_ssize_t start, stop, step, slicelength, cur, i;
8116         Py_UNICODE* source_buf;
8117         Py_UNICODE* result_buf;
8118         PyObject* result;
8119
8120         if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
8121                                  &start, &stop, &step, &slicelength) < 0) {
8122             return NULL;
8123         }
8124
8125         if (slicelength <= 0) {
8126             return PyUnicode_FromUnicode(NULL, 0);
8127         } else if (start == 0 && step == 1 && slicelength == self->length &&
8128                    PyUnicode_CheckExact(self)) {
8129             Py_INCREF(self);
8130             return (PyObject *)self;
8131         } else if (step == 1) {
8132             return PyUnicode_FromUnicode(self->str + start, slicelength);
8133         } else {
8134             source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
8135             result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8136                                                        sizeof(Py_UNICODE));
8137
8138             if (result_buf == NULL)
8139                 return PyErr_NoMemory();
8140
8141             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8142                 result_buf[i] = source_buf[cur];
8143             }
8144
8145             result = PyUnicode_FromUnicode(result_buf, slicelength);
8146             PyObject_FREE(result_buf);
8147             return result;
8148         }
8149     } else {
8150         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8151         return NULL;
8152     }
8153 }
8154
8155 static PyMappingMethods unicode_as_mapping = {
8156     (lenfunc)unicode_length,        /* mp_length */
8157     (binaryfunc)unicode_subscript,  /* mp_subscript */
8158     (objobjargproc)0,           /* mp_ass_subscript */
8159 };
8160
8161 static Py_ssize_t
8162 unicode_buffer_getreadbuf(PyUnicodeObject *self,
8163                           Py_ssize_t index,
8164                           const void **ptr)
8165 {
8166     if (index != 0) {
8167         PyErr_SetString(PyExc_SystemError,
8168                         "accessing non-existent unicode segment");
8169         return -1;
8170     }
8171     *ptr = (void *) self->str;
8172     return PyUnicode_GET_DATA_SIZE(self);
8173 }
8174
8175 static Py_ssize_t
8176 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
8177                            const void **ptr)
8178 {
8179     PyErr_SetString(PyExc_TypeError,
8180                     "cannot use unicode as modifiable buffer");
8181     return -1;
8182 }
8183
8184 static int
8185 unicode_buffer_getsegcount(PyUnicodeObject *self,
8186                            Py_ssize_t *lenp)
8187 {
8188     if (lenp)
8189         *lenp = PyUnicode_GET_DATA_SIZE(self);
8190     return 1;
8191 }
8192
8193 static Py_ssize_t
8194 unicode_buffer_getcharbuf(PyUnicodeObject *self,
8195                           Py_ssize_t index,
8196                           const void **ptr)
8197 {
8198     PyObject *str;
8199
8200     if (index != 0) {
8201         PyErr_SetString(PyExc_SystemError,
8202                         "accessing non-existent unicode segment");
8203         return -1;
8204     }
8205     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
8206     if (str == NULL)
8207         return -1;
8208     *ptr = (void *) PyString_AS_STRING(str);
8209     return PyString_GET_SIZE(str);
8210 }
8211
8212 /* Helpers for PyUnicode_Format() */
8213
8214 static PyObject *
8215 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8216 {
8217     Py_ssize_t argidx = *p_argidx;
8218     if (argidx < arglen) {
8219         (*p_argidx)++;
8220         if (arglen < 0)
8221             return args;
8222         else
8223             return PyTuple_GetItem(args, argidx);
8224     }
8225     PyErr_SetString(PyExc_TypeError,
8226                     "not enough arguments for format string");
8227     return NULL;
8228 }
8229
8230 #define F_LJUST (1<<0)
8231 #define F_SIGN  (1<<1)
8232 #define F_BLANK (1<<2)
8233 #define F_ALT   (1<<3)
8234 #define F_ZERO  (1<<4)
8235
8236 static Py_ssize_t
8237 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8238 {
8239     register Py_ssize_t i;
8240     Py_ssize_t len = strlen(charbuffer);
8241     for (i = len - 1; i >= 0; i--)
8242         buffer[i] = (Py_UNICODE) charbuffer[i];
8243
8244     return len;
8245 }
8246
8247 static int
8248 doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8249 {
8250     Py_ssize_t result;
8251
8252     PyOS_ascii_formatd((char *)buffer, len, format, x);
8253     result = strtounicode(buffer, (char *)buffer);
8254     return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8255 }
8256
8257 static int
8258 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8259 {
8260     Py_ssize_t result;
8261
8262     PyOS_snprintf((char *)buffer, len, format, x);
8263     result = strtounicode(buffer, (char *)buffer);
8264     return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8265 }
8266
8267 /* XXX To save some code duplication, formatfloat/long/int could have been
8268    shared with stringobject.c, converting from 8-bit to Unicode after the
8269    formatting is done. */
8270
8271 static int
8272 formatfloat(Py_UNICODE *buf,
8273             size_t buflen,
8274             int flags,
8275             int prec,
8276             int type,
8277             PyObject *v)
8278 {
8279     /* fmt = '%#.' + `prec` + `type`
8280        worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
8281     char fmt[20];
8282     double x;
8283
8284     x = PyFloat_AsDouble(v);
8285     if (x == -1.0 && PyErr_Occurred())
8286         return -1;
8287     if (prec < 0)
8288         prec = 6;
8289     /* make sure that the decimal representation of precision really does
8290        need at most 10 digits: platforms with sizeof(int) == 8 exist! */
8291     if (prec > 0x7fffffffL) {
8292         PyErr_SetString(PyExc_OverflowError,
8293                         "outrageously large precision "
8294                         "for formatted float");
8295         return -1;
8296     }
8297
8298     if (type == 'f' && fabs(x) >= 1e50)
8299         type = 'g';
8300     /* Worst case length calc to ensure no buffer overrun:
8301
8302        'g' formats:
8303        fmt = %#.<prec>g
8304        buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8305        for any double rep.)
8306        len = 1 + prec + 1 + 2 + 5 = 9 + prec
8307
8308        'f' formats:
8309        buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8310        len = 1 + 50 + 1 + prec = 52 + prec
8311
8312        If prec=0 the effective precision is 1 (the leading digit is
8313        always given), therefore increase the length by one.
8314
8315     */
8316     if (((type == 'g' || type == 'G') &&
8317          buflen <= (size_t)10 + (size_t)prec) ||
8318         (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
8319         PyErr_SetString(PyExc_OverflowError,
8320                         "formatted float is too long (precision too large?)");
8321         return -1;
8322     }
8323     PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8324                   (flags&F_ALT) ? "#" : "",
8325                   prec, type);
8326     return doubletounicode(buf, buflen, fmt, x);
8327 }
8328
8329 static PyObject*
8330 formatlong(PyObject *val, int flags, int prec, int type)
8331 {
8332     char *buf;
8333     int i, len;
8334     PyObject *str; /* temporary string object. */
8335     PyUnicodeObject *result;
8336
8337     str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8338     if (!str)
8339         return NULL;
8340     result = _PyUnicode_New(len);
8341     if (!result) {
8342         Py_DECREF(str);
8343         return NULL;
8344     }
8345     for (i = 0; i < len; i++)
8346         result->str[i] = buf[i];
8347     result->str[len] = 0;
8348     Py_DECREF(str);
8349     return (PyObject*)result;
8350 }
8351
8352 static int
8353 formatint(Py_UNICODE *buf,
8354           size_t buflen,
8355           int flags,
8356           int prec,
8357           int type,
8358           PyObject *v)
8359 {
8360     /* fmt = '%#.' + `prec` + 'l' + `type`
8361      * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8362      *                     + 1 + 1
8363      *                   = 24
8364      */
8365     char fmt[64]; /* plenty big enough! */
8366     char *sign;
8367     long x;
8368
8369     x = PyInt_AsLong(v);
8370     if (x == -1 && PyErr_Occurred())
8371         return -1;
8372     if (x < 0 && type == 'u') {
8373         type = 'd';
8374     }
8375     if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8376         sign = "-";
8377     else
8378         sign = "";
8379     if (prec < 0)
8380         prec = 1;
8381
8382     /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8383      * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8384      */
8385     if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8386         PyErr_SetString(PyExc_OverflowError,
8387                         "formatted integer is too long (precision too large?)");
8388         return -1;
8389     }
8390
8391     if ((flags & F_ALT) &&
8392         (type == 'x' || type == 'X')) {
8393         /* When converting under %#x or %#X, there are a number
8394          * of issues that cause pain:
8395          * - when 0 is being converted, the C standard leaves off
8396          *   the '0x' or '0X', which is inconsistent with other
8397          *   %#x/%#X conversions and inconsistent with Python's
8398          *   hex() function
8399          * - there are platforms that violate the standard and
8400          *   convert 0 with the '0x' or '0X'
8401          *   (Metrowerks, Compaq Tru64)
8402          * - there are platforms that give '0x' when converting
8403          *   under %#X, but convert 0 in accordance with the
8404          *   standard (OS/2 EMX)
8405          *
8406          * We can achieve the desired consistency by inserting our
8407          * own '0x' or '0X' prefix, and substituting %x/%X in place
8408          * of %#x/%#X.
8409          *
8410          * Note that this is the same approach as used in
8411          * formatint() in stringobject.c
8412          */
8413         PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8414                       sign, type, prec, type);
8415     }
8416     else {
8417         PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8418                       sign, (flags&F_ALT) ? "#" : "",
8419                       prec, type);
8420     }
8421     if (sign[0])
8422         return longtounicode(buf, buflen, fmt, -x);
8423     else
8424         return longtounicode(buf, buflen, fmt, x);
8425 }
8426
8427 static int
8428 formatchar(Py_UNICODE *buf,
8429            size_t buflen,
8430            PyObject *v)
8431 {
8432     /* presume that the buffer is at least 2 characters long */
8433     if (PyUnicode_Check(v)) {
8434         if (PyUnicode_GET_SIZE(v) != 1)
8435             goto onError;
8436         buf[0] = PyUnicode_AS_UNICODE(v)[0];
8437     }
8438
8439     else if (PyString_Check(v)) {
8440         if (PyString_GET_SIZE(v) != 1)
8441             goto onError;
8442         buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8443     }
8444
8445     else {
8446         /* Integer input truncated to a character */
8447         long x;
8448         x = PyInt_AsLong(v);
8449         if (x == -1 && PyErr_Occurred())
8450             goto onError;
8451 #ifdef Py_UNICODE_WIDE
8452         if (x < 0 || x > 0x10ffff) {
8453             PyErr_SetString(PyExc_OverflowError,
8454                             "%c arg not in range(0x110000) "
8455                             "(wide Python build)");
8456             return -1;
8457         }
8458 #else
8459         if (x < 0 || x > 0xffff) {
8460             PyErr_SetString(PyExc_OverflowError,
8461                             "%c arg not in range(0x10000) "
8462                             "(narrow Python build)");
8463             return -1;
8464         }
8465 #endif
8466         buf[0] = (Py_UNICODE) x;
8467     }
8468     buf[1] = '\0';
8469     return 1;
8470
8471   onError:
8472     PyErr_SetString(PyExc_TypeError,
8473                     "%c requires int or char");
8474     return -1;
8475 }
8476
8477 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8478
8479    FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8480    chars are formatted. XXX This is a magic number. Each formatting
8481    routine does bounds checking to ensure no overflow, but a better
8482    solution may be to malloc a buffer of appropriate size for each
8483    format. For now, the current solution is sufficient.
8484 */
8485 #define FORMATBUFLEN (size_t)120
8486
8487 PyObject *PyUnicode_Format(PyObject *format,
8488                            PyObject *args)
8489 {
8490     Py_UNICODE *fmt, *res;
8491     Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8492     int args_owned = 0;
8493     PyUnicodeObject *result = NULL;
8494     PyObject *dict = NULL;
8495     PyObject *uformat;
8496
8497     if (format == NULL || args == NULL) {
8498         PyErr_BadInternalCall();
8499         return NULL;
8500     }
8501     uformat = PyUnicode_FromObject(format);
8502     if (uformat == NULL)
8503         return NULL;
8504     fmt = PyUnicode_AS_UNICODE(uformat);
8505     fmtcnt = PyUnicode_GET_SIZE(uformat);
8506
8507     reslen = rescnt = fmtcnt + 100;
8508     result = _PyUnicode_New(reslen);
8509     if (result == NULL)
8510         goto onError;
8511     res = PyUnicode_AS_UNICODE(result);
8512
8513     if (PyTuple_Check(args)) {
8514         arglen = PyTuple_Size(args);
8515         argidx = 0;
8516     }
8517     else {
8518         arglen = -1;
8519         argidx = -2;
8520     }
8521     if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
8522         !PyObject_TypeCheck(args, &PyBaseString_Type))
8523         dict = args;
8524
8525     while (--fmtcnt >= 0) {
8526         if (*fmt != '%') {
8527             if (--rescnt < 0) {
8528                 rescnt = fmtcnt + 100;
8529                 reslen += rescnt;
8530                 if (_PyUnicode_Resize(&result, reslen) < 0)
8531                     goto onError;
8532                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8533                 --rescnt;
8534             }
8535             *res++ = *fmt++;
8536         }
8537         else {
8538             /* Got a format specifier */
8539             int flags = 0;
8540             Py_ssize_t width = -1;
8541             int prec = -1;
8542             Py_UNICODE c = '\0';
8543             Py_UNICODE fill;
8544             int isnumok;
8545             PyObject *v = NULL;
8546             PyObject *temp = NULL;
8547             Py_UNICODE *pbuf;
8548             Py_UNICODE sign;
8549             Py_ssize_t len;
8550             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
8551
8552             fmt++;
8553             if (*fmt == '(') {
8554                 Py_UNICODE *keystart;
8555                 Py_ssize_t keylen;
8556                 PyObject *key;
8557                 int pcount = 1;
8558
8559                 if (dict == NULL) {
8560                     PyErr_SetString(PyExc_TypeError,
8561                                     "format requires a mapping");
8562                     goto onError;
8563                 }
8564                 ++fmt;
8565                 --fmtcnt;
8566                 keystart = fmt;
8567                 /* Skip over balanced parentheses */
8568                 while (pcount > 0 && --fmtcnt >= 0) {
8569                     if (*fmt == ')')
8570                         --pcount;
8571                     else if (*fmt == '(')
8572                         ++pcount;
8573                     fmt++;
8574                 }
8575                 keylen = fmt - keystart - 1;
8576                 if (fmtcnt < 0 || pcount > 0) {
8577                     PyErr_SetString(PyExc_ValueError,
8578                                     "incomplete format key");
8579                     goto onError;
8580                 }
8581 #if 0
8582                 /* keys are converted to strings using UTF-8 and
8583                    then looked up since Python uses strings to hold
8584                    variables names etc. in its namespaces and we
8585                    wouldn't want to break common idioms. */
8586                 key = PyUnicode_EncodeUTF8(keystart,
8587                                            keylen,
8588                                            NULL);
8589 #else
8590                 key = PyUnicode_FromUnicode(keystart, keylen);
8591 #endif
8592                 if (key == NULL)
8593                     goto onError;
8594                 if (args_owned) {
8595                     Py_DECREF(args);
8596                     args_owned = 0;
8597                 }
8598                 args = PyObject_GetItem(dict, key);
8599                 Py_DECREF(key);
8600                 if (args == NULL) {
8601                     goto onError;
8602                 }
8603                 args_owned = 1;
8604                 arglen = -1;
8605                 argidx = -2;
8606             }
8607             while (--fmtcnt >= 0) {
8608                 switch (c = *fmt++) {
8609                 case '-': flags |= F_LJUST; continue;
8610                 case '+': flags |= F_SIGN; continue;
8611                 case ' ': flags |= F_BLANK; continue;
8612                 case '#': flags |= F_ALT; continue;
8613                 case '0': flags |= F_ZERO; continue;
8614                 }
8615                 break;
8616             }
8617             if (c == '*') {
8618                 v = getnextarg(args, arglen, &argidx);
8619                 if (v == NULL)
8620                     goto onError;
8621                 if (!PyInt_Check(v)) {
8622                     PyErr_SetString(PyExc_TypeError,
8623                                     "* wants int");
8624                     goto onError;
8625                 }
8626                 width = PyInt_AsLong(v);
8627                 if (width < 0) {
8628                     flags |= F_LJUST;
8629                     width = -width;
8630                 }
8631                 if (--fmtcnt >= 0)
8632                     c = *fmt++;
8633             }
8634             else if (c >= '0' && c <= '9') {
8635                 width = c - '0';
8636                 while (--fmtcnt >= 0) {
8637                     c = *fmt++;
8638                     if (c < '0' || c > '9')
8639                         break;
8640                     if ((width*10) / 10 != width) {
8641                         PyErr_SetString(PyExc_ValueError,
8642                                         "width too big");
8643                         goto onError;
8644                     }
8645                     width = width*10 + (c - '0');
8646                 }
8647             }
8648             if (c == '.') {
8649                 prec = 0;
8650                 if (--fmtcnt >= 0)
8651                     c = *fmt++;
8652                 if (c == '*') {
8653                     v = getnextarg(args, arglen, &argidx);
8654                     if (v == NULL)
8655                         goto onError;
8656                     if (!PyInt_Check(v)) {
8657                         PyErr_SetString(PyExc_TypeError,
8658                                         "* wants int");
8659                         goto onError;
8660                     }
8661                     prec = PyInt_AsLong(v);
8662                     if (prec < 0)
8663                         prec = 0;
8664                     if (--fmtcnt >= 0)
8665                         c = *fmt++;
8666                 }
8667                 else if (c >= '0' && c <= '9') {
8668                     prec = c - '0';
8669                     while (--fmtcnt >= 0) {
8670                         c = Py_CHARMASK(*fmt++);
8671                         if (c < '0' || c > '9')
8672                             break;
8673                         if ((prec*10) / 10 != prec) {
8674                             PyErr_SetString(PyExc_ValueError,
8675                                             "prec too big");
8676                             goto onError;
8677                         }
8678                         prec = prec*10 + (c - '0');
8679                     }
8680                 }
8681             } /* prec */
8682             if (fmtcnt >= 0) {
8683                 if (c == 'h' || c == 'l' || c == 'L') {
8684                     if (--fmtcnt >= 0)
8685                         c = *fmt++;
8686                 }
8687             }
8688             if (fmtcnt < 0) {
8689                 PyErr_SetString(PyExc_ValueError,
8690                                 "incomplete format");
8691                 goto onError;
8692             }
8693             if (c != '%') {
8694                 v = getnextarg(args, arglen, &argidx);
8695                 if (v == NULL)
8696                     goto onError;
8697             }
8698             sign = 0;
8699             fill = ' ';
8700             switch (c) {
8701
8702             case '%':
8703                 pbuf = formatbuf;
8704                 /* presume that buffer length is at least 1 */
8705                 pbuf[0] = '%';
8706                 len = 1;
8707                 break;
8708
8709             case 's':
8710             case 'r':
8711                 if (PyUnicode_Check(v) && c == 's') {
8712                     temp = v;
8713                     Py_INCREF(temp);
8714                 }
8715                 else {
8716                     PyObject *unicode;
8717                     if (c == 's')
8718                         temp = PyObject_Unicode(v);
8719                     else
8720                         temp = PyObject_Repr(v);
8721                     if (temp == NULL)
8722                         goto onError;
8723                     if (PyUnicode_Check(temp))
8724                         /* nothing to do */;
8725                     else if (PyString_Check(temp)) {
8726                         /* convert to string to Unicode */
8727                         unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8728                                                    PyString_GET_SIZE(temp),
8729                                                    NULL,
8730                                                    "strict");
8731                         Py_DECREF(temp);
8732                         temp = unicode;
8733                         if (temp == NULL)
8734                             goto onError;
8735                     }
8736                     else {
8737                         Py_DECREF(temp);
8738                         PyErr_SetString(PyExc_TypeError,
8739                                         "%s argument has non-string str()");
8740                         goto onError;
8741                     }
8742                 }
8743                 pbuf = PyUnicode_AS_UNICODE(temp);
8744                 len = PyUnicode_GET_SIZE(temp);
8745                 if (prec >= 0 && len > prec)
8746                     len = prec;
8747                 break;
8748
8749             case 'i':
8750             case 'd':
8751             case 'u':
8752             case 'o':
8753             case 'x':
8754             case 'X':
8755                 if (c == 'i')
8756                     c = 'd';
8757                 isnumok = 0;
8758                 if (PyNumber_Check(v)) {
8759                     PyObject *iobj=NULL;
8760
8761                     if (PyInt_Check(v) || (PyLong_Check(v))) {
8762                         iobj = v;
8763                         Py_INCREF(iobj);
8764                     }
8765                     else {
8766                         iobj = PyNumber_Int(v);
8767                         if (iobj==NULL) iobj = PyNumber_Long(v);
8768                     }
8769                     if (iobj!=NULL) {
8770                         if (PyInt_Check(iobj)) {
8771                             isnumok = 1;
8772                             pbuf = formatbuf;
8773                             len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8774                                             flags, prec, c, iobj);
8775                             Py_DECREF(iobj);
8776                             if (len < 0)
8777                                 goto onError;
8778                             sign = 1;
8779                         }
8780                         else if (PyLong_Check(iobj)) {
8781                             isnumok = 1;
8782                             temp = formatlong(iobj, flags, prec, c);
8783                             Py_DECREF(iobj);
8784                             if (!temp)
8785                                 goto onError;
8786                             pbuf = PyUnicode_AS_UNICODE(temp);
8787                             len = PyUnicode_GET_SIZE(temp);
8788                             sign = 1;
8789                         }
8790                         else {
8791                             Py_DECREF(iobj);
8792                         }
8793                     }
8794                 }
8795                 if (!isnumok) {
8796                     PyErr_Format(PyExc_TypeError,
8797                                  "%%%c format: a number is required, "
8798                                  "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8799                     goto onError;
8800                 }
8801                 if (flags & F_ZERO)
8802                     fill = '0';
8803                 break;
8804
8805             case 'e':
8806             case 'E':
8807             case 'f':
8808             case 'F':
8809             case 'g':
8810             case 'G':
8811                 if (c == 'F')
8812                     c = 'f';
8813                 pbuf = formatbuf;
8814                 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8815                                   flags, prec, c, v);
8816                 if (len < 0)
8817                     goto onError;
8818                 sign = 1;
8819                 if (flags & F_ZERO)
8820                     fill = '0';
8821                 break;
8822
8823             case 'c':
8824                 pbuf = formatbuf;
8825                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8826                 if (len < 0)
8827                     goto onError;
8828                 break;
8829
8830             default:
8831                 PyErr_Format(PyExc_ValueError,
8832                              "unsupported format character '%c' (0x%x) "
8833                              "at index %zd",
8834                              (31<=c && c<=126) ? (char)c : '?',
8835                              (int)c,
8836                              (Py_ssize_t)(fmt - 1 -
8837                                           PyUnicode_AS_UNICODE(uformat)));
8838                 goto onError;
8839             }
8840             if (sign) {
8841                 if (*pbuf == '-' || *pbuf == '+') {
8842                     sign = *pbuf++;
8843                     len--;
8844                 }
8845                 else if (flags & F_SIGN)
8846                     sign = '+';
8847                 else if (flags & F_BLANK)
8848                     sign = ' ';
8849                 else
8850                     sign = 0;
8851             }
8852             if (width < len)
8853                 width = len;
8854             if (rescnt - (sign != 0) < width) {
8855                 reslen -= rescnt;
8856                 rescnt = width + fmtcnt + 100;
8857                 reslen += rescnt;
8858                 if (reslen < 0) {
8859                     Py_XDECREF(temp);
8860                     PyErr_NoMemory();
8861                     goto onError;
8862                 }
8863                 if (_PyUnicode_Resize(&result, reslen) < 0) {
8864                     Py_XDECREF(temp);
8865                     goto onError;
8866                 }
8867                 res = PyUnicode_AS_UNICODE(result)
8868                     + reslen - rescnt;
8869             }
8870             if (sign) {
8871                 if (fill != ' ')
8872                     *res++ = sign;
8873                 rescnt--;
8874                 if (width > len)
8875                     width--;
8876             }
8877             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8878                 assert(pbuf[0] == '0');
8879                 assert(pbuf[1] == c);
8880                 if (fill != ' ') {
8881                     *res++ = *pbuf++;
8882                     *res++ = *pbuf++;
8883                 }
8884                 rescnt -= 2;
8885                 width -= 2;
8886                 if (width < 0)
8887                     width = 0;
8888                 len -= 2;
8889             }
8890             if (width > len && !(flags & F_LJUST)) {
8891                 do {
8892                     --rescnt;
8893                     *res++ = fill;
8894                 } while (--width > len);
8895             }
8896             if (fill == ' ') {
8897                 if (sign)
8898                     *res++ = sign;
8899                 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8900                     assert(pbuf[0] == '0');
8901                     assert(pbuf[1] == c);
8902                     *res++ = *pbuf++;
8903                     *res++ = *pbuf++;
8904                 }
8905             }
8906             Py_UNICODE_COPY(res, pbuf, len);
8907             res += len;
8908             rescnt -= len;
8909             while (--width >= len) {
8910                 --rescnt;
8911                 *res++ = ' ';
8912             }
8913             if (dict && (argidx < arglen) && c != '%') {
8914                 PyErr_SetString(PyExc_TypeError,
8915                                 "not all arguments converted during string formatting");
8916                 Py_XDECREF(temp);
8917                 goto onError;
8918             }
8919             Py_XDECREF(temp);
8920         } /* '%' */
8921     } /* until end */
8922     if (argidx < arglen && !dict) {
8923         PyErr_SetString(PyExc_TypeError,
8924                         "not all arguments converted during string formatting");
8925         goto onError;
8926     }
8927
8928     if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8929         goto onError;
8930     if (args_owned) {
8931         Py_DECREF(args);
8932     }
8933     Py_DECREF(uformat);
8934     return (PyObject *)result;
8935
8936   onError:
8937     Py_XDECREF(result);
8938     Py_DECREF(uformat);
8939     if (args_owned) {
8940         Py_DECREF(args);
8941     }
8942     return NULL;
8943 }
8944
8945 static PyBufferProcs unicode_as_buffer = {
8946     (readbufferproc) unicode_buffer_getreadbuf,
8947     (writebufferproc) unicode_buffer_getwritebuf,
8948     (segcountproc) unicode_buffer_getsegcount,
8949     (charbufferproc) unicode_buffer_getcharbuf,
8950 };
8951
8952 static PyObject *
8953 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8954
8955 static PyObject *
8956 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8957 {
8958     PyObject *x = NULL;
8959     static char *kwlist[] = {"string", "encoding", "errors", 0};
8960     char *encoding = NULL;
8961     char *errors = NULL;
8962
8963     if (type != &PyUnicode_Type)
8964         return unicode_subtype_new(type, args, kwds);
8965     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8966                                      kwlist, &x, &encoding, &errors))
8967         return NULL;
8968     if (x == NULL)
8969         return (PyObject *)_PyUnicode_New(0);
8970     if (encoding == NULL && errors == NULL)
8971         return PyObject_Unicode(x);
8972     else
8973         return PyUnicode_FromEncodedObject(x, encoding, errors);
8974 }
8975
8976 static PyObject *
8977 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8978 {
8979     PyUnicodeObject *tmp, *pnew;
8980     Py_ssize_t n;
8981
8982     assert(PyType_IsSubtype(type, &PyUnicode_Type));
8983     tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8984     if (tmp == NULL)
8985         return NULL;
8986     assert(PyUnicode_Check(tmp));
8987     pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8988     if (pnew == NULL) {
8989         Py_DECREF(tmp);
8990         return NULL;
8991     }
8992     pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8993     if (pnew->str == NULL) {
8994         _Py_ForgetReference((PyObject *)pnew);
8995         PyObject_Del(pnew);
8996         Py_DECREF(tmp);
8997         return PyErr_NoMemory();
8998     }
8999     Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9000     pnew->length = n;
9001     pnew->hash = tmp->hash;
9002     Py_DECREF(tmp);
9003     return (PyObject *)pnew;
9004 }
9005
9006 PyDoc_STRVAR(unicode_doc,
9007              "unicode(string [, encoding[, errors]]) -> object\n\
9008 \n\
9009 Create a new Unicode object from the given encoded string.\n\
9010 encoding defaults to the current default string encoding.\n\
9011 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
9012
9013 PyTypeObject PyUnicode_Type = {
9014     PyVarObject_HEAD_INIT(&PyType_Type, 0)
9015     "unicode",              /* tp_name */
9016     sizeof(PyUnicodeObject),        /* tp_size */
9017     0,                  /* tp_itemsize */
9018     /* Slots */
9019     (destructor)unicode_dealloc,    /* tp_dealloc */
9020     0,                  /* tp_print */
9021     0,                  /* tp_getattr */
9022     0,                  /* tp_setattr */
9023     0,                  /* tp_compare */
9024     unicode_repr,           /* tp_repr */
9025     &unicode_as_number,         /* tp_as_number */
9026     &unicode_as_sequence,       /* tp_as_sequence */
9027     &unicode_as_mapping,        /* tp_as_mapping */
9028     (hashfunc) unicode_hash,        /* tp_hash*/
9029     0,                  /* tp_call*/
9030     (reprfunc) unicode_str,     /* tp_str */
9031     PyObject_GenericGetAttr,        /* tp_getattro */
9032     0,                  /* tp_setattro */
9033     &unicode_as_buffer,         /* tp_as_buffer */
9034     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
9035     Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS,  /* tp_flags */
9036     unicode_doc,            /* tp_doc */
9037     0,                  /* tp_traverse */
9038     0,                  /* tp_clear */
9039     PyUnicode_RichCompare,      /* tp_richcompare */
9040     0,                  /* tp_weaklistoffset */
9041     0,                  /* tp_iter */
9042     0,                  /* tp_iternext */
9043     unicode_methods,            /* tp_methods */
9044     0,                  /* tp_members */
9045     0,                  /* tp_getset */
9046     &PyBaseString_Type,         /* tp_base */
9047     0,                  /* tp_dict */
9048     0,                  /* tp_descr_get */
9049     0,                  /* tp_descr_set */
9050     0,                  /* tp_dictoffset */
9051     0,                  /* tp_init */
9052     0,                  /* tp_alloc */
9053     unicode_new,            /* tp_new */
9054     PyObject_Del,           /* tp_free */
9055 };
9056
9057 /* Initialize the Unicode implementation */
9058
9059 void _PyUnicode_Init(void)
9060 {
9061     int i;
9062
9063     /* XXX - move this array to unicodectype.c ? */
9064     Py_UNICODE linebreak[] = {
9065         0x000A, /* LINE FEED */
9066         0x000D, /* CARRIAGE RETURN */
9067         0x001C, /* FILE SEPARATOR */
9068         0x001D, /* GROUP SEPARATOR */
9069         0x001E, /* RECORD SEPARATOR */
9070         0x0085, /* NEXT LINE */
9071         0x2028, /* LINE SEPARATOR */
9072         0x2029, /* PARAGRAPH SEPARATOR */
9073     };
9074
9075     /* Init the implementation */
9076     free_list = NULL;
9077     numfree = 0;
9078     unicode_empty = _PyUnicode_New(0);
9079     if (!unicode_empty)
9080         return;
9081
9082     strcpy(unicode_default_encoding, "ascii");
9083     for (i = 0; i < 256; i++)
9084         unicode_latin1[i] = NULL;
9085     if (PyType_Ready(&PyUnicode_Type) < 0)
9086         Py_FatalError("Can't initialize 'unicode'");
9087
9088     /* initialize the linebreak bloom filter */
9089     bloom_linebreak = make_bloom_mask(
9090         linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9091         );
9092
9093     PyType_Ready(&EncodingMapType);
9094 }
9095
9096 /* Finalize the Unicode implementation */
9097
9098 int
9099 PyUnicode_ClearFreeList(void)
9100 {
9101     int freelist_size = numfree;
9102     PyUnicodeObject *u;
9103
9104     for (u = free_list; u != NULL;) {
9105         PyUnicodeObject *v = u;
9106         u = *(PyUnicodeObject **)u;
9107         if (v->str)
9108             PyObject_DEL(v->str);
9109         Py_XDECREF(v->defenc);
9110         PyObject_Del(v);
9111         numfree--;
9112     }
9113     free_list = NULL;
9114     assert(numfree == 0);
9115     return freelist_size;
9116 }
9117
9118 void
9119 _PyUnicode_Fini(void)
9120 {
9121     int i;
9122
9123     Py_XDECREF(unicode_empty);
9124     unicode_empty = NULL;
9125
9126     for (i = 0; i < 256; i++) {
9127         if (unicode_latin1[i]) {
9128             Py_DECREF(unicode_latin1[i]);
9129             unicode_latin1[i] = NULL;
9130         }
9131     }
9132     (void)PyUnicode_ClearFreeList();
9133 }
9134
9135 #ifdef __cplusplus
9136 }
9137 #endif
9138
9139
9140 /*
9141   Local variables:
9142   c-basic-offset: 4
9143   indent-tabs-mode: nil
9144   End:
9145 */