Objects/unicodeobject.c

   1 /*
   2
   3 Unicode implementation based on original code by Fredrik Lundh,
   4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
   5 Unicode Integration Proposal (see file Misc/unicode.txt).
   6
   7 Major speed upgrades to the method implementations at the Reykjavik
   8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
   9
  10 Copyright (c) Corporation for National Research Initiatives.
  11
  12 --------------------------------------------------------------------
  13 The original string type implementation is:
  14
  15     Copyright (c) 1999 by Secret Labs AB
  16     Copyright (c) 1999 by Fredrik Lundh
  17
  18 By obtaining, using, and/or copying this software and/or its
  19 associated documentation, you agree that you have read, understood,
  20 and will comply with the following terms and conditions:
  21
  22 Permission to use, copy, modify, and distribute this software and its
  23 associated documentation for any purpose and without fee is hereby
  24 granted, provided that the above copyright notice appears in all
  25 copies, and that both that copyright notice and this permission notice
  26 appear in supporting documentation, and that the name of Secret Labs
  27 AB or the author not be used in advertising or publicity pertaining to
  28 distribution of the software without specific, written prior
  29 permission.
  30
  31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  33 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  38 --------------------------------------------------------------------
  39
  40 */
  41
  42 #define PY_SSIZE_T_CLEAN
  43 #include "Python.h"
  44
  45 #include "unicodeobject.h"
  46 #include "ucnhash.h"
  47
  48 #ifdef MS_WINDOWS
  49 #include <windows.h>
  50 #endif
  51
  52 /* Limit for the Unicode object free list */
  53
  54 #define PyUnicode_MAXFREELIST       1024
  55
  56 /* Limit for the Unicode object free list stay alive optimization.
  57
  58    The implementation will keep allocated Unicode memory intact for
  59    all objects on the free list having a size less than this
  60    limit. This reduces malloc() overhead for small Unicode objects.
  61
  62    At worst this will result in PyUnicode_MAXFREELIST *
  63    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
  64    malloc()-overhead) bytes of unused garbage.
  65
  66    Setting the limit to 0 effectively turns the feature off.
  67
  68    Note: This is an experimental feature ! If you get core dumps when
  69    using Unicode objects, turn this feature off.
  70
  71 */
  72
  73 #define KEEPALIVE_SIZE_LIMIT       9
  74
  75 /* Endianness switches; defaults to little endian */
  76
  77 #ifdef WORDS_BIGENDIAN
  78 # define BYTEORDER_IS_BIG_ENDIAN
  79 #else
  80 # define BYTEORDER_IS_LITTLE_ENDIAN
  81 #endif
  82
  83 /* --- Globals ------------------------------------------------------------
  84
  85    The globals are initialized by the _PyUnicode_Init() API and should
  86    not be used before calling that API.
  87
  88 */
  89
  90
  91 #ifdef __cplusplus
  92 extern "C" {
  93 #endif
  94
  95 /* Free list for Unicode objects */
  96 static PyUnicodeObject *free_list;
  97 static int numfree;
  98
  99 /* The empty Unicode object is shared to improve performance. */
 100 static PyUnicodeObject *unicode_empty;
 101
 102 /* Single character Unicode strings in the Latin-1 range are being
 103    shared as well. */
 104 static PyUnicodeObject *unicode_latin1[256];
 105
 106 /* Default encoding to use and assume when NULL is passed as encoding
 107    parameter; it is initialized by _PyUnicode_Init().
 108
 109    Always use the PyUnicode_SetDefaultEncoding() and
 110    PyUnicode_GetDefaultEncoding() APIs to access this global.
 111
 112 */
 113 static char unicode_default_encoding[100];
 114
 115 /* Fast detection of the most frequent whitespace characters */
 116 const unsigned char _Py_ascii_whitespace[] = {
 117         0, 0, 0, 0, 0, 0, 0, 0,
 118 //     case 0x0009: /* HORIZONTAL TABULATION */
 119 //     case 0x000A: /* LINE FEED */
 120 //     case 0x000B: /* VERTICAL TABULATION */
 121 //     case 0x000C: /* FORM FEED */
 122 //     case 0x000D: /* CARRIAGE RETURN */
 123         0, 1, 1, 1, 1, 1, 0, 0,
 124         0, 0, 0, 0, 0, 0, 0, 0,
 125 //     case 0x001C: /* FILE SEPARATOR */
 126 //     case 0x001D: /* GROUP SEPARATOR */
 127 //     case 0x001E: /* RECORD SEPARATOR */
 128 //     case 0x001F: /* UNIT SEPARATOR */
 129         0, 0, 0, 0, 1, 1, 1, 1,
 130 //     case 0x0020: /* SPACE */
 131         1, 0, 0, 0, 0, 0, 0, 0,
 132         0, 0, 0, 0, 0, 0, 0, 0,
 133         0, 0, 0, 0, 0, 0, 0, 0,
 134         0, 0, 0, 0, 0, 0, 0, 0,
 135
 136         0, 0, 0, 0, 0, 0, 0, 0,
 137         0, 0, 0, 0, 0, 0, 0, 0,
 138         0, 0, 0, 0, 0, 0, 0, 0,
 139         0, 0, 0, 0, 0, 0, 0, 0,
 140         0, 0, 0, 0, 0, 0, 0, 0,
 141         0, 0, 0, 0, 0, 0, 0, 0,
 142         0, 0, 0, 0, 0, 0, 0, 0,
 143         0, 0, 0, 0, 0, 0, 0, 0
 144 };
 145
 146 /* Same for linebreaks */
 147 static unsigned char ascii_linebreak[] = {
 148         0, 0, 0, 0, 0, 0, 0, 0,
 149 //         0x000A, /* LINE FEED */
 150 //         0x000D, /* CARRIAGE RETURN */
 151         0, 0, 1, 0, 0, 1, 0, 0,
 152         0, 0, 0, 0, 0, 0, 0, 0,
 153 //         0x001C, /* FILE SEPARATOR */
 154 //         0x001D, /* GROUP SEPARATOR */
 155 //         0x001E, /* RECORD SEPARATOR */
 156         0, 0, 0, 0, 1, 1, 1, 0,
 157         0, 0, 0, 0, 0, 0, 0, 0,
 158         0, 0, 0, 0, 0, 0, 0, 0,
 159         0, 0, 0, 0, 0, 0, 0, 0,
 160         0, 0, 0, 0, 0, 0, 0, 0,
 161
 162         0, 0, 0, 0, 0, 0, 0, 0,
 163         0, 0, 0, 0, 0, 0, 0, 0,
 164         0, 0, 0, 0, 0, 0, 0, 0,
 165         0, 0, 0, 0, 0, 0, 0, 0,
 166         0, 0, 0, 0, 0, 0, 0, 0,
 167         0, 0, 0, 0, 0, 0, 0, 0,
 168         0, 0, 0, 0, 0, 0, 0, 0,
 169         0, 0, 0, 0, 0, 0, 0, 0
 170 };
 171
 172
 173 Py_UNICODE
 174 PyUnicode_GetMax(void)
 175 {
 176 #ifdef Py_UNICODE_WIDE
 177         return 0x10FFFF;
 178 #else
 179         /* This is actually an illegal character, so it should
 180            not be passed to unichr. */
 181         return 0xFFFF;
 182 #endif
 183 }
 184
 185 /* --- Bloom Filters ----------------------------------------------------- */
 186
 187 /* stuff to implement simple "bloom filters" for Unicode characters.
 188    to keep things simple, we use a single bitmask, using the least 5
 189    bits from each unicode characters as the bit index. */
 190
 191 /* the linebreak mask is set up by Unicode_Init below */
 192
 193 #define BLOOM_MASK unsigned long
 194
 195 static BLOOM_MASK bloom_linebreak;
 196
 197 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
 198
 199 #define BLOOM_LINEBREAK(ch) \
 200     ((ch) < 128U ? ascii_linebreak[(ch)] : \
 201     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
 202
 203 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
 204 {
 205     /* calculate simple bloom-style bitmask for a given unicode string */
 206
 207     long mask;
 208     Py_ssize_t i;
 209
 210     mask = 0;
 211     for (i = 0; i < len; i++)
 212         mask |= (1 << (ptr[i] & 0x1F));
 213
 214     return mask;
 215 }
 216
 217 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
 218 {
 219     Py_ssize_t i;
 220
 221     for (i = 0; i < setlen; i++)
 222         if (set[i] == chr)
 223             return 1;
 224
 225     return 0;
 226 }
 227
 228 #define BLOOM_MEMBER(mask, chr, set, setlen)\
 229     BLOOM(mask, chr) && unicode_member(chr, set, setlen)
 230
 231 /* --- Unicode Object ----------------------------------------------------- */
 232
 233 static
 234 int unicode_resize(register PyUnicodeObject *unicode,
 235                       Py_ssize_t length)
 236 {
 237     void *oldstr;
 238
 239     /* Shortcut if there's nothing much to do. */
 240     if (unicode->length == length)
 241         goto reset;
 242
 243     /* Resizing shared object (unicode_empty or single character
 244        objects) in-place is not allowed. Use PyUnicode_Resize()
 245        instead ! */
 246
 247     if (unicode == unicode_empty ||
 248         (unicode->length == 1 &&
 249          unicode->str[0] < 256U &&
 250          unicode_latin1[unicode->str[0]] == unicode)) {
 251         PyErr_SetString(PyExc_SystemError,
 252                         "can't resize shared unicode objects");
 253         return -1;
 254     }
 255
 256     /* We allocate one more byte to make sure the string is Ux0000 terminated.
 257        The overallocation is also used by fastsearch, which assumes that it's
 258        safe to look at str[length] (without making any assumptions about what
 259        it contains). */
 260
 261     oldstr = unicode->str;
 262     unicode->str = PyObject_REALLOC(unicode->str,
 263                                     sizeof(Py_UNICODE) * (length + 1));
 264     if (!unicode->str) {
 265         unicode->str = (Py_UNICODE *)oldstr;
 266         PyErr_NoMemory();
 267         return -1;
 268     }
 269     unicode->str[length] = 0;
 270     unicode->length = length;
 271
 272  reset:
 273     /* Reset the object caches */
 274     if (unicode->defenc) {
 275         Py_DECREF(unicode->defenc);
 276         unicode->defenc = NULL;
 277     }
 278     unicode->hash = -1;
 279
 280     return 0;
 281 }
 282
 283 /* We allocate one more byte to make sure the string is
 284    Ux0000 terminated -- XXX is this needed ?
 285
 286    XXX This allocator could further be enhanced by assuring that the
 287        free list never reduces its size below 1.
 288
 289 */
 290
 291 static
 292 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
 293 {
 294     register PyUnicodeObject *unicode;
 295
 296     /* Optimization for empty strings */
 297     if (length == 0 && unicode_empty != NULL) {
 298         Py_INCREF(unicode_empty);
 299         return unicode_empty;
 300     }
 301
 302     /* Unicode freelist & memory allocation */
 303     if (free_list) {
 304         unicode = free_list;
 305         free_list = *(PyUnicodeObject **)unicode;
 306         numfree--;
 307         if (unicode->str) {
 308             /* Keep-Alive optimization: we only upsize the buffer,
 309                never downsize it. */
 310             if ((unicode->length < length) &&
 311                 unicode_resize(unicode, length) < 0) {
 312                 PyObject_DEL(unicode->str);
 313                 goto onError;
 314             }
 315         }
 316         else {
 317             size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
 318             unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
 319         }
 320         PyObject_INIT(unicode, &PyUnicode_Type);
 321     }
 322     else {
 323         size_t new_size;
 324         unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
 325         if (unicode == NULL)
 326             return NULL;
 327         new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
 328         unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
 329     }
 330
 331     if (!unicode->str) {
 332         PyErr_NoMemory();
 333         goto onError;
 334     }
 335     /* Initialize the first element to guard against cases where
 336      * the caller fails before initializing str -- unicode_resize()
 337      * reads str[0], and the Keep-Alive optimization can keep memory
 338      * allocated for str alive across a call to unicode_dealloc(unicode).
 339      * We don't want unicode_resize to read uninitialized memory in
 340      * that case.
 341      */
 342     unicode->str[0] = 0;
 343     unicode->str[length] = 0;
 344     unicode->length = length;
 345     unicode->hash = -1;
 346     unicode->defenc = NULL;
 347     return unicode;
 348
 349  onError:
 350     _Py_ForgetReference((PyObject *)unicode);
 351     PyObject_Del(unicode);
 352     return NULL;
 353 }
 354
 355 static
 356 void unicode_dealloc(register PyUnicodeObject *unicode)
 357 {
 358     if (PyUnicode_CheckExact(unicode) &&
 359         numfree < PyUnicode_MAXFREELIST) {
 360         /* Keep-Alive optimization */
 361         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
 362             PyObject_DEL(unicode->str);
 363             unicode->str = NULL;
 364             unicode->length = 0;
 365         }
 366         if (unicode->defenc) {
 367             Py_DECREF(unicode->defenc);
 368             unicode->defenc = NULL;
 369         }
 370         /* Add to free list */
 371         *(PyUnicodeObject **)unicode = free_list;
 372         free_list = unicode;
 373         numfree++;
 374     }
 375     else {
 376         PyObject_DEL(unicode->str);
 377         Py_XDECREF(unicode->defenc);
 378         Py_TYPE(unicode)->tp_free((PyObject *)unicode);
 379     }
 380 }
 381
 382 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
 383 {
 384     register PyUnicodeObject *v;
 385
 386     /* Argument checks */
 387     if (unicode == NULL) {
 388         PyErr_BadInternalCall();
 389         return -1;
 390     }
 391     v = (PyUnicodeObject *)*unicode;
 392     if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
 393         PyErr_BadInternalCall();
 394         return -1;
 395     }
 396
 397     /* Resizing unicode_empty and single character objects is not
 398        possible since these are being shared. We simply return a fresh
 399        copy with the same Unicode content. */
 400     if (v->length != length &&
 401         (v == unicode_empty || v->length == 1)) {
 402         PyUnicodeObject *w = _PyUnicode_New(length);
 403         if (w == NULL)
 404             return -1;
 405         Py_UNICODE_COPY(w->str, v->str,
 406                         length < v->length ? length : v->length);
 407         Py_DECREF(*unicode);
 408         *unicode = (PyObject *)w;
 409         return 0;
 410     }
 411
 412     /* Note that we don't have to modify *unicode for unshared Unicode
 413        objects, since we can modify them in-place. */
 414     return unicode_resize(v, length);
 415 }
 416
 417 /* Internal API for use in unicodeobject.c only ! */
 418 #define _PyUnicode_Resize(unicodevar, length) \
 419         PyUnicode_Resize(((PyObject **)(unicodevar)), length)
 420
 421 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
 422                                 Py_ssize_t size)
 423 {
 424     PyUnicodeObject *unicode;
 425
 426     /* If the Unicode data is known at construction time, we can apply
 427        some optimizations which share commonly used objects. */
 428     if (u != NULL) {
 429
 430         /* Optimization for empty strings */
 431         if (size == 0 && unicode_empty != NULL) {
 432             Py_INCREF(unicode_empty);
 433             return (PyObject *)unicode_empty;
 434         }
 435
 436         /* Single character Unicode objects in the Latin-1 range are
 437            shared when using this constructor */
 438         if (size == 1 && *u < 256) {
 439             unicode = unicode_latin1[*u];
 440             if (!unicode) {
 441                 unicode = _PyUnicode_New(1);
 442                 if (!unicode)
 443                     return NULL;
 444                 unicode->str[0] = *u;
 445                 unicode_latin1[*u] = unicode;
 446             }
 447             Py_INCREF(unicode);
 448             return (PyObject *)unicode;
 449         }
 450     }
 451
 452     unicode = _PyUnicode_New(size);
 453     if (!unicode)
 454         return NULL;
 455
 456     /* Copy the Unicode data into the new object */
 457     if (u != NULL)
 458         Py_UNICODE_COPY(unicode->str, u, size);
 459
 460     return (PyObject *)unicode;
 461 }
 462
 463 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
 464 {
 465     PyUnicodeObject *unicode;
 466
 467         if (size < 0) {
 468                 PyErr_SetString(PyExc_SystemError,
 469                     "Negative size passed to PyUnicode_FromStringAndSize");
 470                 return NULL;
 471         }
 472
 473     /* If the Unicode data is known at construction time, we can apply
 474        some optimizations which share commonly used objects.
 475        Also, this means the input must be UTF-8, so fall back to the
 476        UTF-8 decoder at the end. */
 477     if (u != NULL) {
 478
 479         /* Optimization for empty strings */
 480         if (size == 0 && unicode_empty != NULL) {
 481             Py_INCREF(unicode_empty);
 482             return (PyObject *)unicode_empty;
 483         }
 484
 485         /* Single characters are shared when using this constructor.
 486            Restrict to ASCII, since the input must be UTF-8. */
 487         if (size == 1 && Py_CHARMASK(*u) < 128) {
 488             unicode = unicode_latin1[Py_CHARMASK(*u)];
 489             if (!unicode) {
 490                 unicode = _PyUnicode_New(1);
 491                 if (!unicode)
 492                     return NULL;
 493                 unicode->str[0] = Py_CHARMASK(*u);
 494                 unicode_latin1[Py_CHARMASK(*u)] = unicode;
 495             }
 496             Py_INCREF(unicode);
 497             return (PyObject *)unicode;
 498         }
 499
 500         return PyUnicode_DecodeUTF8(u, size, NULL);
 501     }
 502
 503     unicode = _PyUnicode_New(size);
 504     if (!unicode)
 505         return NULL;
 506
 507     return (PyObject *)unicode;
 508 }
 509
 510 PyObject *PyUnicode_FromString(const char *u)
 511 {
 512     size_t size = strlen(u);
 513     if (size > PY_SSIZE_T_MAX) {
 514         PyErr_SetString(PyExc_OverflowError, "input too long");
 515         return NULL;
 516     }
 517
 518     return PyUnicode_FromStringAndSize(u, size);
 519 }
 520
 521 #ifdef HAVE_WCHAR_H
 522
 523 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 524                                  Py_ssize_t size)
 525 {
 526     PyUnicodeObject *unicode;
 527
 528     if (w == NULL) {
 529         PyErr_BadInternalCall();
 530         return NULL;
 531     }
 532
 533     unicode = _PyUnicode_New(size);
 534     if (!unicode)
 535         return NULL;
 536
 537     /* Copy the wchar_t data into the new object */
 538 #ifdef HAVE_USABLE_WCHAR_T
 539     memcpy(unicode->str, w, size * sizeof(wchar_t));
 540 #else
 541     {
 542         register Py_UNICODE *u;
 543         register Py_ssize_t i;
 544         u = PyUnicode_AS_UNICODE(unicode);
 545         for (i = size; i > 0; i--)
 546             *u++ = *w++;
 547     }
 548 #endif
 549
 550     return (PyObject *)unicode;
 551 }
 552
 553 static void
 554 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
 555 {
 556         *fmt++ = '%';
 557         if (width) {
 558                 if (zeropad)
 559                         *fmt++ = '0';
 560                 fmt += sprintf(fmt, "%d", width);
 561         }
 562         if (precision)
 563                 fmt += sprintf(fmt, ".%d", precision);
 564         if (longflag)
 565                 *fmt++ = 'l';
 566         else if (size_tflag) {
 567                 char *f = PY_FORMAT_SIZE_T;
 568                 while (*f)
 569                         *fmt++ = *f++;
 570         }
 571         *fmt++ = c;
 572         *fmt = '\0';
 573 }
 574
 575 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
 576
 577 PyObject *
 578 PyUnicode_FromFormatV(const char *format, va_list vargs)
 579 {
 580         va_list count;
 581         Py_ssize_t callcount = 0;
 582         PyObject **callresults = NULL;
 583         PyObject **callresult = NULL;
 584         Py_ssize_t n = 0;
 585         int width = 0;
 586         int precision = 0;
 587         int zeropad;
 588         const char* f;
 589         Py_UNICODE *s;
 590         PyObject *string;
 591         /* used by sprintf */
 592         char buffer[21];
 593         /* use abuffer instead of buffer, if we need more space
 594          * (which can happen if there's a format specifier with width). */
 595         char *abuffer = NULL;
 596         char *realbuffer;
 597         Py_ssize_t abuffersize = 0;
 598         char fmt[60]; /* should be enough for %0width.precisionld */
 599         const char *copy;
 600
 601 #ifdef VA_LIST_IS_ARRAY
 602         Py_MEMCPY(count, vargs, sizeof(va_list));
 603 #else
 604 #ifdef  __va_copy
 605         __va_copy(count, vargs);
 606 #else
 607         count = vargs;
 608 #endif
 609 #endif
 610         /* step 1: count the number of %S/%R format specifications
 611          * (we call PyObject_Str()/PyObject_Repr() for these objects
 612          * once during step 3 and put the result in an array) */
 613         for (f = format; *f; f++) {
 614                 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
 615                         ++callcount;
 616         }
 617         /* step 2: allocate memory for the results of
 618          * PyObject_Str()/PyObject_Repr() calls */
 619         if (callcount) {
 620                 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
 621                 if (!callresults) {
 622                         PyErr_NoMemory();
 623                         return NULL;
 624                 }
 625                 callresult = callresults;
 626         }
 627         /* step 3: figure out how large a buffer we need */
 628         for (f = format; *f; f++) {
 629                 if (*f == '%') {
 630                         const char* p = f;
 631                         width = 0;
 632                         while (isdigit((unsigned)*f))
 633                                 width = (width*10) + *f++ - '0';
 634                         while (*++f && *f != '%' && !isalpha((unsigned)*f))
 635                                 ;
 636
 637                         /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
 638                          * they don't affect the amount of space we reserve.
 639                          */
 640                         if ((*f == 'l' || *f == 'z') &&
 641                                         (f[1] == 'd' || f[1] == 'u'))
 642                                 ++f;
 643
 644                         switch (*f) {
 645                         case 'c':
 646                                 (void)va_arg(count, int);
 647                                 /* fall through... */
 648                         case '%':
 649                                 n++;
 650                                 break;
 651                         case 'd': case 'u': case 'i': case 'x':
 652                                 (void) va_arg(count, int);
 653                                 /* 20 bytes is enough to hold a 64-bit
 654                                    integer.  Decimal takes the most space.
 655                                    This isn't enough for octal.
 656                                    If a width is specified we need more
 657                                    (which we allocate later). */
 658                                 if (width < 20)
 659                                         width = 20;
 660                                 n += width;
 661                                 if (abuffersize < width)
 662                                         abuffersize = width;
 663                                 break;
 664                         case 's':
 665                         {
 666                                 /* UTF-8 */
 667                                 unsigned char*s;
 668                                 s = va_arg(count, unsigned char*);
 669                                 while (*s) {
 670                                         if (*s < 128) {
 671                                                 n++; s++;
 672                                         } else if (*s < 0xc0) {
 673                                                 /* invalid UTF-8 */
 674                                                 n++; s++;
 675                                         } else if (*s < 0xc0) {
 676                                                 n++;
 677                                                 s++; if(!*s)break;
 678                                                 s++;
 679                                         } else if (*s < 0xe0) {
 680                                                 n++;
 681                                                 s++; if(!*s)break;
 682                                                 s++; if(!*s)break;
 683                                                 s++;
 684                                         } else {
 685                                                 #ifdef Py_UNICODE_WIDE
 686                                                 n++;
 687                                                 #else
 688                                                 n+=2;
 689                                                 #endif
 690                                                 s++; if(!*s)break;
 691                                                 s++; if(!*s)break;
 692                                                 s++; if(!*s)break;
 693                                                 s++;
 694                                         }
 695                                 }
 696                                 break;
 697                         }
 698                         case 'U':
 699                         {
 700                                 PyObject *obj = va_arg(count, PyObject *);
 701                                 assert(obj && PyUnicode_Check(obj));
 702                                 n += PyUnicode_GET_SIZE(obj);
 703                                 break;
 704                         }
 705                         case 'V':
 706                         {
 707                                 PyObject *obj = va_arg(count, PyObject *);
 708                                 const char *str = va_arg(count, const char *);
 709                                 assert(obj || str);
 710                                 assert(!obj || PyUnicode_Check(obj));
 711                                 if (obj)
 712                                         n += PyUnicode_GET_SIZE(obj);
 713                                 else
 714                                         n += strlen(str);
 715                                 break;
 716                         }
 717                         case 'S':
 718                         {
 719                                 PyObject *obj = va_arg(count, PyObject *);
 720                                 PyObject *str;
 721                                 assert(obj);
 722                                 str = PyObject_Str(obj);
 723                                 if (!str)
 724                                         goto fail;
 725                                 n += PyUnicode_GET_SIZE(str);
 726                                 /* Remember the str and switch to the next slot */
 727                                 *callresult++ = str;
 728                                 break;
 729                         }
 730                         case 'R':
 731                         {
 732                                 PyObject *obj = va_arg(count, PyObject *);
 733                                 PyObject *repr;
 734                                 assert(obj);
 735                                 repr = PyObject_Repr(obj);
 736                                 if (!repr)
 737                                         goto fail;
 738                                 n += PyUnicode_GET_SIZE(repr);
 739                                 /* Remember the repr and switch to the next slot */
 740                                 *callresult++ = repr;
 741                                 break;
 742                         }
 743                         case 'p':
 744                                 (void) va_arg(count, int);
 745                                 /* maximum 64-bit pointer representation:
 746                                  * 0xffffffffffffffff
 747                                  * so 19 characters is enough.
 748                                  * XXX I count 18 -- what's the extra for?
 749                                  */
 750                                 n += 19;
 751                                 break;
 752                         default:
 753                                 /* if we stumble upon an unknown
 754                                    formatting code, copy the rest of
 755                                    the format string to the output
 756                                    string. (we cannot just skip the
 757                                    code, since there's no way to know
 758                                    what's in the argument list) */
 759                                 n += strlen(p);
 760                                 goto expand;
 761                         }
 762                 } else
 763                         n++;
 764         }
 765  expand:
 766         if (abuffersize > 20) {
 767                 abuffer = PyObject_Malloc(abuffersize);
 768                 if (!abuffer) {
 769                         PyErr_NoMemory();
 770                         goto fail;
 771                 }
 772                 realbuffer = abuffer;
 773         }
 774         else
 775                 realbuffer = buffer;
 776         /* step 4: fill the buffer */
 777         /* Since we've analyzed how much space we need for the worst case,
 778            we don't have to resize the string.
 779            There can be no errors beyond this point. */
 780         string = PyUnicode_FromUnicode(NULL, n);
 781         if (!string)
 782                 goto fail;
 783
 784         s = PyUnicode_AS_UNICODE(string);
 785         callresult = callresults;
 786
 787         for (f = format; *f; f++) {
 788                 if (*f == '%') {
 789                         const char* p = f++;
 790                         int longflag = 0;
 791                         int size_tflag = 0;
 792                         zeropad = (*f == '0');
 793                         /* parse the width.precision part */
 794                         width = 0;
 795                         while (isdigit((unsigned)*f))
 796                                 width = (width*10) + *f++ - '0';
 797                         precision = 0;
 798                         if (*f == '.') {
 799                                 f++;
 800                                 while (isdigit((unsigned)*f))
 801                                         precision = (precision*10) + *f++ - '0';
 802                         }
 803                         /* handle the long flag, but only for %ld and %lu.
 804                            others can be added when necessary. */
 805                         if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
 806                                 longflag = 1;
 807                                 ++f;
 808                         }
 809                         /* handle the size_t flag. */
 810                         if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
 811                                 size_tflag = 1;
 812                                 ++f;
 813                         }
 814
 815                         switch (*f) {
 816                         case 'c':
 817                                 *s++ = va_arg(vargs, int);
 818                                 break;
 819                         case 'd':
 820                                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
 821                                 if (longflag)
 822                                         sprintf(realbuffer, fmt, va_arg(vargs, long));
 823                                 else if (size_tflag)
 824                                         sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
 825                                 else
 826                                         sprintf(realbuffer, fmt, va_arg(vargs, int));
 827                                 appendstring(realbuffer);
 828                                 break;
 829                         case 'u':
 830                                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
 831                                 if (longflag)
 832                                         sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
 833                                 else if (size_tflag)
 834                                         sprintf(realbuffer, fmt, va_arg(vargs, size_t));
 835                                 else
 836                                         sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
 837                                 appendstring(realbuffer);
 838                                 break;
 839                         case 'i':
 840                                 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
 841                                 sprintf(realbuffer, fmt, va_arg(vargs, int));
 842                                 appendstring(realbuffer);
 843                                 break;
 844                         case 'x':
 845                                 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
 846                                 sprintf(realbuffer, fmt, va_arg(vargs, int));
 847                                 appendstring(realbuffer);
 848                                 break;
 849                         case 's':
 850                         {
 851                                 /* Parameter must be UTF-8 encoded.
 852                                    In case of encoding errors, use
 853                                    the replacement character. */
 854                                 PyObject *u;
 855                                 p = va_arg(vargs, char*);
 856                                 u = PyUnicode_DecodeUTF8(p, strlen(p),
 857                                                          "replace");
 858                                 if (!u)
 859                                         goto fail;
 860                                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
 861                                                 PyUnicode_GET_SIZE(u));
 862                                 s += PyUnicode_GET_SIZE(u);
 863                                 Py_DECREF(u);
 864                                 break;
 865                         }
 866                         case 'U':
 867                         {
 868                                 PyObject *obj = va_arg(vargs, PyObject *);
 869                                 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
 870                                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
 871                                 s += size;
 872                                 break;
 873                         }
 874                         case 'V':
 875                         {
 876                                 PyObject *obj = va_arg(vargs, PyObject *);
 877                                 const char *str = va_arg(vargs, const char *);
 878                                 if (obj) {
 879                                         Py_ssize_t size = PyUnicode_GET_SIZE(obj);
 880                                         Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
 881                                         s += size;
 882                                 } else {
 883                                         appendstring(str);
 884                                 }
 885                                 break;
 886                         }
 887                         case 'S':
 888                         case 'R':
 889                         {
 890                                 Py_UNICODE *ucopy;
 891                                 Py_ssize_t usize;
 892                                 Py_ssize_t upos;
 893                                 /* unused, since we already have the result */
 894                                 (void) va_arg(vargs, PyObject *);
 895                                 ucopy = PyUnicode_AS_UNICODE(*callresult);
 896                                 usize = PyUnicode_GET_SIZE(*callresult);
 897                                 for (upos = 0; upos<usize;)
 898                                         *s++ = ucopy[upos++];
 899                                 /* We're done with the unicode()/repr() => forget it */
 900                                 Py_DECREF(*callresult);
 901                                 /* switch to next unicode()/repr() result */
 902                                 ++callresult;
 903                                 break;
 904                         }
 905                         case 'p':
 906                                 sprintf(buffer, "%p", va_arg(vargs, void*));
 907                                 /* %p is ill-defined:  ensure leading 0x. */
 908                                 if (buffer[1] == 'X')
 909                                         buffer[1] = 'x';
 910                                 else if (buffer[1] != 'x') {
 911                                         memmove(buffer+2, buffer, strlen(buffer)+1);
 912                                         buffer[0] = '0';
 913                                         buffer[1] = 'x';
 914                                 }
 915                                 appendstring(buffer);
 916                                 break;
 917                         case '%':
 918                                 *s++ = '%';
 919                                 break;
 920                         default:
 921                                 appendstring(p);
 922                                 goto end;
 923                         }
 924                 } else
 925                         *s++ = *f;
 926         }
 927
 928  end:
 929         if (callresults)
 930                 PyObject_Free(callresults);
 931         if (abuffer)
 932                 PyObject_Free(abuffer);
 933         _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
 934         return string;
 935  fail:
 936         if (callresults) {
 937                 PyObject **callresult2 = callresults;
 938                 while (callresult2 < callresult) {
 939                         Py_DECREF(*callresult2);
 940                         ++callresult2;
 941                 }
 942                 PyObject_Free(callresults);
 943         }
 944         if (abuffer)
 945                 PyObject_Free(abuffer);
 946         return NULL;
 947 }
 948
 949 #undef appendstring
 950
 951 PyObject *
 952 PyUnicode_FromFormat(const char *format, ...)
 953 {
 954         PyObject* ret;
 955         va_list vargs;
 956
 957 #ifdef HAVE_STDARG_PROTOTYPES
 958         va_start(vargs, format);
 959 #else
 960         va_start(vargs);
 961 #endif
 962         ret = PyUnicode_FromFormatV(format, vargs);
 963         va_end(vargs);
 964         return ret;
 965 }
 966
 967 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
 968                                 wchar_t *w,
 969                                 Py_ssize_t size)
 970 {
 971     if (unicode == NULL) {
 972         PyErr_BadInternalCall();
 973         return -1;
 974     }
 975
 976     /* If possible, try to copy the 0-termination as well */
 977     if (size > PyUnicode_GET_SIZE(unicode))
 978         size = PyUnicode_GET_SIZE(unicode) + 1;
 979
 980 #ifdef HAVE_USABLE_WCHAR_T
 981     memcpy(w, unicode->str, size * sizeof(wchar_t));
 982 #else
 983     {
 984         register Py_UNICODE *u;
 985         register Py_ssize_t i;
 986         u = PyUnicode_AS_UNICODE(unicode);
 987         for (i = size; i > 0; i--)
 988             *w++ = *u++;
 989     }
 990 #endif
 991
 992     if (size > PyUnicode_GET_SIZE(unicode))
 993         return PyUnicode_GET_SIZE(unicode);
 994     else
 995     return size;
 996 }
 997
 998 #endif
 999
1000 PyObject *PyUnicode_FromOrdinal(int ordinal)
1001 {
1002     Py_UNICODE s[1];
1003
1004 #ifdef Py_UNICODE_WIDE
1005     if (ordinal < 0 || ordinal > 0x10ffff) {
1006         PyErr_SetString(PyExc_ValueError,
1007                         "unichr() arg not in range(0x110000) "
1008                         "(wide Python build)");
1009         return NULL;
1010     }
1011 #else
1012     if (ordinal < 0 || ordinal > 0xffff) {
1013         PyErr_SetString(PyExc_ValueError,
1014                         "unichr() arg not in range(0x10000) "
1015                         "(narrow Python build)");
1016         return NULL;
1017     }
1018 #endif
1019
1020     s[0] = (Py_UNICODE)ordinal;
1021     return PyUnicode_FromUnicode(s, 1);
1022 }
1023
1024 PyObject *PyUnicode_FromObject(register PyObject *obj)
1025 {
1026     /* XXX Perhaps we should make this API an alias of
1027            PyObject_Unicode() instead ?! */
1028     if (PyUnicode_CheckExact(obj)) {
1029         Py_INCREF(obj);
1030         return obj;
1031     }
1032     if (PyUnicode_Check(obj)) {
1033         /* For a Unicode subtype that's not a Unicode object,
1034            return a true Unicode object with the same data. */
1035         return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1036                                      PyUnicode_GET_SIZE(obj));
1037     }
1038     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1039 }
1040
1041 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1042                                       const char *encoding,
1043                                       const char *errors)
1044 {
1045     const char *s = NULL;
1046     Py_ssize_t len;
1047     PyObject *v;
1048
1049     if (obj == NULL) {
1050         PyErr_BadInternalCall();
1051         return NULL;
1052     }
1053
1054 #if 0
1055     /* For b/w compatibility we also accept Unicode objects provided
1056        that no encodings is given and then redirect to
1057        PyObject_Unicode() which then applies the additional logic for
1058        Unicode subclasses.
1059
1060        NOTE: This API should really only be used for object which
1061              represent *encoded* Unicode !
1062
1063     */
1064         if (PyUnicode_Check(obj)) {
1065             if (encoding) {
1066                 PyErr_SetString(PyExc_TypeError,
1067                                 "decoding Unicode is not supported");
1068             return NULL;
1069             }
1070         return PyObject_Unicode(obj);
1071             }
1072 #else
1073     if (PyUnicode_Check(obj)) {
1074         PyErr_SetString(PyExc_TypeError,
1075                         "decoding Unicode is not supported");
1076         return NULL;
1077         }
1078 #endif
1079
1080     /* Coerce object */
1081     if (PyString_Check(obj)) {
1082             s = PyString_AS_STRING(obj);
1083             len = PyString_GET_SIZE(obj);
1084     }
1085     else if (PyByteArray_Check(obj)) {
1086         /* Python 2.x specific */
1087         PyErr_Format(PyExc_TypeError,
1088                      "decoding bytearray is not supported");
1089         return NULL;
1090     }
1091     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1092         /* Overwrite the error message with something more useful in
1093            case of a TypeError. */
1094         if (PyErr_ExceptionMatches(PyExc_TypeError))
1095         PyErr_Format(PyExc_TypeError,
1096                          "coercing to Unicode: need string or buffer, "
1097                          "%.80s found",
1098                      Py_TYPE(obj)->tp_name);
1099         goto onError;
1100     }
1101
1102     /* Convert to Unicode */
1103     if (len == 0) {
1104         Py_INCREF(unicode_empty);
1105         v = (PyObject *)unicode_empty;
1106     }
1107     else
1108         v = PyUnicode_Decode(s, len, encoding, errors);
1109
1110     return v;
1111
1112  onError:
1113     return NULL;
1114 }
1115
1116 PyObject *PyUnicode_Decode(const char *s,
1117                            Py_ssize_t size,
1118                            const char *encoding,
1119                            const char *errors)
1120 {
1121     PyObject *buffer = NULL, *unicode;
1122
1123     if (encoding == NULL)
1124         encoding = PyUnicode_GetDefaultEncoding();
1125
1126     /* Shortcuts for common default encodings */
1127     if (strcmp(encoding, "utf-8") == 0)
1128         return PyUnicode_DecodeUTF8(s, size, errors);
1129     else if (strcmp(encoding, "latin-1") == 0)
1130         return PyUnicode_DecodeLatin1(s, size, errors);
1131 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1132     else if (strcmp(encoding, "mbcs") == 0)
1133         return PyUnicode_DecodeMBCS(s, size, errors);
1134 #endif
1135     else if (strcmp(encoding, "ascii") == 0)
1136         return PyUnicode_DecodeASCII(s, size, errors);
1137
1138     /* Decode via the codec registry */
1139     buffer = PyBuffer_FromMemory((void *)s, size);
1140     if (buffer == NULL)
1141         goto onError;
1142     unicode = PyCodec_Decode(buffer, encoding, errors);
1143     if (unicode == NULL)
1144         goto onError;
1145     if (!PyUnicode_Check(unicode)) {
1146         PyErr_Format(PyExc_TypeError,
1147                      "decoder did not return an unicode object (type=%.400s)",
1148                      Py_TYPE(unicode)->tp_name);
1149         Py_DECREF(unicode);
1150         goto onError;
1151     }
1152     Py_DECREF(buffer);
1153     return unicode;
1154
1155  onError:
1156     Py_XDECREF(buffer);
1157     return NULL;
1158 }
1159
1160 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1161                                     const char *encoding,
1162                                     const char *errors)
1163 {
1164     PyObject *v;
1165
1166     if (!PyUnicode_Check(unicode)) {
1167         PyErr_BadArgument();
1168         goto onError;
1169     }
1170
1171     if (encoding == NULL)
1172         encoding = PyUnicode_GetDefaultEncoding();
1173
1174     /* Decode via the codec registry */
1175     v = PyCodec_Decode(unicode, encoding, errors);
1176     if (v == NULL)
1177         goto onError;
1178     return v;
1179
1180  onError:
1181     return NULL;
1182 }
1183
1184 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1185                            Py_ssize_t size,
1186                            const char *encoding,
1187                            const char *errors)
1188 {
1189     PyObject *v, *unicode;
1190
1191     unicode = PyUnicode_FromUnicode(s, size);
1192     if (unicode == NULL)
1193         return NULL;
1194     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1195     Py_DECREF(unicode);
1196     return v;
1197 }
1198
1199 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1200                                     const char *encoding,
1201                                     const char *errors)
1202 {
1203     PyObject *v;
1204
1205     if (!PyUnicode_Check(unicode)) {
1206         PyErr_BadArgument();
1207         goto onError;
1208     }
1209
1210     if (encoding == NULL)
1211         encoding = PyUnicode_GetDefaultEncoding();
1212
1213     /* Encode via the codec registry */
1214     v = PyCodec_Encode(unicode, encoding, errors);
1215     if (v == NULL)
1216         goto onError;
1217     return v;
1218
1219  onError:
1220     return NULL;
1221 }
1222
1223 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1224                                     const char *encoding,
1225                                     const char *errors)
1226 {
1227     PyObject *v;
1228
1229     if (!PyUnicode_Check(unicode)) {
1230         PyErr_BadArgument();
1231         goto onError;
1232     }
1233
1234     if (encoding == NULL)
1235         encoding = PyUnicode_GetDefaultEncoding();
1236
1237     /* Shortcuts for common default encodings */
1238     if (errors == NULL) {
1239         if (strcmp(encoding, "utf-8") == 0)
1240             return PyUnicode_AsUTF8String(unicode);
1241         else if (strcmp(encoding, "latin-1") == 0)
1242             return PyUnicode_AsLatin1String(unicode);
1243 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1244         else if (strcmp(encoding, "mbcs") == 0)
1245             return PyUnicode_AsMBCSString(unicode);
1246 #endif
1247         else if (strcmp(encoding, "ascii") == 0)
1248             return PyUnicode_AsASCIIString(unicode);
1249     }
1250
1251     /* Encode via the codec registry */
1252     v = PyCodec_Encode(unicode, encoding, errors);
1253     if (v == NULL)
1254         goto onError;
1255     if (!PyString_Check(v)) {
1256         PyErr_Format(PyExc_TypeError,
1257                      "encoder did not return a string object (type=%.400s)",
1258                      Py_TYPE(v)->tp_name);
1259         Py_DECREF(v);
1260         goto onError;
1261     }
1262     return v;
1263
1264  onError:
1265     return NULL;
1266 }
1267
1268 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1269                                             const char *errors)
1270 {
1271     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1272
1273     if (v)
1274         return v;
1275     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1276     if (v && errors == NULL)
1277         ((PyUnicodeObject *)unicode)->defenc = v;
1278     return v;
1279 }
1280
1281 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1282 {
1283     if (!PyUnicode_Check(unicode)) {
1284         PyErr_BadArgument();
1285         goto onError;
1286     }
1287     return PyUnicode_AS_UNICODE(unicode);
1288
1289  onError:
1290     return NULL;
1291 }
1292
1293 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1294 {
1295     if (!PyUnicode_Check(unicode)) {
1296         PyErr_BadArgument();
1297         goto onError;
1298     }
1299     return PyUnicode_GET_SIZE(unicode);
1300
1301  onError:
1302     return -1;
1303 }
1304
1305 const char *PyUnicode_GetDefaultEncoding(void)
1306 {
1307     return unicode_default_encoding;
1308 }
1309
1310 int PyUnicode_SetDefaultEncoding(const char *encoding)
1311 {
1312     PyObject *v;
1313
1314     /* Make sure the encoding is valid. As side effect, this also
1315        loads the encoding into the codec registry cache. */
1316     v = _PyCodec_Lookup(encoding);
1317     if (v == NULL)
1318         goto onError;
1319     Py_DECREF(v);
1320     strncpy(unicode_default_encoding,
1321             encoding,
1322             sizeof(unicode_default_encoding));
1323     return 0;
1324
1325  onError:
1326     return -1;
1327 }
1328
1329 /* error handling callback helper:
1330    build arguments, call the callback and check the arguments,
1331    if no exception occurred, copy the replacement to the output
1332    and adjust various state variables.
1333    return 0 on success, -1 on error
1334 */
1335
1336 static
1337 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1338                  const char *encoding, const char *reason,
1339                  const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1340                  Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1341                  PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1342 {
1343     static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1344
1345     PyObject *restuple = NULL;
1346     PyObject *repunicode = NULL;
1347     Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1348     Py_ssize_t requiredsize;
1349     Py_ssize_t newpos;
1350     Py_UNICODE *repptr;
1351     Py_ssize_t repsize;
1352     int res = -1;
1353
1354     if (*errorHandler == NULL) {
1355         *errorHandler = PyCodec_LookupError(errors);
1356         if (*errorHandler == NULL)
1357            goto onError;
1358     }
1359
1360     if (*exceptionObject == NULL) {
1361         *exceptionObject = PyUnicodeDecodeError_Create(
1362             encoding, input, insize, *startinpos, *endinpos, reason);
1363         if (*exceptionObject == NULL)
1364            goto onError;
1365     }
1366     else {
1367         if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1368             goto onError;
1369         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1370             goto onError;
1371         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1372             goto onError;
1373     }
1374
1375     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1376     if (restuple == NULL)
1377         goto onError;
1378     if (!PyTuple_Check(restuple)) {
1379         PyErr_Format(PyExc_TypeError, &argparse[4]);
1380         goto onError;
1381     }
1382     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1383         goto onError;
1384     if (newpos<0)
1385         newpos = insize+newpos;
1386     if (newpos<0 || newpos>insize) {
1387         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1388         goto onError;
1389     }
1390
1391     /* need more space? (at least enough for what we
1392        have+the replacement+the rest of the string (starting
1393        at the new input position), so we won't have to check space
1394        when there are no errors in the rest of the string) */
1395     repptr = PyUnicode_AS_UNICODE(repunicode);
1396     repsize = PyUnicode_GET_SIZE(repunicode);
1397     requiredsize = *outpos + repsize + insize-newpos;
1398     if (requiredsize > outsize) {
1399         if (requiredsize<2*outsize)
1400             requiredsize = 2*outsize;
1401         if (PyUnicode_Resize(output, requiredsize) < 0)
1402             goto onError;
1403         *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1404     }
1405     *endinpos = newpos;
1406     *inptr = input + newpos;
1407     Py_UNICODE_COPY(*outptr, repptr, repsize);
1408     *outptr += repsize;
1409     *outpos += repsize;
1410     /* we made it! */
1411     res = 0;
1412
1413     onError:
1414     Py_XDECREF(restuple);
1415     return res;
1416 }
1417
1418 /* --- UTF-7 Codec -------------------------------------------------------- */
1419
1420 /* see RFC2152 for details */
1421
1422 static
1423 char utf7_special[128] = {
1424     /* indicate whether a UTF-7 character is special i.e. cannot be directly
1425        encoded:
1426            0 - not special
1427            1 - special
1428            2 - whitespace (optional)
1429            3 - RFC2152 Set O (optional) */
1430     1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1431     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1432     2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1433     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1434     3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1435     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1436     3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1437     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1438
1439 };
1440
1441 /* Note: The comparison (c) <= 0 is a trick to work-around gcc
1442    warnings about the comparison always being false; since
1443    utf7_special[0] is 1, we can safely make that one comparison
1444    true  */
1445
1446 #define SPECIAL(c, encodeO, encodeWS) \
1447     ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
1448      (encodeWS && (utf7_special[(c)] == 2)) || \
1449      (encodeO && (utf7_special[(c)] == 3)))
1450
1451 #define B64(n)  \
1452     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1453 #define B64CHAR(c) \
1454     (isalnum(c) || (c) == '+' || (c) == '/')
1455 #define UB64(c) \
1456     ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ?                   \
1457      (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
1458
1459 #define ENCODE(out, ch, bits)                   \
1460     while (bits >= 6) {                         \
1461         *out++ = B64(ch >> (bits-6));           \
1462         bits -= 6;                              \
1463     }
1464
1465 #define DECODE(out, ch, bits, surrogate)                                \
1466     while (bits >= 16) {                                                \
1467         Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff);   \
1468         bits -= 16;                                                     \
1469         if (surrogate) {                                                \
1470             /* We have already generated an error for the high surrogate \
1471                so let's not bother seeing if the low surrogate is correct or not */ \
1472             surrogate = 0;                                              \
1473         } else if (0xDC00 <= outCh && outCh <= 0xDFFF) {                \
1474             /* This is a surrogate pair. Unfortunately we can't represent \
1475                it in a 16-bit character */                              \
1476             surrogate = 1;                                              \
1477             errmsg = "code pairs are not supported";                    \
1478             goto utf7Error;                                             \
1479         } else {                                                        \
1480             *out++ = outCh;                                             \
1481         }                                                               \
1482     }
1483
1484 PyObject *PyUnicode_DecodeUTF7(const char *s,
1485                                Py_ssize_t size,
1486                                const char *errors)
1487 {
1488     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1489 }
1490
1491 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1492                                Py_ssize_t size,
1493                                const char *errors,
1494                                Py_ssize_t *consumed)
1495 {
1496     const char *starts = s;
1497     Py_ssize_t startinpos;
1498     Py_ssize_t endinpos;
1499     Py_ssize_t outpos;
1500     const char *e;
1501     PyUnicodeObject *unicode;
1502     Py_UNICODE *p;
1503     const char *errmsg = "";
1504     int inShift = 0;
1505     unsigned int bitsleft = 0;
1506     unsigned long charsleft = 0;
1507     int surrogate = 0;
1508     PyObject *errorHandler = NULL;
1509     PyObject *exc = NULL;
1510
1511     unicode = _PyUnicode_New(size);
1512     if (!unicode)
1513         return NULL;
1514     if (size == 0) {
1515         if (consumed)
1516             *consumed = 0;
1517         return (PyObject *)unicode;
1518     }
1519
1520     p = unicode->str;
1521     e = s + size;
1522
1523     while (s < e) {
1524         Py_UNICODE ch;
1525         restart:
1526         ch = *s;
1527
1528         if (inShift) {
1529             if ((ch == '-') || !B64CHAR(ch)) {
1530                 inShift = 0;
1531                 s++;
1532
1533                 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1534                 if (bitsleft >= 6) {
1535                     /* The shift sequence has a partial character in it. If
1536                        bitsleft < 6 then we could just classify it as padding
1537                        but that is not the case here */
1538
1539                     errmsg = "partial character in shift sequence";
1540                     goto utf7Error;
1541                 }
1542                 /* According to RFC2152 the remaining bits should be zero. We
1543                    choose to signal an error/insert a replacement character
1544                    here so indicate the potential of a misencoded character. */
1545
1546                 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1547                 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1548                     errmsg = "non-zero padding bits in shift sequence";
1549                     goto utf7Error;
1550                 }
1551
1552                 if (ch == '-') {
1553                     if ((s < e) && (*(s) == '-')) {
1554                         *p++ = '-';
1555                         inShift = 1;
1556                     }
1557                 } else if (SPECIAL(ch,0,0)) {
1558                     errmsg = "unexpected special character";
1559                         goto utf7Error;
1560                 } else  {
1561                     *p++ = ch;
1562                 }
1563             } else {
1564                 charsleft = (charsleft << 6) | UB64(ch);
1565                 bitsleft += 6;
1566                 s++;
1567                 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1568             }
1569         }
1570         else if ( ch == '+' ) {
1571             startinpos = s-starts;
1572             s++;
1573             if (s < e && *s == '-') {
1574                 s++;
1575                 *p++ = '+';
1576             } else
1577             {
1578                 inShift = 1;
1579                 bitsleft = 0;
1580             }
1581         }
1582         else if (SPECIAL(ch,0,0)) {
1583             startinpos = s-starts;
1584             errmsg = "unexpected special character";
1585             s++;
1586                 goto utf7Error;
1587         }
1588         else {
1589             *p++ = ch;
1590             s++;
1591         }
1592         continue;
1593     utf7Error:
1594         outpos = p-PyUnicode_AS_UNICODE(unicode);
1595         endinpos = s-starts;
1596         if (unicode_decode_call_errorhandler(
1597              errors, &errorHandler,
1598              "utf7", errmsg,
1599              starts, size, &startinpos, &endinpos, &exc, &s,
1600              (PyObject **)&unicode, &outpos, &p))
1601         goto onError;
1602     }
1603
1604     if (inShift && !consumed) {
1605         outpos = p-PyUnicode_AS_UNICODE(unicode);
1606         endinpos = size;
1607         if (unicode_decode_call_errorhandler(
1608              errors, &errorHandler,
1609              "utf7", "unterminated shift sequence",
1610              starts, size, &startinpos, &endinpos, &exc, &s,
1611              (PyObject **)&unicode, &outpos, &p))
1612             goto onError;
1613         if (s < e)
1614            goto restart;
1615     }
1616     if (consumed) {
1617         if(inShift)
1618             *consumed = startinpos;
1619         else
1620             *consumed = s-starts;
1621     }
1622
1623     if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1624         goto onError;
1625
1626     Py_XDECREF(errorHandler);
1627     Py_XDECREF(exc);
1628     return (PyObject *)unicode;
1629
1630 onError:
1631     Py_XDECREF(errorHandler);
1632     Py_XDECREF(exc);
1633     Py_DECREF(unicode);
1634     return NULL;
1635 }
1636
1637
1638 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1639                    Py_ssize_t size,
1640                    int encodeSetO,
1641                    int encodeWhiteSpace,
1642                    const char *errors)
1643 {
1644     PyObject *v;
1645     /* It might be possible to tighten this worst case */
1646     Py_ssize_t cbAllocated = 5 * size;
1647     int inShift = 0;
1648     Py_ssize_t i = 0;
1649     unsigned int bitsleft = 0;
1650     unsigned long charsleft = 0;
1651     char * out;
1652     char * start;
1653
1654     if (size == 0)
1655                 return PyString_FromStringAndSize(NULL, 0);
1656
1657     v = PyString_FromStringAndSize(NULL, cbAllocated);
1658     if (v == NULL)
1659         return NULL;
1660
1661     start = out = PyString_AS_STRING(v);
1662     for (;i < size; ++i) {
1663         Py_UNICODE ch = s[i];
1664
1665         if (!inShift) {
1666             if (ch == '+') {
1667                 *out++ = '+';
1668                 *out++ = '-';
1669             } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1670                 charsleft = ch;
1671                 bitsleft = 16;
1672                 *out++ = '+';
1673                 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1674                 inShift = bitsleft > 0;
1675             } else {
1676                 *out++ = (char) ch;
1677             }
1678         } else {
1679             if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1680                 *out++ = B64(charsleft << (6-bitsleft));
1681                 charsleft = 0;
1682                 bitsleft = 0;
1683                 /* Characters not in the BASE64 set implicitly unshift the sequence
1684                    so no '-' is required, except if the character is itself a '-' */
1685                 if (B64CHAR(ch) || ch == '-') {
1686                     *out++ = '-';
1687                 }
1688                 inShift = 0;
1689                 *out++ = (char) ch;
1690             } else {
1691                 bitsleft += 16;
1692                 charsleft = (charsleft << 16) | ch;
1693                 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1694
1695                 /* If the next character is special then we dont' need to terminate
1696                    the shift sequence. If the next character is not a BASE64 character
1697                    or '-' then the shift sequence will be terminated implicitly and we
1698                    don't have to insert a '-'. */
1699
1700                 if (bitsleft == 0) {
1701                     if (i + 1 < size) {
1702                         Py_UNICODE ch2 = s[i+1];
1703
1704                         if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1705
1706                         } else if (B64CHAR(ch2) || ch2 == '-') {
1707                             *out++ = '-';
1708                             inShift = 0;
1709                         } else {
1710                             inShift = 0;
1711                         }
1712
1713                     }
1714                     else {
1715                         *out++ = '-';
1716                         inShift = 0;
1717                     }
1718                 }
1719             }
1720         }
1721     }
1722     if (bitsleft) {
1723         *out++= B64(charsleft << (6-bitsleft) );
1724         *out++ = '-';
1725     }
1726
1727     _PyString_Resize(&v, out - start);
1728     return v;
1729 }
1730
1731 #undef SPECIAL
1732 #undef B64
1733 #undef B64CHAR
1734 #undef UB64
1735 #undef ENCODE
1736 #undef DECODE
1737
1738 /* --- UTF-8 Codec -------------------------------------------------------- */
1739
1740 static
1741 char utf8_code_length[256] = {
1742     /* Map UTF-8 encoded prefix byte to sequence length.  zero means
1743        illegal prefix.  see RFC 2279 for details */
1744     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1745     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1746     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1747     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1748     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1749     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1750     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1751     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1752     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1753     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1754     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1755     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1756     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1757     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1758     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1759     4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1760 };
1761
1762 PyObject *PyUnicode_DecodeUTF8(const char *s,
1763                                Py_ssize_t size,
1764                                const char *errors)
1765 {
1766     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1767 }
1768
1769 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1770                                         Py_ssize_t size,
1771                                         const char *errors,
1772                                         Py_ssize_t *consumed)
1773 {
1774     const char *starts = s;
1775     int n;
1776     Py_ssize_t startinpos;
1777     Py_ssize_t endinpos;
1778     Py_ssize_t outpos;
1779     const char *e;
1780     PyUnicodeObject *unicode;
1781     Py_UNICODE *p;
1782     const char *errmsg = "";
1783     PyObject *errorHandler = NULL;
1784     PyObject *exc = NULL;
1785
1786     /* Note: size will always be longer than the resulting Unicode
1787        character count */
1788     unicode = _PyUnicode_New(size);
1789     if (!unicode)
1790         return NULL;
1791     if (size == 0) {
1792         if (consumed)
1793             *consumed = 0;
1794         return (PyObject *)unicode;
1795     }
1796
1797     /* Unpack UTF-8 encoded data */
1798     p = unicode->str;
1799     e = s + size;
1800
1801     while (s < e) {
1802         Py_UCS4 ch = (unsigned char)*s;
1803
1804         if (ch < 0x80) {
1805             *p++ = (Py_UNICODE)ch;
1806             s++;
1807             continue;
1808         }
1809
1810         n = utf8_code_length[ch];
1811
1812         if (s + n > e) {
1813             if (consumed)
1814                 break;
1815             else {
1816                 errmsg = "unexpected end of data";
1817                 startinpos = s-starts;
1818                 endinpos = size;
1819                 goto utf8Error;
1820             }
1821         }
1822
1823         switch (n) {
1824
1825         case 0:
1826             errmsg = "unexpected code byte";
1827             startinpos = s-starts;
1828             endinpos = startinpos+1;
1829             goto utf8Error;
1830
1831         case 1:
1832             errmsg = "internal error";
1833             startinpos = s-starts;
1834             endinpos = startinpos+1;
1835             goto utf8Error;
1836
1837         case 2:
1838             if ((s[1] & 0xc0) != 0x80) {
1839                 errmsg = "invalid data";
1840                 startinpos = s-starts;
1841                 endinpos = startinpos+2;
1842                 goto utf8Error;
1843             }
1844             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1845             if (ch < 0x80) {
1846                 startinpos = s-starts;
1847                 endinpos = startinpos+2;
1848                 errmsg = "illegal encoding";
1849                 goto utf8Error;
1850             }
1851             else
1852                 *p++ = (Py_UNICODE)ch;
1853             break;
1854
1855         case 3:
1856             if ((s[1] & 0xc0) != 0x80 ||
1857                 (s[2] & 0xc0) != 0x80) {
1858                 errmsg = "invalid data";
1859                 startinpos = s-starts;
1860                 endinpos = startinpos+3;
1861                 goto utf8Error;
1862             }
1863             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1864             if (ch < 0x0800) {
1865                 /* Note: UTF-8 encodings of surrogates are considered
1866                    legal UTF-8 sequences;
1867
1868                    XXX For wide builds (UCS-4) we should probably try
1869                        to recombine the surrogates into a single code
1870                        unit.
1871                 */
1872                 errmsg = "illegal encoding";
1873                 startinpos = s-starts;
1874                 endinpos = startinpos+3;
1875                 goto utf8Error;
1876             }
1877             else
1878                 *p++ = (Py_UNICODE)ch;
1879             break;
1880
1881         case 4:
1882             if ((s[1] & 0xc0) != 0x80 ||
1883                 (s[2] & 0xc0) != 0x80 ||
1884                 (s[3] & 0xc0) != 0x80) {
1885                 errmsg = "invalid data";
1886                 startinpos = s-starts;
1887                 endinpos = startinpos+4;
1888                 goto utf8Error;
1889             }
1890             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1891                  ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1892             /* validate and convert to UTF-16 */
1893             if ((ch < 0x10000)        /* minimum value allowed for 4
1894                                          byte encoding */
1895                 || (ch > 0x10ffff))   /* maximum value allowed for
1896                                          UTF-16 */
1897             {
1898                 errmsg = "illegal encoding";
1899                 startinpos = s-starts;
1900                 endinpos = startinpos+4;
1901                 goto utf8Error;
1902             }
1903 #ifdef Py_UNICODE_WIDE
1904             *p++ = (Py_UNICODE)ch;
1905 #else
1906             /*  compute and append the two surrogates: */
1907
1908             /*  translate from 10000..10FFFF to 0..FFFF */
1909             ch -= 0x10000;
1910
1911             /*  high surrogate = top 10 bits added to D800 */
1912             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1913
1914             /*  low surrogate = bottom 10 bits added to DC00 */
1915             *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1916 #endif
1917             break;
1918
1919         default:
1920             /* Other sizes are only needed for UCS-4 */
1921             errmsg = "unsupported Unicode code range";
1922             startinpos = s-starts;
1923             endinpos = startinpos+n;
1924             goto utf8Error;
1925         }
1926         s += n;
1927         continue;
1928
1929     utf8Error:
1930     outpos = p-PyUnicode_AS_UNICODE(unicode);
1931     if (unicode_decode_call_errorhandler(
1932              errors, &errorHandler,
1933              "utf8", errmsg,
1934              starts, size, &startinpos, &endinpos, &exc, &s,
1935              (PyObject **)&unicode, &outpos, &p))
1936         goto onError;
1937     }
1938     if (consumed)
1939         *consumed = s-starts;
1940
1941     /* Adjust length */
1942     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1943         goto onError;
1944
1945     Py_XDECREF(errorHandler);
1946     Py_XDECREF(exc);
1947     return (PyObject *)unicode;
1948
1949 onError:
1950     Py_XDECREF(errorHandler);
1951     Py_XDECREF(exc);
1952     Py_DECREF(unicode);
1953     return NULL;
1954 }
1955
1956 /* Allocation strategy:  if the string is short, convert into a stack buffer
1957    and allocate exactly as much space needed at the end.  Else allocate the
1958    maximum possible needed (4 result bytes per Unicode character), and return
1959    the excess memory at the end.
1960 */
1961 PyObject *
1962 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1963                      Py_ssize_t size,
1964                      const char *errors)
1965 {
1966 #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
1967
1968     Py_ssize_t i;           /* index into s of next input byte */
1969     PyObject *v;        /* result string object */
1970     char *p;            /* next free byte in output buffer */
1971     Py_ssize_t nallocated;  /* number of result bytes allocated */
1972     Py_ssize_t nneeded;        /* number of result bytes needed */
1973     char stackbuf[MAX_SHORT_UNICHARS * 4];
1974
1975     assert(s != NULL);
1976     assert(size >= 0);
1977
1978     if (size <= MAX_SHORT_UNICHARS) {
1979         /* Write into the stack buffer; nallocated can't overflow.
1980          * At the end, we'll allocate exactly as much heap space as it
1981          * turns out we need.
1982          */
1983         nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1984         v = NULL;   /* will allocate after we're done */
1985         p = stackbuf;
1986     }
1987     else {
1988         /* Overallocate on the heap, and give the excess back at the end. */
1989         nallocated = size * 4;
1990         if (nallocated / 4 != size)  /* overflow! */
1991             return PyErr_NoMemory();
1992         v = PyString_FromStringAndSize(NULL, nallocated);
1993         if (v == NULL)
1994             return NULL;
1995         p = PyString_AS_STRING(v);
1996     }
1997
1998     for (i = 0; i < size;) {
1999         Py_UCS4 ch = s[i++];
2000
2001         if (ch < 0x80)
2002             /* Encode ASCII */
2003             *p++ = (char) ch;
2004
2005         else if (ch < 0x0800) {
2006             /* Encode Latin-1 */
2007             *p++ = (char)(0xc0 | (ch >> 6));
2008             *p++ = (char)(0x80 | (ch & 0x3f));
2009         }
2010         else {
2011             /* Encode UCS2 Unicode ordinals */
2012             if (ch < 0x10000) {
2013                 /* Special case: check for high surrogate */
2014                 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2015                     Py_UCS4 ch2 = s[i];
2016                     /* Check for low surrogate and combine the two to
2017                        form a UCS4 value */
2018                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2019                         ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2020                         i++;
2021                         goto encodeUCS4;
2022                     }
2023                     /* Fall through: handles isolated high surrogates */
2024                 }
2025                 *p++ = (char)(0xe0 | (ch >> 12));
2026                 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2027                 *p++ = (char)(0x80 | (ch & 0x3f));
2028                 continue;
2029             }
2030 encodeUCS4:
2031             /* Encode UCS4 Unicode ordinals */
2032             *p++ = (char)(0xf0 | (ch >> 18));
2033             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2034             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2035             *p++ = (char)(0x80 | (ch & 0x3f));
2036         }
2037     }
2038
2039     if (v == NULL) {
2040         /* This was stack allocated. */
2041         nneeded = p - stackbuf;
2042         assert(nneeded <= nallocated);
2043         v = PyString_FromStringAndSize(stackbuf, nneeded);
2044     }
2045     else {
2046         /* Cut back to size actually needed. */
2047         nneeded = p - PyString_AS_STRING(v);
2048         assert(nneeded <= nallocated);
2049         _PyString_Resize(&v, nneeded);
2050     }
2051     return v;
2052
2053 #undef MAX_SHORT_UNICHARS
2054 }
2055
2056 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2057 {
2058     if (!PyUnicode_Check(unicode)) {
2059         PyErr_BadArgument();
2060         return NULL;
2061     }
2062     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2063                                 PyUnicode_GET_SIZE(unicode),
2064                                 NULL);
2065 }
2066
2067 /* --- UTF-32 Codec ------------------------------------------------------- */
2068
2069 PyObject *
2070 PyUnicode_DecodeUTF32(const char *s,
2071                       Py_ssize_t size,
2072                       const char *errors,
2073                       int *byteorder)
2074 {
2075     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2076 }
2077
2078 PyObject *
2079 PyUnicode_DecodeUTF32Stateful(const char *s,
2080                               Py_ssize_t size,
2081                               const char *errors,
2082                               int *byteorder,
2083                               Py_ssize_t *consumed)
2084 {
2085     const char *starts = s;
2086     Py_ssize_t startinpos;
2087     Py_ssize_t endinpos;
2088     Py_ssize_t outpos;
2089     PyUnicodeObject *unicode;
2090     Py_UNICODE *p;
2091 #ifndef Py_UNICODE_WIDE
2092     int i, pairs;
2093 #else
2094     const int pairs = 0;
2095 #endif
2096     const unsigned char *q, *e;
2097     int bo = 0;       /* assume native ordering by default */
2098     const char *errmsg = "";
2099     /* Offsets from q for retrieving bytes in the right order. */
2100 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2101     int iorder[] = {0, 1, 2, 3};
2102 #else
2103     int iorder[] = {3, 2, 1, 0};
2104 #endif
2105     PyObject *errorHandler = NULL;
2106     PyObject *exc = NULL;
2107     /* On narrow builds we split characters outside the BMP into two
2108        codepoints => count how much extra space we need. */
2109 #ifndef Py_UNICODE_WIDE
2110     for (i = pairs = 0; i < size/4; i++)
2111         if (((Py_UCS4 *)s)[i] >= 0x10000)
2112             pairs++;
2113 #endif
2114
2115     /* This might be one to much, because of a BOM */
2116     unicode = _PyUnicode_New((size+3)/4+pairs);
2117     if (!unicode)
2118         return NULL;
2119     if (size == 0)
2120         return (PyObject *)unicode;
2121
2122     /* Unpack UTF-32 encoded data */
2123     p = unicode->str;
2124     q = (unsigned char *)s;
2125     e = q + size;
2126
2127     if (byteorder)
2128         bo = *byteorder;
2129
2130     /* Check for BOM marks (U+FEFF) in the input and adjust current
2131        byte order setting accordingly. In native mode, the leading BOM
2132        mark is skipped, in all other modes, it is copied to the output
2133        stream as-is (giving a ZWNBSP character). */
2134     if (bo == 0) {
2135         if (size >= 4) {
2136             const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2137                                 (q[iorder[1]] << 8) | q[iorder[0]];
2138 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2139             if (bom == 0x0000FEFF) {
2140                 q += 4;
2141                 bo = -1;
2142             }
2143             else if (bom == 0xFFFE0000) {
2144                 q += 4;
2145                 bo = 1;
2146             }
2147 #else
2148             if (bom == 0x0000FEFF) {
2149                 q += 4;
2150                 bo = 1;
2151             }
2152             else if (bom == 0xFFFE0000) {
2153                 q += 4;
2154                 bo = -1;
2155             }
2156 #endif
2157         }
2158     }
2159
2160     if (bo == -1) {
2161         /* force LE */
2162         iorder[0] = 0;
2163         iorder[1] = 1;
2164         iorder[2] = 2;
2165         iorder[3] = 3;
2166     }
2167     else if (bo == 1) {
2168         /* force BE */
2169         iorder[0] = 3;
2170         iorder[1] = 2;
2171         iorder[2] = 1;
2172         iorder[3] = 0;
2173     }
2174
2175     while (q < e) {
2176         Py_UCS4 ch;
2177         /* remaining bytes at the end? (size should be divisible by 4) */
2178         if (e-q<4) {
2179             if (consumed)
2180                 break;
2181             errmsg = "truncated data";
2182             startinpos = ((const char *)q)-starts;
2183             endinpos = ((const char *)e)-starts;
2184             goto utf32Error;
2185             /* The remaining input chars are ignored if the callback
2186                chooses to skip the input */
2187         }
2188         ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2189              (q[iorder[1]] << 8) | q[iorder[0]];
2190
2191         if (ch >= 0x110000)
2192         {
2193             errmsg = "codepoint not in range(0x110000)";
2194             startinpos = ((const char *)q)-starts;
2195             endinpos = startinpos+4;
2196             goto utf32Error;
2197         }
2198 #ifndef Py_UNICODE_WIDE
2199         if (ch >= 0x10000)
2200         {
2201             *p++ = 0xD800 | ((ch-0x10000) >> 10);
2202             *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2203         }
2204         else
2205 #endif
2206             *p++ = ch;
2207         q += 4;
2208         continue;
2209     utf32Error:
2210         outpos = p-PyUnicode_AS_UNICODE(unicode);
2211     if (unicode_decode_call_errorhandler(
2212          errors, &errorHandler,
2213          "utf32", errmsg,
2214          starts, size, &startinpos, &endinpos, &exc, &s,
2215          (PyObject **)&unicode, &outpos, &p))
2216             goto onError;
2217     }
2218
2219     if (byteorder)
2220         *byteorder = bo;
2221
2222     if (consumed)
2223         *consumed = (const char *)q-starts;
2224
2225     /* Adjust length */
2226     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2227         goto onError;
2228
2229     Py_XDECREF(errorHandler);
2230     Py_XDECREF(exc);
2231     return (PyObject *)unicode;
2232
2233 onError:
2234     Py_DECREF(unicode);
2235     Py_XDECREF(errorHandler);
2236     Py_XDECREF(exc);
2237     return NULL;
2238 }
2239
2240 PyObject *
2241 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2242                       Py_ssize_t size,
2243                       const char *errors,
2244                       int byteorder)
2245 {
2246     PyObject *v;
2247     unsigned char *p;
2248 #ifndef Py_UNICODE_WIDE
2249     int i, pairs;
2250 #else
2251     const int pairs = 0;
2252 #endif
2253     /* Offsets from p for storing byte pairs in the right order. */
2254 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2255     int iorder[] = {0, 1, 2, 3};
2256 #else
2257     int iorder[] = {3, 2, 1, 0};
2258 #endif
2259
2260 #define STORECHAR(CH)                       \
2261     do {                                    \
2262         p[iorder[3]] = ((CH) >> 24) & 0xff; \
2263         p[iorder[2]] = ((CH) >> 16) & 0xff; \
2264         p[iorder[1]] = ((CH) >> 8) & 0xff;  \
2265         p[iorder[0]] = (CH) & 0xff;         \
2266         p += 4;                             \
2267     } while(0)
2268
2269     /* In narrow builds we can output surrogate pairs as one codepoint,
2270        so we need less space. */
2271 #ifndef Py_UNICODE_WIDE
2272     for (i = pairs = 0; i < size-1; i++)
2273         if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2274             0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2275             pairs++;
2276 #endif
2277     v = PyString_FromStringAndSize(NULL,
2278                   4 * (size - pairs + (byteorder == 0)));
2279     if (v == NULL)
2280         return NULL;
2281
2282     p = (unsigned char *)PyString_AS_STRING(v);
2283     if (byteorder == 0)
2284         STORECHAR(0xFEFF);
2285     if (size == 0)
2286         return v;
2287
2288     if (byteorder == -1) {
2289         /* force LE */
2290         iorder[0] = 0;
2291         iorder[1] = 1;
2292         iorder[2] = 2;
2293         iorder[3] = 3;
2294     }
2295     else if (byteorder == 1) {
2296         /* force BE */
2297         iorder[0] = 3;
2298         iorder[1] = 2;
2299         iorder[2] = 1;
2300         iorder[3] = 0;
2301     }
2302
2303     while (size-- > 0) {
2304         Py_UCS4 ch = *s++;
2305 #ifndef Py_UNICODE_WIDE
2306         if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2307             Py_UCS4 ch2 = *s;
2308             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2309                 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2310                 s++;
2311                 size--;
2312             }
2313         }
2314 #endif
2315         STORECHAR(ch);
2316     }
2317     return v;
2318 #undef STORECHAR
2319 }
2320
2321 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2322 {
2323     if (!PyUnicode_Check(unicode)) {
2324         PyErr_BadArgument();
2325         return NULL;
2326     }
2327     return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2328                                  PyUnicode_GET_SIZE(unicode),
2329                                  NULL,
2330                                  0);
2331 }
2332
2333 /* --- UTF-16 Codec ------------------------------------------------------- */
2334
2335 PyObject *
2336 PyUnicode_DecodeUTF16(const char *s,
2337                       Py_ssize_t size,
2338                       const char *errors,
2339                       int *byteorder)
2340 {
2341     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2342 }
2343
2344 PyObject *
2345 PyUnicode_DecodeUTF16Stateful(const char *s,
2346                               Py_ssize_t size,
2347                               const char *errors,
2348                               int *byteorder,
2349                               Py_ssize_t *consumed)
2350 {
2351     const char *starts = s;
2352     Py_ssize_t startinpos;
2353     Py_ssize_t endinpos;
2354     Py_ssize_t outpos;
2355     PyUnicodeObject *unicode;
2356     Py_UNICODE *p;
2357     const unsigned char *q, *e;
2358     int bo = 0;       /* assume native ordering by default */
2359     const char *errmsg = "";
2360     /* Offsets from q for retrieving byte pairs in the right order. */
2361 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2362     int ihi = 1, ilo = 0;
2363 #else
2364     int ihi = 0, ilo = 1;
2365 #endif
2366     PyObject *errorHandler = NULL;
2367     PyObject *exc = NULL;
2368
2369     /* Note: size will always be longer than the resulting Unicode
2370        character count */
2371     unicode = _PyUnicode_New(size);
2372     if (!unicode)
2373         return NULL;
2374     if (size == 0)
2375         return (PyObject *)unicode;
2376
2377     /* Unpack UTF-16 encoded data */
2378     p = unicode->str;
2379     q = (unsigned char *)s;
2380     e = q + size;
2381
2382     if (byteorder)
2383         bo = *byteorder;
2384
2385     /* Check for BOM marks (U+FEFF) in the input and adjust current
2386        byte order setting accordingly. In native mode, the leading BOM
2387        mark is skipped, in all other modes, it is copied to the output
2388        stream as-is (giving a ZWNBSP character). */
2389     if (bo == 0) {
2390         if (size >= 2) {
2391             const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2392 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2393             if (bom == 0xFEFF) {
2394                 q += 2;
2395                 bo = -1;
2396             }
2397             else if (bom == 0xFFFE) {
2398                 q += 2;
2399                 bo = 1;
2400             }
2401 #else
2402             if (bom == 0xFEFF) {
2403                 q += 2;
2404                 bo = 1;
2405             }
2406             else if (bom == 0xFFFE) {
2407                 q += 2;
2408                 bo = -1;
2409             }
2410 #endif
2411         }
2412     }
2413
2414     if (bo == -1) {
2415         /* force LE */
2416         ihi = 1;
2417         ilo = 0;
2418     }
2419     else if (bo == 1) {
2420         /* force BE */
2421         ihi = 0;
2422         ilo = 1;
2423     }
2424
2425     while (q < e) {
2426         Py_UNICODE ch;
2427         /* remaining bytes at the end? (size should be even) */
2428         if (e-q<2) {
2429             if (consumed)
2430                 break;
2431             errmsg = "truncated data";
2432             startinpos = ((const char *)q)-starts;
2433             endinpos = ((const char *)e)-starts;
2434             goto utf16Error;
2435             /* The remaining input chars are ignored if the callback
2436                chooses to skip the input */
2437         }
2438         ch = (q[ihi] << 8) | q[ilo];
2439
2440         q += 2;
2441
2442         if (ch < 0xD800 || ch > 0xDFFF) {
2443             *p++ = ch;
2444             continue;
2445         }
2446
2447         /* UTF-16 code pair: */
2448         if (q >= e) {
2449             errmsg = "unexpected end of data";
2450             startinpos = (((const char *)q)-2)-starts;
2451             endinpos = ((const char *)e)-starts;
2452             goto utf16Error;
2453         }
2454         if (0xD800 <= ch && ch <= 0xDBFF) {
2455             Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2456             q += 2;
2457             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2458 #ifndef Py_UNICODE_WIDE
2459                 *p++ = ch;
2460                 *p++ = ch2;
2461 #else
2462                 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2463 #endif
2464                 continue;
2465             }
2466             else {
2467                 errmsg = "illegal UTF-16 surrogate";
2468                 startinpos = (((const char *)q)-4)-starts;
2469                 endinpos = startinpos+2;
2470                 goto utf16Error;
2471             }
2472
2473         }
2474         errmsg = "illegal encoding";
2475         startinpos = (((const char *)q)-2)-starts;
2476         endinpos = startinpos+2;
2477         /* Fall through to report the error */
2478
2479     utf16Error:
2480         outpos = p-PyUnicode_AS_UNICODE(unicode);
2481         if (unicode_decode_call_errorhandler(
2482                  errors, &errorHandler,
2483                  "utf16", errmsg,
2484                  starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2485                  (PyObject **)&unicode, &outpos, &p))
2486             goto onError;
2487     }
2488
2489     if (byteorder)
2490         *byteorder = bo;
2491
2492     if (consumed)
2493         *consumed = (const char *)q-starts;
2494
2495     /* Adjust length */
2496     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2497         goto onError;
2498
2499     Py_XDECREF(errorHandler);
2500     Py_XDECREF(exc);
2501     return (PyObject *)unicode;
2502
2503 onError:
2504     Py_DECREF(unicode);
2505     Py_XDECREF(errorHandler);
2506     Py_XDECREF(exc);
2507     return NULL;
2508 }
2509
2510 PyObject *
2511 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2512                       Py_ssize_t size,
2513                       const char *errors,
2514                       int byteorder)
2515 {
2516     PyObject *v;
2517     unsigned char *p;
2518 #ifdef Py_UNICODE_WIDE
2519     int i, pairs;
2520 #else
2521     const int pairs = 0;
2522 #endif
2523     /* Offsets from p for storing byte pairs in the right order. */
2524 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2525     int ihi = 1, ilo = 0;
2526 #else
2527     int ihi = 0, ilo = 1;
2528 #endif
2529
2530 #define STORECHAR(CH)                   \
2531     do {                                \
2532         p[ihi] = ((CH) >> 8) & 0xff;    \
2533         p[ilo] = (CH) & 0xff;           \
2534         p += 2;                         \
2535     } while(0)
2536
2537 #ifdef Py_UNICODE_WIDE
2538     for (i = pairs = 0; i < size; i++)
2539         if (s[i] >= 0x10000)
2540             pairs++;
2541 #endif
2542     v = PyString_FromStringAndSize(NULL,
2543                   2 * (size + pairs + (byteorder == 0)));
2544     if (v == NULL)
2545         return NULL;
2546
2547     p = (unsigned char *)PyString_AS_STRING(v);
2548     if (byteorder == 0)
2549         STORECHAR(0xFEFF);
2550     if (size == 0)
2551         return v;
2552
2553     if (byteorder == -1) {
2554         /* force LE */
2555         ihi = 1;
2556         ilo = 0;
2557     }
2558     else if (byteorder == 1) {
2559         /* force BE */
2560         ihi = 0;
2561         ilo = 1;
2562     }
2563
2564     while (size-- > 0) {
2565         Py_UNICODE ch = *s++;
2566         Py_UNICODE ch2 = 0;
2567 #ifdef Py_UNICODE_WIDE
2568         if (ch >= 0x10000) {
2569             ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2570             ch  = 0xD800 | ((ch-0x10000) >> 10);
2571         }
2572 #endif
2573         STORECHAR(ch);
2574         if (ch2)
2575             STORECHAR(ch2);
2576     }
2577     return v;
2578 #undef STORECHAR
2579 }
2580
2581 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2582 {
2583     if (!PyUnicode_Check(unicode)) {
2584         PyErr_BadArgument();
2585         return NULL;
2586     }
2587     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2588                                  PyUnicode_GET_SIZE(unicode),
2589                                  NULL,
2590                                  0);
2591 }
2592
2593 /* --- Unicode Escape Codec ----------------------------------------------- */
2594
2595 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2596
2597 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2598                                         Py_ssize_t size,
2599                                         const char *errors)
2600 {
2601     const char *starts = s;
2602     Py_ssize_t startinpos;
2603     Py_ssize_t endinpos;
2604     Py_ssize_t outpos;
2605     int i;
2606     PyUnicodeObject *v;
2607     Py_UNICODE *p;
2608     const char *end;
2609     char* message;
2610     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2611     PyObject *errorHandler = NULL;
2612     PyObject *exc = NULL;
2613
2614     /* Escaped strings will always be longer than the resulting
2615        Unicode string, so we start with size here and then reduce the
2616        length after conversion to the true value.
2617        (but if the error callback returns a long replacement string
2618        we'll have to allocate more space) */
2619     v = _PyUnicode_New(size);
2620     if (v == NULL)
2621         goto onError;
2622     if (size == 0)
2623         return (PyObject *)v;
2624
2625     p = PyUnicode_AS_UNICODE(v);
2626     end = s + size;
2627
2628     while (s < end) {
2629         unsigned char c;
2630         Py_UNICODE x;
2631         int digits;
2632
2633         /* Non-escape characters are interpreted as Unicode ordinals */
2634         if (*s != '\\') {
2635             *p++ = (unsigned char) *s++;
2636             continue;
2637         }
2638
2639         startinpos = s-starts;
2640         /* \ - Escapes */
2641         s++;
2642         c = *s++;
2643         if (s > end)
2644             c = '\0'; /* Invalid after \ */
2645         switch (c) {
2646
2647         /* \x escapes */
2648         case '\n': break;
2649         case '\\': *p++ = '\\'; break;
2650         case '\'': *p++ = '\''; break;
2651         case '\"': *p++ = '\"'; break;
2652         case 'b': *p++ = '\b'; break;
2653         case 'f': *p++ = '\014'; break; /* FF */
2654         case 't': *p++ = '\t'; break;
2655         case 'n': *p++ = '\n'; break;
2656         case 'r': *p++ = '\r'; break;
2657         case 'v': *p++ = '\013'; break; /* VT */
2658         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2659
2660         /* \OOO (octal) escapes */
2661         case '0': case '1': case '2': case '3':
2662         case '4': case '5': case '6': case '7':
2663             x = s[-1] - '0';
2664             if (s < end && '0' <= *s && *s <= '7') {
2665                 x = (x<<3) + *s++ - '0';
2666                 if (s < end && '0' <= *s && *s <= '7')
2667                     x = (x<<3) + *s++ - '0';
2668             }
2669             *p++ = x;
2670             break;
2671
2672         /* hex escapes */
2673         /* \xXX */
2674         case 'x':
2675             digits = 2;
2676             message = "truncated \\xXX escape";
2677             goto hexescape;
2678
2679         /* \uXXXX */
2680         case 'u':
2681             digits = 4;
2682             message = "truncated \\uXXXX escape";
2683             goto hexescape;
2684
2685         /* \UXXXXXXXX */
2686         case 'U':
2687             digits = 8;
2688             message = "truncated \\UXXXXXXXX escape";
2689         hexescape:
2690             chr = 0;
2691             outpos = p-PyUnicode_AS_UNICODE(v);
2692             if (s+digits>end) {
2693                 endinpos = size;
2694                 if (unicode_decode_call_errorhandler(
2695                     errors, &errorHandler,
2696                     "unicodeescape", "end of string in escape sequence",
2697                     starts, size, &startinpos, &endinpos, &exc, &s,
2698                     (PyObject **)&v, &outpos, &p))
2699                     goto onError;
2700                 goto nextByte;
2701             }
2702             for (i = 0; i < digits; ++i) {
2703                 c = (unsigned char) s[i];
2704                 if (!isxdigit(c)) {
2705                     endinpos = (s+i+1)-starts;
2706                     if (unicode_decode_call_errorhandler(
2707                         errors, &errorHandler,
2708                         "unicodeescape", message,
2709                         starts, size, &startinpos, &endinpos, &exc, &s,
2710                         (PyObject **)&v, &outpos, &p))
2711                         goto onError;
2712                     goto nextByte;
2713                 }
2714                 chr = (chr<<4) & ~0xF;
2715                 if (c >= '0' && c <= '9')
2716                     chr += c - '0';
2717                 else if (c >= 'a' && c <= 'f')
2718                     chr += 10 + c - 'a';
2719                 else
2720                     chr += 10 + c - 'A';
2721             }
2722             s += i;
2723             if (chr == 0xffffffff && PyErr_Occurred())
2724                 /* _decoding_error will have already written into the
2725                    target buffer. */
2726                 break;
2727         store:
2728             /* when we get here, chr is a 32-bit unicode character */
2729             if (chr <= 0xffff)
2730                 /* UCS-2 character */
2731                 *p++ = (Py_UNICODE) chr;
2732             else if (chr <= 0x10ffff) {
2733                 /* UCS-4 character. Either store directly, or as
2734                    surrogate pair. */
2735 #ifdef Py_UNICODE_WIDE
2736                 *p++ = chr;
2737 #else
2738                 chr -= 0x10000L;
2739                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2740                 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2741 #endif
2742             } else {
2743                 endinpos = s-starts;
2744                 outpos = p-PyUnicode_AS_UNICODE(v);
2745                 if (unicode_decode_call_errorhandler(
2746                     errors, &errorHandler,
2747                     "unicodeescape", "illegal Unicode character",
2748                     starts, size, &startinpos, &endinpos, &exc, &s,
2749                     (PyObject **)&v, &outpos, &p))
2750                     goto onError;
2751             }
2752             break;
2753
2754         /* \N{name} */
2755         case 'N':
2756             message = "malformed \\N character escape";
2757             if (ucnhash_CAPI == NULL) {
2758                 /* load the unicode data module */
2759                 PyObject *m, *api;
2760                 m = PyImport_ImportModuleNoBlock("unicodedata");
2761                 if (m == NULL)
2762                     goto ucnhashError;
2763                 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
2764                 Py_DECREF(m);
2765                 if (api == NULL)
2766                     goto ucnhashError;
2767                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
2768                 Py_DECREF(api);
2769                 if (ucnhash_CAPI == NULL)
2770                     goto ucnhashError;
2771             }
2772             if (*s == '{') {
2773                 const char *start = s+1;
2774                 /* look for the closing brace */
2775                 while (*s != '}' && s < end)
2776                     s++;
2777                 if (s > start && s < end && *s == '}') {
2778                     /* found a name.  look it up in the unicode database */
2779                     message = "unknown Unicode character name";
2780                     s++;
2781                     if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2782                         goto store;
2783                 }
2784             }
2785             endinpos = s-starts;
2786             outpos = p-PyUnicode_AS_UNICODE(v);
2787             if (unicode_decode_call_errorhandler(
2788                 errors, &errorHandler,
2789                 "unicodeescape", message,
2790                 starts, size, &startinpos, &endinpos, &exc, &s,
2791                 (PyObject **)&v, &outpos, &p))
2792                 goto onError;
2793             break;
2794
2795         default:
2796             if (s > end) {
2797                 message = "\\ at end of string";
2798                 s--;
2799                 endinpos = s-starts;
2800                 outpos = p-PyUnicode_AS_UNICODE(v);
2801                 if (unicode_decode_call_errorhandler(
2802                     errors, &errorHandler,
2803                     "unicodeescape", message,
2804                     starts, size, &startinpos, &endinpos, &exc, &s,
2805                     (PyObject **)&v, &outpos, &p))
2806                     goto onError;
2807             }
2808             else {
2809                 *p++ = '\\';
2810                 *p++ = (unsigned char)s[-1];
2811             }
2812             break;
2813         }
2814         nextByte:
2815         ;
2816     }
2817     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2818         goto onError;
2819     Py_XDECREF(errorHandler);
2820     Py_XDECREF(exc);
2821     return (PyObject *)v;
2822
2823 ucnhashError:
2824     PyErr_SetString(
2825         PyExc_UnicodeError,
2826         "\\N escapes not supported (can't load unicodedata module)"
2827         );
2828     Py_XDECREF(v);
2829     Py_XDECREF(errorHandler);
2830     Py_XDECREF(exc);
2831     return NULL;
2832
2833 onError:
2834     Py_XDECREF(v);
2835     Py_XDECREF(errorHandler);
2836     Py_XDECREF(exc);
2837     return NULL;
2838 }
2839
2840 /* Return a Unicode-Escape string version of the Unicode object.
2841
2842    If quotes is true, the string is enclosed in u"" or u'' quotes as
2843    appropriate.
2844
2845 */
2846
2847 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2848                                       Py_ssize_t size,
2849                                       Py_UNICODE ch)
2850 {
2851     /* like wcschr, but doesn't stop at NULL characters */
2852
2853     while (size-- > 0) {
2854         if (*s == ch)
2855             return s;
2856         s++;
2857     }
2858
2859     return NULL;
2860 }
2861
2862 static
2863 PyObject *unicodeescape_string(const Py_UNICODE *s,
2864                                Py_ssize_t size,
2865                                int quotes)
2866 {
2867     PyObject *repr;
2868     char *p;
2869
2870     static const char *hexdigit = "0123456789abcdef";
2871
2872     /* XXX(nnorwitz): rather than over-allocating, it would be
2873        better to choose a different scheme.  Perhaps scan the
2874        first N-chars of the string and allocate based on that size.
2875     */
2876     /* Initial allocation is based on the longest-possible unichr
2877        escape.
2878
2879        In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2880        unichr, so in this case it's the longest unichr escape. In
2881        narrow (UTF-16) builds this is five chars per source unichr
2882        since there are two unichrs in the surrogate pair, so in narrow
2883        (UTF-16) builds it's not the longest unichr escape.
2884
2885        In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2886        so in the narrow (UTF-16) build case it's the longest unichr
2887        escape.
2888     */
2889
2890     repr = PyString_FromStringAndSize(NULL,
2891         2
2892 #ifdef Py_UNICODE_WIDE
2893         + 10*size
2894 #else
2895         + 6*size
2896 #endif
2897         + 1);
2898     if (repr == NULL)
2899         return NULL;
2900
2901     p = PyString_AS_STRING(repr);
2902
2903     if (quotes) {
2904         *p++ = 'u';
2905         *p++ = (findchar(s, size, '\'') &&
2906                 !findchar(s, size, '"')) ? '"' : '\'';
2907     }
2908     while (size-- > 0) {
2909         Py_UNICODE ch = *s++;
2910
2911         /* Escape quotes and backslashes */
2912         if ((quotes &&
2913              ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
2914             *p++ = '\\';
2915             *p++ = (char) ch;
2916             continue;
2917         }
2918
2919 #ifdef Py_UNICODE_WIDE
2920         /* Map 21-bit characters to '\U00xxxxxx' */
2921         else if (ch >= 0x10000) {
2922             *p++ = '\\';
2923             *p++ = 'U';
2924             *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2925             *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2926             *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2927             *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2928             *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2929             *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2930             *p++ = hexdigit[(ch >> 4) & 0x0000000F];
2931             *p++ = hexdigit[ch & 0x0000000F];
2932             continue;
2933         }
2934 #else
2935         /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
2936         else if (ch >= 0xD800 && ch < 0xDC00) {
2937             Py_UNICODE ch2;
2938             Py_UCS4 ucs;
2939
2940             ch2 = *s++;
2941             size--;
2942             if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2943                 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2944                 *p++ = '\\';
2945                 *p++ = 'U';
2946                 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2947                 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2948                 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2949                 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2950                 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2951                 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2952                 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2953                 *p++ = hexdigit[ucs & 0x0000000F];
2954                 continue;
2955             }
2956             /* Fall through: isolated surrogates are copied as-is */
2957             s--;
2958             size++;
2959         }
2960 #endif
2961
2962         /* Map 16-bit characters to '\uxxxx' */
2963         if (ch >= 256) {
2964             *p++ = '\\';
2965             *p++ = 'u';
2966             *p++ = hexdigit[(ch >> 12) & 0x000F];
2967             *p++ = hexdigit[(ch >> 8) & 0x000F];
2968             *p++ = hexdigit[(ch >> 4) & 0x000F];
2969             *p++ = hexdigit[ch & 0x000F];
2970         }
2971
2972         /* Map special whitespace to '\t', \n', '\r' */
2973         else if (ch == '\t') {
2974             *p++ = '\\';
2975             *p++ = 't';
2976         }
2977         else if (ch == '\n') {
2978             *p++ = '\\';
2979             *p++ = 'n';
2980         }
2981         else if (ch == '\r') {
2982             *p++ = '\\';
2983             *p++ = 'r';
2984         }
2985
2986         /* Map non-printable US ASCII to '\xhh' */
2987         else if (ch < ' ' || ch >= 0x7F) {
2988             *p++ = '\\';
2989             *p++ = 'x';
2990             *p++ = hexdigit[(ch >> 4) & 0x000F];
2991             *p++ = hexdigit[ch & 0x000F];
2992         }
2993
2994         /* Copy everything else as-is */
2995         else
2996             *p++ = (char) ch;
2997     }
2998     if (quotes)
2999         *p++ = PyString_AS_STRING(repr)[1];
3000
3001     *p = '\0';
3002     _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
3003     return repr;
3004 }
3005
3006 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3007                                         Py_ssize_t size)
3008 {
3009     return unicodeescape_string(s, size, 0);
3010 }
3011
3012 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3013 {
3014     if (!PyUnicode_Check(unicode)) {
3015         PyErr_BadArgument();
3016         return NULL;
3017     }
3018     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3019                                          PyUnicode_GET_SIZE(unicode));
3020 }
3021
3022 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3023
3024 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3025                                            Py_ssize_t size,
3026                                            const char *errors)
3027 {
3028     const char *starts = s;
3029     Py_ssize_t startinpos;
3030     Py_ssize_t endinpos;
3031     Py_ssize_t outpos;
3032     PyUnicodeObject *v;
3033     Py_UNICODE *p;
3034     const char *end;
3035     const char *bs;
3036     PyObject *errorHandler = NULL;
3037     PyObject *exc = NULL;
3038
3039     /* Escaped strings will always be longer than the resulting
3040        Unicode string, so we start with size here and then reduce the
3041        length after conversion to the true value. (But decoding error
3042        handler might have to resize the string) */
3043     v = _PyUnicode_New(size);
3044     if (v == NULL)
3045         goto onError;
3046     if (size == 0)
3047         return (PyObject *)v;
3048     p = PyUnicode_AS_UNICODE(v);
3049     end = s + size;
3050     while (s < end) {
3051         unsigned char c;
3052         Py_UCS4 x;
3053         int i;
3054         int count;
3055
3056         /* Non-escape characters are interpreted as Unicode ordinals */
3057         if (*s != '\\') {
3058             *p++ = (unsigned char)*s++;
3059             continue;
3060         }
3061         startinpos = s-starts;
3062
3063         /* \u-escapes are only interpreted iff the number of leading
3064            backslashes if odd */
3065         bs = s;
3066         for (;s < end;) {
3067             if (*s != '\\')
3068                 break;
3069             *p++ = (unsigned char)*s++;
3070         }
3071         if (((s - bs) & 1) == 0 ||
3072             s >= end ||
3073             (*s != 'u' && *s != 'U')) {
3074             continue;
3075         }
3076         p--;
3077         count = *s=='u' ? 4 : 8;
3078         s++;
3079
3080         /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3081         outpos = p-PyUnicode_AS_UNICODE(v);
3082         for (x = 0, i = 0; i < count; ++i, ++s) {
3083             c = (unsigned char)*s;
3084             if (!isxdigit(c)) {
3085                 endinpos = s-starts;
3086                 if (unicode_decode_call_errorhandler(
3087                     errors, &errorHandler,
3088                     "rawunicodeescape", "truncated \\uXXXX",
3089                     starts, size, &startinpos, &endinpos, &exc, &s,
3090                     (PyObject **)&v, &outpos, &p))
3091                     goto onError;
3092                 goto nextByte;
3093             }
3094             x = (x<<4) & ~0xF;
3095             if (c >= '0' && c <= '9')
3096                 x += c - '0';
3097             else if (c >= 'a' && c <= 'f')
3098                 x += 10 + c - 'a';
3099             else
3100                 x += 10 + c - 'A';
3101         }
3102         if (x <= 0xffff)
3103                 /* UCS-2 character */
3104                 *p++ = (Py_UNICODE) x;
3105         else if (x <= 0x10ffff) {
3106                 /* UCS-4 character. Either store directly, or as
3107                    surrogate pair. */
3108 #ifdef Py_UNICODE_WIDE
3109                 *p++ = (Py_UNICODE) x;
3110 #else
3111                 x -= 0x10000L;
3112                 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3113                 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3114 #endif
3115         } else {
3116             endinpos = s-starts;
3117             outpos = p-PyUnicode_AS_UNICODE(v);
3118             if (unicode_decode_call_errorhandler(
3119                     errors, &errorHandler,
3120                     "rawunicodeescape", "\\Uxxxxxxxx out of range",
3121                     starts, size, &startinpos, &endinpos, &exc, &s,
3122                     (PyObject **)&v, &outpos, &p))
3123                     goto onError;
3124         }
3125         nextByte:
3126         ;
3127     }
3128     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3129         goto onError;
3130     Py_XDECREF(errorHandler);
3131     Py_XDECREF(exc);
3132     return (PyObject *)v;
3133
3134  onError:
3135     Py_XDECREF(v);
3136     Py_XDECREF(errorHandler);
3137     Py_XDECREF(exc);
3138     return NULL;
3139 }
3140
3141 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3142                                            Py_ssize_t size)
3143 {
3144     PyObject *repr;
3145     char *p;
3146     char *q;
3147
3148     static const char *hexdigit = "0123456789abcdef";
3149
3150 #ifdef Py_UNICODE_WIDE
3151     repr = PyString_FromStringAndSize(NULL, 10 * size);
3152 #else
3153     repr = PyString_FromStringAndSize(NULL, 6 * size);
3154 #endif
3155     if (repr == NULL)
3156         return NULL;
3157     if (size == 0)
3158         return repr;
3159
3160     p = q = PyString_AS_STRING(repr);
3161     while (size-- > 0) {
3162         Py_UNICODE ch = *s++;
3163 #ifdef Py_UNICODE_WIDE
3164         /* Map 32-bit characters to '\Uxxxxxxxx' */
3165         if (ch >= 0x10000) {
3166             *p++ = '\\';
3167             *p++ = 'U';
3168             *p++ = hexdigit[(ch >> 28) & 0xf];
3169             *p++ = hexdigit[(ch >> 24) & 0xf];
3170             *p++ = hexdigit[(ch >> 20) & 0xf];
3171             *p++ = hexdigit[(ch >> 16) & 0xf];
3172             *p++ = hexdigit[(ch >> 12) & 0xf];
3173             *p++ = hexdigit[(ch >> 8) & 0xf];
3174             *p++ = hexdigit[(ch >> 4) & 0xf];
3175             *p++ = hexdigit[ch & 15];
3176         }
3177         else
3178 #else
3179         /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3180         if (ch >= 0xD800 && ch < 0xDC00) {
3181             Py_UNICODE ch2;
3182             Py_UCS4 ucs;
3183
3184             ch2 = *s++;
3185             size--;
3186             if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3187                 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3188                 *p++ = '\\';
3189                 *p++ = 'U';
3190                 *p++ = hexdigit[(ucs >> 28) & 0xf];
3191                 *p++ = hexdigit[(ucs >> 24) & 0xf];
3192                 *p++ = hexdigit[(ucs >> 20) & 0xf];
3193                 *p++ = hexdigit[(ucs >> 16) & 0xf];
3194                 *p++ = hexdigit[(ucs >> 12) & 0xf];
3195                 *p++ = hexdigit[(ucs >> 8) & 0xf];
3196                 *p++ = hexdigit[(ucs >> 4) & 0xf];
3197                 *p++ = hexdigit[ucs & 0xf];
3198                 continue;
3199             }
3200             /* Fall through: isolated surrogates are copied as-is */
3201             s--;
3202             size++;
3203         }
3204 #endif
3205         /* Map 16-bit characters to '\uxxxx' */
3206         if (ch >= 256) {
3207             *p++ = '\\';
3208             *p++ = 'u';
3209             *p++ = hexdigit[(ch >> 12) & 0xf];
3210             *p++ = hexdigit[(ch >> 8) & 0xf];
3211             *p++ = hexdigit[(ch >> 4) & 0xf];
3212             *p++ = hexdigit[ch & 15];
3213         }
3214         /* Copy everything else as-is */
3215         else
3216             *p++ = (char) ch;
3217     }
3218     *p = '\0';
3219     _PyString_Resize(&repr, p - q);
3220     return repr;
3221 }
3222
3223 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3224 {
3225     if (!PyUnicode_Check(unicode)) {
3226         PyErr_BadArgument();
3227         return NULL;
3228     }
3229     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3230                                             PyUnicode_GET_SIZE(unicode));
3231 }
3232
3233 /* --- Unicode Internal Codec ------------------------------------------- */
3234
3235 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3236                                            Py_ssize_t size,
3237                                            const char *errors)
3238 {
3239     const char *starts = s;
3240     Py_ssize_t startinpos;
3241     Py_ssize_t endinpos;
3242     Py_ssize_t outpos;
3243     PyUnicodeObject *v;
3244     Py_UNICODE *p;
3245     const char *end;
3246     const char *reason;
3247     PyObject *errorHandler = NULL;
3248     PyObject *exc = NULL;
3249
3250 #ifdef Py_UNICODE_WIDE
3251     Py_UNICODE unimax = PyUnicode_GetMax();
3252 #endif
3253
3254     /* XXX overflow detection missing */
3255     v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3256     if (v == NULL)
3257         goto onError;
3258     if (PyUnicode_GetSize((PyObject *)v) == 0)
3259         return (PyObject *)v;
3260     p = PyUnicode_AS_UNICODE(v);
3261     end = s + size;
3262
3263     while (s < end) {
3264         memcpy(p, s, sizeof(Py_UNICODE));
3265         /* We have to sanity check the raw data, otherwise doom looms for
3266            some malformed UCS-4 data. */
3267         if (
3268             #ifdef Py_UNICODE_WIDE
3269             *p > unimax || *p < 0 ||
3270             #endif
3271             end-s < Py_UNICODE_SIZE
3272             )
3273             {
3274             startinpos = s - starts;
3275             if (end-s < Py_UNICODE_SIZE) {
3276                 endinpos = end-starts;
3277                 reason = "truncated input";
3278             }
3279             else {
3280                 endinpos = s - starts + Py_UNICODE_SIZE;
3281                 reason = "illegal code point (> 0x10FFFF)";
3282             }
3283             outpos = p - PyUnicode_AS_UNICODE(v);
3284             if (unicode_decode_call_errorhandler(
3285                     errors, &errorHandler,
3286                     "unicode_internal", reason,
3287                     starts, size, &startinpos, &endinpos, &exc, &s,
3288                     (PyObject **)&v, &outpos, &p)) {
3289                 goto onError;
3290             }
3291         }
3292         else {
3293             p++;
3294             s += Py_UNICODE_SIZE;
3295         }
3296     }
3297
3298     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3299         goto onError;
3300     Py_XDECREF(errorHandler);
3301     Py_XDECREF(exc);
3302     return (PyObject *)v;
3303
3304  onError:
3305     Py_XDECREF(v);
3306     Py_XDECREF(errorHandler);
3307     Py_XDECREF(exc);
3308     return NULL;
3309 }
3310
3311 /* --- Latin-1 Codec ------------------------------------------------------ */
3312
3313 PyObject *PyUnicode_DecodeLatin1(const char *s,
3314                                  Py_ssize_t size,
3315                                  const char *errors)
3316 {
3317     PyUnicodeObject *v;
3318     Py_UNICODE *p;
3319
3320     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3321     if (size == 1) {
3322         Py_UNICODE r = *(unsigned char*)s;
3323         return PyUnicode_FromUnicode(&r, 1);
3324     }
3325
3326     v = _PyUnicode_New(size);
3327     if (v == NULL)
3328         goto onError;
3329     if (size == 0)
3330         return (PyObject *)v;
3331     p = PyUnicode_AS_UNICODE(v);
3332     while (size-- > 0)
3333         *p++ = (unsigned char)*s++;
3334     return (PyObject *)v;
3335
3336  onError:
3337     Py_XDECREF(v);
3338     return NULL;
3339 }
3340
3341 /* create or adjust a UnicodeEncodeError */
3342 static void make_encode_exception(PyObject **exceptionObject,
3343     const char *encoding,
3344     const Py_UNICODE *unicode, Py_ssize_t size,
3345     Py_ssize_t startpos, Py_ssize_t endpos,
3346     const char *reason)
3347 {
3348     if (*exceptionObject == NULL) {
3349         *exceptionObject = PyUnicodeEncodeError_Create(
3350             encoding, unicode, size, startpos, endpos, reason);
3351     }
3352     else {
3353         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3354             goto onError;
3355         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3356             goto onError;
3357         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3358             goto onError;
3359         return;
3360         onError:
3361         Py_DECREF(*exceptionObject);
3362         *exceptionObject = NULL;
3363     }
3364 }
3365
3366 /* raises a UnicodeEncodeError */
3367 static void raise_encode_exception(PyObject **exceptionObject,
3368     const char *encoding,
3369     const Py_UNICODE *unicode, Py_ssize_t size,
3370     Py_ssize_t startpos, Py_ssize_t endpos,
3371     const char *reason)
3372 {
3373     make_encode_exception(exceptionObject,
3374         encoding, unicode, size, startpos, endpos, reason);
3375     if (*exceptionObject != NULL)
3376         PyCodec_StrictErrors(*exceptionObject);
3377 }
3378
3379 /* error handling callback helper:
3380    build arguments, call the callback and check the arguments,
3381    put the result into newpos and return the replacement string, which
3382    has to be freed by the caller */
3383 static PyObject *unicode_encode_call_errorhandler(const char *errors,
3384     PyObject **errorHandler,
3385     const char *encoding, const char *reason,
3386     const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3387     Py_ssize_t startpos, Py_ssize_t endpos,
3388     Py_ssize_t *newpos)
3389 {
3390     static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3391
3392     PyObject *restuple;
3393     PyObject *resunicode;
3394
3395     if (*errorHandler == NULL) {
3396         *errorHandler = PyCodec_LookupError(errors);
3397         if (*errorHandler == NULL)
3398             return NULL;
3399     }
3400
3401     make_encode_exception(exceptionObject,
3402         encoding, unicode, size, startpos, endpos, reason);
3403     if (*exceptionObject == NULL)
3404         return NULL;
3405
3406     restuple = PyObject_CallFunctionObjArgs(
3407         *errorHandler, *exceptionObject, NULL);
3408     if (restuple == NULL)
3409         return NULL;
3410     if (!PyTuple_Check(restuple)) {
3411         PyErr_Format(PyExc_TypeError, &argparse[4]);
3412         Py_DECREF(restuple);
3413         return NULL;
3414     }
3415     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3416         &resunicode, newpos)) {
3417         Py_DECREF(restuple);
3418         return NULL;
3419     }
3420     if (*newpos<0)
3421         *newpos = size+*newpos;
3422     if (*newpos<0 || *newpos>size) {
3423         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3424         Py_DECREF(restuple);
3425         return NULL;
3426     }
3427     Py_INCREF(resunicode);
3428     Py_DECREF(restuple);
3429     return resunicode;
3430 }
3431
3432 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3433                                  Py_ssize_t size,
3434                                  const char *errors,
3435                                  int limit)
3436 {
3437     /* output object */
3438     PyObject *res;
3439     /* pointers to the beginning and end+1 of input */
3440     const Py_UNICODE *startp = p;
3441     const Py_UNICODE *endp = p + size;
3442     /* pointer to the beginning of the unencodable characters */
3443     /* const Py_UNICODE *badp = NULL; */
3444     /* pointer into the output */
3445     char *str;
3446     /* current output position */
3447     Py_ssize_t respos = 0;
3448     Py_ssize_t ressize;
3449     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3450     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3451     PyObject *errorHandler = NULL;
3452     PyObject *exc = NULL;
3453     /* the following variable is used for caching string comparisons
3454      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3455     int known_errorHandler = -1;
3456
3457     /* allocate enough for a simple encoding without
3458        replacements, if we need more, we'll resize */
3459     res = PyString_FromStringAndSize(NULL, size);
3460     if (res == NULL)
3461         goto onError;
3462     if (size == 0)
3463         return res;
3464     str = PyString_AS_STRING(res);
3465     ressize = size;
3466
3467     while (p<endp) {
3468         Py_UNICODE c = *p;
3469
3470         /* can we encode this? */
3471         if (c<limit) {
3472             /* no overflow check, because we know that the space is enough */
3473             *str++ = (char)c;
3474             ++p;
3475         }
3476         else {
3477             Py_ssize_t unicodepos = p-startp;
3478             Py_ssize_t requiredsize;
3479             PyObject *repunicode;
3480             Py_ssize_t repsize;
3481             Py_ssize_t newpos;
3482             Py_ssize_t respos;
3483             Py_UNICODE *uni2;
3484             /* startpos for collecting unencodable chars */
3485             const Py_UNICODE *collstart = p;
3486             const Py_UNICODE *collend = p;
3487             /* find all unecodable characters */
3488             while ((collend < endp) && ((*collend)>=limit))
3489                 ++collend;
3490             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3491             if (known_errorHandler==-1) {
3492                 if ((errors==NULL) || (!strcmp(errors, "strict")))
3493                     known_errorHandler = 1;
3494                 else if (!strcmp(errors, "replace"))
3495                     known_errorHandler = 2;
3496                 else if (!strcmp(errors, "ignore"))
3497                     known_errorHandler = 3;
3498                 else if (!strcmp(errors, "xmlcharrefreplace"))
3499                     known_errorHandler = 4;
3500                 else
3501                     known_errorHandler = 0;
3502             }
3503             switch (known_errorHandler) {
3504                 case 1: /* strict */
3505                     raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3506                     goto onError;
3507                 case 2: /* replace */
3508                     while (collstart++<collend)
3509                         *str++ = '?'; /* fall through */
3510                 case 3: /* ignore */
3511                     p = collend;
3512                     break;
3513                 case 4: /* xmlcharrefreplace */
3514                     respos = str-PyString_AS_STRING(res);
3515                     /* determine replacement size (temporarily (mis)uses p) */
3516                     for (p = collstart, repsize = 0; p < collend; ++p) {
3517                         if (*p<10)
3518                             repsize += 2+1+1;
3519                         else if (*p<100)
3520                             repsize += 2+2+1;
3521                         else if (*p<1000)
3522                             repsize += 2+3+1;
3523                         else if (*p<10000)
3524                             repsize += 2+4+1;
3525 #ifndef Py_UNICODE_WIDE
3526                         else
3527                             repsize += 2+5+1;
3528 #else
3529                         else if (*p<100000)
3530                             repsize += 2+5+1;
3531                         else if (*p<1000000)
3532                             repsize += 2+6+1;
3533                         else
3534                             repsize += 2+7+1;
3535 #endif
3536                     }
3537                     requiredsize = respos+repsize+(endp-collend);
3538                     if (requiredsize > ressize) {
3539                         if (requiredsize<2*ressize)
3540                             requiredsize = 2*ressize;
3541                         if (_PyString_Resize(&res, requiredsize))
3542                             goto onError;
3543                         str = PyString_AS_STRING(res) + respos;
3544                         ressize = requiredsize;
3545                     }
3546                     /* generate replacement (temporarily (mis)uses p) */
3547                     for (p = collstart; p < collend; ++p) {
3548                         str += sprintf(str, "&#%d;", (int)*p);
3549                     }
3550                     p = collend;
3551                     break;
3552                 default:
3553                     repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3554                         encoding, reason, startp, size, &exc,
3555                         collstart-startp, collend-startp, &newpos);
3556                     if (repunicode == NULL)
3557                         goto onError;
3558                     /* need more space? (at least enough for what we
3559                        have+the replacement+the rest of the string, so
3560                        we won't have to check space for encodable characters) */
3561                     respos = str-PyString_AS_STRING(res);
3562                     repsize = PyUnicode_GET_SIZE(repunicode);
3563                     requiredsize = respos+repsize+(endp-collend);
3564                     if (requiredsize > ressize) {
3565                         if (requiredsize<2*ressize)
3566                             requiredsize = 2*ressize;
3567                         if (_PyString_Resize(&res, requiredsize)) {
3568                             Py_DECREF(repunicode);
3569                             goto onError;
3570                         }
3571                         str = PyString_AS_STRING(res) + respos;
3572                         ressize = requiredsize;
3573                     }
3574                     /* check if there is anything unencodable in the replacement
3575                        and copy it to the output */
3576                     for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3577                         c = *uni2;
3578                         if (c >= limit) {
3579                             raise_encode_exception(&exc, encoding, startp, size,
3580                                 unicodepos, unicodepos+1, reason);
3581                             Py_DECREF(repunicode);
3582                             goto onError;
3583                         }
3584                         *str = (char)c;
3585                     }
3586                     p = startp + newpos;
3587                     Py_DECREF(repunicode);
3588             }
3589         }
3590     }
3591     /* Resize if we allocated to much */
3592     respos = str-PyString_AS_STRING(res);
3593     if (respos<ressize)
3594        /* If this falls res will be NULL */
3595         _PyString_Resize(&res, respos);
3596     Py_XDECREF(errorHandler);
3597     Py_XDECREF(exc);
3598     return res;
3599
3600     onError:
3601     Py_XDECREF(res);
3602     Py_XDECREF(errorHandler);
3603     Py_XDECREF(exc);
3604     return NULL;
3605 }
3606
3607 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3608                                  Py_ssize_t size,
3609                                  const char *errors)
3610 {
3611     return unicode_encode_ucs1(p, size, errors, 256);
3612 }
3613
3614 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3615 {
3616     if (!PyUnicode_Check(unicode)) {
3617         PyErr_BadArgument();
3618         return NULL;
3619     }
3620     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3621                                   PyUnicode_GET_SIZE(unicode),
3622                                   NULL);
3623 }
3624
3625 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3626
3627 PyObject *PyUnicode_DecodeASCII(const char *s,
3628                                 Py_ssize_t size,
3629                                 const char *errors)
3630 {
3631     const char *starts = s;
3632     PyUnicodeObject *v;
3633     Py_UNICODE *p;
3634     Py_ssize_t startinpos;
3635     Py_ssize_t endinpos;
3636     Py_ssize_t outpos;
3637     const char *e;
3638     PyObject *errorHandler = NULL;
3639     PyObject *exc = NULL;
3640
3641     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3642     if (size == 1 && *(unsigned char*)s < 128) {
3643         Py_UNICODE r = *(unsigned char*)s;
3644         return PyUnicode_FromUnicode(&r, 1);
3645     }
3646
3647     v = _PyUnicode_New(size);
3648     if (v == NULL)
3649         goto onError;
3650     if (size == 0)
3651         return (PyObject *)v;
3652     p = PyUnicode_AS_UNICODE(v);
3653     e = s + size;
3654     while (s < e) {
3655         register unsigned char c = (unsigned char)*s;
3656         if (c < 128) {
3657             *p++ = c;
3658             ++s;
3659         }
3660         else {
3661             startinpos = s-starts;
3662             endinpos = startinpos + 1;
3663             outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3664             if (unicode_decode_call_errorhandler(
3665                  errors, &errorHandler,
3666                  "ascii", "ordinal not in range(128)",
3667                  starts, size, &startinpos, &endinpos, &exc, &s,
3668                  (PyObject **)&v, &outpos, &p))
3669                 goto onError;
3670         }
3671     }
3672     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
3673         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3674             goto onError;
3675     Py_XDECREF(errorHandler);
3676     Py_XDECREF(exc);
3677     return (PyObject *)v;
3678
3679  onError:
3680     Py_XDECREF(v);
3681     Py_XDECREF(errorHandler);
3682     Py_XDECREF(exc);
3683     return NULL;
3684 }
3685
3686 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3687                                 Py_ssize_t size,
3688                                 const char *errors)
3689 {
3690     return unicode_encode_ucs1(p, size, errors, 128);
3691 }
3692
3693 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3694 {
3695     if (!PyUnicode_Check(unicode)) {
3696         PyErr_BadArgument();
3697         return NULL;
3698     }
3699     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3700                                  PyUnicode_GET_SIZE(unicode),
3701                                  NULL);
3702 }
3703
3704 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3705
3706 /* --- MBCS codecs for Windows -------------------------------------------- */
3707
3708 #if SIZEOF_INT < SIZEOF_SSIZE_T
3709 #define NEED_RETRY
3710 #endif
3711
3712 /* XXX This code is limited to "true" double-byte encodings, as
3713    a) it assumes an incomplete character consists of a single byte, and
3714    b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3715       encodings, see IsDBCSLeadByteEx documentation. */
3716
3717 static int is_dbcs_lead_byte(const char *s, int offset)
3718 {
3719     const char *curr = s + offset;
3720
3721     if (IsDBCSLeadByte(*curr)) {
3722         const char *prev = CharPrev(s, curr);
3723         return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3724     }
3725     return 0;
3726 }
3727
3728 /*
3729  * Decode MBCS string into unicode object. If 'final' is set, converts
3730  * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3731  */
3732 static int decode_mbcs(PyUnicodeObject **v,
3733                         const char *s, /* MBCS string */
3734                         int size, /* sizeof MBCS string */
3735                         int final)
3736 {
3737     Py_UNICODE *p;
3738     Py_ssize_t n = 0;
3739     int usize = 0;
3740
3741     assert(size >= 0);
3742
3743     /* Skip trailing lead-byte unless 'final' is set */
3744     if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3745         --size;
3746
3747     /* First get the size of the result */
3748     if (size > 0) {
3749         usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3750         if (usize == 0) {
3751             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3752             return -1;
3753         }
3754     }
3755
3756     if (*v == NULL) {
3757         /* Create unicode object */
3758         *v = _PyUnicode_New(usize);
3759         if (*v == NULL)
3760             return -1;
3761     }
3762     else {
3763         /* Extend unicode object */
3764         n = PyUnicode_GET_SIZE(*v);
3765         if (_PyUnicode_Resize(v, n + usize) < 0)
3766             return -1;
3767     }
3768
3769     /* Do the conversion */
3770     if (size > 0) {
3771         p = PyUnicode_AS_UNICODE(*v) + n;
3772         if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3773             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3774             return -1;
3775         }
3776     }
3777
3778     return size;
3779 }
3780
3781 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3782                                         Py_ssize_t size,
3783                                         const char *errors,
3784                                         Py_ssize_t *consumed)
3785 {
3786     PyUnicodeObject *v = NULL;
3787     int done;
3788
3789     if (consumed)
3790         *consumed = 0;
3791
3792 #ifdef NEED_RETRY
3793   retry:
3794     if (size > INT_MAX)
3795         done = decode_mbcs(&v, s, INT_MAX, 0);
3796     else
3797 #endif
3798         done = decode_mbcs(&v, s, (int)size, !consumed);
3799
3800     if (done < 0) {
3801         Py_XDECREF(v);
3802         return NULL;
3803     }
3804
3805     if (consumed)
3806         *consumed += done;
3807
3808 #ifdef NEED_RETRY
3809     if (size > INT_MAX) {
3810         s += done;
3811         size -= done;
3812         goto retry;
3813     }
3814 #endif
3815
3816     return (PyObject *)v;
3817 }
3818
3819 PyObject *PyUnicode_DecodeMBCS(const char *s,
3820                                 Py_ssize_t size,
3821                                 const char *errors)
3822 {
3823     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3824 }
3825
3826 /*
3827  * Convert unicode into string object (MBCS).
3828  * Returns 0 if succeed, -1 otherwise.
3829  */
3830 static int encode_mbcs(PyObject **repr,
3831                         const Py_UNICODE *p, /* unicode */
3832                         int size) /* size of unicode */
3833 {
3834     int mbcssize = 0;
3835     Py_ssize_t n = 0;
3836
3837     assert(size >= 0);
3838
3839     /* First get the size of the result */
3840     if (size > 0) {
3841         mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3842         if (mbcssize == 0) {
3843             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3844             return -1;
3845         }
3846     }
3847
3848     if (*repr == NULL) {
3849         /* Create string object */
3850         *repr = PyString_FromStringAndSize(NULL, mbcssize);
3851         if (*repr == NULL)
3852             return -1;
3853     }
3854     else {
3855         /* Extend string object */
3856         n = PyString_Size(*repr);
3857         if (_PyString_Resize(repr, n + mbcssize) < 0)
3858             return -1;
3859     }
3860
3861     /* Do the conversion */
3862     if (size > 0) {
3863         char *s = PyString_AS_STRING(*repr) + n;
3864         if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3865             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3866             return -1;
3867         }
3868     }
3869
3870     return 0;
3871 }
3872
3873 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
3874                                 Py_ssize_t size,
3875                                 const char *errors)
3876 {
3877     PyObject *repr = NULL;
3878     int ret;
3879
3880 #ifdef NEED_RETRY
3881  retry:
3882     if (size > INT_MAX)
3883         ret = encode_mbcs(&repr, p, INT_MAX);
3884     else
3885 #endif
3886         ret = encode_mbcs(&repr, p, (int)size);
3887
3888     if (ret < 0) {
3889         Py_XDECREF(repr);
3890         return NULL;
3891     }
3892
3893 #ifdef NEED_RETRY
3894     if (size > INT_MAX) {
3895         p += INT_MAX;
3896         size -= INT_MAX;
3897         goto retry;
3898     }
3899 #endif
3900
3901     return repr;
3902 }
3903
3904 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3905 {
3906     if (!PyUnicode_Check(unicode)) {
3907         PyErr_BadArgument();
3908         return NULL;
3909     }
3910     return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3911                                 PyUnicode_GET_SIZE(unicode),
3912                                 NULL);
3913 }
3914
3915 #undef NEED_RETRY
3916
3917 #endif /* MS_WINDOWS */
3918
3919 /* --- Character Mapping Codec -------------------------------------------- */
3920
3921 PyObject *PyUnicode_DecodeCharmap(const char *s,
3922                                   Py_ssize_t size,
3923                                   PyObject *mapping,
3924                                   const char *errors)
3925 {
3926     const char *starts = s;
3927     Py_ssize_t startinpos;
3928     Py_ssize_t endinpos;
3929     Py_ssize_t outpos;
3930     const char *e;
3931     PyUnicodeObject *v;
3932     Py_UNICODE *p;
3933     Py_ssize_t extrachars = 0;
3934     PyObject *errorHandler = NULL;
3935     PyObject *exc = NULL;
3936     Py_UNICODE *mapstring = NULL;
3937     Py_ssize_t maplen = 0;
3938
3939     /* Default to Latin-1 */
3940     if (mapping == NULL)
3941         return PyUnicode_DecodeLatin1(s, size, errors);
3942
3943     v = _PyUnicode_New(size);
3944     if (v == NULL)
3945         goto onError;
3946     if (size == 0)
3947         return (PyObject *)v;
3948     p = PyUnicode_AS_UNICODE(v);
3949     e = s + size;
3950     if (PyUnicode_CheckExact(mapping)) {
3951         mapstring = PyUnicode_AS_UNICODE(mapping);
3952         maplen = PyUnicode_GET_SIZE(mapping);
3953         while (s < e) {
3954             unsigned char ch = *s;
3955             Py_UNICODE x = 0xfffe; /* illegal value */
3956
3957             if (ch < maplen)
3958                 x = mapstring[ch];
3959
3960             if (x == 0xfffe) {
3961                 /* undefined mapping */
3962                 outpos = p-PyUnicode_AS_UNICODE(v);
3963                 startinpos = s-starts;
3964                 endinpos = startinpos+1;
3965                 if (unicode_decode_call_errorhandler(
3966                      errors, &errorHandler,
3967                      "charmap", "character maps to <undefined>",
3968                      starts, size, &startinpos, &endinpos, &exc, &s,
3969                      (PyObject **)&v, &outpos, &p)) {
3970                     goto onError;
3971                 }
3972                 continue;
3973             }
3974             *p++ = x;
3975             ++s;
3976         }
3977     }
3978     else {
3979         while (s < e) {
3980             unsigned char ch = *s;
3981             PyObject *w, *x;
3982
3983             /* Get mapping (char ordinal -> integer, Unicode char or None) */
3984             w = PyInt_FromLong((long)ch);
3985             if (w == NULL)
3986                 goto onError;
3987             x = PyObject_GetItem(mapping, w);
3988             Py_DECREF(w);
3989             if (x == NULL) {
3990                 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3991                     /* No mapping found means: mapping is undefined. */
3992                     PyErr_Clear();
3993                     x = Py_None;
3994                     Py_INCREF(x);
3995                 } else
3996                     goto onError;
3997             }
3998
3999             /* Apply mapping */
4000             if (PyInt_Check(x)) {
4001                 long value = PyInt_AS_LONG(x);
4002                 if (value < 0 || value > 65535) {
4003                     PyErr_SetString(PyExc_TypeError,
4004                                     "character mapping must be in range(65536)");
4005                     Py_DECREF(x);
4006                     goto onError;
4007                 }
4008                 *p++ = (Py_UNICODE)value;
4009             }
4010             else if (x == Py_None) {
4011                 /* undefined mapping */
4012                 outpos = p-PyUnicode_AS_UNICODE(v);
4013                 startinpos = s-starts;
4014                 endinpos = startinpos+1;
4015                 if (unicode_decode_call_errorhandler(
4016                      errors, &errorHandler,
4017                      "charmap", "character maps to <undefined>",
4018                      starts, size, &startinpos, &endinpos, &exc, &s,
4019                      (PyObject **)&v, &outpos, &p)) {
4020                     Py_DECREF(x);
4021                     goto onError;
4022                 }
4023                 Py_DECREF(x);
4024                 continue;
4025             }
4026             else if (PyUnicode_Check(x)) {
4027                 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4028
4029                 if (targetsize == 1)
4030                     /* 1-1 mapping */
4031                     *p++ = *PyUnicode_AS_UNICODE(x);
4032
4033                 else if (targetsize > 1) {
4034                     /* 1-n mapping */
4035                     if (targetsize > extrachars) {
4036                         /* resize first */
4037                         Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4038                         Py_ssize_t needed = (targetsize - extrachars) + \
4039                                      (targetsize << 2);
4040                         extrachars += needed;
4041                         /* XXX overflow detection missing */
4042                         if (_PyUnicode_Resize(&v,
4043                                              PyUnicode_GET_SIZE(v) + needed) < 0) {
4044                             Py_DECREF(x);
4045                             goto onError;
4046                         }
4047                         p = PyUnicode_AS_UNICODE(v) + oldpos;
4048                     }
4049                     Py_UNICODE_COPY(p,
4050                                     PyUnicode_AS_UNICODE(x),
4051                                     targetsize);
4052                     p += targetsize;
4053                     extrachars -= targetsize;
4054                 }
4055                 /* 1-0 mapping: skip the character */
4056             }
4057             else {
4058                 /* wrong return value */
4059                 PyErr_SetString(PyExc_TypeError,
4060                       "character mapping must return integer, None or unicode");
4061                 Py_DECREF(x);
4062                 goto onError;
4063             }
4064             Py_DECREF(x);
4065             ++s;
4066         }
4067     }
4068     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4069         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4070             goto onError;
4071     Py_XDECREF(errorHandler);
4072     Py_XDECREF(exc);
4073     return (PyObject *)v;
4074
4075  onError:
4076     Py_XDECREF(errorHandler);
4077     Py_XDECREF(exc);
4078     Py_XDECREF(v);
4079     return NULL;
4080 }
4081
4082 /* Charmap encoding: the lookup table */
4083
4084 struct encoding_map{
4085   PyObject_HEAD
4086   unsigned char level1[32];
4087   int count2, count3;
4088   unsigned char level23[1];
4089 };
4090
4091 static PyObject*
4092 encoding_map_size(PyObject *obj, PyObject* args)
4093 {
4094     struct encoding_map *map = (struct encoding_map*)obj;
4095     return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4096                           128*map->count3);
4097 }
4098
4099 static PyMethodDef encoding_map_methods[] = {
4100         {"size", encoding_map_size, METH_NOARGS,
4101          PyDoc_STR("Return the size (in bytes) of this object") },
4102         { 0 }
4103 };
4104
4105 static void
4106 encoding_map_dealloc(PyObject* o)
4107 {
4108         PyObject_FREE(o);
4109 }
4110
4111 static PyTypeObject EncodingMapType = {
4112         PyVarObject_HEAD_INIT(NULL, 0)
4113         "EncodingMap",          /*tp_name*/
4114         sizeof(struct encoding_map),   /*tp_basicsize*/
4115         0,                      /*tp_itemsize*/
4116         /* methods */
4117         encoding_map_dealloc,   /*tp_dealloc*/
4118         0,                      /*tp_print*/
4119         0,                      /*tp_getattr*/
4120         0,                      /*tp_setattr*/
4121         0,                      /*tp_compare*/
4122         0,                      /*tp_repr*/
4123         0,                      /*tp_as_number*/
4124         0,                      /*tp_as_sequence*/
4125         0,                      /*tp_as_mapping*/
4126         0,                      /*tp_hash*/
4127         0,                      /*tp_call*/
4128         0,                      /*tp_str*/
4129         0,                      /*tp_getattro*/
4130         0,                      /*tp_setattro*/
4131         0,                      /*tp_as_buffer*/
4132         Py_TPFLAGS_DEFAULT,     /*tp_flags*/
4133         0,                      /*tp_doc*/
4134         0,                      /*tp_traverse*/
4135         0,                      /*tp_clear*/
4136         0,                      /*tp_richcompare*/
4137         0,                      /*tp_weaklistoffset*/
4138         0,                      /*tp_iter*/
4139         0,                      /*tp_iternext*/
4140         encoding_map_methods,   /*tp_methods*/
4141         0,                      /*tp_members*/
4142         0,                      /*tp_getset*/
4143         0,                      /*tp_base*/
4144         0,                      /*tp_dict*/
4145         0,                      /*tp_descr_get*/
4146         0,                      /*tp_descr_set*/
4147         0,                      /*tp_dictoffset*/
4148         0,                      /*tp_init*/
4149         0,                      /*tp_alloc*/
4150         0,                      /*tp_new*/
4151         0,                      /*tp_free*/
4152         0,                      /*tp_is_gc*/
4153 };
4154
4155 PyObject*
4156 PyUnicode_BuildEncodingMap(PyObject* string)
4157 {
4158     Py_UNICODE *decode;
4159     PyObject *result;
4160     struct encoding_map *mresult;
4161     int i;
4162     int need_dict = 0;
4163     unsigned char level1[32];
4164     unsigned char level2[512];
4165     unsigned char *mlevel1, *mlevel2, *mlevel3;
4166     int count2 = 0, count3 = 0;
4167
4168     if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4169         PyErr_BadArgument();
4170         return NULL;
4171     }
4172     decode = PyUnicode_AS_UNICODE(string);
4173     memset(level1, 0xFF, sizeof level1);
4174     memset(level2, 0xFF, sizeof level2);
4175
4176     /* If there isn't a one-to-one mapping of NULL to \0,
4177        or if there are non-BMP characters, we need to use
4178        a mapping dictionary. */
4179     if (decode[0] != 0)
4180         need_dict = 1;
4181     for (i = 1; i < 256; i++) {
4182         int l1, l2;
4183         if (decode[i] == 0
4184             #ifdef Py_UNICODE_WIDE
4185             || decode[i] > 0xFFFF
4186             #endif
4187         ) {
4188             need_dict = 1;
4189             break;
4190         }
4191         if (decode[i] == 0xFFFE)
4192             /* unmapped character */
4193             continue;
4194         l1 = decode[i] >> 11;
4195         l2 = decode[i] >> 7;
4196         if (level1[l1] == 0xFF)
4197             level1[l1] = count2++;
4198         if (level2[l2] == 0xFF)
4199             level2[l2] = count3++;
4200     }
4201
4202     if (count2 >= 0xFF || count3 >= 0xFF)
4203         need_dict = 1;
4204
4205     if (need_dict) {
4206         PyObject *result = PyDict_New();
4207         PyObject *key, *value;
4208         if (!result)
4209             return NULL;
4210         for (i = 0; i < 256; i++) {
4211             key = value = NULL;
4212             key = PyInt_FromLong(decode[i]);
4213             value = PyInt_FromLong(i);
4214             if (!key || !value)
4215                 goto failed1;
4216             if (PyDict_SetItem(result, key, value) == -1)
4217                 goto failed1;
4218             Py_DECREF(key);
4219             Py_DECREF(value);
4220         }
4221         return result;
4222       failed1:
4223         Py_XDECREF(key);
4224         Py_XDECREF(value);
4225         Py_DECREF(result);
4226         return NULL;
4227     }
4228
4229     /* Create a three-level trie */
4230     result = PyObject_MALLOC(sizeof(struct encoding_map) +
4231                              16*count2 + 128*count3 - 1);
4232     if (!result)
4233         return PyErr_NoMemory();
4234     PyObject_Init(result, &EncodingMapType);
4235     mresult = (struct encoding_map*)result;
4236     mresult->count2 = count2;
4237     mresult->count3 = count3;
4238     mlevel1 = mresult->level1;
4239     mlevel2 = mresult->level23;
4240     mlevel3 = mresult->level23 + 16*count2;
4241     memcpy(mlevel1, level1, 32);
4242     memset(mlevel2, 0xFF, 16*count2);
4243     memset(mlevel3, 0, 128*count3);
4244     count3 = 0;
4245     for (i = 1; i < 256; i++) {
4246         int o1, o2, o3, i2, i3;
4247         if (decode[i] == 0xFFFE)
4248             /* unmapped character */
4249             continue;
4250         o1 = decode[i]>>11;
4251         o2 = (decode[i]>>7) & 0xF;
4252         i2 = 16*mlevel1[o1] + o2;
4253         if (mlevel2[i2] == 0xFF)
4254             mlevel2[i2] = count3++;
4255         o3 = decode[i] & 0x7F;
4256         i3 = 128*mlevel2[i2] + o3;
4257         mlevel3[i3] = i;
4258     }
4259     return result;
4260 }
4261
4262 static int
4263 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4264 {
4265     struct encoding_map *map = (struct encoding_map*)mapping;
4266     int l1 = c>>11;
4267     int l2 = (c>>7) & 0xF;
4268     int l3 = c & 0x7F;
4269     int i;
4270
4271 #ifdef Py_UNICODE_WIDE
4272     if (c > 0xFFFF) {
4273         return -1;
4274     }
4275 #endif
4276     if (c == 0)
4277         return 0;
4278     /* level 1*/
4279     i = map->level1[l1];
4280     if (i == 0xFF) {
4281         return -1;
4282     }
4283     /* level 2*/
4284     i = map->level23[16*i+l2];
4285     if (i == 0xFF) {
4286         return -1;
4287     }
4288     /* level 3 */
4289     i = map->level23[16*map->count2 + 128*i + l3];
4290     if (i == 0) {
4291         return -1;
4292     }
4293     return i;
4294 }
4295
4296 /* Lookup the character ch in the mapping. If the character
4297    can't be found, Py_None is returned (or NULL, if another
4298    error occurred). */
4299 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4300 {
4301     PyObject *w = PyInt_FromLong((long)c);
4302     PyObject *x;
4303
4304     if (w == NULL)
4305          return NULL;
4306     x = PyObject_GetItem(mapping, w);
4307     Py_DECREF(w);
4308     if (x == NULL) {
4309         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4310             /* No mapping found means: mapping is undefined. */
4311             PyErr_Clear();
4312             x = Py_None;
4313             Py_INCREF(x);
4314             return x;
4315         } else
4316             return NULL;
4317     }
4318     else if (x == Py_None)
4319         return x;
4320     else if (PyInt_Check(x)) {
4321         long value = PyInt_AS_LONG(x);
4322         if (value < 0 || value > 255) {
4323             PyErr_SetString(PyExc_TypeError,
4324                              "character mapping must be in range(256)");
4325             Py_DECREF(x);
4326             return NULL;
4327         }
4328         return x;
4329     }
4330     else if (PyString_Check(x))
4331         return x;
4332     else {
4333         /* wrong return value */
4334         PyErr_SetString(PyExc_TypeError,
4335               "character mapping must return integer, None or str");
4336         Py_DECREF(x);
4337         return NULL;
4338     }
4339 }
4340
4341 static int
4342 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4343 {
4344         Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4345         /* exponentially overallocate to minimize reallocations */
4346         if (requiredsize < 2*outsize)
4347             requiredsize = 2*outsize;
4348         if (_PyString_Resize(outobj, requiredsize)) {
4349             return 0;
4350         }
4351         return 1;
4352 }
4353
4354 typedef enum charmapencode_result {
4355   enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4356 }charmapencode_result;
4357 /* lookup the character, put the result in the output string and adjust
4358    various state variables. Reallocate the output string if not enough
4359    space is available. Return a new reference to the object that
4360    was put in the output buffer, or Py_None, if the mapping was undefined
4361    (in which case no character was written) or NULL, if a
4362    reallocation error occurred. The caller must decref the result */
4363 static
4364 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4365     PyObject **outobj, Py_ssize_t *outpos)
4366 {
4367     PyObject *rep;
4368     char *outstart;
4369     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4370
4371     if (Py_TYPE(mapping) == &EncodingMapType) {
4372         int res = encoding_map_lookup(c, mapping);
4373         Py_ssize_t requiredsize = *outpos+1;
4374         if (res == -1)
4375             return enc_FAILED;
4376         if (outsize<requiredsize)
4377             if (!charmapencode_resize(outobj, outpos, requiredsize))
4378                 return enc_EXCEPTION;
4379         outstart = PyString_AS_STRING(*outobj);
4380         outstart[(*outpos)++] = (char)res;
4381         return enc_SUCCESS;
4382     }
4383
4384     rep = charmapencode_lookup(c, mapping);
4385     if (rep==NULL)
4386         return enc_EXCEPTION;
4387     else if (rep==Py_None) {
4388         Py_DECREF(rep);
4389         return enc_FAILED;
4390     } else {
4391         if (PyInt_Check(rep)) {
4392             Py_ssize_t requiredsize = *outpos+1;
4393             if (outsize<requiredsize)
4394                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4395                     Py_DECREF(rep);
4396                     return enc_EXCEPTION;
4397                 }
4398             outstart = PyString_AS_STRING(*outobj);
4399             outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4400         }
4401         else {
4402             const char *repchars = PyString_AS_STRING(rep);
4403             Py_ssize_t repsize = PyString_GET_SIZE(rep);
4404             Py_ssize_t requiredsize = *outpos+repsize;
4405             if (outsize<requiredsize)
4406                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4407                     Py_DECREF(rep);
4408                     return enc_EXCEPTION;
4409                 }
4410             outstart = PyString_AS_STRING(*outobj);
4411             memcpy(outstart + *outpos, repchars, repsize);
4412             *outpos += repsize;
4413         }
4414     }
4415     Py_DECREF(rep);
4416     return enc_SUCCESS;
4417 }
4418
4419 /* handle an error in PyUnicode_EncodeCharmap
4420    Return 0 on success, -1 on error */
4421 static
4422 int charmap_encoding_error(
4423     const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4424     PyObject **exceptionObject,
4425     int *known_errorHandler, PyObject **errorHandler, const char *errors,
4426     PyObject **res, Py_ssize_t *respos)
4427 {
4428     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4429     Py_ssize_t repsize;
4430     Py_ssize_t newpos;
4431     Py_UNICODE *uni2;
4432     /* startpos for collecting unencodable chars */
4433     Py_ssize_t collstartpos = *inpos;
4434     Py_ssize_t collendpos = *inpos+1;
4435     Py_ssize_t collpos;
4436     char *encoding = "charmap";
4437     char *reason = "character maps to <undefined>";
4438     charmapencode_result x;
4439
4440     /* find all unencodable characters */
4441     while (collendpos < size) {
4442         PyObject *rep;
4443         if (Py_TYPE(mapping) == &EncodingMapType) {
4444             int res = encoding_map_lookup(p[collendpos], mapping);
4445             if (res != -1)
4446                 break;
4447             ++collendpos;
4448             continue;
4449         }
4450
4451         rep = charmapencode_lookup(p[collendpos], mapping);
4452         if (rep==NULL)
4453             return -1;
4454         else if (rep!=Py_None) {
4455             Py_DECREF(rep);
4456             break;
4457         }
4458         Py_DECREF(rep);
4459         ++collendpos;
4460     }
4461     /* cache callback name lookup
4462      * (if not done yet, i.e. it's the first error) */
4463     if (*known_errorHandler==-1) {
4464         if ((errors==NULL) || (!strcmp(errors, "strict")))
4465             *known_errorHandler = 1;
4466         else if (!strcmp(errors, "replace"))
4467             *known_errorHandler = 2;
4468         else if (!strcmp(errors, "ignore"))
4469             *known_errorHandler = 3;
4470         else if (!strcmp(errors, "xmlcharrefreplace"))
4471             *known_errorHandler = 4;
4472         else
4473             *known_errorHandler = 0;
4474     }
4475     switch (*known_errorHandler) {
4476         case 1: /* strict */
4477             raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4478             return -1;
4479         case 2: /* replace */
4480             for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4481                 x = charmapencode_output('?', mapping, res, respos);
4482                 if (x==enc_EXCEPTION) {
4483                     return -1;
4484                 }
4485                 else if (x==enc_FAILED) {
4486                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4487                     return -1;
4488                 }
4489             }
4490             /* fall through */
4491         case 3: /* ignore */
4492             *inpos = collendpos;
4493             break;
4494         case 4: /* xmlcharrefreplace */
4495             /* generate replacement (temporarily (mis)uses p) */
4496             for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4497                 char buffer[2+29+1+1];
4498                 char *cp;
4499                 sprintf(buffer, "&#%d;", (int)p[collpos]);
4500                 for (cp = buffer; *cp; ++cp) {
4501                     x = charmapencode_output(*cp, mapping, res, respos);
4502                     if (x==enc_EXCEPTION)
4503                         return -1;
4504                     else if (x==enc_FAILED) {
4505                         raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4506                         return -1;
4507                     }
4508                 }
4509             }
4510             *inpos = collendpos;
4511             break;
4512         default:
4513             repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4514                 encoding, reason, p, size, exceptionObject,
4515                 collstartpos, collendpos, &newpos);
4516             if (repunicode == NULL)
4517                 return -1;
4518             /* generate replacement  */
4519             repsize = PyUnicode_GET_SIZE(repunicode);
4520             for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4521                 x = charmapencode_output(*uni2, mapping, res, respos);
4522                 if (x==enc_EXCEPTION) {
4523                     return -1;
4524                 }
4525                 else if (x==enc_FAILED) {
4526                     Py_DECREF(repunicode);
4527                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4528                     return -1;
4529                 }
4530             }
4531             *inpos = newpos;
4532             Py_DECREF(repunicode);
4533     }
4534     return 0;
4535 }
4536
4537 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4538                                   Py_ssize_t size,
4539                                   PyObject *mapping,
4540                                   const char *errors)
4541 {
4542     /* output object */
4543     PyObject *res = NULL;
4544     /* current input position */
4545     Py_ssize_t inpos = 0;
4546     /* current output position */
4547     Py_ssize_t respos = 0;
4548     PyObject *errorHandler = NULL;
4549     PyObject *exc = NULL;
4550     /* the following variable is used for caching string comparisons
4551      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4552      * 3=ignore, 4=xmlcharrefreplace */
4553     int known_errorHandler = -1;
4554
4555     /* Default to Latin-1 */
4556     if (mapping == NULL)
4557         return PyUnicode_EncodeLatin1(p, size, errors);
4558
4559     /* allocate enough for a simple encoding without
4560        replacements, if we need more, we'll resize */
4561     res = PyString_FromStringAndSize(NULL, size);
4562     if (res == NULL)
4563         goto onError;
4564     if (size == 0)
4565         return res;
4566
4567     while (inpos<size) {
4568         /* try to encode it */
4569         charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4570         if (x==enc_EXCEPTION) /* error */
4571             goto onError;
4572         if (x==enc_FAILED) { /* unencodable character */
4573             if (charmap_encoding_error(p, size, &inpos, mapping,
4574                 &exc,
4575                 &known_errorHandler, &errorHandler, errors,
4576                 &res, &respos)) {
4577                 goto onError;
4578             }
4579         }
4580         else
4581             /* done with this character => adjust input position */
4582             ++inpos;
4583     }
4584
4585     /* Resize if we allocated to much */
4586     if (respos<PyString_GET_SIZE(res)) {
4587         if (_PyString_Resize(&res, respos))
4588             goto onError;
4589     }
4590     Py_XDECREF(exc);
4591     Py_XDECREF(errorHandler);
4592     return res;
4593
4594     onError:
4595     Py_XDECREF(res);
4596     Py_XDECREF(exc);
4597     Py_XDECREF(errorHandler);
4598     return NULL;
4599 }
4600
4601 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4602                                     PyObject *mapping)
4603 {
4604     if (!PyUnicode_Check(unicode) || mapping == NULL) {
4605         PyErr_BadArgument();
4606         return NULL;
4607     }
4608     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4609                                    PyUnicode_GET_SIZE(unicode),
4610                                    mapping,
4611                                    NULL);
4612 }
4613
4614 /* create or adjust a UnicodeTranslateError */
4615 static void make_translate_exception(PyObject **exceptionObject,
4616     const Py_UNICODE *unicode, Py_ssize_t size,
4617     Py_ssize_t startpos, Py_ssize_t endpos,
4618     const char *reason)
4619 {
4620     if (*exceptionObject == NULL) {
4621         *exceptionObject = PyUnicodeTranslateError_Create(
4622             unicode, size, startpos, endpos, reason);
4623     }
4624     else {
4625         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4626             goto onError;
4627         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4628             goto onError;
4629         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4630             goto onError;
4631         return;
4632         onError:
4633         Py_DECREF(*exceptionObject);
4634         *exceptionObject = NULL;
4635     }
4636 }
4637
4638 /* raises a UnicodeTranslateError */
4639 static void raise_translate_exception(PyObject **exceptionObject,
4640     const Py_UNICODE *unicode, Py_ssize_t size,
4641     Py_ssize_t startpos, Py_ssize_t endpos,
4642     const char *reason)
4643 {
4644     make_translate_exception(exceptionObject,
4645         unicode, size, startpos, endpos, reason);
4646     if (*exceptionObject != NULL)
4647         PyCodec_StrictErrors(*exceptionObject);
4648 }
4649
4650 /* error handling callback helper:
4651    build arguments, call the callback and check the arguments,
4652    put the result into newpos and return the replacement string, which
4653    has to be freed by the caller */
4654 static PyObject *unicode_translate_call_errorhandler(const char *errors,
4655     PyObject **errorHandler,
4656     const char *reason,
4657     const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4658     Py_ssize_t startpos, Py_ssize_t endpos,
4659     Py_ssize_t *newpos)
4660 {
4661     static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4662
4663     Py_ssize_t i_newpos;
4664     PyObject *restuple;
4665     PyObject *resunicode;
4666
4667     if (*errorHandler == NULL) {
4668         *errorHandler = PyCodec_LookupError(errors);
4669         if (*errorHandler == NULL)
4670             return NULL;
4671     }
4672
4673     make_translate_exception(exceptionObject,
4674         unicode, size, startpos, endpos, reason);
4675     if (*exceptionObject == NULL)
4676         return NULL;
4677
4678     restuple = PyObject_CallFunctionObjArgs(
4679         *errorHandler, *exceptionObject, NULL);
4680     if (restuple == NULL)
4681         return NULL;
4682     if (!PyTuple_Check(restuple)) {
4683         PyErr_Format(PyExc_TypeError, &argparse[4]);
4684         Py_DECREF(restuple);
4685         return NULL;
4686     }
4687     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4688         &resunicode, &i_newpos)) {
4689         Py_DECREF(restuple);
4690         return NULL;
4691     }
4692     if (i_newpos<0)
4693         *newpos = size+i_newpos;
4694     else
4695         *newpos = i_newpos;
4696     if (*newpos<0 || *newpos>size) {
4697         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4698         Py_DECREF(restuple);
4699         return NULL;
4700     }
4701     Py_INCREF(resunicode);
4702     Py_DECREF(restuple);
4703     return resunicode;
4704 }
4705
4706 /* Lookup the character ch in the mapping and put the result in result,
4707    which must be decrefed by the caller.
4708    Return 0 on success, -1 on error */
4709 static
4710 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4711 {
4712     PyObject *w = PyInt_FromLong((long)c);
4713     PyObject *x;
4714
4715     if (w == NULL)
4716          return -1;
4717     x = PyObject_GetItem(mapping, w);
4718     Py_DECREF(w);
4719     if (x == NULL) {
4720         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4721             /* No mapping found means: use 1:1 mapping. */
4722             PyErr_Clear();
4723             *result = NULL;
4724             return 0;
4725         } else
4726             return -1;
4727     }
4728     else if (x == Py_None) {
4729         *result = x;
4730         return 0;
4731     }
4732     else if (PyInt_Check(x)) {
4733         long value = PyInt_AS_LONG(x);
4734         long max = PyUnicode_GetMax();
4735         if (value < 0 || value > max) {
4736             PyErr_Format(PyExc_TypeError,
4737                              "character mapping must be in range(0x%lx)", max+1);
4738             Py_DECREF(x);
4739             return -1;
4740         }
4741         *result = x;
4742         return 0;
4743     }
4744     else if (PyUnicode_Check(x)) {
4745         *result = x;
4746         return 0;
4747     }
4748     else {
4749         /* wrong return value */
4750         PyErr_SetString(PyExc_TypeError,
4751               "character mapping must return integer, None or unicode");
4752         Py_DECREF(x);
4753         return -1;
4754     }
4755 }
4756 /* ensure that *outobj is at least requiredsize characters long,
4757 if not reallocate and adjust various state variables.
4758 Return 0 on success, -1 on error */
4759 static
4760 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4761     Py_ssize_t requiredsize)
4762 {
4763     Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4764     if (requiredsize > oldsize) {
4765         /* remember old output position */
4766         Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4767         /* exponentially overallocate to minimize reallocations */
4768         if (requiredsize < 2 * oldsize)
4769             requiredsize = 2 * oldsize;
4770         if (_PyUnicode_Resize(outobj, requiredsize) < 0)
4771             return -1;
4772         *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4773     }
4774     return 0;
4775 }
4776 /* lookup the character, put the result in the output string and adjust
4777    various state variables. Return a new reference to the object that
4778    was put in the output buffer in *result, or Py_None, if the mapping was
4779    undefined (in which case no character was written).
4780    The called must decref result.
4781    Return 0 on success, -1 on error. */
4782 static
4783 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
4784     Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4785     PyObject **res)
4786 {
4787     if (charmaptranslate_lookup(*curinp, mapping, res))
4788         return -1;
4789     if (*res==NULL) {
4790         /* not found => default to 1:1 mapping */
4791         *(*outp)++ = *curinp;
4792     }
4793     else if (*res==Py_None)
4794         ;
4795     else if (PyInt_Check(*res)) {
4796         /* no overflow check, because we know that the space is enough */
4797         *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4798     }
4799     else if (PyUnicode_Check(*res)) {
4800         Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4801         if (repsize==1) {
4802             /* no overflow check, because we know that the space is enough */
4803             *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4804         }
4805         else if (repsize!=0) {
4806             /* more than one character */
4807             Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4808                 (insize - (curinp-startinp)) +
4809                 repsize - 1;
4810             if (charmaptranslate_makespace(outobj, outp, requiredsize))
4811                 return -1;
4812             memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4813             *outp += repsize;
4814         }
4815     }
4816     else
4817         return -1;
4818     return 0;
4819 }
4820
4821 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
4822                                      Py_ssize_t size,
4823                                      PyObject *mapping,
4824                                      const char *errors)
4825 {
4826     /* output object */
4827     PyObject *res = NULL;
4828     /* pointers to the beginning and end+1 of input */
4829     const Py_UNICODE *startp = p;
4830     const Py_UNICODE *endp = p + size;
4831     /* pointer into the output */
4832     Py_UNICODE *str;
4833     /* current output position */
4834     Py_ssize_t respos = 0;
4835     char *reason = "character maps to <undefined>";
4836     PyObject *errorHandler = NULL;
4837     PyObject *exc = NULL;
4838     /* the following variable is used for caching string comparisons
4839      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4840      * 3=ignore, 4=xmlcharrefreplace */
4841     int known_errorHandler = -1;
4842
4843     if (mapping == NULL) {
4844         PyErr_BadArgument();
4845         return NULL;
4846     }
4847
4848     /* allocate enough for a simple 1:1 translation without
4849        replacements, if we need more, we'll resize */
4850     res = PyUnicode_FromUnicode(NULL, size);
4851     if (res == NULL)
4852         goto onError;
4853     if (size == 0)
4854         return res;
4855     str = PyUnicode_AS_UNICODE(res);
4856
4857     while (p<endp) {
4858         /* try to encode it */
4859         PyObject *x = NULL;
4860         if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4861             Py_XDECREF(x);
4862             goto onError;
4863         }
4864         Py_XDECREF(x);
4865         if (x!=Py_None) /* it worked => adjust input pointer */
4866             ++p;
4867         else { /* untranslatable character */
4868             PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4869             Py_ssize_t repsize;
4870             Py_ssize_t newpos;
4871             Py_UNICODE *uni2;
4872             /* startpos for collecting untranslatable chars */
4873             const Py_UNICODE *collstart = p;
4874             const Py_UNICODE *collend = p+1;
4875             const Py_UNICODE *coll;
4876
4877             /* find all untranslatable characters */
4878             while (collend < endp) {
4879                 if (charmaptranslate_lookup(*collend, mapping, &x))
4880                     goto onError;
4881                 Py_XDECREF(x);
4882                 if (x!=Py_None)
4883                     break;
4884                 ++collend;
4885             }
4886             /* cache callback name lookup
4887              * (if not done yet, i.e. it's the first error) */
4888             if (known_errorHandler==-1) {
4889                 if ((errors==NULL) || (!strcmp(errors, "strict")))
4890                     known_errorHandler = 1;
4891                 else if (!strcmp(errors, "replace"))
4892                     known_errorHandler = 2;
4893                 else if (!strcmp(errors, "ignore"))
4894                     known_errorHandler = 3;
4895                 else if (!strcmp(errors, "xmlcharrefreplace"))
4896                     known_errorHandler = 4;
4897                 else
4898                     known_errorHandler = 0;
4899             }
4900             switch (known_errorHandler) {
4901                 case 1: /* strict */
4902                     raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4903                     goto onError;
4904                 case 2: /* replace */
4905                     /* No need to check for space, this is a 1:1 replacement */
4906                     for (coll = collstart; coll<collend; ++coll)
4907                         *str++ = '?';
4908                     /* fall through */
4909                 case 3: /* ignore */
4910                     p = collend;
4911                     break;
4912                 case 4: /* xmlcharrefreplace */
4913                     /* generate replacement (temporarily (mis)uses p) */
4914                     for (p = collstart; p < collend; ++p) {
4915                         char buffer[2+29+1+1];
4916                         char *cp;
4917                         sprintf(buffer, "&#%d;", (int)*p);
4918                         if (charmaptranslate_makespace(&res, &str,
4919                             (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4920                             goto onError;
4921                         for (cp = buffer; *cp; ++cp)
4922                             *str++ = *cp;
4923                     }
4924                     p = collend;
4925                     break;
4926                 default:
4927                     repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4928                         reason, startp, size, &exc,
4929                         collstart-startp, collend-startp, &newpos);
4930                     if (repunicode == NULL)
4931                         goto onError;
4932                     /* generate replacement  */
4933                     repsize = PyUnicode_GET_SIZE(repunicode);
4934                     if (charmaptranslate_makespace(&res, &str,
4935                         (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4936                         Py_DECREF(repunicode);
4937                         goto onError;
4938                     }
4939                     for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4940                         *str++ = *uni2;
4941                     p = startp + newpos;
4942                     Py_DECREF(repunicode);
4943             }
4944         }
4945     }
4946     /* Resize if we allocated to much */
4947     respos = str-PyUnicode_AS_UNICODE(res);
4948     if (respos<PyUnicode_GET_SIZE(res)) {
4949         if (_PyUnicode_Resize(&res, respos) < 0)
4950             goto onError;
4951     }
4952     Py_XDECREF(exc);
4953     Py_XDECREF(errorHandler);
4954     return res;
4955
4956     onError:
4957     Py_XDECREF(res);
4958     Py_XDECREF(exc);
4959     Py_XDECREF(errorHandler);
4960     return NULL;
4961 }
4962
4963 PyObject *PyUnicode_Translate(PyObject *str,
4964                               PyObject *mapping,
4965                               const char *errors)
4966 {
4967     PyObject *result;
4968
4969     str = PyUnicode_FromObject(str);
4970     if (str == NULL)
4971         goto onError;
4972     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4973                                         PyUnicode_GET_SIZE(str),
4974                                         mapping,
4975                                         errors);
4976     Py_DECREF(str);
4977     return result;
4978
4979  onError:
4980     Py_XDECREF(str);
4981     return NULL;
4982 }
4983
4984 /* --- Decimal Encoder ---------------------------------------------------- */
4985
4986 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
4987                             Py_ssize_t length,
4988                             char *output,
4989                             const char *errors)
4990 {
4991     Py_UNICODE *p, *end;
4992     PyObject *errorHandler = NULL;
4993     PyObject *exc = NULL;
4994     const char *encoding = "decimal";
4995     const char *reason = "invalid decimal Unicode string";
4996     /* the following variable is used for caching string comparisons
4997      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4998     int known_errorHandler = -1;
4999
5000     if (output == NULL) {
5001         PyErr_BadArgument();
5002         return -1;
5003     }
5004
5005     p = s;
5006     end = s + length;
5007     while (p < end) {
5008         register Py_UNICODE ch = *p;
5009         int decimal;
5010         PyObject *repunicode;
5011         Py_ssize_t repsize;
5012         Py_ssize_t newpos;
5013         Py_UNICODE *uni2;
5014         Py_UNICODE *collstart;
5015         Py_UNICODE *collend;
5016
5017         if (Py_UNICODE_ISSPACE(ch)) {
5018             *output++ = ' ';
5019             ++p;
5020             continue;
5021         }
5022         decimal = Py_UNICODE_TODECIMAL(ch);
5023         if (decimal >= 0) {
5024             *output++ = '0' + decimal;
5025             ++p;
5026             continue;
5027         }
5028         if (0 < ch && ch < 256) {
5029             *output++ = (char)ch;
5030             ++p;
5031             continue;
5032         }
5033         /* All other characters are considered unencodable */
5034         collstart = p;
5035         collend = p+1;
5036         while (collend < end) {
5037             if ((0 < *collend && *collend < 256) ||
5038                 !Py_UNICODE_ISSPACE(*collend) ||
5039                 Py_UNICODE_TODECIMAL(*collend))
5040                 break;
5041         }
5042         /* cache callback name lookup
5043          * (if not done yet, i.e. it's the first error) */
5044         if (known_errorHandler==-1) {
5045             if ((errors==NULL) || (!strcmp(errors, "strict")))
5046                 known_errorHandler = 1;
5047             else if (!strcmp(errors, "replace"))
5048                 known_errorHandler = 2;
5049             else if (!strcmp(errors, "ignore"))
5050                 known_errorHandler = 3;
5051             else if (!strcmp(errors, "xmlcharrefreplace"))
5052                 known_errorHandler = 4;
5053             else
5054                 known_errorHandler = 0;
5055         }
5056         switch (known_errorHandler) {
5057             case 1: /* strict */
5058                 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5059                 goto onError;
5060             case 2: /* replace */
5061                 for (p = collstart; p < collend; ++p)
5062                     *output++ = '?';
5063                 /* fall through */
5064             case 3: /* ignore */
5065                 p = collend;
5066                 break;
5067             case 4: /* xmlcharrefreplace */
5068                 /* generate replacement (temporarily (mis)uses p) */
5069                 for (p = collstart; p < collend; ++p)
5070                     output += sprintf(output, "&#%d;", (int)*p);
5071                 p = collend;
5072                 break;
5073             default:
5074                 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5075                     encoding, reason, s, length, &exc,
5076                     collstart-s, collend-s, &newpos);
5077                 if (repunicode == NULL)
5078                     goto onError;
5079                 /* generate replacement  */
5080                 repsize = PyUnicode_GET_SIZE(repunicode);
5081                 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5082                     Py_UNICODE ch = *uni2;
5083                     if (Py_UNICODE_ISSPACE(ch))
5084                         *output++ = ' ';
5085                     else {
5086                         decimal = Py_UNICODE_TODECIMAL(ch);
5087                         if (decimal >= 0)
5088                             *output++ = '0' + decimal;
5089                         else if (0 < ch && ch < 256)
5090                             *output++ = (char)ch;
5091                         else {
5092                             Py_DECREF(repunicode);
5093                             raise_encode_exception(&exc, encoding,
5094                                 s, length, collstart-s, collend-s, reason);
5095                             goto onError;
5096                         }
5097                     }
5098                 }
5099                 p = s + newpos;
5100                 Py_DECREF(repunicode);
5101         }
5102     }
5103     /* 0-terminate the output string */
5104     *output++ = '\0';
5105     Py_XDECREF(exc);
5106     Py_XDECREF(errorHandler);
5107     return 0;
5108
5109  onError:
5110     Py_XDECREF(exc);
5111     Py_XDECREF(errorHandler);
5112     return -1;
5113 }
5114
5115 /* --- Helpers ------------------------------------------------------------ */
5116
5117 #include "stringlib/unicodedefs.h"
5118
5119 #define FROM_UNICODE
5120
5121 #include "stringlib/fastsearch.h"
5122
5123 #include "stringlib/count.h"
5124 #include "stringlib/find.h"
5125 #include "stringlib/partition.h"
5126
5127 /* helper macro to fixup start/end slice values */
5128 #define FIX_START_END(obj)                      \
5129     if (start < 0)                              \
5130         start += (obj)->length;                 \
5131     if (start < 0)                              \
5132         start = 0;                              \
5133     if (end > (obj)->length)                    \
5134         end = (obj)->length;                    \
5135     if (end < 0)                                \
5136         end += (obj)->length;                   \
5137     if (end < 0)                                \
5138         end = 0;
5139
5140 Py_ssize_t PyUnicode_Count(PyObject *str,
5141                            PyObject *substr,
5142                            Py_ssize_t start,
5143                            Py_ssize_t end)
5144 {
5145     Py_ssize_t result;
5146     PyUnicodeObject* str_obj;
5147     PyUnicodeObject* sub_obj;
5148
5149     str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5150     if (!str_obj)
5151         return -1;
5152     sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5153     if (!sub_obj) {
5154         Py_DECREF(str_obj);
5155         return -1;
5156     }
5157
5158     FIX_START_END(str_obj);
5159
5160     result = stringlib_count(
5161         str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5162         );
5163
5164     Py_DECREF(sub_obj);
5165     Py_DECREF(str_obj);
5166
5167     return result;
5168 }
5169
5170 Py_ssize_t PyUnicode_Find(PyObject *str,
5171                           PyObject *sub,
5172                           Py_ssize_t start,
5173                           Py_ssize_t end,
5174                           int direction)
5175 {
5176     Py_ssize_t result;
5177
5178     str = PyUnicode_FromObject(str);
5179     if (!str)
5180         return -2;
5181     sub = PyUnicode_FromObject(sub);
5182     if (!sub) {
5183         Py_DECREF(str);
5184         return -2;
5185     }
5186
5187     if (direction > 0)
5188         result = stringlib_find_slice(
5189             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5190             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5191             start, end
5192             );
5193     else
5194         result = stringlib_rfind_slice(
5195             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5196             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5197             start, end
5198             );
5199
5200     Py_DECREF(str);
5201     Py_DECREF(sub);
5202
5203     return result;
5204 }
5205
5206 static
5207 int tailmatch(PyUnicodeObject *self,
5208               PyUnicodeObject *substring,
5209               Py_ssize_t start,
5210               Py_ssize_t end,
5211               int direction)
5212 {
5213     if (substring->length == 0)
5214         return 1;
5215
5216     FIX_START_END(self);
5217
5218     end -= substring->length;
5219     if (end < start)
5220         return 0;
5221
5222     if (direction > 0) {
5223         if (Py_UNICODE_MATCH(self, end, substring))
5224             return 1;
5225     } else {
5226         if (Py_UNICODE_MATCH(self, start, substring))
5227             return 1;
5228     }
5229
5230     return 0;
5231 }
5232
5233 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5234                         PyObject *substr,
5235                         Py_ssize_t start,
5236                         Py_ssize_t end,
5237                         int direction)
5238 {
5239     Py_ssize_t result;
5240
5241     str = PyUnicode_FromObject(str);
5242     if (str == NULL)
5243         return -1;
5244     substr = PyUnicode_FromObject(substr);
5245     if (substr == NULL) {
5246         Py_DECREF(str);
5247         return -1;
5248     }
5249
5250     result = tailmatch((PyUnicodeObject *)str,
5251                        (PyUnicodeObject *)substr,
5252                        start, end, direction);
5253     Py_DECREF(str);
5254     Py_DECREF(substr);
5255     return result;
5256 }
5257
5258 /* Apply fixfct filter to the Unicode object self and return a
5259    reference to the modified object */
5260
5261 static
5262 PyObject *fixup(PyUnicodeObject *self,
5263                 int (*fixfct)(PyUnicodeObject *s))
5264 {
5265
5266     PyUnicodeObject *u;
5267
5268     u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5269     if (u == NULL)
5270         return NULL;
5271
5272     Py_UNICODE_COPY(u->str, self->str, self->length);
5273
5274     if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5275         /* fixfct should return TRUE if it modified the buffer. If
5276            FALSE, return a reference to the original buffer instead
5277            (to save space, not time) */
5278         Py_INCREF(self);
5279         Py_DECREF(u);
5280         return (PyObject*) self;
5281     }
5282     return (PyObject*) u;
5283 }
5284
5285 static
5286 int fixupper(PyUnicodeObject *self)
5287 {
5288     Py_ssize_t len = self->length;
5289     Py_UNICODE *s = self->str;
5290     int status = 0;
5291
5292     while (len-- > 0) {
5293         register Py_UNICODE ch;
5294
5295         ch = Py_UNICODE_TOUPPER(*s);
5296         if (ch != *s) {
5297             status = 1;
5298             *s = ch;
5299         }
5300         s++;
5301     }
5302
5303     return status;
5304 }
5305
5306 static
5307 int fixlower(PyUnicodeObject *self)
5308 {
5309     Py_ssize_t len = self->length;
5310     Py_UNICODE *s = self->str;
5311     int status = 0;
5312
5313     while (len-- > 0) {
5314         register Py_UNICODE ch;
5315
5316         ch = Py_UNICODE_TOLOWER(*s);
5317         if (ch != *s) {
5318             status = 1;
5319             *s = ch;
5320         }
5321         s++;
5322     }
5323
5324     return status;
5325 }
5326
5327 static
5328 int fixswapcase(PyUnicodeObject *self)
5329 {
5330     Py_ssize_t len = self->length;
5331     Py_UNICODE *s = self->str;
5332     int status = 0;
5333
5334     while (len-- > 0) {
5335         if (Py_UNICODE_ISUPPER(*s)) {
5336             *s = Py_UNICODE_TOLOWER(*s);
5337             status = 1;
5338         } else if (Py_UNICODE_ISLOWER(*s)) {
5339             *s = Py_UNICODE_TOUPPER(*s);
5340             status = 1;
5341         }
5342         s++;
5343     }
5344
5345     return status;
5346 }
5347
5348 static
5349 int fixcapitalize(PyUnicodeObject *self)
5350 {
5351     Py_ssize_t len = self->length;
5352     Py_UNICODE *s = self->str;
5353     int status = 0;
5354
5355     if (len == 0)
5356         return 0;
5357     if (Py_UNICODE_ISLOWER(*s)) {
5358         *s = Py_UNICODE_TOUPPER(*s);
5359         status = 1;
5360     }
5361     s++;
5362     while (--len > 0) {
5363         if (Py_UNICODE_ISUPPER(*s)) {
5364             *s = Py_UNICODE_TOLOWER(*s);
5365             status = 1;
5366         }
5367         s++;
5368     }
5369     return status;
5370 }
5371
5372 static
5373 int fixtitle(PyUnicodeObject *self)
5374 {
5375     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5376     register Py_UNICODE *e;
5377     int previous_is_cased;
5378
5379     /* Shortcut for single character strings */
5380     if (PyUnicode_GET_SIZE(self) == 1) {
5381         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5382         if (*p != ch) {
5383             *p = ch;
5384             return 1;
5385         }
5386         else
5387             return 0;
5388     }
5389
5390     e = p + PyUnicode_GET_SIZE(self);
5391     previous_is_cased = 0;
5392     for (; p < e; p++) {
5393         register const Py_UNICODE ch = *p;
5394
5395         if (previous_is_cased)
5396             *p = Py_UNICODE_TOLOWER(ch);
5397         else
5398             *p = Py_UNICODE_TOTITLE(ch);
5399
5400         if (Py_UNICODE_ISLOWER(ch) ||
5401             Py_UNICODE_ISUPPER(ch) ||
5402             Py_UNICODE_ISTITLE(ch))
5403             previous_is_cased = 1;
5404         else
5405             previous_is_cased = 0;
5406     }
5407     return 1;
5408 }
5409
5410 PyObject *
5411 PyUnicode_Join(PyObject *separator, PyObject *seq)
5412 {
5413     PyObject *internal_separator = NULL;
5414     const Py_UNICODE blank = ' ';
5415     const Py_UNICODE *sep = &blank;
5416     Py_ssize_t seplen = 1;
5417     PyUnicodeObject *res = NULL; /* the result */
5418     Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
5419     Py_ssize_t res_used;         /* # used bytes */
5420     Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
5421     PyObject *fseq;          /* PySequence_Fast(seq) */
5422     Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
5423     PyObject *item;
5424     Py_ssize_t i;
5425
5426     fseq = PySequence_Fast(seq, "");
5427     if (fseq == NULL) {
5428         return NULL;
5429     }
5430
5431     /* Grrrr.  A codec may be invoked to convert str objects to
5432      * Unicode, and so it's possible to call back into Python code
5433      * during PyUnicode_FromObject(), and so it's possible for a sick
5434      * codec to change the size of fseq (if seq is a list).  Therefore
5435      * we have to keep refetching the size -- can't assume seqlen
5436      * is invariant.
5437      */
5438     seqlen = PySequence_Fast_GET_SIZE(fseq);
5439     /* If empty sequence, return u"". */
5440     if (seqlen == 0) {
5441         res = _PyUnicode_New(0);  /* empty sequence; return u"" */
5442         goto Done;
5443     }
5444     /* If singleton sequence with an exact Unicode, return that. */
5445     if (seqlen == 1) {
5446         item = PySequence_Fast_GET_ITEM(fseq, 0);
5447         if (PyUnicode_CheckExact(item)) {
5448             Py_INCREF(item);
5449             res = (PyUnicodeObject *)item;
5450             goto Done;
5451         }
5452     }
5453
5454     /* At least two items to join, or one that isn't exact Unicode. */
5455     if (seqlen > 1) {
5456         /* Set up sep and seplen -- they're needed. */
5457         if (separator == NULL) {
5458             sep = &blank;
5459             seplen = 1;
5460         }
5461         else {
5462             internal_separator = PyUnicode_FromObject(separator);
5463             if (internal_separator == NULL)
5464                 goto onError;
5465             sep = PyUnicode_AS_UNICODE(internal_separator);
5466             seplen = PyUnicode_GET_SIZE(internal_separator);
5467             /* In case PyUnicode_FromObject() mutated seq. */
5468             seqlen = PySequence_Fast_GET_SIZE(fseq);
5469         }
5470     }
5471
5472     /* Get space. */
5473     res = _PyUnicode_New(res_alloc);
5474     if (res == NULL)
5475         goto onError;
5476     res_p = PyUnicode_AS_UNICODE(res);
5477     res_used = 0;
5478
5479     for (i = 0; i < seqlen; ++i) {
5480         Py_ssize_t itemlen;
5481         Py_ssize_t new_res_used;
5482
5483         item = PySequence_Fast_GET_ITEM(fseq, i);
5484         /* Convert item to Unicode. */
5485         if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5486             PyErr_Format(PyExc_TypeError,
5487                          "sequence item %zd: expected string or Unicode,"
5488                          " %.80s found",
5489                          i, Py_TYPE(item)->tp_name);
5490             goto onError;
5491         }
5492         item = PyUnicode_FromObject(item);
5493         if (item == NULL)
5494             goto onError;
5495         /* We own a reference to item from here on. */
5496
5497         /* In case PyUnicode_FromObject() mutated seq. */
5498         seqlen = PySequence_Fast_GET_SIZE(fseq);
5499
5500         /* Make sure we have enough space for the separator and the item. */
5501         itemlen = PyUnicode_GET_SIZE(item);
5502         new_res_used = res_used + itemlen;
5503         if (new_res_used < 0)
5504             goto Overflow;
5505         if (i < seqlen - 1) {
5506             new_res_used += seplen;
5507             if (new_res_used < 0)
5508                 goto Overflow;
5509         }
5510         if (new_res_used > res_alloc) {
5511             /* double allocated size until it's big enough */
5512             do {
5513                 res_alloc += res_alloc;
5514                 if (res_alloc <= 0)
5515                     goto Overflow;
5516             } while (new_res_used > res_alloc);
5517             if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5518                 Py_DECREF(item);
5519                 goto onError;
5520             }
5521             res_p = PyUnicode_AS_UNICODE(res) + res_used;
5522         }
5523
5524         /* Copy item, and maybe the separator. */
5525         Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5526         res_p += itemlen;
5527         if (i < seqlen - 1) {
5528             Py_UNICODE_COPY(res_p, sep, seplen);
5529             res_p += seplen;
5530         }
5531         Py_DECREF(item);
5532         res_used = new_res_used;
5533     }
5534
5535     /* Shrink res to match the used area; this probably can't fail,
5536      * but it's cheap to check.
5537      */
5538     if (_PyUnicode_Resize(&res, res_used) < 0)
5539         goto onError;
5540
5541  Done:
5542     Py_XDECREF(internal_separator);
5543     Py_DECREF(fseq);
5544     return (PyObject *)res;
5545
5546  Overflow:
5547     PyErr_SetString(PyExc_OverflowError,
5548                     "join() result is too long for a Python string");
5549     Py_DECREF(item);
5550     /* fall through */
5551
5552  onError:
5553     Py_XDECREF(internal_separator);
5554     Py_DECREF(fseq);
5555     Py_XDECREF(res);
5556     return NULL;
5557 }
5558
5559 static
5560 PyUnicodeObject *pad(PyUnicodeObject *self,
5561                      Py_ssize_t left,
5562                      Py_ssize_t right,
5563                      Py_UNICODE fill)
5564 {
5565     PyUnicodeObject *u;
5566
5567     if (left < 0)
5568         left = 0;
5569     if (right < 0)
5570         right = 0;
5571
5572     if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5573         Py_INCREF(self);
5574         return self;
5575     }
5576
5577     u = _PyUnicode_New(left + self->length + right);
5578     if (u) {
5579         if (left)
5580             Py_UNICODE_FILL(u->str, fill, left);
5581         Py_UNICODE_COPY(u->str + left, self->str, self->length);
5582         if (right)
5583             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5584     }
5585
5586     return u;
5587 }
5588
5589 #define SPLIT_APPEND(data, left, right)                                 \
5590         str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
5591         if (!str)                                                       \
5592             goto onError;                                               \
5593         if (PyList_Append(list, str)) {                                 \
5594             Py_DECREF(str);                                             \
5595             goto onError;                                               \
5596         }                                                               \
5597         else                                                            \
5598             Py_DECREF(str);
5599
5600 static
5601 PyObject *split_whitespace(PyUnicodeObject *self,
5602                            PyObject *list,
5603                            Py_ssize_t maxcount)
5604 {
5605     register Py_ssize_t i;
5606     register Py_ssize_t j;
5607     Py_ssize_t len = self->length;
5608     PyObject *str;
5609     register const Py_UNICODE *buf = self->str;
5610
5611     for (i = j = 0; i < len; ) {
5612         /* find a token */
5613         while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5614             i++;
5615         j = i;
5616         while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
5617             i++;
5618         if (j < i) {
5619             if (maxcount-- <= 0)
5620                 break;
5621             SPLIT_APPEND(buf, j, i);
5622             while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5623                 i++;
5624             j = i;
5625         }
5626     }
5627     if (j < len) {
5628         SPLIT_APPEND(buf, j, len);
5629     }
5630     return list;
5631
5632  onError:
5633     Py_DECREF(list);
5634     return NULL;
5635 }
5636
5637 PyObject *PyUnicode_Splitlines(PyObject *string,
5638                                int keepends)
5639 {
5640     register Py_ssize_t i;
5641     register Py_ssize_t j;
5642     Py_ssize_t len;
5643     PyObject *list;
5644     PyObject *str;
5645     Py_UNICODE *data;
5646
5647     string = PyUnicode_FromObject(string);
5648     if (string == NULL)
5649         return NULL;
5650     data = PyUnicode_AS_UNICODE(string);
5651     len = PyUnicode_GET_SIZE(string);
5652
5653     list = PyList_New(0);
5654     if (!list)
5655         goto onError;
5656
5657     for (i = j = 0; i < len; ) {
5658         Py_ssize_t eol;
5659
5660         /* Find a line and append it */
5661         while (i < len && !BLOOM_LINEBREAK(data[i]))
5662             i++;
5663
5664         /* Skip the line break reading CRLF as one line break */
5665         eol = i;
5666         if (i < len) {
5667             if (data[i] == '\r' && i + 1 < len &&
5668                 data[i+1] == '\n')
5669                 i += 2;
5670             else
5671                 i++;
5672             if (keepends)
5673                 eol = i;
5674         }
5675         SPLIT_APPEND(data, j, eol);
5676         j = i;
5677     }
5678     if (j < len) {
5679         SPLIT_APPEND(data, j, len);
5680     }
5681
5682     Py_DECREF(string);
5683     return list;
5684
5685  onError:
5686     Py_XDECREF(list);
5687     Py_DECREF(string);
5688     return NULL;
5689 }
5690
5691 static
5692 PyObject *split_char(PyUnicodeObject *self,
5693                      PyObject *list,
5694                      Py_UNICODE ch,
5695                      Py_ssize_t maxcount)
5696 {
5697     register Py_ssize_t i;
5698     register Py_ssize_t j;
5699     Py_ssize_t len = self->length;
5700     PyObject *str;
5701     register const Py_UNICODE *buf = self->str;
5702
5703     for (i = j = 0; i < len; ) {
5704         if (buf[i] == ch) {
5705             if (maxcount-- <= 0)
5706                 break;
5707             SPLIT_APPEND(buf, j, i);
5708             i = j = i + 1;
5709         } else
5710             i++;
5711     }
5712     if (j <= len) {
5713         SPLIT_APPEND(buf, j, len);
5714     }
5715     return list;
5716
5717  onError:
5718     Py_DECREF(list);
5719     return NULL;
5720 }
5721
5722 static
5723 PyObject *split_substring(PyUnicodeObject *self,
5724                           PyObject *list,
5725                           PyUnicodeObject *substring,
5726                           Py_ssize_t maxcount)
5727 {
5728     register Py_ssize_t i;
5729     register Py_ssize_t j;
5730     Py_ssize_t len = self->length;
5731     Py_ssize_t sublen = substring->length;
5732     PyObject *str;
5733
5734     for (i = j = 0; i <= len - sublen; ) {
5735         if (Py_UNICODE_MATCH(self, i, substring)) {
5736             if (maxcount-- <= 0)
5737                 break;
5738             SPLIT_APPEND(self->str, j, i);
5739             i = j = i + sublen;
5740         } else
5741             i++;
5742     }
5743     if (j <= len) {
5744         SPLIT_APPEND(self->str, j, len);
5745     }
5746     return list;
5747
5748  onError:
5749     Py_DECREF(list);
5750     return NULL;
5751 }
5752
5753 static
5754 PyObject *rsplit_whitespace(PyUnicodeObject *self,
5755                             PyObject *list,
5756                             Py_ssize_t maxcount)
5757 {
5758     register Py_ssize_t i;
5759     register Py_ssize_t j;
5760     Py_ssize_t len = self->length;
5761     PyObject *str;
5762     register const Py_UNICODE *buf = self->str;
5763
5764     for (i = j = len - 1; i >= 0; ) {
5765         /* find a token */
5766         while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5767             i--;
5768         j = i;
5769         while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
5770             i--;
5771         if (j > i) {
5772             if (maxcount-- <= 0)
5773                 break;
5774             SPLIT_APPEND(buf, i + 1, j + 1);
5775             while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5776                 i--;
5777             j = i;
5778         }
5779     }
5780     if (j >= 0) {
5781         SPLIT_APPEND(buf, 0, j + 1);
5782     }
5783     if (PyList_Reverse(list) < 0)
5784         goto onError;
5785     return list;
5786
5787  onError:
5788     Py_DECREF(list);
5789     return NULL;
5790 }
5791
5792 static
5793 PyObject *rsplit_char(PyUnicodeObject *self,
5794                       PyObject *list,
5795                       Py_UNICODE ch,
5796                       Py_ssize_t maxcount)
5797 {
5798     register Py_ssize_t i;
5799     register Py_ssize_t j;
5800     Py_ssize_t len = self->length;
5801     PyObject *str;
5802     register const Py_UNICODE *buf = self->str;
5803
5804     for (i = j = len - 1; i >= 0; ) {
5805         if (buf[i] == ch) {
5806             if (maxcount-- <= 0)
5807                 break;
5808             SPLIT_APPEND(buf, i + 1, j + 1);
5809             j = i = i - 1;
5810         } else
5811             i--;
5812     }
5813     if (j >= -1) {
5814         SPLIT_APPEND(buf, 0, j + 1);
5815     }
5816     if (PyList_Reverse(list) < 0)
5817         goto onError;
5818     return list;
5819
5820  onError:
5821     Py_DECREF(list);
5822     return NULL;
5823 }
5824
5825 static
5826 PyObject *rsplit_substring(PyUnicodeObject *self,
5827                            PyObject *list,
5828                            PyUnicodeObject *substring,
5829                            Py_ssize_t maxcount)
5830 {
5831     register Py_ssize_t i;
5832     register Py_ssize_t j;
5833     Py_ssize_t len = self->length;
5834     Py_ssize_t sublen = substring->length;
5835     PyObject *str;
5836
5837     for (i = len - sublen, j = len; i >= 0; ) {
5838         if (Py_UNICODE_MATCH(self, i, substring)) {
5839             if (maxcount-- <= 0)
5840                 break;
5841             SPLIT_APPEND(self->str, i + sublen, j);
5842             j = i;
5843             i -= sublen;
5844         } else
5845             i--;
5846     }
5847     if (j >= 0) {
5848         SPLIT_APPEND(self->str, 0, j);
5849     }
5850     if (PyList_Reverse(list) < 0)
5851         goto onError;
5852     return list;
5853
5854  onError:
5855     Py_DECREF(list);
5856     return NULL;
5857 }
5858
5859 #undef SPLIT_APPEND
5860
5861 static
5862 PyObject *split(PyUnicodeObject *self,
5863                 PyUnicodeObject *substring,
5864                 Py_ssize_t maxcount)
5865 {
5866     PyObject *list;
5867
5868     if (maxcount < 0)
5869         maxcount = PY_SSIZE_T_MAX;
5870
5871     list = PyList_New(0);
5872     if (!list)
5873         return NULL;
5874
5875     if (substring == NULL)
5876         return split_whitespace(self,list,maxcount);
5877
5878     else if (substring->length == 1)
5879         return split_char(self,list,substring->str[0],maxcount);
5880
5881     else if (substring->length == 0) {
5882         Py_DECREF(list);
5883         PyErr_SetString(PyExc_ValueError, "empty separator");
5884         return NULL;
5885     }
5886     else
5887         return split_substring(self,list,substring,maxcount);
5888 }
5889
5890 static
5891 PyObject *rsplit(PyUnicodeObject *self,
5892                  PyUnicodeObject *substring,
5893                  Py_ssize_t maxcount)
5894 {
5895     PyObject *list;
5896
5897     if (maxcount < 0)
5898         maxcount = PY_SSIZE_T_MAX;
5899
5900     list = PyList_New(0);
5901     if (!list)
5902         return NULL;
5903
5904     if (substring == NULL)
5905         return rsplit_whitespace(self,list,maxcount);
5906
5907     else if (substring->length == 1)
5908         return rsplit_char(self,list,substring->str[0],maxcount);
5909
5910     else if (substring->length == 0) {
5911         Py_DECREF(list);
5912         PyErr_SetString(PyExc_ValueError, "empty separator");
5913         return NULL;
5914     }
5915     else
5916         return rsplit_substring(self,list,substring,maxcount);
5917 }
5918
5919 static
5920 PyObject *replace(PyUnicodeObject *self,
5921                   PyUnicodeObject *str1,
5922                   PyUnicodeObject *str2,
5923                   Py_ssize_t maxcount)
5924 {
5925     PyUnicodeObject *u;
5926
5927     if (maxcount < 0)
5928         maxcount = PY_SSIZE_T_MAX;
5929
5930     if (str1->length == str2->length) {
5931         /* same length */
5932         Py_ssize_t i;
5933         if (str1->length == 1) {
5934             /* replace characters */
5935             Py_UNICODE u1, u2;
5936             if (!findchar(self->str, self->length, str1->str[0]))
5937                 goto nothing;
5938             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5939             if (!u)
5940                 return NULL;
5941             Py_UNICODE_COPY(u->str, self->str, self->length);
5942             u1 = str1->str[0];
5943             u2 = str2->str[0];
5944             for (i = 0; i < u->length; i++)
5945                 if (u->str[i] == u1) {
5946                     if (--maxcount < 0)
5947                         break;
5948                     u->str[i] = u2;
5949                 }
5950         } else {
5951             i = fastsearch(
5952                 self->str, self->length, str1->str, str1->length, FAST_SEARCH
5953                 );
5954             if (i < 0)
5955                 goto nothing;
5956             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5957             if (!u)
5958                 return NULL;
5959             Py_UNICODE_COPY(u->str, self->str, self->length);
5960             while (i <= self->length - str1->length)
5961                 if (Py_UNICODE_MATCH(self, i, str1)) {
5962                     if (--maxcount < 0)
5963                         break;
5964                     Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5965                     i += str1->length;
5966                 } else
5967                     i++;
5968         }
5969     } else {
5970
5971         Py_ssize_t n, i, j, e;
5972         Py_ssize_t product, new_size, delta;
5973         Py_UNICODE *p;
5974
5975         /* replace strings */
5976         n = stringlib_count(self->str, self->length, str1->str, str1->length);
5977         if (n > maxcount)
5978             n = maxcount;
5979         if (n == 0)
5980             goto nothing;
5981         /* new_size = self->length + n * (str2->length - str1->length)); */
5982         delta = (str2->length - str1->length);
5983         if (delta == 0) {
5984             new_size = self->length;
5985         } else {
5986             product = n * (str2->length - str1->length);
5987             if ((product / (str2->length - str1->length)) != n) {
5988                 PyErr_SetString(PyExc_OverflowError,
5989                                 "replace string is too long");
5990                 return NULL;
5991             }
5992             new_size = self->length + product;
5993             if (new_size < 0) {
5994                 PyErr_SetString(PyExc_OverflowError,
5995                                 "replace string is too long");
5996                 return NULL;
5997             }
5998         }
5999         u = _PyUnicode_New(new_size);
6000         if (!u)
6001             return NULL;
6002         i = 0;
6003         p = u->str;
6004         e = self->length - str1->length;
6005         if (str1->length > 0) {
6006             while (n-- > 0) {
6007                 /* look for next match */
6008                 j = i;
6009                 while (j <= e) {
6010                     if (Py_UNICODE_MATCH(self, j, str1))
6011                         break;
6012                     j++;
6013                 }
6014                 if (j > i) {
6015                     if (j > e)
6016                         break;
6017                     /* copy unchanged part [i:j] */
6018                     Py_UNICODE_COPY(p, self->str+i, j-i);
6019                     p += j - i;
6020                 }
6021                 /* copy substitution string */
6022                 if (str2->length > 0) {
6023                     Py_UNICODE_COPY(p, str2->str, str2->length);
6024                     p += str2->length;
6025                 }
6026                 i = j + str1->length;
6027             }
6028             if (i < self->length)
6029                 /* copy tail [i:] */
6030                 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6031         } else {
6032             /* interleave */
6033             while (n > 0) {
6034                 Py_UNICODE_COPY(p, str2->str, str2->length);
6035                 p += str2->length;
6036                 if (--n <= 0)
6037                     break;
6038                 *p++ = self->str[i++];
6039             }
6040             Py_UNICODE_COPY(p, self->str+i, self->length-i);
6041         }
6042     }
6043     return (PyObject *) u;
6044
6045 nothing:
6046     /* nothing to replace; return original string (when possible) */
6047     if (PyUnicode_CheckExact(self)) {
6048         Py_INCREF(self);
6049         return (PyObject *) self;
6050     }
6051     return PyUnicode_FromUnicode(self->str, self->length);
6052 }
6053
6054 /* --- Unicode Object Methods --------------------------------------------- */
6055
6056 PyDoc_STRVAR(title__doc__,
6057 "S.title() -> unicode\n\
6058 \n\
6059 Return a titlecased version of S, i.e. words start with title case\n\
6060 characters, all remaining cased characters have lower case.");
6061
6062 static PyObject*
6063 unicode_title(PyUnicodeObject *self)
6064 {
6065     return fixup(self, fixtitle);
6066 }
6067
6068 PyDoc_STRVAR(capitalize__doc__,
6069 "S.capitalize() -> unicode\n\
6070 \n\
6071 Return a capitalized version of S, i.e. make the first character\n\
6072 have upper case.");
6073
6074 static PyObject*
6075 unicode_capitalize(PyUnicodeObject *self)
6076 {
6077     return fixup(self, fixcapitalize);
6078 }
6079
6080 #if 0
6081 PyDoc_STRVAR(capwords__doc__,
6082 "S.capwords() -> unicode\n\
6083 \n\
6084 Apply .capitalize() to all words in S and return the result with\n\
6085 normalized whitespace (all whitespace strings are replaced by ' ').");
6086
6087 static PyObject*
6088 unicode_capwords(PyUnicodeObject *self)
6089 {
6090     PyObject *list;
6091     PyObject *item;
6092     Py_ssize_t i;
6093
6094     /* Split into words */
6095     list = split(self, NULL, -1);
6096     if (!list)
6097         return NULL;
6098
6099     /* Capitalize each word */
6100     for (i = 0; i < PyList_GET_SIZE(list); i++) {
6101         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6102                      fixcapitalize);
6103         if (item == NULL)
6104             goto onError;
6105         Py_DECREF(PyList_GET_ITEM(list, i));
6106         PyList_SET_ITEM(list, i, item);
6107     }
6108
6109     /* Join the words to form a new string */
6110     item = PyUnicode_Join(NULL, list);
6111
6112 onError:
6113     Py_DECREF(list);
6114     return (PyObject *)item;
6115 }
6116 #endif
6117
6118 /* Argument converter.  Coerces to a single unicode character */
6119
6120 static int
6121 convert_uc(PyObject *obj, void *addr)
6122 {
6123         Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6124         PyObject *uniobj;
6125         Py_UNICODE *unistr;
6126
6127         uniobj = PyUnicode_FromObject(obj);
6128         if (uniobj == NULL) {
6129                 PyErr_SetString(PyExc_TypeError,
6130                         "The fill character cannot be converted to Unicode");
6131                 return 0;
6132         }
6133         if (PyUnicode_GET_SIZE(uniobj) != 1) {
6134                 PyErr_SetString(PyExc_TypeError,
6135                         "The fill character must be exactly one character long");
6136                 Py_DECREF(uniobj);
6137                 return 0;
6138         }
6139         unistr = PyUnicode_AS_UNICODE(uniobj);
6140         *fillcharloc = unistr[0];
6141         Py_DECREF(uniobj);
6142         return 1;
6143 }
6144
6145 PyDoc_STRVAR(center__doc__,
6146 "S.center(width[, fillchar]) -> unicode\n\
6147 \n\
6148 Return S centered in a Unicode string of length width. Padding is\n\
6149 done using the specified fill character (default is a space)");
6150
6151 static PyObject *
6152 unicode_center(PyUnicodeObject *self, PyObject *args)
6153 {
6154     Py_ssize_t marg, left;
6155     Py_ssize_t width;
6156     Py_UNICODE fillchar = ' ';
6157
6158     if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6159         return NULL;
6160
6161     if (self->length >= width && PyUnicode_CheckExact(self)) {
6162         Py_INCREF(self);
6163         return (PyObject*) self;
6164     }
6165
6166     marg = width - self->length;
6167     left = marg / 2 + (marg & width & 1);
6168
6169     return (PyObject*) pad(self, left, marg - left, fillchar);
6170 }
6171
6172 #if 0
6173
6174 /* This code should go into some future Unicode collation support
6175    module. The basic comparison should compare ordinals on a naive
6176    basis (this is what Java does and thus JPython too). */
6177
6178 /* speedy UTF-16 code point order comparison */
6179 /* gleaned from: */
6180 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6181
6182 static short utf16Fixup[32] =
6183 {
6184     0, 0, 0, 0, 0, 0, 0, 0,
6185     0, 0, 0, 0, 0, 0, 0, 0,
6186     0, 0, 0, 0, 0, 0, 0, 0,
6187     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6188 };
6189
6190 static int
6191 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6192 {
6193     Py_ssize_t len1, len2;
6194
6195     Py_UNICODE *s1 = str1->str;
6196     Py_UNICODE *s2 = str2->str;
6197
6198     len1 = str1->length;
6199     len2 = str2->length;
6200
6201     while (len1 > 0 && len2 > 0) {
6202         Py_UNICODE c1, c2;
6203
6204         c1 = *s1++;
6205         c2 = *s2++;
6206
6207         if (c1 > (1<<11) * 26)
6208             c1 += utf16Fixup[c1>>11];
6209         if (c2 > (1<<11) * 26)
6210             c2 += utf16Fixup[c2>>11];
6211         /* now c1 and c2 are in UTF-32-compatible order */
6212
6213         if (c1 != c2)
6214             return (c1 < c2) ? -1 : 1;
6215
6216         len1--; len2--;
6217     }
6218
6219     return (len1 < len2) ? -1 : (len1 != len2);
6220 }
6221
6222 #else
6223
6224 static int
6225 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6226 {
6227     register Py_ssize_t len1, len2;
6228
6229     Py_UNICODE *s1 = str1->str;
6230     Py_UNICODE *s2 = str2->str;
6231
6232     len1 = str1->length;
6233     len2 = str2->length;
6234
6235     while (len1 > 0 && len2 > 0) {
6236         Py_UNICODE c1, c2;
6237
6238         c1 = *s1++;
6239         c2 = *s2++;
6240
6241         if (c1 != c2)
6242             return (c1 < c2) ? -1 : 1;
6243
6244         len1--; len2--;
6245     }
6246
6247     return (len1 < len2) ? -1 : (len1 != len2);
6248 }
6249
6250 #endif
6251
6252 int PyUnicode_Compare(PyObject *left,
6253                       PyObject *right)
6254 {
6255     PyUnicodeObject *u = NULL, *v = NULL;
6256     int result;
6257
6258     /* Coerce the two arguments */
6259     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6260     if (u == NULL)
6261         goto onError;
6262     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6263     if (v == NULL)
6264         goto onError;
6265
6266     /* Shortcut for empty or interned objects */
6267     if (v == u) {
6268         Py_DECREF(u);
6269         Py_DECREF(v);
6270         return 0;
6271     }
6272
6273     result = unicode_compare(u, v);
6274
6275     Py_DECREF(u);
6276     Py_DECREF(v);
6277     return result;
6278
6279 onError:
6280     Py_XDECREF(u);
6281     Py_XDECREF(v);
6282     return -1;
6283 }
6284
6285 PyObject *PyUnicode_RichCompare(PyObject *left,
6286                                 PyObject *right,
6287                                 int op)
6288 {
6289     int result;
6290
6291     result = PyUnicode_Compare(left, right);
6292     if (result == -1 && PyErr_Occurred())
6293         goto onError;
6294
6295     /* Convert the return value to a Boolean */
6296     switch (op) {
6297     case Py_EQ:
6298         result = (result == 0);
6299         break;
6300     case Py_NE:
6301         result = (result != 0);
6302         break;
6303     case Py_LE:
6304         result = (result <= 0);
6305         break;
6306     case Py_GE:
6307         result = (result >= 0);
6308         break;
6309     case Py_LT:
6310         result = (result == -1);
6311         break;
6312     case Py_GT:
6313         result = (result == 1);
6314         break;
6315     }
6316     return PyBool_FromLong(result);
6317
6318  onError:
6319
6320     /* Standard case
6321
6322        Type errors mean that PyUnicode_FromObject() could not convert
6323        one of the arguments (usually the right hand side) to Unicode,
6324        ie. we can't handle the comparison request. However, it is
6325        possible that the other object knows a comparison method, which
6326        is why we return Py_NotImplemented to give the other object a
6327        chance.
6328
6329     */
6330     if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6331         PyErr_Clear();
6332         Py_INCREF(Py_NotImplemented);
6333         return Py_NotImplemented;
6334     }
6335     if (op != Py_EQ && op != Py_NE)
6336         return NULL;
6337
6338     /* Equality comparison.
6339
6340        This is a special case: we silence any PyExc_UnicodeDecodeError
6341        and instead turn it into a PyErr_UnicodeWarning.
6342
6343     */
6344     if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6345         return NULL;
6346     PyErr_Clear();
6347     if (PyErr_Warn(PyExc_UnicodeWarning,
6348                    (op == Py_EQ) ?
6349                    "Unicode equal comparison "
6350                    "failed to convert both arguments to Unicode - "
6351                    "interpreting them as being unequal" :
6352                    "Unicode unequal comparison "
6353                    "failed to convert both arguments to Unicode - "
6354                    "interpreting them as being unequal"
6355                    ) < 0)
6356         return NULL;
6357     result = (op == Py_NE);
6358     return PyBool_FromLong(result);
6359 }
6360
6361 int PyUnicode_Contains(PyObject *container,
6362                        PyObject *element)
6363 {
6364     PyObject *str, *sub;
6365     int result;
6366
6367     /* Coerce the two arguments */
6368     sub = PyUnicode_FromObject(element);
6369     if (!sub) {
6370         PyErr_SetString(PyExc_TypeError,
6371             "'in <string>' requires string as left operand");
6372         return -1;
6373     }
6374
6375     str = PyUnicode_FromObject(container);
6376     if (!str) {
6377         Py_DECREF(sub);
6378         return -1;
6379     }
6380
6381     result = stringlib_contains_obj(str, sub);
6382
6383     Py_DECREF(str);
6384     Py_DECREF(sub);
6385
6386     return result;
6387 }
6388
6389 /* Concat to string or Unicode object giving a new Unicode object. */
6390
6391 PyObject *PyUnicode_Concat(PyObject *left,
6392                            PyObject *right)
6393 {
6394     PyUnicodeObject *u = NULL, *v = NULL, *w;
6395
6396     /* Coerce the two arguments */
6397     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6398     if (u == NULL)
6399         goto onError;
6400     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6401     if (v == NULL)
6402         goto onError;
6403
6404     /* Shortcuts */
6405     if (v == unicode_empty) {
6406         Py_DECREF(v);
6407         return (PyObject *)u;
6408     }
6409     if (u == unicode_empty) {
6410         Py_DECREF(u);
6411         return (PyObject *)v;
6412     }
6413
6414     /* Concat the two Unicode strings */
6415     w = _PyUnicode_New(u->length + v->length);
6416     if (w == NULL)
6417         goto onError;
6418     Py_UNICODE_COPY(w->str, u->str, u->length);
6419     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6420
6421     Py_DECREF(u);
6422     Py_DECREF(v);
6423     return (PyObject *)w;
6424
6425 onError:
6426     Py_XDECREF(u);
6427     Py_XDECREF(v);
6428     return NULL;
6429 }
6430
6431 PyDoc_STRVAR(count__doc__,
6432 "S.count(sub[, start[, end]]) -> int\n\
6433 \n\
6434 Return the number of non-overlapping occurrences of substring sub in\n\
6435 Unicode string S[start:end].  Optional arguments start and end are\n\
6436 interpreted as in slice notation.");
6437
6438 static PyObject *
6439 unicode_count(PyUnicodeObject *self, PyObject *args)
6440 {
6441     PyUnicodeObject *substring;
6442     Py_ssize_t start = 0;
6443     Py_ssize_t end = PY_SSIZE_T_MAX;
6444     PyObject *result;
6445
6446     if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6447                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6448         return NULL;
6449
6450     substring = (PyUnicodeObject *)PyUnicode_FromObject(
6451         (PyObject *)substring);
6452     if (substring == NULL)
6453         return NULL;
6454
6455     FIX_START_END(self);
6456
6457     result = PyInt_FromSsize_t(
6458         stringlib_count(self->str + start, end - start,
6459                         substring->str, substring->length)
6460         );
6461
6462     Py_DECREF(substring);
6463
6464     return result;
6465 }
6466
6467 PyDoc_STRVAR(encode__doc__,
6468 "S.encode([encoding[,errors]]) -> string or unicode\n\
6469 \n\
6470 Encodes S using the codec registered for encoding. encoding defaults\n\
6471 to the default encoding. errors may be given to set a different error\n\
6472 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6473 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6474 'xmlcharrefreplace' as well as any other name registered with\n\
6475 codecs.register_error that can handle UnicodeEncodeErrors.");
6476
6477 static PyObject *
6478 unicode_encode(PyUnicodeObject *self, PyObject *args)
6479 {
6480     char *encoding = NULL;
6481     char *errors = NULL;
6482     PyObject *v;
6483
6484     if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6485         return NULL;
6486     v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6487     if (v == NULL)
6488         goto onError;
6489     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6490         PyErr_Format(PyExc_TypeError,
6491                      "encoder did not return a string/unicode object "
6492                      "(type=%.400s)",
6493                      Py_TYPE(v)->tp_name);
6494         Py_DECREF(v);
6495         return NULL;
6496     }
6497     return v;
6498
6499  onError:
6500     return NULL;
6501 }
6502
6503 PyDoc_STRVAR(decode__doc__,
6504 "S.decode([encoding[,errors]]) -> string or unicode\n\
6505 \n\
6506 Decodes S using the codec registered for encoding. encoding defaults\n\
6507 to the default encoding. errors may be given to set a different error\n\
6508 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6509 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6510 as well as any other name registerd with codecs.register_error that is\n\
6511 able to handle UnicodeDecodeErrors.");
6512
6513 static PyObject *
6514 unicode_decode(PyUnicodeObject *self, PyObject *args)
6515 {
6516     char *encoding = NULL;
6517     char *errors = NULL;
6518     PyObject *v;
6519
6520     if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6521         return NULL;
6522     v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
6523     if (v == NULL)
6524         goto onError;
6525     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6526         PyErr_Format(PyExc_TypeError,
6527                      "decoder did not return a string/unicode object "
6528                      "(type=%.400s)",
6529                      Py_TYPE(v)->tp_name);
6530         Py_DECREF(v);
6531         return NULL;
6532     }
6533     return v;
6534
6535  onError:
6536     return NULL;
6537 }
6538
6539 PyDoc_STRVAR(expandtabs__doc__,
6540 "S.expandtabs([tabsize]) -> unicode\n\
6541 \n\
6542 Return a copy of S where all tab characters are expanded using spaces.\n\
6543 If tabsize is not given, a tab size of 8 characters is assumed.");
6544
6545 static PyObject*
6546 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6547 {
6548     Py_UNICODE *e;
6549     Py_UNICODE *p;
6550     Py_UNICODE *q;
6551     Py_UNICODE *qe;
6552     Py_ssize_t i, j, incr;
6553     PyUnicodeObject *u;
6554     int tabsize = 8;
6555
6556     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6557         return NULL;
6558
6559     /* First pass: determine size of output string */
6560     i = 0; /* chars up to and including most recent \n or \r */
6561     j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6562     e = self->str + self->length; /* end of input */
6563     for (p = self->str; p < e; p++)
6564         if (*p == '\t') {
6565             if (tabsize > 0) {
6566                 incr = tabsize - (j % tabsize); /* cannot overflow */
6567                 if (j > PY_SSIZE_T_MAX - incr)
6568                     goto overflow1;
6569                 j += incr;
6570             }
6571         }
6572         else {
6573             if (j > PY_SSIZE_T_MAX - 1)
6574                 goto overflow1;
6575             j++;
6576             if (*p == '\n' || *p == '\r') {
6577                 if (i > PY_SSIZE_T_MAX - j)
6578                     goto overflow1;
6579                 i += j;
6580                 j = 0;
6581             }
6582         }
6583
6584     if (i > PY_SSIZE_T_MAX - j)
6585         goto overflow1;
6586
6587     /* Second pass: create output string and fill it */
6588     u = _PyUnicode_New(i + j);
6589     if (!u)
6590         return NULL;
6591
6592     j = 0; /* same as in first pass */
6593     q = u->str; /* next output char */
6594     qe = u->str + u->length; /* end of output */
6595
6596     for (p = self->str; p < e; p++)
6597         if (*p == '\t') {
6598             if (tabsize > 0) {
6599                 i = tabsize - (j % tabsize);
6600                 j += i;
6601                 while (i--) {
6602                     if (q >= qe)
6603                         goto overflow2;
6604                     *q++ = ' ';
6605                 }
6606             }
6607         }
6608         else {
6609             if (q >= qe)
6610                 goto overflow2;
6611             *q++ = *p;
6612             j++;
6613             if (*p == '\n' || *p == '\r')
6614                 j = 0;
6615         }
6616
6617     return (PyObject*) u;
6618
6619   overflow2:
6620     Py_DECREF(u);
6621   overflow1:
6622     PyErr_SetString(PyExc_OverflowError, "new string is too long");
6623     return NULL;
6624 }
6625
6626 PyDoc_STRVAR(find__doc__,
6627 "S.find(sub [,start [,end]]) -> int\n\
6628 \n\
6629 Return the lowest index in S where substring sub is found,\n\
6630 such that sub is contained within s[start:end].  Optional\n\
6631 arguments start and end are interpreted as in slice notation.\n\
6632 \n\
6633 Return -1 on failure.");
6634
6635 static PyObject *
6636 unicode_find(PyUnicodeObject *self, PyObject *args)
6637 {
6638     PyObject *substring;
6639     Py_ssize_t start;
6640     Py_ssize_t end;
6641     Py_ssize_t result;
6642
6643     if (!_ParseTupleFinds(args, &substring, &start, &end))
6644         return NULL;
6645
6646     result = stringlib_find_slice(
6647         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6648         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6649         start, end
6650         );
6651
6652     Py_DECREF(substring);
6653
6654     return PyInt_FromSsize_t(result);
6655 }
6656
6657 static PyObject *
6658 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6659 {
6660     if (index < 0 || index >= self->length) {
6661         PyErr_SetString(PyExc_IndexError, "string index out of range");
6662         return NULL;
6663     }
6664
6665     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6666 }
6667
6668 static long
6669 unicode_hash(PyUnicodeObject *self)
6670 {
6671     /* Since Unicode objects compare equal to their ASCII string
6672        counterparts, they should use the individual character values
6673        as basis for their hash value.  This is needed to assure that
6674        strings and Unicode objects behave in the same way as
6675        dictionary keys. */
6676
6677     register Py_ssize_t len;
6678     register Py_UNICODE *p;
6679     register long x;
6680
6681     if (self->hash != -1)
6682         return self->hash;
6683     len = PyUnicode_GET_SIZE(self);
6684     p = PyUnicode_AS_UNICODE(self);
6685     x = *p << 7;
6686     while (--len >= 0)
6687         x = (1000003*x) ^ *p++;
6688     x ^= PyUnicode_GET_SIZE(self);
6689     if (x == -1)
6690         x = -2;
6691     self->hash = x;
6692     return x;
6693 }
6694
6695 PyDoc_STRVAR(index__doc__,
6696 "S.index(sub [,start [,end]]) -> int\n\
6697 \n\
6698 Like S.find() but raise ValueError when the substring is not found.");
6699
6700 static PyObject *
6701 unicode_index(PyUnicodeObject *self, PyObject *args)
6702 {
6703     Py_ssize_t result;
6704     PyObject *substring;
6705     Py_ssize_t start;
6706     Py_ssize_t end;
6707
6708     if (!_ParseTupleFinds(args, &substring, &start, &end))
6709         return NULL;
6710
6711     result = stringlib_find_slice(
6712         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6713         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6714         start, end
6715         );
6716
6717     Py_DECREF(substring);
6718
6719     if (result < 0) {
6720         PyErr_SetString(PyExc_ValueError, "substring not found");
6721         return NULL;
6722     }
6723
6724     return PyInt_FromSsize_t(result);
6725 }
6726
6727 PyDoc_STRVAR(islower__doc__,
6728 "S.islower() -> bool\n\
6729 \n\
6730 Return True if all cased characters in S are lowercase and there is\n\
6731 at least one cased character in S, False otherwise.");
6732
6733 static PyObject*
6734 unicode_islower(PyUnicodeObject *self)
6735 {
6736     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6737     register const Py_UNICODE *e;
6738     int cased;
6739
6740     /* Shortcut for single character strings */
6741     if (PyUnicode_GET_SIZE(self) == 1)
6742         return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6743
6744     /* Special case for empty strings */
6745     if (PyUnicode_GET_SIZE(self) == 0)
6746         return PyBool_FromLong(0);
6747
6748     e = p + PyUnicode_GET_SIZE(self);
6749     cased = 0;
6750     for (; p < e; p++) {
6751         register const Py_UNICODE ch = *p;
6752
6753         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6754             return PyBool_FromLong(0);
6755         else if (!cased && Py_UNICODE_ISLOWER(ch))
6756             cased = 1;
6757     }
6758     return PyBool_FromLong(cased);
6759 }
6760
6761 PyDoc_STRVAR(isupper__doc__,
6762 "S.isupper() -> bool\n\
6763 \n\
6764 Return True if all cased characters in S are uppercase and there is\n\
6765 at least one cased character in S, False otherwise.");
6766
6767 static PyObject*
6768 unicode_isupper(PyUnicodeObject *self)
6769 {
6770     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6771     register const Py_UNICODE *e;
6772     int cased;
6773
6774     /* Shortcut for single character strings */
6775     if (PyUnicode_GET_SIZE(self) == 1)
6776         return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6777
6778     /* Special case for empty strings */
6779     if (PyUnicode_GET_SIZE(self) == 0)
6780         return PyBool_FromLong(0);
6781
6782     e = p + PyUnicode_GET_SIZE(self);
6783     cased = 0;
6784     for (; p < e; p++) {
6785         register const Py_UNICODE ch = *p;
6786
6787         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6788             return PyBool_FromLong(0);
6789         else if (!cased && Py_UNICODE_ISUPPER(ch))
6790             cased = 1;
6791     }
6792     return PyBool_FromLong(cased);
6793 }
6794
6795 PyDoc_STRVAR(istitle__doc__,
6796 "S.istitle() -> bool\n\
6797 \n\
6798 Return True if S is a titlecased string and there is at least one\n\
6799 character in S, i.e. upper- and titlecase characters may only\n\
6800 follow uncased characters and lowercase characters only cased ones.\n\
6801 Return False otherwise.");
6802
6803 static PyObject*
6804 unicode_istitle(PyUnicodeObject *self)
6805 {
6806     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6807     register const Py_UNICODE *e;
6808     int cased, previous_is_cased;
6809
6810     /* Shortcut for single character strings */
6811     if (PyUnicode_GET_SIZE(self) == 1)
6812         return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6813                                (Py_UNICODE_ISUPPER(*p) != 0));
6814
6815     /* Special case for empty strings */
6816     if (PyUnicode_GET_SIZE(self) == 0)
6817         return PyBool_FromLong(0);
6818
6819     e = p + PyUnicode_GET_SIZE(self);
6820     cased = 0;
6821     previous_is_cased = 0;
6822     for (; p < e; p++) {
6823         register const Py_UNICODE ch = *p;
6824
6825         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6826             if (previous_is_cased)
6827                 return PyBool_FromLong(0);
6828             previous_is_cased = 1;
6829             cased = 1;
6830         }
6831         else if (Py_UNICODE_ISLOWER(ch)) {
6832             if (!previous_is_cased)
6833                 return PyBool_FromLong(0);
6834             previous_is_cased = 1;
6835             cased = 1;
6836         }
6837         else
6838             previous_is_cased = 0;
6839     }
6840     return PyBool_FromLong(cased);
6841 }
6842
6843 PyDoc_STRVAR(isspace__doc__,
6844 "S.isspace() -> bool\n\
6845 \n\
6846 Return True if all characters in S are whitespace\n\
6847 and there is at least one character in S, False otherwise.");
6848
6849 static PyObject*
6850 unicode_isspace(PyUnicodeObject *self)
6851 {
6852     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6853     register const Py_UNICODE *e;
6854
6855     /* Shortcut for single character strings */
6856     if (PyUnicode_GET_SIZE(self) == 1 &&
6857         Py_UNICODE_ISSPACE(*p))
6858         return PyBool_FromLong(1);
6859
6860     /* Special case for empty strings */
6861     if (PyUnicode_GET_SIZE(self) == 0)
6862         return PyBool_FromLong(0);
6863
6864     e = p + PyUnicode_GET_SIZE(self);
6865     for (; p < e; p++) {
6866         if (!Py_UNICODE_ISSPACE(*p))
6867             return PyBool_FromLong(0);
6868     }
6869     return PyBool_FromLong(1);
6870 }
6871
6872 PyDoc_STRVAR(isalpha__doc__,
6873 "S.isalpha() -> bool\n\
6874 \n\
6875 Return True if all characters in S are alphabetic\n\
6876 and there is at least one character in S, False otherwise.");
6877
6878 static PyObject*
6879 unicode_isalpha(PyUnicodeObject *self)
6880 {
6881     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6882     register const Py_UNICODE *e;
6883
6884     /* Shortcut for single character strings */
6885     if (PyUnicode_GET_SIZE(self) == 1 &&
6886         Py_UNICODE_ISALPHA(*p))
6887         return PyBool_FromLong(1);
6888
6889     /* Special case for empty strings */
6890     if (PyUnicode_GET_SIZE(self) == 0)
6891         return PyBool_FromLong(0);
6892
6893     e = p + PyUnicode_GET_SIZE(self);
6894     for (; p < e; p++) {
6895         if (!Py_UNICODE_ISALPHA(*p))
6896             return PyBool_FromLong(0);
6897     }
6898     return PyBool_FromLong(1);
6899 }
6900
6901 PyDoc_STRVAR(isalnum__doc__,
6902 "S.isalnum() -> bool\n\
6903 \n\
6904 Return True if all characters in S are alphanumeric\n\
6905 and there is at least one character in S, False otherwise.");
6906
6907 static PyObject*
6908 unicode_isalnum(PyUnicodeObject *self)
6909 {
6910     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6911     register const Py_UNICODE *e;
6912
6913     /* Shortcut for single character strings */
6914     if (PyUnicode_GET_SIZE(self) == 1 &&
6915         Py_UNICODE_ISALNUM(*p))
6916         return PyBool_FromLong(1);
6917
6918     /* Special case for empty strings */
6919     if (PyUnicode_GET_SIZE(self) == 0)
6920         return PyBool_FromLong(0);
6921
6922     e = p + PyUnicode_GET_SIZE(self);
6923     for (; p < e; p++) {
6924         if (!Py_UNICODE_ISALNUM(*p))
6925             return PyBool_FromLong(0);
6926     }
6927     return PyBool_FromLong(1);
6928 }
6929
6930 PyDoc_STRVAR(isdecimal__doc__,
6931 "S.isdecimal() -> bool\n\
6932 \n\
6933 Return True if there are only decimal characters in S,\n\
6934 False otherwise.");
6935
6936 static PyObject*
6937 unicode_isdecimal(PyUnicodeObject *self)
6938 {
6939     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6940     register const Py_UNICODE *e;
6941
6942     /* Shortcut for single character strings */
6943     if (PyUnicode_GET_SIZE(self) == 1 &&
6944         Py_UNICODE_ISDECIMAL(*p))
6945         return PyBool_FromLong(1);
6946
6947     /* Special case for empty strings */
6948     if (PyUnicode_GET_SIZE(self) == 0)
6949         return PyBool_FromLong(0);
6950
6951     e = p + PyUnicode_GET_SIZE(self);
6952     for (; p < e; p++) {
6953         if (!Py_UNICODE_ISDECIMAL(*p))
6954             return PyBool_FromLong(0);
6955     }
6956     return PyBool_FromLong(1);
6957 }
6958
6959 PyDoc_STRVAR(isdigit__doc__,
6960 "S.isdigit() -> bool\n\
6961 \n\
6962 Return True if all characters in S are digits\n\
6963 and there is at least one character in S, False otherwise.");
6964
6965 static PyObject*
6966 unicode_isdigit(PyUnicodeObject *self)
6967 {
6968     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6969     register const Py_UNICODE *e;
6970
6971     /* Shortcut for single character strings */
6972     if (PyUnicode_GET_SIZE(self) == 1 &&
6973         Py_UNICODE_ISDIGIT(*p))
6974         return PyBool_FromLong(1);
6975
6976     /* Special case for empty strings */
6977     if (PyUnicode_GET_SIZE(self) == 0)
6978         return PyBool_FromLong(0);
6979
6980     e = p + PyUnicode_GET_SIZE(self);
6981     for (; p < e; p++) {
6982         if (!Py_UNICODE_ISDIGIT(*p))
6983             return PyBool_FromLong(0);
6984     }
6985     return PyBool_FromLong(1);
6986 }
6987
6988 PyDoc_STRVAR(isnumeric__doc__,
6989 "S.isnumeric() -> bool\n\
6990 \n\
6991 Return True if there are only numeric characters in S,\n\
6992 False otherwise.");
6993
6994 static PyObject*
6995 unicode_isnumeric(PyUnicodeObject *self)
6996 {
6997     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6998     register const Py_UNICODE *e;
6999
7000     /* Shortcut for single character strings */
7001     if (PyUnicode_GET_SIZE(self) == 1 &&
7002         Py_UNICODE_ISNUMERIC(*p))
7003         return PyBool_FromLong(1);
7004
7005     /* Special case for empty strings */
7006     if (PyUnicode_GET_SIZE(self) == 0)
7007         return PyBool_FromLong(0);
7008
7009     e = p + PyUnicode_GET_SIZE(self);
7010     for (; p < e; p++) {
7011         if (!Py_UNICODE_ISNUMERIC(*p))
7012             return PyBool_FromLong(0);
7013     }
7014     return PyBool_FromLong(1);
7015 }
7016
7017 PyDoc_STRVAR(join__doc__,
7018 "S.join(sequence) -> unicode\n\
7019 \n\
7020 Return a string which is the concatenation of the strings in the\n\
7021 sequence.  The separator between elements is S.");
7022
7023 static PyObject*
7024 unicode_join(PyObject *self, PyObject *data)
7025 {
7026     return PyUnicode_Join(self, data);
7027 }
7028
7029 static Py_ssize_t
7030 unicode_length(PyUnicodeObject *self)
7031 {
7032     return self->length;
7033 }
7034
7035 PyDoc_STRVAR(ljust__doc__,
7036 "S.ljust(width[, fillchar]) -> int\n\
7037 \n\
7038 Return S left justified in a Unicode string of length width. Padding is\n\
7039 done using the specified fill character (default is a space).");
7040
7041 static PyObject *
7042 unicode_ljust(PyUnicodeObject *self, PyObject *args)
7043 {
7044     Py_ssize_t width;
7045     Py_UNICODE fillchar = ' ';
7046
7047     if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7048         return NULL;
7049
7050     if (self->length >= width && PyUnicode_CheckExact(self)) {
7051         Py_INCREF(self);
7052         return (PyObject*) self;
7053     }
7054
7055     return (PyObject*) pad(self, 0, width - self->length, fillchar);
7056 }
7057
7058 PyDoc_STRVAR(lower__doc__,
7059 "S.lower() -> unicode\n\
7060 \n\
7061 Return a copy of the string S converted to lowercase.");
7062
7063 static PyObject*
7064 unicode_lower(PyUnicodeObject *self)
7065 {
7066     return fixup(self, fixlower);
7067 }
7068
7069 #define LEFTSTRIP 0
7070 #define RIGHTSTRIP 1
7071 #define BOTHSTRIP 2
7072
7073 /* Arrays indexed by above */
7074 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7075
7076 #define STRIPNAME(i) (stripformat[i]+3)
7077
7078 /* externally visible for str.strip(unicode) */
7079 PyObject *
7080 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7081 {
7082         Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7083         Py_ssize_t len = PyUnicode_GET_SIZE(self);
7084         Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7085         Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7086         Py_ssize_t i, j;
7087
7088         BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7089
7090         i = 0;
7091         if (striptype != RIGHTSTRIP) {
7092             while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7093                 i++;
7094             }
7095         }
7096
7097         j = len;
7098         if (striptype != LEFTSTRIP) {
7099             do {
7100                 j--;
7101             } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7102             j++;
7103         }
7104
7105         if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7106             Py_INCREF(self);
7107             return (PyObject*)self;
7108         }
7109         else
7110             return PyUnicode_FromUnicode(s+i, j-i);
7111 }
7112
7113
7114 static PyObject *
7115 do_strip(PyUnicodeObject *self, int striptype)
7116 {
7117         Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7118         Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7119
7120         i = 0;
7121         if (striptype != RIGHTSTRIP) {
7122                 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7123                         i++;
7124                 }
7125         }
7126
7127         j = len;
7128         if (striptype != LEFTSTRIP) {
7129                 do {
7130                         j--;
7131                 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7132                 j++;
7133         }
7134
7135         if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7136                 Py_INCREF(self);
7137                 return (PyObject*)self;
7138         }
7139         else
7140                 return PyUnicode_FromUnicode(s+i, j-i);
7141 }
7142
7143
7144 static PyObject *
7145 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7146 {
7147         PyObject *sep = NULL;
7148
7149         if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7150                 return NULL;
7151
7152         if (sep != NULL && sep != Py_None) {
7153                 if (PyUnicode_Check(sep))
7154                         return _PyUnicode_XStrip(self, striptype, sep);
7155                 else if (PyString_Check(sep)) {
7156                         PyObject *res;
7157                         sep = PyUnicode_FromObject(sep);
7158                         if (sep==NULL)
7159                                 return NULL;
7160                         res = _PyUnicode_XStrip(self, striptype, sep);
7161                         Py_DECREF(sep);
7162                         return res;
7163                 }
7164                 else {
7165                         PyErr_Format(PyExc_TypeError,
7166                                      "%s arg must be None, unicode or str",
7167                                      STRIPNAME(striptype));
7168                         return NULL;
7169                 }
7170         }
7171
7172         return do_strip(self, striptype);
7173 }
7174
7175
7176 PyDoc_STRVAR(strip__doc__,
7177 "S.strip([chars]) -> unicode\n\
7178 \n\
7179 Return a copy of the string S with leading and trailing\n\
7180 whitespace removed.\n\
7181 If chars is given and not None, remove characters in chars instead.\n\
7182 If chars is a str, it will be converted to unicode before stripping");
7183
7184 static PyObject *
7185 unicode_strip(PyUnicodeObject *self, PyObject *args)
7186 {
7187         if (PyTuple_GET_SIZE(args) == 0)
7188                 return do_strip(self, BOTHSTRIP); /* Common case */
7189         else
7190                 return do_argstrip(self, BOTHSTRIP, args);
7191 }
7192
7193
7194 PyDoc_STRVAR(lstrip__doc__,
7195 "S.lstrip([chars]) -> unicode\n\
7196 \n\
7197 Return a copy of the string S with leading whitespace removed.\n\
7198 If chars is given and not None, remove characters in chars instead.\n\
7199 If chars is a str, it will be converted to unicode before stripping");
7200
7201 static PyObject *
7202 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7203 {
7204         if (PyTuple_GET_SIZE(args) == 0)
7205                 return do_strip(self, LEFTSTRIP); /* Common case */
7206         else
7207                 return do_argstrip(self, LEFTSTRIP, args);
7208 }
7209
7210
7211 PyDoc_STRVAR(rstrip__doc__,
7212 "S.rstrip([chars]) -> unicode\n\
7213 \n\
7214 Return a copy of the string S with trailing whitespace removed.\n\
7215 If chars is given and not None, remove characters in chars instead.\n\
7216 If chars is a str, it will be converted to unicode before stripping");
7217
7218 static PyObject *
7219 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7220 {
7221         if (PyTuple_GET_SIZE(args) == 0)
7222                 return do_strip(self, RIGHTSTRIP); /* Common case */
7223         else
7224                 return do_argstrip(self, RIGHTSTRIP, args);
7225 }
7226
7227
7228 static PyObject*
7229 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7230 {
7231     PyUnicodeObject *u;
7232     Py_UNICODE *p;
7233     Py_ssize_t nchars;
7234     size_t nbytes;
7235
7236     if (len < 0)
7237         len = 0;
7238
7239     if (len == 1 && PyUnicode_CheckExact(str)) {
7240         /* no repeat, return original string */
7241         Py_INCREF(str);
7242         return (PyObject*) str;
7243     }
7244
7245     /* ensure # of chars needed doesn't overflow int and # of bytes
7246      * needed doesn't overflow size_t
7247      */
7248     nchars = len * str->length;
7249     if (len && nchars / len != str->length) {
7250         PyErr_SetString(PyExc_OverflowError,
7251                         "repeated string is too long");
7252         return NULL;
7253     }
7254     nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7255     if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7256         PyErr_SetString(PyExc_OverflowError,
7257                         "repeated string is too long");
7258         return NULL;
7259     }
7260     u = _PyUnicode_New(nchars);
7261     if (!u)
7262         return NULL;
7263
7264     p = u->str;
7265
7266     if (str->length == 1 && len > 0) {
7267         Py_UNICODE_FILL(p, str->str[0], len);
7268     } else {
7269         Py_ssize_t done = 0; /* number of characters copied this far */
7270         if (done < nchars) {
7271             Py_UNICODE_COPY(p, str->str, str->length);
7272             done = str->length;
7273         }
7274         while (done < nchars) {
7275             Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7276             Py_UNICODE_COPY(p+done, p, n);
7277             done += n;
7278         }
7279     }
7280
7281     return (PyObject*) u;
7282 }
7283
7284 PyObject *PyUnicode_Replace(PyObject *obj,
7285                             PyObject *subobj,
7286                             PyObject *replobj,
7287                             Py_ssize_t maxcount)
7288 {
7289     PyObject *self;
7290     PyObject *str1;
7291     PyObject *str2;
7292     PyObject *result;
7293
7294     self = PyUnicode_FromObject(obj);
7295     if (self == NULL)
7296         return NULL;
7297     str1 = PyUnicode_FromObject(subobj);
7298     if (str1 == NULL) {
7299         Py_DECREF(self);
7300         return NULL;
7301     }
7302     str2 = PyUnicode_FromObject(replobj);
7303     if (str2 == NULL) {
7304         Py_DECREF(self);
7305         Py_DECREF(str1);
7306         return NULL;
7307     }
7308     result = replace((PyUnicodeObject *)self,
7309                      (PyUnicodeObject *)str1,
7310                      (PyUnicodeObject *)str2,
7311                      maxcount);
7312     Py_DECREF(self);
7313     Py_DECREF(str1);
7314     Py_DECREF(str2);
7315     return result;
7316 }
7317
7318 PyDoc_STRVAR(replace__doc__,
7319 "S.replace (old, new[, count]) -> unicode\n\
7320 \n\
7321 Return a copy of S with all occurrences of substring\n\
7322 old replaced by new.  If the optional argument count is\n\
7323 given, only the first count occurrences are replaced.");
7324
7325 static PyObject*
7326 unicode_replace(PyUnicodeObject *self, PyObject *args)
7327 {
7328     PyUnicodeObject *str1;
7329     PyUnicodeObject *str2;
7330     Py_ssize_t maxcount = -1;
7331     PyObject *result;
7332
7333     if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7334         return NULL;
7335     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7336     if (str1 == NULL)
7337         return NULL;
7338     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7339     if (str2 == NULL) {
7340         Py_DECREF(str1);
7341         return NULL;
7342     }
7343
7344     result = replace(self, str1, str2, maxcount);
7345
7346     Py_DECREF(str1);
7347     Py_DECREF(str2);
7348     return result;
7349 }
7350
7351 static
7352 PyObject *unicode_repr(PyObject *unicode)
7353 {
7354     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7355                                 PyUnicode_GET_SIZE(unicode),
7356                                 1);
7357 }
7358
7359 PyDoc_STRVAR(rfind__doc__,
7360 "S.rfind(sub [,start [,end]]) -> int\n\
7361 \n\
7362 Return the highest index in S where substring sub is found,\n\
7363 such that sub is contained within s[start:end].  Optional\n\
7364 arguments start and end are interpreted as in slice notation.\n\
7365 \n\
7366 Return -1 on failure.");
7367
7368 static PyObject *
7369 unicode_rfind(PyUnicodeObject *self, PyObject *args)
7370 {
7371     PyObject *substring;
7372     Py_ssize_t start;
7373     Py_ssize_t end;
7374     Py_ssize_t result;
7375
7376     if (!_ParseTupleFinds(args, &substring, &start, &end))
7377             return NULL;
7378
7379     result = stringlib_rfind_slice(
7380         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7381         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7382         start, end
7383         );
7384
7385     Py_DECREF(substring);
7386
7387     return PyInt_FromSsize_t(result);
7388 }
7389
7390 PyDoc_STRVAR(rindex__doc__,
7391 "S.rindex(sub [,start [,end]]) -> int\n\
7392 \n\
7393 Like S.rfind() but raise ValueError when the substring is not found.");
7394
7395 static PyObject *
7396 unicode_rindex(PyUnicodeObject *self, PyObject *args)
7397 {
7398     PyObject *substring;
7399     Py_ssize_t start;
7400     Py_ssize_t end;
7401     Py_ssize_t result;
7402
7403     if (!_ParseTupleFinds(args, &substring, &start, &end))
7404             return NULL;
7405
7406     result = stringlib_rfind_slice(
7407         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7408         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7409         start, end
7410         );
7411
7412     Py_DECREF(substring);
7413
7414     if (result < 0) {
7415         PyErr_SetString(PyExc_ValueError, "substring not found");
7416         return NULL;
7417     }
7418     return PyInt_FromSsize_t(result);
7419 }
7420
7421 PyDoc_STRVAR(rjust__doc__,
7422 "S.rjust(width[, fillchar]) -> unicode\n\
7423 \n\
7424 Return S right justified in a Unicode string of length width. Padding is\n\
7425 done using the specified fill character (default is a space).");
7426
7427 static PyObject *
7428 unicode_rjust(PyUnicodeObject *self, PyObject *args)
7429 {
7430     Py_ssize_t width;
7431     Py_UNICODE fillchar = ' ';
7432
7433     if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7434         return NULL;
7435
7436     if (self->length >= width && PyUnicode_CheckExact(self)) {
7437         Py_INCREF(self);
7438         return (PyObject*) self;
7439     }
7440
7441     return (PyObject*) pad(self, width - self->length, 0, fillchar);
7442 }
7443
7444 static PyObject*
7445 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
7446 {
7447     /* standard clamping */
7448     if (start < 0)
7449         start = 0;
7450     if (end < 0)
7451         end = 0;
7452     if (end > self->length)
7453         end = self->length;
7454     if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
7455         /* full slice, return original string */
7456         Py_INCREF(self);
7457         return (PyObject*) self;
7458     }
7459     if (start > end)
7460         start = end;
7461     /* copy slice */
7462     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7463                                              end - start);
7464 }
7465
7466 PyObject *PyUnicode_Split(PyObject *s,
7467                           PyObject *sep,
7468                           Py_ssize_t maxsplit)
7469 {
7470     PyObject *result;
7471
7472     s = PyUnicode_FromObject(s);
7473     if (s == NULL)
7474         return NULL;
7475     if (sep != NULL) {
7476         sep = PyUnicode_FromObject(sep);
7477         if (sep == NULL) {
7478             Py_DECREF(s);
7479             return NULL;
7480         }
7481     }
7482
7483     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7484
7485     Py_DECREF(s);
7486     Py_XDECREF(sep);
7487     return result;
7488 }
7489
7490 PyDoc_STRVAR(split__doc__,
7491 "S.split([sep [,maxsplit]]) -> list of strings\n\
7492 \n\
7493 Return a list of the words in S, using sep as the\n\
7494 delimiter string.  If maxsplit is given, at most maxsplit\n\
7495 splits are done. If sep is not specified or is None, any\n\
7496 whitespace string is a separator and empty strings are\n\
7497 removed from the result.");
7498
7499 static PyObject*
7500 unicode_split(PyUnicodeObject *self, PyObject *args)
7501 {
7502     PyObject *substring = Py_None;
7503     Py_ssize_t maxcount = -1;
7504
7505     if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7506         return NULL;
7507
7508     if (substring == Py_None)
7509         return split(self, NULL, maxcount);
7510     else if (PyUnicode_Check(substring))
7511         return split(self, (PyUnicodeObject *)substring, maxcount);
7512     else
7513         return PyUnicode_Split((PyObject *)self, substring, maxcount);
7514 }
7515
7516 PyObject *
7517 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7518 {
7519     PyObject* str_obj;
7520     PyObject* sep_obj;
7521     PyObject* out;
7522
7523     str_obj = PyUnicode_FromObject(str_in);
7524     if (!str_obj)
7525         return NULL;
7526     sep_obj = PyUnicode_FromObject(sep_in);
7527     if (!sep_obj) {
7528         Py_DECREF(str_obj);
7529         return NULL;
7530     }
7531
7532     out = stringlib_partition(
7533         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7534         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7535         );
7536
7537     Py_DECREF(sep_obj);
7538     Py_DECREF(str_obj);
7539
7540     return out;
7541 }
7542
7543
7544 PyObject *
7545 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7546 {
7547     PyObject* str_obj;
7548     PyObject* sep_obj;
7549     PyObject* out;
7550
7551     str_obj = PyUnicode_FromObject(str_in);
7552     if (!str_obj)
7553         return NULL;
7554     sep_obj = PyUnicode_FromObject(sep_in);
7555     if (!sep_obj) {
7556         Py_DECREF(str_obj);
7557         return NULL;
7558     }
7559
7560     out = stringlib_rpartition(
7561         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7562         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7563         );
7564
7565     Py_DECREF(sep_obj);
7566     Py_DECREF(str_obj);
7567
7568     return out;
7569 }
7570
7571 PyDoc_STRVAR(partition__doc__,
7572 "S.partition(sep) -> (head, sep, tail)\n\
7573 \n\
7574 Searches for the separator sep in S, and returns the part before it,\n\
7575 the separator itself, and the part after it.  If the separator is not\n\
7576 found, returns S and two empty strings.");
7577
7578 static PyObject*
7579 unicode_partition(PyUnicodeObject *self, PyObject *separator)
7580 {
7581     return PyUnicode_Partition((PyObject *)self, separator);
7582 }
7583
7584 PyDoc_STRVAR(rpartition__doc__,
7585 "S.rpartition(sep) -> (tail, sep, head)\n\
7586 \n\
7587 Searches for the separator sep in S, starting at the end of S, and returns\n\
7588 the part before it, the separator itself, and the part after it.  If the\n\
7589 separator is not found, returns two empty strings and S.");
7590
7591 static PyObject*
7592 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7593 {
7594     return PyUnicode_RPartition((PyObject *)self, separator);
7595 }
7596
7597 PyObject *PyUnicode_RSplit(PyObject *s,
7598                            PyObject *sep,
7599                            Py_ssize_t maxsplit)
7600 {
7601     PyObject *result;
7602
7603     s = PyUnicode_FromObject(s);
7604     if (s == NULL)
7605         return NULL;
7606     if (sep != NULL) {
7607         sep = PyUnicode_FromObject(sep);
7608         if (sep == NULL) {
7609             Py_DECREF(s);
7610             return NULL;
7611         }
7612     }
7613
7614     result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7615
7616     Py_DECREF(s);
7617     Py_XDECREF(sep);
7618     return result;
7619 }
7620
7621 PyDoc_STRVAR(rsplit__doc__,
7622 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7623 \n\
7624 Return a list of the words in S, using sep as the\n\
7625 delimiter string, starting at the end of the string and\n\
7626 working to the front.  If maxsplit is given, at most maxsplit\n\
7627 splits are done. If sep is not specified, any whitespace string\n\
7628 is a separator.");
7629
7630 static PyObject*
7631 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7632 {
7633     PyObject *substring = Py_None;
7634     Py_ssize_t maxcount = -1;
7635
7636     if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7637         return NULL;
7638
7639     if (substring == Py_None)
7640         return rsplit(self, NULL, maxcount);
7641     else if (PyUnicode_Check(substring))
7642         return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7643     else
7644         return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7645 }
7646
7647 PyDoc_STRVAR(splitlines__doc__,
7648 "S.splitlines([keepends]]) -> list of strings\n\
7649 \n\
7650 Return a list of the lines in S, breaking at line boundaries.\n\
7651 Line breaks are not included in the resulting list unless keepends\n\
7652 is given and true.");
7653
7654 static PyObject*
7655 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7656 {
7657     int keepends = 0;
7658
7659     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7660         return NULL;
7661
7662     return PyUnicode_Splitlines((PyObject *)self, keepends);
7663 }
7664
7665 static
7666 PyObject *unicode_str(PyUnicodeObject *self)
7667 {
7668     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
7669 }
7670
7671 PyDoc_STRVAR(swapcase__doc__,
7672 "S.swapcase() -> unicode\n\
7673 \n\
7674 Return a copy of S with uppercase characters converted to lowercase\n\
7675 and vice versa.");
7676
7677 static PyObject*
7678 unicode_swapcase(PyUnicodeObject *self)
7679 {
7680     return fixup(self, fixswapcase);
7681 }
7682
7683 PyDoc_STRVAR(translate__doc__,
7684 "S.translate(table) -> unicode\n\
7685 \n\
7686 Return a copy of the string S, where all characters have been mapped\n\
7687 through the given translation table, which must be a mapping of\n\
7688 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7689 Unmapped characters are left untouched. Characters mapped to None\n\
7690 are deleted.");
7691
7692 static PyObject*
7693 unicode_translate(PyUnicodeObject *self, PyObject *table)
7694 {
7695     return PyUnicode_TranslateCharmap(self->str,
7696                                       self->length,
7697                                       table,
7698                                       "ignore");
7699 }
7700
7701 PyDoc_STRVAR(upper__doc__,
7702 "S.upper() -> unicode\n\
7703 \n\
7704 Return a copy of S converted to uppercase.");
7705
7706 static PyObject*
7707 unicode_upper(PyUnicodeObject *self)
7708 {
7709     return fixup(self, fixupper);
7710 }
7711
7712 PyDoc_STRVAR(zfill__doc__,
7713 "S.zfill(width) -> unicode\n\
7714 \n\
7715 Pad a numeric string x with zeros on the left, to fill a field\n\
7716 of the specified width. The string x is never truncated.");
7717
7718 static PyObject *
7719 unicode_zfill(PyUnicodeObject *self, PyObject *args)
7720 {
7721     Py_ssize_t fill;
7722     PyUnicodeObject *u;
7723
7724     Py_ssize_t width;
7725     if (!PyArg_ParseTuple(args, "n:zfill", &width))
7726         return NULL;
7727
7728     if (self->length >= width) {
7729         if (PyUnicode_CheckExact(self)) {
7730             Py_INCREF(self);
7731             return (PyObject*) self;
7732         }
7733         else
7734             return PyUnicode_FromUnicode(
7735                 PyUnicode_AS_UNICODE(self),
7736                 PyUnicode_GET_SIZE(self)
7737             );
7738     }
7739
7740     fill = width - self->length;
7741
7742     u = pad(self, fill, 0, '0');
7743
7744     if (u == NULL)
7745         return NULL;
7746
7747     if (u->str[fill] == '+' || u->str[fill] == '-') {
7748         /* move sign to beginning of string */
7749         u->str[0] = u->str[fill];
7750         u->str[fill] = '0';
7751     }
7752
7753     return (PyObject*) u;
7754 }
7755
7756 #if 0
7757 static PyObject*
7758 free_listsize(PyUnicodeObject *self)
7759 {
7760     return PyInt_FromLong(numfree);
7761 }
7762 #endif
7763
7764 PyDoc_STRVAR(startswith__doc__,
7765 "S.startswith(prefix[, start[, end]]) -> bool\n\
7766 \n\
7767 Return True if S starts with the specified prefix, False otherwise.\n\
7768 With optional start, test S beginning at that position.\n\
7769 With optional end, stop comparing S at that position.\n\
7770 prefix can also be a tuple of strings to try.");
7771
7772 static PyObject *
7773 unicode_startswith(PyUnicodeObject *self,
7774                    PyObject *args)
7775 {
7776     PyObject *subobj;
7777     PyUnicodeObject *substring;
7778     Py_ssize_t start = 0;
7779     Py_ssize_t end = PY_SSIZE_T_MAX;
7780     int result;
7781
7782     if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
7783                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7784         return NULL;
7785     if (PyTuple_Check(subobj)) {
7786         Py_ssize_t i;
7787         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7788             substring = (PyUnicodeObject *)PyUnicode_FromObject(
7789                             PyTuple_GET_ITEM(subobj, i));
7790             if (substring == NULL)
7791                 return NULL;
7792             result = tailmatch(self, substring, start, end, -1);
7793             Py_DECREF(substring);
7794             if (result) {
7795                 Py_RETURN_TRUE;
7796             }
7797         }
7798         /* nothing matched */
7799         Py_RETURN_FALSE;
7800     }
7801     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7802     if (substring == NULL)
7803          return NULL;
7804     result = tailmatch(self, substring, start, end, -1);
7805     Py_DECREF(substring);
7806     return PyBool_FromLong(result);
7807 }
7808
7809
7810 PyDoc_STRVAR(endswith__doc__,
7811 "S.endswith(suffix[, start[, end]]) -> bool\n\
7812 \n\
7813 Return True if S ends with the specified suffix, False otherwise.\n\
7814 With optional start, test S beginning at that position.\n\
7815 With optional end, stop comparing S at that position.\n\
7816 suffix can also be a tuple of strings to try.");
7817
7818 static PyObject *
7819 unicode_endswith(PyUnicodeObject *self,
7820                  PyObject *args)
7821 {
7822     PyObject *subobj;
7823     PyUnicodeObject *substring;
7824     Py_ssize_t start = 0;
7825     Py_ssize_t end = PY_SSIZE_T_MAX;
7826     int result;
7827
7828     if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7829         _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7830         return NULL;
7831     if (PyTuple_Check(subobj)) {
7832         Py_ssize_t i;
7833         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7834             substring = (PyUnicodeObject *)PyUnicode_FromObject(
7835                             PyTuple_GET_ITEM(subobj, i));
7836             if (substring == NULL)
7837             return NULL;
7838             result = tailmatch(self, substring, start, end, +1);
7839             Py_DECREF(substring);
7840             if (result) {
7841                 Py_RETURN_TRUE;
7842             }
7843         }
7844         Py_RETURN_FALSE;
7845     }
7846     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7847     if (substring == NULL)
7848     return NULL;
7849
7850     result = tailmatch(self, substring, start, end, +1);
7851     Py_DECREF(substring);
7852     return PyBool_FromLong(result);
7853 }
7854
7855
7856 /* Implements do_string_format, which is unicode because of stringlib */
7857 #include "stringlib/string_format.h"
7858
7859 PyDoc_STRVAR(format__doc__,
7860 "S.format(*args, **kwargs) -> unicode\n\
7861 \n\
7862 ");
7863
7864 static PyObject *
7865 unicode__format__(PyObject *self, PyObject *args)
7866 {
7867     PyObject *format_spec;
7868     PyObject *result = NULL;
7869     PyObject *tmp = NULL;
7870
7871     /* If 2.x, convert format_spec to the same type as value */
7872     /* This is to allow things like u''.format('') */
7873     if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7874         goto done;
7875     if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7876         PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
7877                      "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
7878         goto done;
7879     }
7880     tmp = PyObject_Unicode(format_spec);
7881     if (tmp == NULL)
7882         goto done;
7883     format_spec = tmp;
7884
7885     result = _PyUnicode_FormatAdvanced(self,
7886                                        PyUnicode_AS_UNICODE(format_spec),
7887                                        PyUnicode_GET_SIZE(format_spec));
7888 done:
7889     Py_XDECREF(tmp);
7890     return result;
7891 }
7892
7893 PyDoc_STRVAR(p_format__doc__,
7894 "S.__format__(format_spec) -> unicode\n\
7895 \n\
7896 ");
7897
7898 static PyObject *
7899 unicode__sizeof__(PyUnicodeObject *v)
7900 {
7901     PyObject *res = NULL, *defsize = NULL;
7902
7903     res = PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7904                             sizeof(Py_UNICODE) * (v->length + 1));
7905     if (v->defenc) {
7906         defsize = PyObject_CallMethod(v->defenc, "__sizeof__", NULL);
7907         if (defsize == NULL) {
7908             Py_DECREF(res);
7909             return NULL;
7910         }
7911         res = PyNumber_Add(res, defsize);
7912         Py_DECREF(defsize);
7913     }
7914     return res;
7915 }
7916
7917 PyDoc_STRVAR(sizeof__doc__,
7918 "S.__sizeof__() -> size of S in memory, in bytes\n\
7919 \n\
7920 ");
7921
7922 static PyObject *
7923 unicode_getnewargs(PyUnicodeObject *v)
7924 {
7925         return Py_BuildValue("(u#)", v->str, v->length);
7926 }
7927
7928
7929 static PyMethodDef unicode_methods[] = {
7930
7931     /* Order is according to common usage: often used methods should
7932        appear first, since lookup is done sequentially. */
7933
7934     {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7935     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7936     {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
7937     {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
7938     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7939     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7940     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7941     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7942     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7943     {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7944     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
7945     {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
7946     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7947     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7948     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
7949     {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
7950     {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
7951 /*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7952     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7953     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7954     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
7955     {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
7956     {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
7957     {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
7958     {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
7959     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7960     {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7961     {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7962     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7963     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7964     {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7965     {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7966     {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7967     {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7968     {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7969     {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7970     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7971     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7972     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
7973     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
7974     {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7975     {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7976     {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7977     {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
7978     {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
7979 #if 0
7980     {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
7981 #endif
7982
7983 #if 0
7984     /* This one is just used for debugging the implementation. */
7985     {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
7986 #endif
7987
7988     {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
7989     {NULL, NULL}
7990 };
7991
7992 static PyObject *
7993 unicode_mod(PyObject *v, PyObject *w)
7994 {
7995        if (!PyUnicode_Check(v)) {
7996                Py_INCREF(Py_NotImplemented);
7997                return Py_NotImplemented;
7998        }
7999        return PyUnicode_Format(v, w);
8000 }
8001
8002 static PyNumberMethods unicode_as_number = {
8003         0,                              /*nb_add*/
8004         0,                              /*nb_subtract*/
8005         0,                              /*nb_multiply*/
8006         0,                              /*nb_divide*/
8007         unicode_mod,                    /*nb_remainder*/
8008 };
8009
8010 static PySequenceMethods unicode_as_sequence = {
8011     (lenfunc) unicode_length,           /* sq_length */
8012     PyUnicode_Concat,                   /* sq_concat */
8013     (ssizeargfunc) unicode_repeat,      /* sq_repeat */
8014     (ssizeargfunc) unicode_getitem,     /* sq_item */
8015     (ssizessizeargfunc) unicode_slice,  /* sq_slice */
8016     0,                                  /* sq_ass_item */
8017     0,                                  /* sq_ass_slice */
8018     PyUnicode_Contains,                 /* sq_contains */
8019 };
8020
8021 static PyObject*
8022 unicode_subscript(PyUnicodeObject* self, PyObject* item)
8023 {
8024     if (PyIndex_Check(item)) {
8025         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
8026         if (i == -1 && PyErr_Occurred())
8027             return NULL;
8028         if (i < 0)
8029             i += PyUnicode_GET_SIZE(self);
8030         return unicode_getitem(self, i);
8031     } else if (PySlice_Check(item)) {
8032         Py_ssize_t start, stop, step, slicelength, cur, i;
8033         Py_UNICODE* source_buf;
8034         Py_UNICODE* result_buf;
8035         PyObject* result;
8036
8037         if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
8038                                  &start, &stop, &step, &slicelength) < 0) {
8039             return NULL;
8040         }
8041
8042         if (slicelength <= 0) {
8043             return PyUnicode_FromUnicode(NULL, 0);
8044         } else if (start == 0 && step == 1 && slicelength == self->length &&
8045                    PyUnicode_CheckExact(self)) {
8046             Py_INCREF(self);
8047             return (PyObject *)self;
8048         } else if (step == 1) {
8049             return PyUnicode_FromUnicode(self->str + start, slicelength);
8050         } else {
8051             source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
8052             result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8053                                                        sizeof(Py_UNICODE));
8054
8055             if (result_buf == NULL)
8056                     return PyErr_NoMemory();
8057
8058             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8059                 result_buf[i] = source_buf[cur];
8060             }
8061
8062             result = PyUnicode_FromUnicode(result_buf, slicelength);
8063             PyObject_FREE(result_buf);
8064             return result;
8065         }
8066     } else {
8067         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8068         return NULL;
8069     }
8070 }
8071
8072 static PyMappingMethods unicode_as_mapping = {
8073     (lenfunc)unicode_length,            /* mp_length */
8074     (binaryfunc)unicode_subscript,      /* mp_subscript */
8075     (objobjargproc)0,                   /* mp_ass_subscript */
8076 };
8077
8078 static Py_ssize_t
8079 unicode_buffer_getreadbuf(PyUnicodeObject *self,
8080                           Py_ssize_t index,
8081                           const void **ptr)
8082 {
8083     if (index != 0) {
8084         PyErr_SetString(PyExc_SystemError,
8085                         "accessing non-existent unicode segment");
8086         return -1;
8087     }
8088     *ptr = (void *) self->str;
8089     return PyUnicode_GET_DATA_SIZE(self);
8090 }
8091
8092 static Py_ssize_t
8093 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
8094                            const void **ptr)
8095 {
8096     PyErr_SetString(PyExc_TypeError,
8097                     "cannot use unicode as modifiable buffer");
8098     return -1;
8099 }
8100
8101 static int
8102 unicode_buffer_getsegcount(PyUnicodeObject *self,
8103                            Py_ssize_t *lenp)
8104 {
8105     if (lenp)
8106         *lenp = PyUnicode_GET_DATA_SIZE(self);
8107     return 1;
8108 }
8109
8110 static Py_ssize_t
8111 unicode_buffer_getcharbuf(PyUnicodeObject *self,
8112                           Py_ssize_t index,
8113                           const void **ptr)
8114 {
8115     PyObject *str;
8116
8117     if (index != 0) {
8118         PyErr_SetString(PyExc_SystemError,
8119                         "accessing non-existent unicode segment");
8120         return -1;
8121     }
8122     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
8123     if (str == NULL)
8124         return -1;
8125     *ptr = (void *) PyString_AS_STRING(str);
8126     return PyString_GET_SIZE(str);
8127 }
8128
8129 /* Helpers for PyUnicode_Format() */
8130
8131 static PyObject *
8132 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8133 {
8134     Py_ssize_t argidx = *p_argidx;
8135     if (argidx < arglen) {
8136         (*p_argidx)++;
8137         if (arglen < 0)
8138             return args;
8139         else
8140             return PyTuple_GetItem(args, argidx);
8141     }
8142     PyErr_SetString(PyExc_TypeError,
8143                     "not enough arguments for format string");
8144     return NULL;
8145 }
8146
8147 #define F_LJUST (1<<0)
8148 #define F_SIGN  (1<<1)
8149 #define F_BLANK (1<<2)
8150 #define F_ALT   (1<<3)
8151 #define F_ZERO  (1<<4)
8152
8153 static Py_ssize_t
8154 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8155 {
8156     register Py_ssize_t i;
8157     Py_ssize_t len = strlen(charbuffer);
8158     for (i = len - 1; i >= 0; i--)
8159         buffer[i] = (Py_UNICODE) charbuffer[i];
8160
8161     return len;
8162 }
8163
8164 static int
8165 doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8166 {
8167     Py_ssize_t result;
8168
8169     PyOS_ascii_formatd((char *)buffer, len, format, x);
8170     result = strtounicode(buffer, (char *)buffer);
8171     return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8172 }
8173
8174 static int
8175 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8176 {
8177     Py_ssize_t result;
8178
8179     PyOS_snprintf((char *)buffer, len, format, x);
8180     result = strtounicode(buffer, (char *)buffer);
8181     return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8182 }
8183
8184 /* XXX To save some code duplication, formatfloat/long/int could have been
8185    shared with stringobject.c, converting from 8-bit to Unicode after the
8186    formatting is done. */
8187
8188 static int
8189 formatfloat(Py_UNICODE *buf,
8190             size_t buflen,
8191             int flags,
8192             int prec,
8193             int type,
8194             PyObject *v)
8195 {
8196     /* fmt = '%#.' + `prec` + `type`
8197        worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
8198     char fmt[20];
8199     double x;
8200
8201     x = PyFloat_AsDouble(v);
8202     if (x == -1.0 && PyErr_Occurred())
8203         return -1;
8204     if (prec < 0)
8205         prec = 6;
8206     if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8207         type = 'g';
8208     /* Worst case length calc to ensure no buffer overrun:
8209
8210        'g' formats:
8211          fmt = %#.<prec>g
8212          buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8213             for any double rep.)
8214          len = 1 + prec + 1 + 2 + 5 = 9 + prec
8215
8216        'f' formats:
8217          buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8218          len = 1 + 50 + 1 + prec = 52 + prec
8219
8220        If prec=0 the effective precision is 1 (the leading digit is
8221        always given), therefore increase the length by one.
8222
8223     */
8224     if (((type == 'g' || type == 'G') &&
8225           buflen <= (size_t)10 + (size_t)prec) ||
8226         (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
8227         PyErr_SetString(PyExc_OverflowError,
8228                         "formatted float is too long (precision too large?)");
8229         return -1;
8230     }
8231     PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8232                   (flags&F_ALT) ? "#" : "",
8233                   prec, type);
8234     return doubletounicode(buf, buflen, fmt, x);
8235 }
8236
8237 static PyObject*
8238 formatlong(PyObject *val, int flags, int prec, int type)
8239 {
8240         char *buf;
8241         int i, len;
8242         PyObject *str; /* temporary string object. */
8243         PyUnicodeObject *result;
8244
8245         str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8246         if (!str)
8247                 return NULL;
8248         result = _PyUnicode_New(len);
8249         if (!result) {
8250                 Py_DECREF(str);
8251                 return NULL;
8252         }
8253         for (i = 0; i < len; i++)
8254                 result->str[i] = buf[i];
8255         result->str[len] = 0;
8256         Py_DECREF(str);
8257         return (PyObject*)result;
8258 }
8259
8260 static int
8261 formatint(Py_UNICODE *buf,
8262           size_t buflen,
8263           int flags,
8264           int prec,
8265           int type,
8266           PyObject *v)
8267 {
8268     /* fmt = '%#.' + `prec` + 'l' + `type`
8269      * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8270      *                     + 1 + 1
8271      *                   = 24
8272      */
8273     char fmt[64]; /* plenty big enough! */
8274     char *sign;
8275     long x;
8276
8277     x = PyInt_AsLong(v);
8278     if (x == -1 && PyErr_Occurred())
8279         return -1;
8280     if (x < 0 && type == 'u') {
8281         type = 'd';
8282     }
8283     if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8284         sign = "-";
8285     else
8286         sign = "";
8287     if (prec < 0)
8288         prec = 1;
8289
8290     /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8291      * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8292      */
8293     if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8294         PyErr_SetString(PyExc_OverflowError,
8295                 "formatted integer is too long (precision too large?)");
8296         return -1;
8297     }
8298
8299     if ((flags & F_ALT) &&
8300         (type == 'x' || type == 'X')) {
8301         /* When converting under %#x or %#X, there are a number
8302          * of issues that cause pain:
8303          * - when 0 is being converted, the C standard leaves off
8304          *   the '0x' or '0X', which is inconsistent with other
8305          *   %#x/%#X conversions and inconsistent with Python's
8306          *   hex() function
8307          * - there are platforms that violate the standard and
8308          *   convert 0 with the '0x' or '0X'
8309          *   (Metrowerks, Compaq Tru64)
8310          * - there are platforms that give '0x' when converting
8311          *   under %#X, but convert 0 in accordance with the
8312          *   standard (OS/2 EMX)
8313          *
8314          * We can achieve the desired consistency by inserting our
8315          * own '0x' or '0X' prefix, and substituting %x/%X in place
8316          * of %#x/%#X.
8317          *
8318          * Note that this is the same approach as used in
8319          * formatint() in stringobject.c
8320          */
8321         PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8322                       sign, type, prec, type);
8323     }
8324     else {
8325         PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8326                       sign, (flags&F_ALT) ? "#" : "",
8327                       prec, type);
8328     }
8329     if (sign[0])
8330         return longtounicode(buf, buflen, fmt, -x);
8331     else
8332         return longtounicode(buf, buflen, fmt, x);
8333 }
8334
8335 static int
8336 formatchar(Py_UNICODE *buf,
8337            size_t buflen,
8338            PyObject *v)
8339 {
8340     /* presume that the buffer is at least 2 characters long */
8341     if (PyUnicode_Check(v)) {
8342         if (PyUnicode_GET_SIZE(v) != 1)
8343             goto onError;
8344         buf[0] = PyUnicode_AS_UNICODE(v)[0];
8345     }
8346
8347     else if (PyString_Check(v)) {
8348         if (PyString_GET_SIZE(v) != 1)
8349             goto onError;
8350         buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8351     }
8352
8353     else {
8354         /* Integer input truncated to a character */
8355         long x;
8356         x = PyInt_AsLong(v);
8357         if (x == -1 && PyErr_Occurred())
8358             goto onError;
8359 #ifdef Py_UNICODE_WIDE
8360         if (x < 0 || x > 0x10ffff) {
8361             PyErr_SetString(PyExc_OverflowError,
8362                             "%c arg not in range(0x110000) "
8363                             "(wide Python build)");
8364             return -1;
8365         }
8366 #else
8367         if (x < 0 || x > 0xffff) {
8368             PyErr_SetString(PyExc_OverflowError,
8369                             "%c arg not in range(0x10000) "
8370                             "(narrow Python build)");
8371             return -1;
8372         }
8373 #endif
8374         buf[0] = (Py_UNICODE) x;
8375     }
8376     buf[1] = '\0';
8377     return 1;
8378
8379  onError:
8380     PyErr_SetString(PyExc_TypeError,
8381                     "%c requires int or char");
8382     return -1;
8383 }
8384
8385 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8386
8387    FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8388    chars are formatted. XXX This is a magic number. Each formatting
8389    routine does bounds checking to ensure no overflow, but a better
8390    solution may be to malloc a buffer of appropriate size for each
8391    format. For now, the current solution is sufficient.
8392 */
8393 #define FORMATBUFLEN (size_t)120
8394
8395 PyObject *PyUnicode_Format(PyObject *format,
8396                            PyObject *args)
8397 {
8398     Py_UNICODE *fmt, *res;
8399     Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8400     int args_owned = 0;
8401     PyUnicodeObject *result = NULL;
8402     PyObject *dict = NULL;
8403     PyObject *uformat;
8404
8405     if (format == NULL || args == NULL) {
8406         PyErr_BadInternalCall();
8407         return NULL;
8408     }
8409     uformat = PyUnicode_FromObject(format);
8410     if (uformat == NULL)
8411         return NULL;
8412     fmt = PyUnicode_AS_UNICODE(uformat);
8413     fmtcnt = PyUnicode_GET_SIZE(uformat);
8414
8415     reslen = rescnt = fmtcnt + 100;
8416     result = _PyUnicode_New(reslen);
8417     if (result == NULL)
8418         goto onError;
8419     res = PyUnicode_AS_UNICODE(result);
8420
8421     if (PyTuple_Check(args)) {
8422         arglen = PyTuple_Size(args);
8423         argidx = 0;
8424     }
8425     else {
8426         arglen = -1;
8427         argidx = -2;
8428     }
8429     if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
8430         !PyObject_TypeCheck(args, &PyBaseString_Type))
8431         dict = args;
8432
8433     while (--fmtcnt >= 0) {
8434         if (*fmt != '%') {
8435             if (--rescnt < 0) {
8436                 rescnt = fmtcnt + 100;
8437                 reslen += rescnt;
8438                 if (_PyUnicode_Resize(&result, reslen) < 0)
8439                     goto onError;
8440                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8441                 --rescnt;
8442             }
8443             *res++ = *fmt++;
8444         }
8445         else {
8446             /* Got a format specifier */
8447             int flags = 0;
8448             Py_ssize_t width = -1;
8449             int prec = -1;
8450             Py_UNICODE c = '\0';
8451             Py_UNICODE fill;
8452             int isnumok;
8453             PyObject *v = NULL;
8454             PyObject *temp = NULL;
8455             Py_UNICODE *pbuf;
8456             Py_UNICODE sign;
8457             Py_ssize_t len;
8458             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
8459
8460             fmt++;
8461             if (*fmt == '(') {
8462                 Py_UNICODE *keystart;
8463                 Py_ssize_t keylen;
8464                 PyObject *key;
8465                 int pcount = 1;
8466
8467                 if (dict == NULL) {
8468                     PyErr_SetString(PyExc_TypeError,
8469                                     "format requires a mapping");
8470                     goto onError;
8471                 }
8472                 ++fmt;
8473                 --fmtcnt;
8474                 keystart = fmt;
8475                 /* Skip over balanced parentheses */
8476                 while (pcount > 0 && --fmtcnt >= 0) {
8477                     if (*fmt == ')')
8478                         --pcount;
8479                     else if (*fmt == '(')
8480                         ++pcount;
8481                     fmt++;
8482                 }
8483                 keylen = fmt - keystart - 1;
8484                 if (fmtcnt < 0 || pcount > 0) {
8485                     PyErr_SetString(PyExc_ValueError,
8486                                     "incomplete format key");
8487                     goto onError;
8488                 }
8489 #if 0
8490                 /* keys are converted to strings using UTF-8 and
8491                    then looked up since Python uses strings to hold
8492                    variables names etc. in its namespaces and we
8493                    wouldn't want to break common idioms. */
8494                 key = PyUnicode_EncodeUTF8(keystart,
8495                                            keylen,
8496                                            NULL);
8497 #else
8498                 key = PyUnicode_FromUnicode(keystart, keylen);
8499 #endif
8500                 if (key == NULL)
8501                     goto onError;
8502                 if (args_owned) {
8503                     Py_DECREF(args);
8504                     args_owned = 0;
8505                 }
8506                 args = PyObject_GetItem(dict, key);
8507                 Py_DECREF(key);
8508                 if (args == NULL) {
8509                     goto onError;
8510                 }
8511                 args_owned = 1;
8512                 arglen = -1;
8513                 argidx = -2;
8514             }
8515             while (--fmtcnt >= 0) {
8516                 switch (c = *fmt++) {
8517                 case '-': flags |= F_LJUST; continue;
8518                 case '+': flags |= F_SIGN; continue;
8519                 case ' ': flags |= F_BLANK; continue;
8520                 case '#': flags |= F_ALT; continue;
8521                 case '0': flags |= F_ZERO; continue;
8522                 }
8523                 break;
8524             }
8525             if (c == '*') {
8526                 v = getnextarg(args, arglen, &argidx);
8527                 if (v == NULL)
8528                     goto onError;
8529                 if (!PyInt_Check(v)) {
8530                     PyErr_SetString(PyExc_TypeError,
8531                                     "* wants int");
8532                     goto onError;
8533                 }
8534                 width = PyInt_AsLong(v);
8535                 if (width < 0) {
8536                     flags |= F_LJUST;
8537                     width = -width;
8538                 }
8539                 if (--fmtcnt >= 0)
8540                     c = *fmt++;
8541             }
8542             else if (c >= '0' && c <= '9') {
8543                 width = c - '0';
8544                 while (--fmtcnt >= 0) {
8545                     c = *fmt++;
8546                     if (c < '0' || c > '9')
8547                         break;
8548                     if ((width*10) / 10 != width) {
8549                         PyErr_SetString(PyExc_ValueError,
8550                                         "width too big");
8551                         goto onError;
8552                     }
8553                     width = width*10 + (c - '0');
8554                 }
8555             }
8556             if (c == '.') {
8557                 prec = 0;
8558                 if (--fmtcnt >= 0)
8559                     c = *fmt++;
8560                 if (c == '*') {
8561                     v = getnextarg(args, arglen, &argidx);
8562                     if (v == NULL)
8563                         goto onError;
8564                     if (!PyInt_Check(v)) {
8565                         PyErr_SetString(PyExc_TypeError,
8566                                         "* wants int");
8567                         goto onError;
8568                     }
8569                     prec = PyInt_AsLong(v);
8570                     if (prec < 0)
8571                         prec = 0;
8572                     if (--fmtcnt >= 0)
8573                         c = *fmt++;
8574                 }
8575                 else if (c >= '0' && c <= '9') {
8576                     prec = c - '0';
8577                     while (--fmtcnt >= 0) {
8578                         c = Py_CHARMASK(*fmt++);
8579                         if (c < '0' || c > '9')
8580                             break;
8581                         if ((prec*10) / 10 != prec) {
8582                             PyErr_SetString(PyExc_ValueError,
8583                                             "prec too big");
8584                             goto onError;
8585                         }
8586                         prec = prec*10 + (c - '0');
8587                     }
8588                 }
8589             } /* prec */
8590             if (fmtcnt >= 0) {
8591                 if (c == 'h' || c == 'l' || c == 'L') {
8592                     if (--fmtcnt >= 0)
8593                         c = *fmt++;
8594                 }
8595             }
8596             if (fmtcnt < 0) {
8597                 PyErr_SetString(PyExc_ValueError,
8598                                 "incomplete format");
8599                 goto onError;
8600             }
8601             if (c != '%') {
8602                 v = getnextarg(args, arglen, &argidx);
8603                 if (v == NULL)
8604                     goto onError;
8605             }
8606             sign = 0;
8607             fill = ' ';
8608             switch (c) {
8609
8610             case '%':
8611                 pbuf = formatbuf;
8612                 /* presume that buffer length is at least 1 */
8613                 pbuf[0] = '%';
8614                 len = 1;
8615                 break;
8616
8617             case 's':
8618             case 'r':
8619                 if (PyUnicode_Check(v) && c == 's') {
8620                     temp = v;
8621                     Py_INCREF(temp);
8622                 }
8623                 else {
8624                     PyObject *unicode;
8625                     if (c == 's')
8626                         temp = PyObject_Unicode(v);
8627                     else
8628                         temp = PyObject_Repr(v);
8629                     if (temp == NULL)
8630                         goto onError;
8631                     if (PyUnicode_Check(temp))
8632                         /* nothing to do */;
8633                     else if (PyString_Check(temp)) {
8634                         /* convert to string to Unicode */
8635                         unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8636                                                    PyString_GET_SIZE(temp),
8637                                                    NULL,
8638                                                    "strict");
8639                         Py_DECREF(temp);
8640                         temp = unicode;
8641                         if (temp == NULL)
8642                             goto onError;
8643                     }
8644                     else {
8645                         Py_DECREF(temp);
8646                         PyErr_SetString(PyExc_TypeError,
8647                                         "%s argument has non-string str()");
8648                         goto onError;
8649                     }
8650                 }
8651                 pbuf = PyUnicode_AS_UNICODE(temp);
8652                 len = PyUnicode_GET_SIZE(temp);
8653                 if (prec >= 0 && len > prec)
8654                     len = prec;
8655                 break;
8656
8657             case 'i':
8658             case 'd':
8659             case 'u':
8660             case 'o':
8661             case 'x':
8662             case 'X':
8663                 if (c == 'i')
8664                     c = 'd';
8665                 isnumok = 0;
8666                 if (PyNumber_Check(v)) {
8667                         PyObject *iobj=NULL;
8668
8669                         if (PyInt_Check(v) || (PyLong_Check(v))) {
8670                                 iobj = v;
8671                                 Py_INCREF(iobj);
8672                         }
8673                         else {
8674                                 iobj = PyNumber_Int(v);
8675                                 if (iobj==NULL) iobj = PyNumber_Long(v);
8676                         }
8677                         if (iobj!=NULL) {
8678                                 if (PyInt_Check(iobj)) {
8679                                         isnumok = 1;
8680                                         pbuf = formatbuf;
8681                                         len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8682                                                     flags, prec, c, iobj);
8683                                         Py_DECREF(iobj);
8684                                         if (len < 0)
8685                                             goto onError;
8686                                         sign = 1;
8687                                 }
8688                                 else if (PyLong_Check(iobj)) {
8689                                         isnumok = 1;
8690                                         temp = formatlong(iobj, flags, prec, c);
8691                                         Py_DECREF(iobj);
8692                                         if (!temp)
8693                                             goto onError;
8694                                         pbuf = PyUnicode_AS_UNICODE(temp);
8695                                         len = PyUnicode_GET_SIZE(temp);
8696                                         sign = 1;
8697                                 }
8698                                 else {
8699                                         Py_DECREF(iobj);
8700                                 }
8701                         }
8702                 }
8703                 if (!isnumok) {
8704                         PyErr_Format(PyExc_TypeError,
8705                             "%%%c format: a number is required, "
8706                                      "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8707                         goto onError;
8708                 }
8709                 if (flags & F_ZERO)
8710                     fill = '0';
8711                 break;
8712
8713             case 'e':
8714             case 'E':
8715             case 'f':
8716             case 'F':
8717             case 'g':
8718             case 'G':
8719                 if (c == 'F')
8720                         c = 'f';
8721                 pbuf = formatbuf;
8722                 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8723                         flags, prec, c, v);
8724                 if (len < 0)
8725                     goto onError;
8726                 sign = 1;
8727                 if (flags & F_ZERO)
8728                     fill = '0';
8729                 break;
8730
8731             case 'c':
8732                 pbuf = formatbuf;
8733                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8734                 if (len < 0)
8735                     goto onError;
8736                 break;
8737
8738             default:
8739                 PyErr_Format(PyExc_ValueError,
8740                              "unsupported format character '%c' (0x%x) "
8741                              "at index %zd",
8742                              (31<=c && c<=126) ? (char)c : '?',
8743                              (int)c,
8744                              (Py_ssize_t)(fmt - 1 -
8745                                           PyUnicode_AS_UNICODE(uformat)));
8746                 goto onError;
8747             }
8748             if (sign) {
8749                 if (*pbuf == '-' || *pbuf == '+') {
8750                     sign = *pbuf++;
8751                     len--;
8752                 }
8753                 else if (flags & F_SIGN)
8754                     sign = '+';
8755                 else if (flags & F_BLANK)
8756                     sign = ' ';
8757                 else
8758                     sign = 0;
8759             }
8760             if (width < len)
8761                 width = len;
8762             if (rescnt - (sign != 0) < width) {
8763                 reslen -= rescnt;
8764                 rescnt = width + fmtcnt + 100;
8765                 reslen += rescnt;
8766                 if (reslen < 0) {
8767                     Py_XDECREF(temp);
8768                     PyErr_NoMemory();
8769                     goto onError;
8770                 }
8771                 if (_PyUnicode_Resize(&result, reslen) < 0) {
8772                     Py_XDECREF(temp);
8773                     goto onError;
8774                 }
8775                 res = PyUnicode_AS_UNICODE(result)
8776                     + reslen - rescnt;
8777             }
8778             if (sign) {
8779                 if (fill != ' ')
8780                     *res++ = sign;
8781                 rescnt--;
8782                 if (width > len)
8783                     width--;
8784             }
8785             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8786                 assert(pbuf[0] == '0');
8787                 assert(pbuf[1] == c);
8788                 if (fill != ' ') {
8789                     *res++ = *pbuf++;
8790                     *res++ = *pbuf++;
8791                 }
8792                 rescnt -= 2;
8793                 width -= 2;
8794                 if (width < 0)
8795                     width = 0;
8796                 len -= 2;
8797             }
8798             if (width > len && !(flags & F_LJUST)) {
8799                 do {
8800                     --rescnt;
8801                     *res++ = fill;
8802                 } while (--width > len);
8803             }
8804             if (fill == ' ') {
8805                 if (sign)
8806                     *res++ = sign;
8807                 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8808                     assert(pbuf[0] == '0');
8809                     assert(pbuf[1] == c);
8810                     *res++ = *pbuf++;
8811                     *res++ = *pbuf++;
8812                 }
8813             }
8814             Py_UNICODE_COPY(res, pbuf, len);
8815             res += len;
8816             rescnt -= len;
8817             while (--width >= len) {
8818                 --rescnt;
8819                 *res++ = ' ';
8820             }
8821             if (dict && (argidx < arglen) && c != '%') {
8822                 PyErr_SetString(PyExc_TypeError,
8823                                 "not all arguments converted during string formatting");
8824                 Py_XDECREF(temp);
8825                 goto onError;
8826             }
8827             Py_XDECREF(temp);
8828         } /* '%' */
8829     } /* until end */
8830     if (argidx < arglen && !dict) {
8831         PyErr_SetString(PyExc_TypeError,
8832                         "not all arguments converted during string formatting");
8833         goto onError;
8834     }
8835
8836     if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8837         goto onError;
8838     if (args_owned) {
8839         Py_DECREF(args);
8840     }
8841     Py_DECREF(uformat);
8842     return (PyObject *)result;
8843
8844  onError:
8845     Py_XDECREF(result);
8846     Py_DECREF(uformat);
8847     if (args_owned) {
8848         Py_DECREF(args);
8849     }
8850     return NULL;
8851 }
8852
8853 static PyBufferProcs unicode_as_buffer = {
8854     (readbufferproc) unicode_buffer_getreadbuf,
8855     (writebufferproc) unicode_buffer_getwritebuf,
8856     (segcountproc) unicode_buffer_getsegcount,
8857     (charbufferproc) unicode_buffer_getcharbuf,
8858 };
8859
8860 static PyObject *
8861 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8862
8863 static PyObject *
8864 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8865 {
8866         PyObject *x = NULL;
8867         static char *kwlist[] = {"string", "encoding", "errors", 0};
8868         char *encoding = NULL;
8869         char *errors = NULL;
8870
8871         if (type != &PyUnicode_Type)
8872                 return unicode_subtype_new(type, args, kwds);
8873         if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8874                                           kwlist, &x, &encoding, &errors))
8875             return NULL;
8876         if (x == NULL)
8877                 return (PyObject *)_PyUnicode_New(0);
8878         if (encoding == NULL && errors == NULL)
8879             return PyObject_Unicode(x);
8880         else
8881         return PyUnicode_FromEncodedObject(x, encoding, errors);
8882 }
8883
8884 static PyObject *
8885 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8886 {
8887         PyUnicodeObject *tmp, *pnew;
8888         Py_ssize_t n;
8889
8890         assert(PyType_IsSubtype(type, &PyUnicode_Type));
8891         tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8892         if (tmp == NULL)
8893                 return NULL;
8894         assert(PyUnicode_Check(tmp));
8895         pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8896         if (pnew == NULL) {
8897                 Py_DECREF(tmp);
8898                 return NULL;
8899         }
8900         pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8901         if (pnew->str == NULL) {
8902                 _Py_ForgetReference((PyObject *)pnew);
8903                 PyObject_Del(pnew);
8904                 Py_DECREF(tmp);
8905                 return PyErr_NoMemory();
8906         }
8907         Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8908         pnew->length = n;
8909         pnew->hash = tmp->hash;
8910         Py_DECREF(tmp);
8911         return (PyObject *)pnew;
8912 }
8913
8914 PyDoc_STRVAR(unicode_doc,
8915 "unicode(string [, encoding[, errors]]) -> object\n\
8916 \n\
8917 Create a new Unicode object from the given encoded string.\n\
8918 encoding defaults to the current default string encoding.\n\
8919 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8920
8921 PyTypeObject PyUnicode_Type = {
8922     PyVarObject_HEAD_INIT(&PyType_Type, 0)
8923     "unicode",                          /* tp_name */
8924     sizeof(PyUnicodeObject),            /* tp_size */
8925     0,                                  /* tp_itemsize */
8926     /* Slots */
8927     (destructor)unicode_dealloc,        /* tp_dealloc */
8928     0,                                  /* tp_print */
8929     0,                                  /* tp_getattr */
8930     0,                                  /* tp_setattr */
8931     0,                                  /* tp_compare */
8932     unicode_repr,                       /* tp_repr */
8933     &unicode_as_number,                 /* tp_as_number */
8934     &unicode_as_sequence,               /* tp_as_sequence */
8935     &unicode_as_mapping,                /* tp_as_mapping */
8936     (hashfunc) unicode_hash,            /* tp_hash*/
8937     0,                                  /* tp_call*/
8938     (reprfunc) unicode_str,             /* tp_str */
8939     PyObject_GenericGetAttr,            /* tp_getattro */
8940     0,                                  /* tp_setattro */
8941     &unicode_as_buffer,                 /* tp_as_buffer */
8942     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
8943             Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS,  /* tp_flags */
8944     unicode_doc,                        /* tp_doc */
8945     0,                                  /* tp_traverse */
8946     0,                                  /* tp_clear */
8947     PyUnicode_RichCompare,              /* tp_richcompare */
8948     0,                                  /* tp_weaklistoffset */
8949     0,                                  /* tp_iter */
8950     0,                                  /* tp_iternext */
8951     unicode_methods,                    /* tp_methods */
8952     0,                                  /* tp_members */
8953     0,                                  /* tp_getset */
8954     &PyBaseString_Type,                 /* tp_base */
8955     0,                                  /* tp_dict */
8956     0,                                  /* tp_descr_get */
8957     0,                                  /* tp_descr_set */
8958     0,                                  /* tp_dictoffset */
8959     0,                                  /* tp_init */
8960     0,                                  /* tp_alloc */
8961     unicode_new,                        /* tp_new */
8962     PyObject_Del,               /* tp_free */
8963 };
8964
8965 /* Initialize the Unicode implementation */
8966
8967 void _PyUnicode_Init(void)
8968 {
8969     int i;
8970
8971     /* XXX - move this array to unicodectype.c ? */
8972     Py_UNICODE linebreak[] = {
8973         0x000A, /* LINE FEED */
8974         0x000D, /* CARRIAGE RETURN */
8975         0x001C, /* FILE SEPARATOR */
8976         0x001D, /* GROUP SEPARATOR */
8977         0x001E, /* RECORD SEPARATOR */
8978         0x0085, /* NEXT LINE */
8979         0x2028, /* LINE SEPARATOR */
8980         0x2029, /* PARAGRAPH SEPARATOR */
8981     };
8982
8983     /* Init the implementation */
8984     free_list = NULL;
8985     numfree = 0;
8986     unicode_empty = _PyUnicode_New(0);
8987     if (!unicode_empty)
8988         return;
8989
8990     strcpy(unicode_default_encoding, "ascii");
8991     for (i = 0; i < 256; i++)
8992         unicode_latin1[i] = NULL;
8993     if (PyType_Ready(&PyUnicode_Type) < 0)
8994         Py_FatalError("Can't initialize 'unicode'");
8995
8996     /* initialize the linebreak bloom filter */
8997     bloom_linebreak = make_bloom_mask(
8998         linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8999         );
9000
9001     PyType_Ready(&EncodingMapType);
9002 }
9003
9004 /* Finalize the Unicode implementation */
9005
9006 int
9007 PyUnicode_ClearFreeList(void)
9008 {
9009     int freelist_size = numfree;
9010     PyUnicodeObject *u;
9011
9012     for (u = free_list; u != NULL;) {
9013         PyUnicodeObject *v = u;
9014         u = *(PyUnicodeObject **)u;
9015         if (v->str)
9016             PyObject_DEL(v->str);
9017         Py_XDECREF(v->defenc);
9018         PyObject_Del(v);
9019         numfree--;
9020     }
9021     free_list = NULL;
9022     assert(numfree == 0);
9023     return freelist_size;
9024 }
9025
9026 void
9027 _PyUnicode_Fini(void)
9028 {
9029     int i;
9030
9031     Py_XDECREF(unicode_empty);
9032     unicode_empty = NULL;
9033
9034     for (i = 0; i < 256; i++) {
9035         if (unicode_latin1[i]) {
9036             Py_DECREF(unicode_latin1[i]);
9037             unicode_latin1[i] = NULL;
9038         }
9039     }
9040     (void)PyUnicode_ClearFreeList();
9041 }
9042
9043 #ifdef __cplusplus
9044 }
9045 #endif
9046
9047
9048 /*
9049 Local variables:
9050 c-basic-offset: 4
9051 indent-tabs-mode: nil
9052 End:
9053 */