Objects/unicodeobject.c

   1 /*
   2
   3 Unicode implementation based on original code by Fredrik Lundh,
   4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
   5 Unicode Integration Proposal (see file Misc/unicode.txt).
   6
   7 Major speed upgrades to the method implementations at the Reykjavik
   8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
   9
  10 Copyright (c) Corporation for National Research Initiatives.
  11
  12 --------------------------------------------------------------------
  13 The original string type implementation is:
  14
  15   Copyright (c) 1999 by Secret Labs AB
  16   Copyright (c) 1999 by Fredrik Lundh
  17
  18 By obtaining, using, and/or copying this software and/or its
  19 associated documentation, you agree that you have read, understood,
  20 and will comply with the following terms and conditions:
  21
  22 Permission to use, copy, modify, and distribute this software and its
  23 associated documentation for any purpose and without fee is hereby
  24 granted, provided that the above copyright notice appears in all
  25 copies, and that both that copyright notice and this permission notice
  26 appear in supporting documentation, and that the name of Secret Labs
  27 AB or the author not be used in advertising or publicity pertaining to
  28 distribution of the software without specific, written prior
  29 permission.
  30
  31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  33 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  38 --------------------------------------------------------------------
  39
  40 */
  41
  42 #define PY_SSIZE_T_CLEAN
  43 #include "Python.h"
  44
  45 #include "unicodeobject.h"
  46 #include "ucnhash.h"
  47
  48 #ifdef MS_WINDOWS
  49 #include <windows.h>
  50 #endif
  51
  52 /* Limit for the Unicode object free list */
  53
  54 #define PyUnicode_MAXFREELIST       1024
  55
  56 /* Limit for the Unicode object free list stay alive optimization.
  57
  58    The implementation will keep allocated Unicode memory intact for
  59    all objects on the free list having a size less than this
  60    limit. This reduces malloc() overhead for small Unicode objects.
  61
  62    At worst this will result in PyUnicode_MAXFREELIST *
  63    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
  64    malloc()-overhead) bytes of unused garbage.
  65
  66    Setting the limit to 0 effectively turns the feature off.
  67
  68    Note: This is an experimental feature ! If you get core dumps when
  69    using Unicode objects, turn this feature off.
  70
  71 */
  72
  73 #define KEEPALIVE_SIZE_LIMIT       9
  74
  75 /* Endianness switches; defaults to little endian */
  76
  77 #ifdef WORDS_BIGENDIAN
  78 # define BYTEORDER_IS_BIG_ENDIAN
  79 #else
  80 # define BYTEORDER_IS_LITTLE_ENDIAN
  81 #endif
  82
  83 /* --- Globals ------------------------------------------------------------
  84
  85    The globals are initialized by the _PyUnicode_Init() API and should
  86    not be used before calling that API.
  87
  88 */
  89
  90
  91 #ifdef __cplusplus
  92 extern "C" {
  93 #endif
  94
  95 /* Free list for Unicode objects */
  96 static PyUnicodeObject *free_list;
  97 static int numfree;
  98
  99 /* The empty Unicode object is shared to improve performance. */
 100 static PyUnicodeObject *unicode_empty;
 101
 102 /* Single character Unicode strings in the Latin-1 range are being
 103    shared as well. */
 104 static PyUnicodeObject *unicode_latin1[256];
 105
 106 /* Default encoding to use and assume when NULL is passed as encoding
 107    parameter; it is initialized by _PyUnicode_Init().
 108
 109    Always use the PyUnicode_SetDefaultEncoding() and
 110    PyUnicode_GetDefaultEncoding() APIs to access this global.
 111
 112 */
 113 static char unicode_default_encoding[100];
 114
 115 /* Fast detection of the most frequent whitespace characters */
 116 const unsigned char _Py_ascii_whitespace[] = {
 117     0, 0, 0, 0, 0, 0, 0, 0,
 118 /*     case 0x0009: * CHARACTER TABULATION */
 119 /*     case 0x000A: * LINE FEED */
 120 /*     case 0x000B: * LINE TABULATION */
 121 /*     case 0x000C: * FORM FEED */
 122 /*     case 0x000D: * CARRIAGE RETURN */
 123     0, 1, 1, 1, 1, 1, 0, 0,
 124     0, 0, 0, 0, 0, 0, 0, 0,
 125 /*     case 0x001C: * FILE SEPARATOR */
 126 /*     case 0x001D: * GROUP SEPARATOR */
 127 /*     case 0x001E: * RECORD SEPARATOR */
 128 /*     case 0x001F: * UNIT SEPARATOR */
 129     0, 0, 0, 0, 1, 1, 1, 1,
 130 /*     case 0x0020: * SPACE */
 131     1, 0, 0, 0, 0, 0, 0, 0,
 132     0, 0, 0, 0, 0, 0, 0, 0,
 133     0, 0, 0, 0, 0, 0, 0, 0,
 134     0, 0, 0, 0, 0, 0, 0, 0,
 135
 136     0, 0, 0, 0, 0, 0, 0, 0,
 137     0, 0, 0, 0, 0, 0, 0, 0,
 138     0, 0, 0, 0, 0, 0, 0, 0,
 139     0, 0, 0, 0, 0, 0, 0, 0,
 140     0, 0, 0, 0, 0, 0, 0, 0,
 141     0, 0, 0, 0, 0, 0, 0, 0,
 142     0, 0, 0, 0, 0, 0, 0, 0,
 143     0, 0, 0, 0, 0, 0, 0, 0
 144 };
 145
 146 /* Same for linebreaks */
 147 static unsigned char ascii_linebreak[] = {
 148     0, 0, 0, 0, 0, 0, 0, 0,
 149 /*         0x000A, * LINE FEED */
 150 /*         0x000B, * LINE TABULATION */
 151 /*         0x000C, * FORM FEED */
 152 /*         0x000D, * CARRIAGE RETURN */
 153     0, 0, 1, 1, 1, 1, 0, 0,
 154     0, 0, 0, 0, 0, 0, 0, 0,
 155 /*         0x001C, * FILE SEPARATOR */
 156 /*         0x001D, * GROUP SEPARATOR */
 157 /*         0x001E, * RECORD SEPARATOR */
 158     0, 0, 0, 0, 1, 1, 1, 0,
 159     0, 0, 0, 0, 0, 0, 0, 0,
 160     0, 0, 0, 0, 0, 0, 0, 0,
 161     0, 0, 0, 0, 0, 0, 0, 0,
 162     0, 0, 0, 0, 0, 0, 0, 0,
 163
 164     0, 0, 0, 0, 0, 0, 0, 0,
 165     0, 0, 0, 0, 0, 0, 0, 0,
 166     0, 0, 0, 0, 0, 0, 0, 0,
 167     0, 0, 0, 0, 0, 0, 0, 0,
 168     0, 0, 0, 0, 0, 0, 0, 0,
 169     0, 0, 0, 0, 0, 0, 0, 0,
 170     0, 0, 0, 0, 0, 0, 0, 0,
 171     0, 0, 0, 0, 0, 0, 0, 0
 172 };
 173
 174
 175 Py_UNICODE
 176 PyUnicode_GetMax(void)
 177 {
 178 #ifdef Py_UNICODE_WIDE
 179     return 0x10FFFF;
 180 #else
 181     /* This is actually an illegal character, so it should
 182        not be passed to unichr. */
 183     return 0xFFFF;
 184 #endif
 185 }
 186
 187 /* --- Bloom Filters ----------------------------------------------------- */
 188
 189 /* stuff to implement simple "bloom filters" for Unicode characters.
 190    to keep things simple, we use a single bitmask, using the least 5
 191    bits from each unicode characters as the bit index. */
 192
 193 /* the linebreak mask is set up by Unicode_Init below */
 194
 195 #if LONG_BIT >= 128
 196 #define BLOOM_WIDTH 128
 197 #elif LONG_BIT >= 64
 198 #define BLOOM_WIDTH 64
 199 #elif LONG_BIT >= 32
 200 #define BLOOM_WIDTH 32
 201 #else
 202 #error "LONG_BIT is smaller than 32"
 203 #endif
 204
 205 #define BLOOM_MASK unsigned long
 206
 207 static BLOOM_MASK bloom_linebreak;
 208
 209 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
 210 #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
 211
 212 #define BLOOM_LINEBREAK(ch)                                             \
 213     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
 214      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
 215
 216 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
 217 {
 218     /* calculate simple bloom-style bitmask for a given unicode string */
 219
 220     BLOOM_MASK mask;
 221     Py_ssize_t i;
 222
 223     mask = 0;
 224     for (i = 0; i < len; i++)
 225         BLOOM_ADD(mask, ptr[i]);
 226
 227     return mask;
 228 }
 229
 230 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
 231 {
 232     Py_ssize_t i;
 233
 234     for (i = 0; i < setlen; i++)
 235         if (set[i] == chr)
 236             return 1;
 237
 238     return 0;
 239 }
 240
 241 #define BLOOM_MEMBER(mask, chr, set, setlen)                    \
 242     BLOOM(mask, chr) && unicode_member(chr, set, setlen)
 243
 244 /* --- Unicode Object ----------------------------------------------------- */
 245
 246 static
 247 int unicode_resize(register PyUnicodeObject *unicode,
 248                    Py_ssize_t length)
 249 {
 250     void *oldstr;
 251
 252     /* Shortcut if there's nothing much to do. */
 253     if (unicode->length == length)
 254         goto reset;
 255
 256     /* Resizing shared object (unicode_empty or single character
 257        objects) in-place is not allowed. Use PyUnicode_Resize()
 258        instead ! */
 259
 260     if (unicode == unicode_empty ||
 261         (unicode->length == 1 &&
 262          unicode->str[0] < 256U &&
 263          unicode_latin1[unicode->str[0]] == unicode)) {
 264         PyErr_SetString(PyExc_SystemError,
 265                         "can't resize shared unicode objects");
 266         return -1;
 267     }
 268
 269     /* We allocate one more byte to make sure the string is Ux0000 terminated.
 270        The overallocation is also used by fastsearch, which assumes that it's
 271        safe to look at str[length] (without making any assumptions about what
 272        it contains). */
 273
 274     oldstr = unicode->str;
 275     unicode->str = PyObject_REALLOC(unicode->str,
 276                                     sizeof(Py_UNICODE) * (length + 1));
 277     if (!unicode->str) {
 278         unicode->str = (Py_UNICODE *)oldstr;
 279         PyErr_NoMemory();
 280         return -1;
 281     }
 282     unicode->str[length] = 0;
 283     unicode->length = length;
 284
 285   reset:
 286     /* Reset the object caches */
 287     if (unicode->defenc) {
 288         Py_DECREF(unicode->defenc);
 289         unicode->defenc = NULL;
 290     }
 291     unicode->hash = -1;
 292
 293     return 0;
 294 }
 295
 296 /* We allocate one more byte to make sure the string is
 297    Ux0000 terminated -- XXX is this needed ?
 298
 299    XXX This allocator could further be enhanced by assuring that the
 300    free list never reduces its size below 1.
 301
 302 */
 303
 304 static
 305 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
 306 {
 307     register PyUnicodeObject *unicode;
 308
 309     /* Optimization for empty strings */
 310     if (length == 0 && unicode_empty != NULL) {
 311         Py_INCREF(unicode_empty);
 312         return unicode_empty;
 313     }
 314
 315     /* Ensure we won't overflow the size. */
 316     if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
 317         return (PyUnicodeObject *)PyErr_NoMemory();
 318     }
 319
 320     /* Unicode freelist & memory allocation */
 321     if (free_list) {
 322         unicode = free_list;
 323         free_list = *(PyUnicodeObject **)unicode;
 324         numfree--;
 325         if (unicode->str) {
 326             /* Keep-Alive optimization: we only upsize the buffer,
 327                never downsize it. */
 328             if ((unicode->length < length) &&
 329                 unicode_resize(unicode, length) < 0) {
 330                 PyObject_DEL(unicode->str);
 331                 unicode->str = NULL;
 332             }
 333         }
 334         else {
 335             size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
 336             unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
 337         }
 338         PyObject_INIT(unicode, &PyUnicode_Type);
 339     }
 340     else {
 341         size_t new_size;
 342         unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
 343         if (unicode == NULL)
 344             return NULL;
 345         new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
 346         unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
 347     }
 348
 349     if (!unicode->str) {
 350         PyErr_NoMemory();
 351         goto onError;
 352     }
 353     /* Initialize the first element to guard against cases where
 354      * the caller fails before initializing str -- unicode_resize()
 355      * reads str[0], and the Keep-Alive optimization can keep memory
 356      * allocated for str alive across a call to unicode_dealloc(unicode).
 357      * We don't want unicode_resize to read uninitialized memory in
 358      * that case.
 359      */
 360     unicode->str[0] = 0;
 361     unicode->str[length] = 0;
 362     unicode->length = length;
 363     unicode->hash = -1;
 364     unicode->defenc = NULL;
 365     return unicode;
 366
 367   onError:
 368     /* XXX UNREF/NEWREF interface should be more symmetrical */
 369     _Py_DEC_REFTOTAL;
 370     _Py_ForgetReference((PyObject *)unicode);
 371     PyObject_Del(unicode);
 372     return NULL;
 373 }
 374
 375 static
 376 void unicode_dealloc(register PyUnicodeObject *unicode)
 377 {
 378     if (PyUnicode_CheckExact(unicode) &&
 379         numfree < PyUnicode_MAXFREELIST) {
 380         /* Keep-Alive optimization */
 381         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
 382             PyObject_DEL(unicode->str);
 383             unicode->str = NULL;
 384             unicode->length = 0;
 385         }
 386         if (unicode->defenc) {
 387             Py_DECREF(unicode->defenc);
 388             unicode->defenc = NULL;
 389         }
 390         /* Add to free list */
 391         *(PyUnicodeObject **)unicode = free_list;
 392         free_list = unicode;
 393         numfree++;
 394     }
 395     else {
 396         PyObject_DEL(unicode->str);
 397         Py_XDECREF(unicode->defenc);
 398         Py_TYPE(unicode)->tp_free((PyObject *)unicode);
 399     }
 400 }
 401
 402 static
 403 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
 404 {
 405     register PyUnicodeObject *v;
 406
 407     /* Argument checks */
 408     if (unicode == NULL) {
 409         PyErr_BadInternalCall();
 410         return -1;
 411     }
 412     v = *unicode;
 413     if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
 414         PyErr_BadInternalCall();
 415         return -1;
 416     }
 417
 418     /* Resizing unicode_empty and single character objects is not
 419        possible since these are being shared. We simply return a fresh
 420        copy with the same Unicode content. */
 421     if (v->length != length &&
 422         (v == unicode_empty || v->length == 1)) {
 423         PyUnicodeObject *w = _PyUnicode_New(length);
 424         if (w == NULL)
 425             return -1;
 426         Py_UNICODE_COPY(w->str, v->str,
 427                         length < v->length ? length : v->length);
 428         Py_DECREF(*unicode);
 429         *unicode = w;
 430         return 0;
 431     }
 432
 433     /* Note that we don't have to modify *unicode for unshared Unicode
 434        objects, since we can modify them in-place. */
 435     return unicode_resize(v, length);
 436 }
 437
 438 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
 439 {
 440     return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
 441 }
 442
 443 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
 444                                 Py_ssize_t size)
 445 {
 446     PyUnicodeObject *unicode;
 447
 448     /* If the Unicode data is known at construction time, we can apply
 449        some optimizations which share commonly used objects. */
 450     if (u != NULL) {
 451
 452         /* Optimization for empty strings */
 453         if (size == 0 && unicode_empty != NULL) {
 454             Py_INCREF(unicode_empty);
 455             return (PyObject *)unicode_empty;
 456         }
 457
 458         /* Single character Unicode objects in the Latin-1 range are
 459            shared when using this constructor */
 460         if (size == 1 && *u < 256) {
 461             unicode = unicode_latin1[*u];
 462             if (!unicode) {
 463                 unicode = _PyUnicode_New(1);
 464                 if (!unicode)
 465                     return NULL;
 466                 unicode->str[0] = *u;
 467                 unicode_latin1[*u] = unicode;
 468             }
 469             Py_INCREF(unicode);
 470             return (PyObject *)unicode;
 471         }
 472     }
 473
 474     unicode = _PyUnicode_New(size);
 475     if (!unicode)
 476         return NULL;
 477
 478     /* Copy the Unicode data into the new object */
 479     if (u != NULL)
 480         Py_UNICODE_COPY(unicode->str, u, size);
 481
 482     return (PyObject *)unicode;
 483 }
 484
 485 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
 486 {
 487     PyUnicodeObject *unicode;
 488
 489     if (size < 0) {
 490         PyErr_SetString(PyExc_SystemError,
 491                         "Negative size passed to PyUnicode_FromStringAndSize");
 492         return NULL;
 493     }
 494
 495     /* If the Unicode data is known at construction time, we can apply
 496        some optimizations which share commonly used objects.
 497        Also, this means the input must be UTF-8, so fall back to the
 498        UTF-8 decoder at the end. */
 499     if (u != NULL) {
 500
 501         /* Optimization for empty strings */
 502         if (size == 0 && unicode_empty != NULL) {
 503             Py_INCREF(unicode_empty);
 504             return (PyObject *)unicode_empty;
 505         }
 506
 507         /* Single characters are shared when using this constructor.
 508            Restrict to ASCII, since the input must be UTF-8. */
 509         if (size == 1 && Py_CHARMASK(*u) < 128) {
 510             unicode = unicode_latin1[Py_CHARMASK(*u)];
 511             if (!unicode) {
 512                 unicode = _PyUnicode_New(1);
 513                 if (!unicode)
 514                     return NULL;
 515                 unicode->str[0] = Py_CHARMASK(*u);
 516                 unicode_latin1[Py_CHARMASK(*u)] = unicode;
 517             }
 518             Py_INCREF(unicode);
 519             return (PyObject *)unicode;
 520         }
 521
 522         return PyUnicode_DecodeUTF8(u, size, NULL);
 523     }
 524
 525     unicode = _PyUnicode_New(size);
 526     if (!unicode)
 527         return NULL;
 528
 529     return (PyObject *)unicode;
 530 }
 531
 532 PyObject *PyUnicode_FromString(const char *u)
 533 {
 534     size_t size = strlen(u);
 535     if (size > PY_SSIZE_T_MAX) {
 536         PyErr_SetString(PyExc_OverflowError, "input too long");
 537         return NULL;
 538     }
 539
 540     return PyUnicode_FromStringAndSize(u, size);
 541 }
 542
 543 #ifdef HAVE_WCHAR_H
 544
 545 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
 546 # define CONVERT_WCHAR_TO_SURROGATES
 547 #endif
 548
 549 #ifdef CONVERT_WCHAR_TO_SURROGATES
 550
 551 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
 552    to convert from UTF32 to UTF16. */
 553
 554 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 555                                  Py_ssize_t size)
 556 {
 557     PyUnicodeObject *unicode;
 558     register Py_ssize_t i;
 559     Py_ssize_t alloc;
 560     const wchar_t *orig_w;
 561
 562     if (w == NULL) {
 563         PyErr_BadInternalCall();
 564         return NULL;
 565     }
 566
 567     alloc = size;
 568     orig_w = w;
 569     for (i = size; i > 0; i--) {
 570         if (*w > 0xFFFF)
 571             alloc++;
 572         w++;
 573     }
 574     w = orig_w;
 575     unicode = _PyUnicode_New(alloc);
 576     if (!unicode)
 577         return NULL;
 578
 579     /* Copy the wchar_t data into the new object */
 580     {
 581         register Py_UNICODE *u;
 582         u = PyUnicode_AS_UNICODE(unicode);
 583         for (i = size; i > 0; i--) {
 584             if (*w > 0xFFFF) {
 585                 wchar_t ordinal = *w++;
 586                 ordinal -= 0x10000;
 587                 *u++ = 0xD800 | (ordinal >> 10);
 588                 *u++ = 0xDC00 | (ordinal & 0x3FF);
 589             }
 590             else
 591                 *u++ = *w++;
 592         }
 593     }
 594     return (PyObject *)unicode;
 595 }
 596
 597 #else
 598
 599 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 600                                  Py_ssize_t size)
 601 {
 602     PyUnicodeObject *unicode;
 603
 604     if (w == NULL) {
 605         PyErr_BadInternalCall();
 606         return NULL;
 607     }
 608
 609     unicode = _PyUnicode_New(size);
 610     if (!unicode)
 611         return NULL;
 612
 613     /* Copy the wchar_t data into the new object */
 614 #ifdef HAVE_USABLE_WCHAR_T
 615     memcpy(unicode->str, w, size * sizeof(wchar_t));
 616 #else
 617     {
 618         register Py_UNICODE *u;
 619         register Py_ssize_t i;
 620         u = PyUnicode_AS_UNICODE(unicode);
 621         for (i = size; i > 0; i--)
 622             *u++ = *w++;
 623     }
 624 #endif
 625
 626     return (PyObject *)unicode;
 627 }
 628
 629 #endif /* CONVERT_WCHAR_TO_SURROGATES */
 630
 631 #undef CONVERT_WCHAR_TO_SURROGATES
 632
 633 static void
 634 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
 635 {
 636     *fmt++ = '%';
 637     if (width) {
 638         if (zeropad)
 639             *fmt++ = '0';
 640         fmt += sprintf(fmt, "%d", width);
 641     }
 642     if (precision)
 643         fmt += sprintf(fmt, ".%d", precision);
 644     if (longflag)
 645         *fmt++ = 'l';
 646     else if (size_tflag) {
 647         char *f = PY_FORMAT_SIZE_T;
 648         while (*f)
 649             *fmt++ = *f++;
 650     }
 651     *fmt++ = c;
 652     *fmt = '\0';
 653 }
 654
 655 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
 656
 657 PyObject *
 658 PyUnicode_FromFormatV(const char *format, va_list vargs)
 659 {
 660     va_list count;
 661     Py_ssize_t callcount = 0;
 662     PyObject **callresults = NULL;
 663     PyObject **callresult = NULL;
 664     Py_ssize_t n = 0;
 665     int width = 0;
 666     int precision = 0;
 667     int zeropad;
 668     const char* f;
 669     Py_UNICODE *s;
 670     PyObject *string;
 671     /* used by sprintf */
 672     char buffer[21];
 673     /* use abuffer instead of buffer, if we need more space
 674      * (which can happen if there's a format specifier with width). */
 675     char *abuffer = NULL;
 676     char *realbuffer;
 677     Py_ssize_t abuffersize = 0;
 678     char fmt[60]; /* should be enough for %0width.precisionld */
 679     const char *copy;
 680
 681 #ifdef VA_LIST_IS_ARRAY
 682     Py_MEMCPY(count, vargs, sizeof(va_list));
 683 #else
 684 #ifdef  __va_copy
 685     __va_copy(count, vargs);
 686 #else
 687     count = vargs;
 688 #endif
 689 #endif
 690      /* step 1: count the number of %S/%R/%s format specifications
 691       * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
 692       * objects once during step 3 and put the result in an array) */
 693     for (f = format; *f; f++) {
 694          if (*f == '%') {
 695              if (*(f+1)=='%')
 696                  continue;
 697              if (*(f+1)=='S' || *(f+1)=='R')
 698                  ++callcount;
 699              while (isdigit((unsigned)*f))
 700                  width = (width*10) + *f++ - '0';
 701              while (*++f && *f != '%' && !isalpha((unsigned)*f))
 702                  ;
 703              if (*f == 's')
 704                  ++callcount;
 705          }
 706     }
 707     /* step 2: allocate memory for the results of
 708      * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
 709     if (callcount) {
 710         callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
 711         if (!callresults) {
 712             PyErr_NoMemory();
 713             return NULL;
 714         }
 715         callresult = callresults;
 716     }
 717     /* step 3: figure out how large a buffer we need */
 718     for (f = format; *f; f++) {
 719         if (*f == '%') {
 720             const char* p = f;
 721             width = 0;
 722             while (isdigit((unsigned)*f))
 723                 width = (width*10) + *f++ - '0';
 724             while (*++f && *f != '%' && !isalpha((unsigned)*f))
 725                 ;
 726
 727             /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
 728              * they don't affect the amount of space we reserve.
 729              */
 730             if ((*f == 'l' || *f == 'z') &&
 731                 (f[1] == 'd' || f[1] == 'u'))
 732                 ++f;
 733
 734             switch (*f) {
 735             case 'c':
 736                 (void)va_arg(count, int);
 737                 /* fall through... */
 738             case '%':
 739                 n++;
 740                 break;
 741             case 'd': case 'u': case 'i': case 'x':
 742                 (void) va_arg(count, int);
 743                 /* 20 bytes is enough to hold a 64-bit
 744                    integer.  Decimal takes the most space.
 745                    This isn't enough for octal.
 746                    If a width is specified we need more
 747                    (which we allocate later). */
 748                 if (width < 20)
 749                     width = 20;
 750                 n += width;
 751                 if (abuffersize < width)
 752                     abuffersize = width;
 753                 break;
 754             case 's':
 755             {
 756                 /* UTF-8 */
 757                 const char *s = va_arg(count, const char*);
 758                 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
 759                 if (!str)
 760                     goto fail;
 761                 n += PyUnicode_GET_SIZE(str);
 762                 /* Remember the str and switch to the next slot */
 763                 *callresult++ = str;
 764                 break;
 765             }
 766             case 'U':
 767             {
 768                 PyObject *obj = va_arg(count, PyObject *);
 769                 assert(obj && PyUnicode_Check(obj));
 770                 n += PyUnicode_GET_SIZE(obj);
 771                 break;
 772             }
 773             case 'V':
 774             {
 775                 PyObject *obj = va_arg(count, PyObject *);
 776                 const char *str = va_arg(count, const char *);
 777                 assert(obj || str);
 778                 assert(!obj || PyUnicode_Check(obj));
 779                 if (obj)
 780                     n += PyUnicode_GET_SIZE(obj);
 781                 else
 782                     n += strlen(str);
 783                 break;
 784             }
 785             case 'S':
 786             {
 787                 PyObject *obj = va_arg(count, PyObject *);
 788                 PyObject *str;
 789                 assert(obj);
 790                 str = PyObject_Str(obj);
 791                 if (!str)
 792                     goto fail;
 793                 n += PyUnicode_GET_SIZE(str);
 794                 /* Remember the str and switch to the next slot */
 795                 *callresult++ = str;
 796                 break;
 797             }
 798             case 'R':
 799             {
 800                 PyObject *obj = va_arg(count, PyObject *);
 801                 PyObject *repr;
 802                 assert(obj);
 803                 repr = PyObject_Repr(obj);
 804                 if (!repr)
 805                     goto fail;
 806                 n += PyUnicode_GET_SIZE(repr);
 807                 /* Remember the repr and switch to the next slot */
 808                 *callresult++ = repr;
 809                 break;
 810             }
 811             case 'p':
 812                 (void) va_arg(count, int);
 813                 /* maximum 64-bit pointer representation:
 814                  * 0xffffffffffffffff
 815                  * so 19 characters is enough.
 816                  * XXX I count 18 -- what's the extra for?
 817                  */
 818                 n += 19;
 819                 break;
 820             default:
 821                 /* if we stumble upon an unknown
 822                    formatting code, copy the rest of
 823                    the format string to the output
 824                    string. (we cannot just skip the
 825                    code, since there's no way to know
 826                    what's in the argument list) */
 827                 n += strlen(p);
 828                 goto expand;
 829             }
 830         } else
 831             n++;
 832     }
 833   expand:
 834     if (abuffersize > 20) {
 835         abuffer = PyObject_Malloc(abuffersize);
 836         if (!abuffer) {
 837             PyErr_NoMemory();
 838             goto fail;
 839         }
 840         realbuffer = abuffer;
 841     }
 842     else
 843         realbuffer = buffer;
 844     /* step 4: fill the buffer */
 845     /* Since we've analyzed how much space we need for the worst case,
 846        we don't have to resize the string.
 847        There can be no errors beyond this point. */
 848     string = PyUnicode_FromUnicode(NULL, n);
 849     if (!string)
 850         goto fail;
 851
 852     s = PyUnicode_AS_UNICODE(string);
 853     callresult = callresults;
 854
 855     for (f = format; *f; f++) {
 856         if (*f == '%') {
 857             const char* p = f++;
 858             int longflag = 0;
 859             int size_tflag = 0;
 860             zeropad = (*f == '0');
 861             /* parse the width.precision part */
 862             width = 0;
 863             while (isdigit((unsigned)*f))
 864                 width = (width*10) + *f++ - '0';
 865             precision = 0;
 866             if (*f == '.') {
 867                 f++;
 868                 while (isdigit((unsigned)*f))
 869                     precision = (precision*10) + *f++ - '0';
 870             }
 871             /* handle the long flag, but only for %ld and %lu.
 872                others can be added when necessary. */
 873             if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
 874                 longflag = 1;
 875                 ++f;
 876             }
 877             /* handle the size_t flag. */
 878             if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
 879                 size_tflag = 1;
 880                 ++f;
 881             }
 882
 883             switch (*f) {
 884             case 'c':
 885                 *s++ = va_arg(vargs, int);
 886                 break;
 887             case 'd':
 888                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
 889                 if (longflag)
 890                     sprintf(realbuffer, fmt, va_arg(vargs, long));
 891                 else if (size_tflag)
 892                     sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
 893                 else
 894                     sprintf(realbuffer, fmt, va_arg(vargs, int));
 895                 appendstring(realbuffer);
 896                 break;
 897             case 'u':
 898                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
 899                 if (longflag)
 900                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
 901                 else if (size_tflag)
 902                     sprintf(realbuffer, fmt, va_arg(vargs, size_t));
 903                 else
 904                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
 905                 appendstring(realbuffer);
 906                 break;
 907             case 'i':
 908                 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
 909                 sprintf(realbuffer, fmt, va_arg(vargs, int));
 910                 appendstring(realbuffer);
 911                 break;
 912             case 'x':
 913                 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
 914                 sprintf(realbuffer, fmt, va_arg(vargs, int));
 915                 appendstring(realbuffer);
 916                 break;
 917             case 's':
 918             {
 919                 /* unused, since we already have the result */
 920                 (void) va_arg(vargs, char *);
 921                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
 922                                 PyUnicode_GET_SIZE(*callresult));
 923                 s += PyUnicode_GET_SIZE(*callresult);
 924                 /* We're done with the unicode()/repr() => forget it */
 925                 Py_DECREF(*callresult);
 926                 /* switch to next unicode()/repr() result */
 927                 ++callresult;
 928                 break;
 929             }
 930             case 'U':
 931             {
 932                 PyObject *obj = va_arg(vargs, PyObject *);
 933                 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
 934                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
 935                 s += size;
 936                 break;
 937             }
 938             case 'V':
 939             {
 940                 PyObject *obj = va_arg(vargs, PyObject *);
 941                 const char *str = va_arg(vargs, const char *);
 942                 if (obj) {
 943                     Py_ssize_t size = PyUnicode_GET_SIZE(obj);
 944                     Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
 945                     s += size;
 946                 } else {
 947                     appendstring(str);
 948                 }
 949                 break;
 950             }
 951             case 'S':
 952             case 'R':
 953             {
 954                 Py_UNICODE *ucopy;
 955                 Py_ssize_t usize;
 956                 Py_ssize_t upos;
 957                 /* unused, since we already have the result */
 958                 (void) va_arg(vargs, PyObject *);
 959                 ucopy = PyUnicode_AS_UNICODE(*callresult);
 960                 usize = PyUnicode_GET_SIZE(*callresult);
 961                 for (upos = 0; upos<usize;)
 962                     *s++ = ucopy[upos++];
 963                 /* We're done with the unicode()/repr() => forget it */
 964                 Py_DECREF(*callresult);
 965                 /* switch to next unicode()/repr() result */
 966                 ++callresult;
 967                 break;
 968             }
 969             case 'p':
 970                 sprintf(buffer, "%p", va_arg(vargs, void*));
 971                 /* %p is ill-defined:  ensure leading 0x. */
 972                 if (buffer[1] == 'X')
 973                     buffer[1] = 'x';
 974                 else if (buffer[1] != 'x') {
 975                     memmove(buffer+2, buffer, strlen(buffer)+1);
 976                     buffer[0] = '0';
 977                     buffer[1] = 'x';
 978                 }
 979                 appendstring(buffer);
 980                 break;
 981             case '%':
 982                 *s++ = '%';
 983                 break;
 984             default:
 985                 appendstring(p);
 986                 goto end;
 987             }
 988         } else
 989             *s++ = *f;
 990     }
 991
 992   end:
 993     if (callresults)
 994         PyObject_Free(callresults);
 995     if (abuffer)
 996         PyObject_Free(abuffer);
 997     PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
 998     return string;
 999   fail:
1000     if (callresults) {
1001         PyObject **callresult2 = callresults;
1002         while (callresult2 < callresult) {
1003             Py_DECREF(*callresult2);
1004             ++callresult2;
1005         }
1006         PyObject_Free(callresults);
1007     }
1008     if (abuffer)
1009         PyObject_Free(abuffer);
1010     return NULL;
1011 }
1012
1013 #undef appendstring
1014
1015 PyObject *
1016 PyUnicode_FromFormat(const char *format, ...)
1017 {
1018     PyObject* ret;
1019     va_list vargs;
1020
1021 #ifdef HAVE_STDARG_PROTOTYPES
1022     va_start(vargs, format);
1023 #else
1024     va_start(vargs);
1025 #endif
1026     ret = PyUnicode_FromFormatV(format, vargs);
1027     va_end(vargs);
1028     return ret;
1029 }
1030
1031 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1032                                 wchar_t *w,
1033                                 Py_ssize_t size)
1034 {
1035     if (unicode == NULL) {
1036         PyErr_BadInternalCall();
1037         return -1;
1038     }
1039
1040     /* If possible, try to copy the 0-termination as well */
1041     if (size > PyUnicode_GET_SIZE(unicode))
1042         size = PyUnicode_GET_SIZE(unicode) + 1;
1043
1044 #ifdef HAVE_USABLE_WCHAR_T
1045     memcpy(w, unicode->str, size * sizeof(wchar_t));
1046 #else
1047     {
1048         register Py_UNICODE *u;
1049         register Py_ssize_t i;
1050         u = PyUnicode_AS_UNICODE(unicode);
1051         for (i = size; i > 0; i--)
1052             *w++ = *u++;
1053     }
1054 #endif
1055
1056     if (size > PyUnicode_GET_SIZE(unicode))
1057         return PyUnicode_GET_SIZE(unicode);
1058     else
1059         return size;
1060 }
1061
1062 #endif
1063
1064 PyObject *PyUnicode_FromOrdinal(int ordinal)
1065 {
1066     Py_UNICODE s[1];
1067
1068 #ifdef Py_UNICODE_WIDE
1069     if (ordinal < 0 || ordinal > 0x10ffff) {
1070         PyErr_SetString(PyExc_ValueError,
1071                         "unichr() arg not in range(0x110000) "
1072                         "(wide Python build)");
1073         return NULL;
1074     }
1075 #else
1076     if (ordinal < 0 || ordinal > 0xffff) {
1077         PyErr_SetString(PyExc_ValueError,
1078                         "unichr() arg not in range(0x10000) "
1079                         "(narrow Python build)");
1080         return NULL;
1081     }
1082 #endif
1083
1084     s[0] = (Py_UNICODE)ordinal;
1085     return PyUnicode_FromUnicode(s, 1);
1086 }
1087
1088 PyObject *PyUnicode_FromObject(register PyObject *obj)
1089 {
1090     /* XXX Perhaps we should make this API an alias of
1091        PyObject_Unicode() instead ?! */
1092     if (PyUnicode_CheckExact(obj)) {
1093         Py_INCREF(obj);
1094         return obj;
1095     }
1096     if (PyUnicode_Check(obj)) {
1097         /* For a Unicode subtype that's not a Unicode object,
1098            return a true Unicode object with the same data. */
1099         return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1100                                      PyUnicode_GET_SIZE(obj));
1101     }
1102     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1103 }
1104
1105 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1106                                       const char *encoding,
1107                                       const char *errors)
1108 {
1109     const char *s = NULL;
1110     Py_ssize_t len;
1111     PyObject *v;
1112
1113     if (obj == NULL) {
1114         PyErr_BadInternalCall();
1115         return NULL;
1116     }
1117
1118 #if 0
1119     /* For b/w compatibility we also accept Unicode objects provided
1120        that no encodings is given and then redirect to
1121        PyObject_Unicode() which then applies the additional logic for
1122        Unicode subclasses.
1123
1124        NOTE: This API should really only be used for object which
1125        represent *encoded* Unicode !
1126
1127     */
1128     if (PyUnicode_Check(obj)) {
1129         if (encoding) {
1130             PyErr_SetString(PyExc_TypeError,
1131                             "decoding Unicode is not supported");
1132             return NULL;
1133         }
1134         return PyObject_Unicode(obj);
1135     }
1136 #else
1137     if (PyUnicode_Check(obj)) {
1138         PyErr_SetString(PyExc_TypeError,
1139                         "decoding Unicode is not supported");
1140         return NULL;
1141     }
1142 #endif
1143
1144     /* Coerce object */
1145     if (PyString_Check(obj)) {
1146         s = PyString_AS_STRING(obj);
1147         len = PyString_GET_SIZE(obj);
1148     }
1149     else if (PyByteArray_Check(obj)) {
1150         /* Python 2.x specific */
1151         PyErr_Format(PyExc_TypeError,
1152                      "decoding bytearray is not supported");
1153         return NULL;
1154     }
1155     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1156         /* Overwrite the error message with something more useful in
1157            case of a TypeError. */
1158         if (PyErr_ExceptionMatches(PyExc_TypeError))
1159             PyErr_Format(PyExc_TypeError,
1160                          "coercing to Unicode: need string or buffer, "
1161                          "%.80s found",
1162                          Py_TYPE(obj)->tp_name);
1163         goto onError;
1164     }
1165
1166     /* Convert to Unicode */
1167     if (len == 0) {
1168         Py_INCREF(unicode_empty);
1169         v = (PyObject *)unicode_empty;
1170     }
1171     else
1172         v = PyUnicode_Decode(s, len, encoding, errors);
1173
1174     return v;
1175
1176   onError:
1177     return NULL;
1178 }
1179
1180 PyObject *PyUnicode_Decode(const char *s,
1181                            Py_ssize_t size,
1182                            const char *encoding,
1183                            const char *errors)
1184 {
1185     PyObject *buffer = NULL, *unicode;
1186
1187     if (encoding == NULL)
1188         encoding = PyUnicode_GetDefaultEncoding();
1189
1190     /* Shortcuts for common default encodings */
1191     if (strcmp(encoding, "utf-8") == 0)
1192         return PyUnicode_DecodeUTF8(s, size, errors);
1193     else if (strcmp(encoding, "latin-1") == 0)
1194         return PyUnicode_DecodeLatin1(s, size, errors);
1195 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1196     else if (strcmp(encoding, "mbcs") == 0)
1197         return PyUnicode_DecodeMBCS(s, size, errors);
1198 #endif
1199     else if (strcmp(encoding, "ascii") == 0)
1200         return PyUnicode_DecodeASCII(s, size, errors);
1201
1202     /* Decode via the codec registry */
1203     buffer = PyBuffer_FromMemory((void *)s, size);
1204     if (buffer == NULL)
1205         goto onError;
1206     unicode = PyCodec_Decode(buffer, encoding, errors);
1207     if (unicode == NULL)
1208         goto onError;
1209     if (!PyUnicode_Check(unicode)) {
1210         PyErr_Format(PyExc_TypeError,
1211                      "decoder did not return an unicode object (type=%.400s)",
1212                      Py_TYPE(unicode)->tp_name);
1213         Py_DECREF(unicode);
1214         goto onError;
1215     }
1216     Py_DECREF(buffer);
1217     return unicode;
1218
1219   onError:
1220     Py_XDECREF(buffer);
1221     return NULL;
1222 }
1223
1224 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1225                                     const char *encoding,
1226                                     const char *errors)
1227 {
1228     PyObject *v;
1229
1230     if (!PyUnicode_Check(unicode)) {
1231         PyErr_BadArgument();
1232         goto onError;
1233     }
1234
1235     if (encoding == NULL)
1236         encoding = PyUnicode_GetDefaultEncoding();
1237
1238     /* Decode via the codec registry */
1239     v = PyCodec_Decode(unicode, encoding, errors);
1240     if (v == NULL)
1241         goto onError;
1242     return v;
1243
1244   onError:
1245     return NULL;
1246 }
1247
1248 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1249                            Py_ssize_t size,
1250                            const char *encoding,
1251                            const char *errors)
1252 {
1253     PyObject *v, *unicode;
1254
1255     unicode = PyUnicode_FromUnicode(s, size);
1256     if (unicode == NULL)
1257         return NULL;
1258     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1259     Py_DECREF(unicode);
1260     return v;
1261 }
1262
1263 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1264                                     const char *encoding,
1265                                     const char *errors)
1266 {
1267     PyObject *v;
1268
1269     if (!PyUnicode_Check(unicode)) {
1270         PyErr_BadArgument();
1271         goto onError;
1272     }
1273
1274     if (encoding == NULL)
1275         encoding = PyUnicode_GetDefaultEncoding();
1276
1277     /* Encode via the codec registry */
1278     v = PyCodec_Encode(unicode, encoding, errors);
1279     if (v == NULL)
1280         goto onError;
1281     return v;
1282
1283   onError:
1284     return NULL;
1285 }
1286
1287 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1288                                     const char *encoding,
1289                                     const char *errors)
1290 {
1291     PyObject *v;
1292
1293     if (!PyUnicode_Check(unicode)) {
1294         PyErr_BadArgument();
1295         goto onError;
1296     }
1297
1298     if (encoding == NULL)
1299         encoding = PyUnicode_GetDefaultEncoding();
1300
1301     /* Shortcuts for common default encodings */
1302     if (errors == NULL) {
1303         if (strcmp(encoding, "utf-8") == 0)
1304             return PyUnicode_AsUTF8String(unicode);
1305         else if (strcmp(encoding, "latin-1") == 0)
1306             return PyUnicode_AsLatin1String(unicode);
1307 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1308         else if (strcmp(encoding, "mbcs") == 0)
1309             return PyUnicode_AsMBCSString(unicode);
1310 #endif
1311         else if (strcmp(encoding, "ascii") == 0)
1312             return PyUnicode_AsASCIIString(unicode);
1313     }
1314
1315     /* Encode via the codec registry */
1316     v = PyCodec_Encode(unicode, encoding, errors);
1317     if (v == NULL)
1318         goto onError;
1319     if (!PyString_Check(v)) {
1320         PyErr_Format(PyExc_TypeError,
1321                      "encoder did not return a string object (type=%.400s)",
1322                      Py_TYPE(v)->tp_name);
1323         Py_DECREF(v);
1324         goto onError;
1325     }
1326     return v;
1327
1328   onError:
1329     return NULL;
1330 }
1331
1332 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1333                                             const char *errors)
1334 {
1335     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1336
1337     if (v)
1338         return v;
1339     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1340     if (v && errors == NULL)
1341         ((PyUnicodeObject *)unicode)->defenc = v;
1342     return v;
1343 }
1344
1345 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1346 {
1347     if (!PyUnicode_Check(unicode)) {
1348         PyErr_BadArgument();
1349         goto onError;
1350     }
1351     return PyUnicode_AS_UNICODE(unicode);
1352
1353   onError:
1354     return NULL;
1355 }
1356
1357 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1358 {
1359     if (!PyUnicode_Check(unicode)) {
1360         PyErr_BadArgument();
1361         goto onError;
1362     }
1363     return PyUnicode_GET_SIZE(unicode);
1364
1365   onError:
1366     return -1;
1367 }
1368
1369 const char *PyUnicode_GetDefaultEncoding(void)
1370 {
1371     return unicode_default_encoding;
1372 }
1373
1374 int PyUnicode_SetDefaultEncoding(const char *encoding)
1375 {
1376     PyObject *v;
1377
1378     /* Make sure the encoding is valid. As side effect, this also
1379        loads the encoding into the codec registry cache. */
1380     v = _PyCodec_Lookup(encoding);
1381     if (v == NULL)
1382         goto onError;
1383     Py_DECREF(v);
1384     strncpy(unicode_default_encoding,
1385             encoding,
1386             sizeof(unicode_default_encoding));
1387     return 0;
1388
1389   onError:
1390     return -1;
1391 }
1392
1393 /* error handling callback helper:
1394    build arguments, call the callback and check the arguments,
1395    if no exception occurred, copy the replacement to the output
1396    and adjust various state variables.
1397    return 0 on success, -1 on error
1398 */
1399
1400 static
1401 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1402                                      const char *encoding, const char *reason,
1403                                      const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1404                                      Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1405                                      PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1406 {
1407     static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1408
1409     PyObject *restuple = NULL;
1410     PyObject *repunicode = NULL;
1411     Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1412     Py_ssize_t requiredsize;
1413     Py_ssize_t newpos;
1414     Py_UNICODE *repptr;
1415     Py_ssize_t repsize;
1416     int res = -1;
1417
1418     if (*errorHandler == NULL) {
1419         *errorHandler = PyCodec_LookupError(errors);
1420         if (*errorHandler == NULL)
1421             goto onError;
1422     }
1423
1424     if (*exceptionObject == NULL) {
1425         *exceptionObject = PyUnicodeDecodeError_Create(
1426             encoding, input, insize, *startinpos, *endinpos, reason);
1427         if (*exceptionObject == NULL)
1428             goto onError;
1429     }
1430     else {
1431         if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1432             goto onError;
1433         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1434             goto onError;
1435         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1436             goto onError;
1437     }
1438
1439     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1440     if (restuple == NULL)
1441         goto onError;
1442     if (!PyTuple_Check(restuple)) {
1443         PyErr_SetString(PyExc_TypeError, &argparse[4]);
1444         goto onError;
1445     }
1446     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1447         goto onError;
1448     if (newpos<0)
1449         newpos = insize+newpos;
1450     if (newpos<0 || newpos>insize) {
1451         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1452         goto onError;
1453     }
1454
1455     /* need more space? (at least enough for what we
1456        have+the replacement+the rest of the string (starting
1457        at the new input position), so we won't have to check space
1458        when there are no errors in the rest of the string) */
1459     repptr = PyUnicode_AS_UNICODE(repunicode);
1460     repsize = PyUnicode_GET_SIZE(repunicode);
1461     requiredsize = *outpos + repsize + insize-newpos;
1462     if (requiredsize > outsize) {
1463         if (requiredsize<2*outsize)
1464             requiredsize = 2*outsize;
1465         if (_PyUnicode_Resize(output, requiredsize) < 0)
1466             goto onError;
1467         *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1468     }
1469     *endinpos = newpos;
1470     *inptr = input + newpos;
1471     Py_UNICODE_COPY(*outptr, repptr, repsize);
1472     *outptr += repsize;
1473     *outpos += repsize;
1474     /* we made it! */
1475     res = 0;
1476
1477   onError:
1478     Py_XDECREF(restuple);
1479     return res;
1480 }
1481
1482 /* --- UTF-7 Codec -------------------------------------------------------- */
1483
1484 /* See RFC2152 for details.  We encode conservatively and decode liberally. */
1485
1486 /* Three simple macros defining base-64. */
1487
1488 /* Is c a base-64 character? */
1489
1490 #define IS_BASE64(c) \
1491     (isalnum(c) || (c) == '+' || (c) == '/')
1492
1493 /* given that c is a base-64 character, what is its base-64 value? */
1494
1495 #define FROM_BASE64(c)                                                  \
1496     (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
1497      ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
1498      ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
1499      (c) == '+' ? 62 : 63)
1500
1501 /* What is the base-64 character of the bottom 6 bits of n? */
1502
1503 #define TO_BASE64(n)  \
1504     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1505
1506 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1507  * decoded as itself.  We are permissive on decoding; the only ASCII
1508  * byte not decoding to itself is the + which begins a base64
1509  * string. */
1510
1511 #define DECODE_DIRECT(c)                                \
1512     ((c) <= 127 && (c) != '+')
1513
1514 /* The UTF-7 encoder treats ASCII characters differently according to
1515  * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1516  * the above).  See RFC2152.  This array identifies these different
1517  * sets:
1518  * 0 : "Set D"
1519  *     alphanumeric and '(),-./:?
1520  * 1 : "Set O"
1521  *     !"#$%&*;<=>@[]^_`{|}
1522  * 2 : "whitespace"
1523  *     ht nl cr sp
1524  * 3 : special (must be base64 encoded)
1525  *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1526  */
1527
1528 static
1529 char utf7_category[128] = {
1530 /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
1531     3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
1532 /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
1533     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
1534 /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
1535     2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
1536 /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
1537     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
1538 /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
1539     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1540 /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
1541     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
1542 /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
1543     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1544 /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
1545     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
1546 };
1547
1548 /* ENCODE_DIRECT: this character should be encoded as itself.  The
1549  * answer depends on whether we are encoding set O as itself, and also
1550  * on whether we are encoding whitespace as itself.  RFC2152 makes it
1551  * clear that the answers to these questions vary between
1552  * applications, so this code needs to be flexible.  */
1553
1554 #define ENCODE_DIRECT(c, directO, directWS)             \
1555     ((c) < 128 && (c) > 0 &&                            \
1556      ((utf7_category[(c)] == 0) ||                      \
1557       (directWS && (utf7_category[(c)] == 2)) ||        \
1558       (directO && (utf7_category[(c)] == 1))))
1559
1560 PyObject *PyUnicode_DecodeUTF7(const char *s,
1561                                Py_ssize_t size,
1562                                const char *errors)
1563 {
1564     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1565 }
1566
1567 /* The decoder.  The only state we preserve is our read position,
1568  * i.e. how many characters we have consumed.  So if we end in the
1569  * middle of a shift sequence we have to back off the read position
1570  * and the output to the beginning of the sequence, otherwise we lose
1571  * all the shift state (seen bits, number of bits seen, high
1572  * surrogate). */
1573
1574 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1575                                        Py_ssize_t size,
1576                                        const char *errors,
1577                                        Py_ssize_t *consumed)
1578 {
1579     const char *starts = s;
1580     Py_ssize_t startinpos;
1581     Py_ssize_t endinpos;
1582     Py_ssize_t outpos;
1583     const char *e;
1584     PyUnicodeObject *unicode;
1585     Py_UNICODE *p;
1586     const char *errmsg = "";
1587     int inShift = 0;
1588     Py_UNICODE *shiftOutStart;
1589     unsigned int base64bits = 0;
1590     unsigned long base64buffer = 0;
1591     Py_UNICODE surrogate = 0;
1592     PyObject *errorHandler = NULL;
1593     PyObject *exc = NULL;
1594
1595     unicode = _PyUnicode_New(size);
1596     if (!unicode)
1597         return NULL;
1598     if (size == 0) {
1599         if (consumed)
1600             *consumed = 0;
1601         return (PyObject *)unicode;
1602     }
1603
1604     p = unicode->str;
1605     shiftOutStart = p;
1606     e = s + size;
1607
1608     while (s < e) {
1609         Py_UNICODE ch = (unsigned char) *s;
1610
1611         if (inShift) { /* in a base-64 section */
1612             if (IS_BASE64(ch)) { /* consume a base-64 character */
1613                 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1614                 base64bits += 6;
1615                 s++;
1616                 if (base64bits >= 16) {
1617                     /* we have enough bits for a UTF-16 value */
1618                     Py_UNICODE outCh = (Py_UNICODE)
1619                                        (base64buffer >> (base64bits-16));
1620                     base64bits -= 16;
1621                     base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1622                     if (surrogate) {
1623                         /* expecting a second surrogate */
1624                         if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1625 #ifdef Py_UNICODE_WIDE
1626                             *p++ = (((surrogate & 0x3FF)<<10)
1627                                     | (outCh & 0x3FF)) + 0x10000;
1628 #else
1629                             *p++ = surrogate;
1630                             *p++ = outCh;
1631 #endif
1632                             surrogate = 0;
1633                         }
1634                         else {
1635                             surrogate = 0;
1636                             errmsg = "second surrogate missing";
1637                             goto utf7Error;
1638                         }
1639                     }
1640                     else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1641                         /* first surrogate */
1642                         surrogate = outCh;
1643                     }
1644                     else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1645                         errmsg = "unexpected second surrogate";
1646                         goto utf7Error;
1647                     }
1648                     else {
1649                         *p++ = outCh;
1650                     }
1651                 }
1652             }
1653             else { /* now leaving a base-64 section */
1654                 inShift = 0;
1655                 s++;
1656                 if (surrogate) {
1657                     errmsg = "second surrogate missing at end of shift sequence";
1658                     goto utf7Error;
1659                 }
1660                 if (base64bits > 0) { /* left-over bits */
1661                     if (base64bits >= 6) {
1662                         /* We've seen at least one base-64 character */
1663                         errmsg = "partial character in shift sequence";
1664                         goto utf7Error;
1665                     }
1666                     else {
1667                         /* Some bits remain; they should be zero */
1668                         if (base64buffer != 0) {
1669                             errmsg = "non-zero padding bits in shift sequence";
1670                             goto utf7Error;
1671                         }
1672                     }
1673                 }
1674                 if (ch != '-') {
1675                     /* '-' is absorbed; other terminating
1676                        characters are preserved */
1677                     *p++ = ch;
1678                 }
1679             }
1680         }
1681         else if ( ch == '+' ) {
1682             startinpos = s-starts;
1683             s++; /* consume '+' */
1684             if (s < e && *s == '-') { /* '+-' encodes '+' */
1685                 s++;
1686                 *p++ = '+';
1687             }
1688             else { /* begin base64-encoded section */
1689                 inShift = 1;
1690                 shiftOutStart = p;
1691                 base64bits = 0;
1692             }
1693         }
1694         else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
1695             *p++ = ch;
1696             s++;
1697         }
1698         else {
1699             startinpos = s-starts;
1700             s++;
1701             errmsg = "unexpected special character";
1702             goto utf7Error;
1703         }
1704         continue;
1705 utf7Error:
1706         outpos = p-PyUnicode_AS_UNICODE(unicode);
1707         endinpos = s-starts;
1708         if (unicode_decode_call_errorhandler(
1709                 errors, &errorHandler,
1710                 "utf7", errmsg,
1711                 starts, size, &startinpos, &endinpos, &exc, &s,
1712                 &unicode, &outpos, &p))
1713             goto onError;
1714     }
1715
1716     /* end of string */
1717
1718     if (inShift && !consumed) { /* in shift sequence, no more to follow */
1719         /* if we're in an inconsistent state, that's an error */
1720         if (surrogate ||
1721                 (base64bits >= 6) ||
1722                 (base64bits > 0 && base64buffer != 0)) {
1723             outpos = p-PyUnicode_AS_UNICODE(unicode);
1724             endinpos = size;
1725             if (unicode_decode_call_errorhandler(
1726                     errors, &errorHandler,
1727                     "utf7", "unterminated shift sequence",
1728                     starts, size, &startinpos, &endinpos, &exc, &s,
1729                     &unicode, &outpos, &p))
1730                 goto onError;
1731         }
1732     }
1733
1734     /* return state */
1735     if (consumed) {
1736         if (inShift) {
1737             p = shiftOutStart; /* back off output */
1738             *consumed = startinpos;
1739         }
1740         else {
1741             *consumed = s-starts;
1742         }
1743     }
1744
1745     if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1746         goto onError;
1747
1748     Py_XDECREF(errorHandler);
1749     Py_XDECREF(exc);
1750     return (PyObject *)unicode;
1751
1752   onError:
1753     Py_XDECREF(errorHandler);
1754     Py_XDECREF(exc);
1755     Py_DECREF(unicode);
1756     return NULL;
1757 }
1758
1759
1760 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1761                                Py_ssize_t size,
1762                                int base64SetO,
1763                                int base64WhiteSpace,
1764                                const char *errors)
1765 {
1766     PyObject *v;
1767     /* It might be possible to tighten this worst case */
1768     Py_ssize_t allocated = 8 * size;
1769     int inShift = 0;
1770     Py_ssize_t i = 0;
1771     unsigned int base64bits = 0;
1772     unsigned long base64buffer = 0;
1773     char * out;
1774     char * start;
1775
1776     if (allocated / 8 != size)
1777         return PyErr_NoMemory();
1778
1779     if (size == 0)
1780         return PyString_FromStringAndSize(NULL, 0);
1781
1782     v = PyString_FromStringAndSize(NULL, allocated);
1783     if (v == NULL)
1784         return NULL;
1785
1786     start = out = PyString_AS_STRING(v);
1787     for (;i < size; ++i) {
1788         Py_UNICODE ch = s[i];
1789
1790         if (inShift) {
1791             if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1792                 /* shifting out */
1793                 if (base64bits) { /* output remaining bits */
1794                     *out++ = TO_BASE64(base64buffer << (6-base64bits));
1795                     base64buffer = 0;
1796                     base64bits = 0;
1797                 }
1798                 inShift = 0;
1799                 /* Characters not in the BASE64 set implicitly unshift the sequence
1800                    so no '-' is required, except if the character is itself a '-' */
1801                 if (IS_BASE64(ch) || ch == '-') {
1802                     *out++ = '-';
1803                 }
1804                 *out++ = (char) ch;
1805             }
1806             else {
1807                 goto encode_char;
1808             }
1809         }
1810         else { /* not in a shift sequence */
1811             if (ch == '+') {
1812                 *out++ = '+';
1813                         *out++ = '-';
1814             }
1815             else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1816                 *out++ = (char) ch;
1817             }
1818             else {
1819                 *out++ = '+';
1820                 inShift = 1;
1821                 goto encode_char;
1822             }
1823         }
1824         continue;
1825 encode_char:
1826 #ifdef Py_UNICODE_WIDE
1827         if (ch >= 0x10000) {
1828             /* code first surrogate */
1829             base64bits += 16;
1830             base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1831             while (base64bits >= 6) {
1832                 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1833                 base64bits -= 6;
1834             }
1835             /* prepare second surrogate */
1836             ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
1837         }
1838 #endif
1839         base64bits += 16;
1840         base64buffer = (base64buffer << 16) | ch;
1841         while (base64bits >= 6) {
1842             *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1843             base64bits -= 6;
1844         }
1845     }
1846     if (base64bits)
1847         *out++= TO_BASE64(base64buffer << (6-base64bits) );
1848     if (inShift)
1849         *out++ = '-';
1850
1851     if (_PyString_Resize(&v, out - start))
1852         return NULL;
1853     return v;
1854 }
1855
1856 #undef IS_BASE64
1857 #undef FROM_BASE64
1858 #undef TO_BASE64
1859 #undef DECODE_DIRECT
1860 #undef ENCODE_DIRECT
1861
1862 /* --- UTF-8 Codec -------------------------------------------------------- */
1863
1864 static
1865 char utf8_code_length[256] = {
1866     /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
1867        illegal prefix.  See RFC 3629 for details */
1868     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1869     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1870     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1871     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1872     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1873     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1874     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1875     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1876     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
1877     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1878     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1879     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1880     0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1881     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1882     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1883     4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
1884 };
1885
1886 PyObject *PyUnicode_DecodeUTF8(const char *s,
1887                                Py_ssize_t size,
1888                                const char *errors)
1889 {
1890     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1891 }
1892
1893 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1894                                        Py_ssize_t size,
1895                                        const char *errors,
1896                                        Py_ssize_t *consumed)
1897 {
1898     const char *starts = s;
1899     int n;
1900     int k;
1901     Py_ssize_t startinpos;
1902     Py_ssize_t endinpos;
1903     Py_ssize_t outpos;
1904     const char *e;
1905     PyUnicodeObject *unicode;
1906     Py_UNICODE *p;
1907     const char *errmsg = "";
1908     PyObject *errorHandler = NULL;
1909     PyObject *exc = NULL;
1910
1911     /* Note: size will always be longer than the resulting Unicode
1912        character count */
1913     unicode = _PyUnicode_New(size);
1914     if (!unicode)
1915         return NULL;
1916     if (size == 0) {
1917         if (consumed)
1918             *consumed = 0;
1919         return (PyObject *)unicode;
1920     }
1921
1922     /* Unpack UTF-8 encoded data */
1923     p = unicode->str;
1924     e = s + size;
1925
1926     while (s < e) {
1927         Py_UCS4 ch = (unsigned char)*s;
1928
1929         if (ch < 0x80) {
1930             *p++ = (Py_UNICODE)ch;
1931             s++;
1932             continue;
1933         }
1934
1935         n = utf8_code_length[ch];
1936
1937         if (s + n > e) {
1938             if (consumed)
1939                 break;
1940             else {
1941                 errmsg = "unexpected end of data";
1942                 startinpos = s-starts;
1943                 endinpos = startinpos+1;
1944                 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
1945                     endinpos++;
1946                 goto utf8Error;
1947             }
1948         }
1949
1950         switch (n) {
1951
1952         case 0:
1953             errmsg = "invalid start byte";
1954             startinpos = s-starts;
1955             endinpos = startinpos+1;
1956             goto utf8Error;
1957
1958         case 1:
1959             errmsg = "internal error";
1960             startinpos = s-starts;
1961             endinpos = startinpos+1;
1962             goto utf8Error;
1963
1964         case 2:
1965             if ((s[1] & 0xc0) != 0x80) {
1966                 errmsg = "invalid continuation byte";
1967                 startinpos = s-starts;
1968                 endinpos = startinpos + 1;
1969                 goto utf8Error;
1970             }
1971             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1972             assert ((ch > 0x007F) && (ch <= 0x07FF));
1973             *p++ = (Py_UNICODE)ch;
1974             break;
1975
1976         case 3:
1977             /* XXX: surrogates shouldn't be valid UTF-8!
1978                see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
1979                (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
1980                Uncomment the 2 lines below to make them invalid,
1981                codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
1982             if ((s[1] & 0xc0) != 0x80 ||
1983                 (s[2] & 0xc0) != 0x80 ||
1984                 ((unsigned char)s[0] == 0xE0 &&
1985                  (unsigned char)s[1] < 0xA0)/* ||
1986                 ((unsigned char)s[0] == 0xED &&
1987                  (unsigned char)s[1] > 0x9F)*/) {
1988                 errmsg = "invalid continuation byte";
1989                 startinpos = s-starts;
1990                 endinpos = startinpos + 1;
1991
1992                 /* if s[1] first two bits are 1 and 0, then the invalid
1993                    continuation byte is s[2], so increment endinpos by 1,
1994                    if not, s[1] is invalid and endinpos doesn't need to
1995                    be incremented. */
1996                 if ((s[1] & 0xC0) == 0x80)
1997                     endinpos++;
1998                 goto utf8Error;
1999             }
2000             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2001             assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2002             *p++ = (Py_UNICODE)ch;
2003             break;
2004
2005         case 4:
2006             if ((s[1] & 0xc0) != 0x80 ||
2007                 (s[2] & 0xc0) != 0x80 ||
2008                 (s[3] & 0xc0) != 0x80 ||
2009                 ((unsigned char)s[0] == 0xF0 &&
2010                  (unsigned char)s[1] < 0x90) ||
2011                 ((unsigned char)s[0] == 0xF4 &&
2012                  (unsigned char)s[1] > 0x8F)) {
2013                 errmsg = "invalid continuation byte";
2014                 startinpos = s-starts;
2015                 endinpos = startinpos + 1;
2016                 if ((s[1] & 0xC0) == 0x80) {
2017                     endinpos++;
2018                     if ((s[2] & 0xC0) == 0x80)
2019                         endinpos++;
2020                 }
2021                 goto utf8Error;
2022             }
2023             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2024                  ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2025             assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2026
2027 #ifdef Py_UNICODE_WIDE
2028             *p++ = (Py_UNICODE)ch;
2029 #else
2030             /*  compute and append the two surrogates: */
2031
2032             /*  translate from 10000..10FFFF to 0..FFFF */
2033             ch -= 0x10000;
2034
2035             /*  high surrogate = top 10 bits added to D800 */
2036             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2037
2038             /*  low surrogate = bottom 10 bits added to DC00 */
2039             *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2040 #endif
2041             break;
2042         }
2043         s += n;
2044         continue;
2045
2046       utf8Error:
2047         outpos = p-PyUnicode_AS_UNICODE(unicode);
2048         if (unicode_decode_call_errorhandler(
2049                 errors, &errorHandler,
2050                 "utf8", errmsg,
2051                 starts, size, &startinpos, &endinpos, &exc, &s,
2052                 &unicode, &outpos, &p))
2053             goto onError;
2054     }
2055     if (consumed)
2056         *consumed = s-starts;
2057
2058     /* Adjust length */
2059     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2060         goto onError;
2061
2062     Py_XDECREF(errorHandler);
2063     Py_XDECREF(exc);
2064     return (PyObject *)unicode;
2065
2066   onError:
2067     Py_XDECREF(errorHandler);
2068     Py_XDECREF(exc);
2069     Py_DECREF(unicode);
2070     return NULL;
2071 }
2072
2073 /* Allocation strategy:  if the string is short, convert into a stack buffer
2074    and allocate exactly as much space needed at the end.  Else allocate the
2075    maximum possible needed (4 result bytes per Unicode character), and return
2076    the excess memory at the end.
2077 */
2078 PyObject *
2079 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2080                      Py_ssize_t size,
2081                      const char *errors)
2082 {
2083 #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
2084
2085     Py_ssize_t i;           /* index into s of next input byte */
2086     PyObject *v;        /* result string object */
2087     char *p;            /* next free byte in output buffer */
2088     Py_ssize_t nallocated;  /* number of result bytes allocated */
2089     Py_ssize_t nneeded;        /* number of result bytes needed */
2090     char stackbuf[MAX_SHORT_UNICHARS * 4];
2091
2092     assert(s != NULL);
2093     assert(size >= 0);
2094
2095     if (size <= MAX_SHORT_UNICHARS) {
2096         /* Write into the stack buffer; nallocated can't overflow.
2097          * At the end, we'll allocate exactly as much heap space as it
2098          * turns out we need.
2099          */
2100         nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2101         v = NULL;   /* will allocate after we're done */
2102         p = stackbuf;
2103     }
2104     else {
2105         /* Overallocate on the heap, and give the excess back at the end. */
2106         nallocated = size * 4;
2107         if (nallocated / 4 != size)  /* overflow! */
2108             return PyErr_NoMemory();
2109         v = PyString_FromStringAndSize(NULL, nallocated);
2110         if (v == NULL)
2111             return NULL;
2112         p = PyString_AS_STRING(v);
2113     }
2114
2115     for (i = 0; i < size;) {
2116         Py_UCS4 ch = s[i++];
2117
2118         if (ch < 0x80)
2119             /* Encode ASCII */
2120             *p++ = (char) ch;
2121
2122         else if (ch < 0x0800) {
2123             /* Encode Latin-1 */
2124             *p++ = (char)(0xc0 | (ch >> 6));
2125             *p++ = (char)(0x80 | (ch & 0x3f));
2126         }
2127         else {
2128             /* Encode UCS2 Unicode ordinals */
2129             if (ch < 0x10000) {
2130                 /* Special case: check for high surrogate */
2131                 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2132                     Py_UCS4 ch2 = s[i];
2133                     /* Check for low surrogate and combine the two to
2134                        form a UCS4 value */
2135                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2136                         ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2137                         i++;
2138                         goto encodeUCS4;
2139                     }
2140                     /* Fall through: handles isolated high surrogates */
2141                 }
2142                 *p++ = (char)(0xe0 | (ch >> 12));
2143                 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2144                 *p++ = (char)(0x80 | (ch & 0x3f));
2145                 continue;
2146             }
2147           encodeUCS4:
2148             /* Encode UCS4 Unicode ordinals */
2149             *p++ = (char)(0xf0 | (ch >> 18));
2150             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2151             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2152             *p++ = (char)(0x80 | (ch & 0x3f));
2153         }
2154     }
2155
2156     if (v == NULL) {
2157         /* This was stack allocated. */
2158         nneeded = p - stackbuf;
2159         assert(nneeded <= nallocated);
2160         v = PyString_FromStringAndSize(stackbuf, nneeded);
2161     }
2162     else {
2163         /* Cut back to size actually needed. */
2164         nneeded = p - PyString_AS_STRING(v);
2165         assert(nneeded <= nallocated);
2166         if (_PyString_Resize(&v, nneeded))
2167             return NULL;
2168     }
2169     return v;
2170
2171 #undef MAX_SHORT_UNICHARS
2172 }
2173
2174 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2175 {
2176     if (!PyUnicode_Check(unicode)) {
2177         PyErr_BadArgument();
2178         return NULL;
2179     }
2180     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2181                                 PyUnicode_GET_SIZE(unicode),
2182                                 NULL);
2183 }
2184
2185 /* --- UTF-32 Codec ------------------------------------------------------- */
2186
2187 PyObject *
2188 PyUnicode_DecodeUTF32(const char *s,
2189                       Py_ssize_t size,
2190                       const char *errors,
2191                       int *byteorder)
2192 {
2193     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2194 }
2195
2196 PyObject *
2197 PyUnicode_DecodeUTF32Stateful(const char *s,
2198                               Py_ssize_t size,
2199                               const char *errors,
2200                               int *byteorder,
2201                               Py_ssize_t *consumed)
2202 {
2203     const char *starts = s;
2204     Py_ssize_t startinpos;
2205     Py_ssize_t endinpos;
2206     Py_ssize_t outpos;
2207     PyUnicodeObject *unicode;
2208     Py_UNICODE *p;
2209 #ifndef Py_UNICODE_WIDE
2210     int pairs = 0;
2211     const unsigned char *qq;
2212 #else
2213     const int pairs = 0;
2214 #endif
2215     const unsigned char *q, *e;
2216     int bo = 0;       /* assume native ordering by default */
2217     const char *errmsg = "";
2218     /* Offsets from q for retrieving bytes in the right order. */
2219 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2220     int iorder[] = {0, 1, 2, 3};
2221 #else
2222     int iorder[] = {3, 2, 1, 0};
2223 #endif
2224     PyObject *errorHandler = NULL;
2225     PyObject *exc = NULL;
2226
2227     q = (unsigned char *)s;
2228     e = q + size;
2229
2230     if (byteorder)
2231         bo = *byteorder;
2232
2233     /* Check for BOM marks (U+FEFF) in the input and adjust current
2234        byte order setting accordingly. In native mode, the leading BOM
2235        mark is skipped, in all other modes, it is copied to the output
2236        stream as-is (giving a ZWNBSP character). */
2237     if (bo == 0) {
2238         if (size >= 4) {
2239             const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2240                 (q[iorder[1]] << 8) | q[iorder[0]];
2241 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2242             if (bom == 0x0000FEFF) {
2243                 q += 4;
2244                 bo = -1;
2245             }
2246             else if (bom == 0xFFFE0000) {
2247                 q += 4;
2248                 bo = 1;
2249             }
2250 #else
2251             if (bom == 0x0000FEFF) {
2252                 q += 4;
2253                 bo = 1;
2254             }
2255             else if (bom == 0xFFFE0000) {
2256                 q += 4;
2257                 bo = -1;
2258             }
2259 #endif
2260         }
2261     }
2262
2263     if (bo == -1) {
2264         /* force LE */
2265         iorder[0] = 0;
2266         iorder[1] = 1;
2267         iorder[2] = 2;
2268         iorder[3] = 3;
2269     }
2270     else if (bo == 1) {
2271         /* force BE */
2272         iorder[0] = 3;
2273         iorder[1] = 2;
2274         iorder[2] = 1;
2275         iorder[3] = 0;
2276     }
2277
2278     /* On narrow builds we split characters outside the BMP into two
2279        codepoints => count how much extra space we need. */
2280 #ifndef Py_UNICODE_WIDE
2281     for (qq = q; qq < e; qq += 4)
2282         if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2283             pairs++;
2284 #endif
2285
2286     /* This might be one to much, because of a BOM */
2287     unicode = _PyUnicode_New((size+3)/4+pairs);
2288     if (!unicode)
2289         return NULL;
2290     if (size == 0)
2291         return (PyObject *)unicode;
2292
2293     /* Unpack UTF-32 encoded data */
2294     p = unicode->str;
2295
2296     while (q < e) {
2297         Py_UCS4 ch;
2298         /* remaining bytes at the end? (size should be divisible by 4) */
2299         if (e-q<4) {
2300             if (consumed)
2301                 break;
2302             errmsg = "truncated data";
2303             startinpos = ((const char *)q)-starts;
2304             endinpos = ((const char *)e)-starts;
2305             goto utf32Error;
2306             /* The remaining input chars are ignored if the callback
2307                chooses to skip the input */
2308         }
2309         ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2310             (q[iorder[1]] << 8) | q[iorder[0]];
2311
2312         if (ch >= 0x110000)
2313         {
2314             errmsg = "codepoint not in range(0x110000)";
2315             startinpos = ((const char *)q)-starts;
2316             endinpos = startinpos+4;
2317             goto utf32Error;
2318         }
2319 #ifndef Py_UNICODE_WIDE
2320         if (ch >= 0x10000)
2321         {
2322             *p++ = 0xD800 | ((ch-0x10000) >> 10);
2323             *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2324         }
2325         else
2326 #endif
2327             *p++ = ch;
2328         q += 4;
2329         continue;
2330       utf32Error:
2331         outpos = p-PyUnicode_AS_UNICODE(unicode);
2332         if (unicode_decode_call_errorhandler(
2333                 errors, &errorHandler,
2334                 "utf32", errmsg,
2335                 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2336                 &unicode, &outpos, &p))
2337             goto onError;
2338     }
2339
2340     if (byteorder)
2341         *byteorder = bo;
2342
2343     if (consumed)
2344         *consumed = (const char *)q-starts;
2345
2346     /* Adjust length */
2347     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2348         goto onError;
2349
2350     Py_XDECREF(errorHandler);
2351     Py_XDECREF(exc);
2352     return (PyObject *)unicode;
2353
2354   onError:
2355     Py_DECREF(unicode);
2356     Py_XDECREF(errorHandler);
2357     Py_XDECREF(exc);
2358     return NULL;
2359 }
2360
2361 PyObject *
2362 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2363                       Py_ssize_t size,
2364                       const char *errors,
2365                       int byteorder)
2366 {
2367     PyObject *v;
2368     unsigned char *p;
2369     Py_ssize_t nsize, bytesize;
2370 #ifndef Py_UNICODE_WIDE
2371     Py_ssize_t i, pairs;
2372 #else
2373     const int pairs = 0;
2374 #endif
2375     /* Offsets from p for storing byte pairs in the right order. */
2376 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2377     int iorder[] = {0, 1, 2, 3};
2378 #else
2379     int iorder[] = {3, 2, 1, 0};
2380 #endif
2381
2382 #define STORECHAR(CH)                           \
2383     do {                                        \
2384         p[iorder[3]] = ((CH) >> 24) & 0xff;     \
2385         p[iorder[2]] = ((CH) >> 16) & 0xff;     \
2386         p[iorder[1]] = ((CH) >> 8) & 0xff;      \
2387         p[iorder[0]] = (CH) & 0xff;             \
2388         p += 4;                                 \
2389     } while(0)
2390
2391     /* In narrow builds we can output surrogate pairs as one codepoint,
2392        so we need less space. */
2393 #ifndef Py_UNICODE_WIDE
2394     for (i = pairs = 0; i < size-1; i++)
2395         if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2396             0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2397             pairs++;
2398 #endif
2399     nsize = (size - pairs + (byteorder == 0));
2400     bytesize = nsize * 4;
2401     if (bytesize / 4 != nsize)
2402         return PyErr_NoMemory();
2403     v = PyString_FromStringAndSize(NULL, bytesize);
2404     if (v == NULL)
2405         return NULL;
2406
2407     p = (unsigned char *)PyString_AS_STRING(v);
2408     if (byteorder == 0)
2409         STORECHAR(0xFEFF);
2410     if (size == 0)
2411         return v;
2412
2413     if (byteorder == -1) {
2414         /* force LE */
2415         iorder[0] = 0;
2416         iorder[1] = 1;
2417         iorder[2] = 2;
2418         iorder[3] = 3;
2419     }
2420     else if (byteorder == 1) {
2421         /* force BE */
2422         iorder[0] = 3;
2423         iorder[1] = 2;
2424         iorder[2] = 1;
2425         iorder[3] = 0;
2426     }
2427
2428     while (size-- > 0) {
2429         Py_UCS4 ch = *s++;
2430 #ifndef Py_UNICODE_WIDE
2431         if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2432             Py_UCS4 ch2 = *s;
2433             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2434                 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2435                 s++;
2436                 size--;
2437             }
2438         }
2439 #endif
2440         STORECHAR(ch);
2441     }
2442     return v;
2443 #undef STORECHAR
2444 }
2445
2446 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2447 {
2448     if (!PyUnicode_Check(unicode)) {
2449         PyErr_BadArgument();
2450         return NULL;
2451     }
2452     return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2453                                  PyUnicode_GET_SIZE(unicode),
2454                                  NULL,
2455                                  0);
2456 }
2457
2458 /* --- UTF-16 Codec ------------------------------------------------------- */
2459
2460 PyObject *
2461 PyUnicode_DecodeUTF16(const char *s,
2462                       Py_ssize_t size,
2463                       const char *errors,
2464                       int *byteorder)
2465 {
2466     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2467 }
2468
2469 PyObject *
2470 PyUnicode_DecodeUTF16Stateful(const char *s,
2471                               Py_ssize_t size,
2472                               const char *errors,
2473                               int *byteorder,
2474                               Py_ssize_t *consumed)
2475 {
2476     const char *starts = s;
2477     Py_ssize_t startinpos;
2478     Py_ssize_t endinpos;
2479     Py_ssize_t outpos;
2480     PyUnicodeObject *unicode;
2481     Py_UNICODE *p;
2482     const unsigned char *q, *e;
2483     int bo = 0;       /* assume native ordering by default */
2484     const char *errmsg = "";
2485     /* Offsets from q for retrieving byte pairs in the right order. */
2486 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2487     int ihi = 1, ilo = 0;
2488 #else
2489     int ihi = 0, ilo = 1;
2490 #endif
2491     PyObject *errorHandler = NULL;
2492     PyObject *exc = NULL;
2493
2494     /* Note: size will always be longer than the resulting Unicode
2495        character count */
2496     unicode = _PyUnicode_New(size);
2497     if (!unicode)
2498         return NULL;
2499     if (size == 0)
2500         return (PyObject *)unicode;
2501
2502     /* Unpack UTF-16 encoded data */
2503     p = unicode->str;
2504     q = (unsigned char *)s;
2505     e = q + size;
2506
2507     if (byteorder)
2508         bo = *byteorder;
2509
2510     /* Check for BOM marks (U+FEFF) in the input and adjust current
2511        byte order setting accordingly. In native mode, the leading BOM
2512        mark is skipped, in all other modes, it is copied to the output
2513        stream as-is (giving a ZWNBSP character). */
2514     if (bo == 0) {
2515         if (size >= 2) {
2516             const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2517 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2518             if (bom == 0xFEFF) {
2519                 q += 2;
2520                 bo = -1;
2521             }
2522             else if (bom == 0xFFFE) {
2523                 q += 2;
2524                 bo = 1;
2525             }
2526 #else
2527             if (bom == 0xFEFF) {
2528                 q += 2;
2529                 bo = 1;
2530             }
2531             else if (bom == 0xFFFE) {
2532                 q += 2;
2533                 bo = -1;
2534             }
2535 #endif
2536         }
2537     }
2538
2539     if (bo == -1) {
2540         /* force LE */
2541         ihi = 1;
2542         ilo = 0;
2543     }
2544     else if (bo == 1) {
2545         /* force BE */
2546         ihi = 0;
2547         ilo = 1;
2548     }
2549
2550     while (q < e) {
2551         Py_UNICODE ch;
2552         /* remaining bytes at the end? (size should be even) */
2553         if (e-q<2) {
2554             if (consumed)
2555                 break;
2556             errmsg = "truncated data";
2557             startinpos = ((const char *)q)-starts;
2558             endinpos = ((const char *)e)-starts;
2559             goto utf16Error;
2560             /* The remaining input chars are ignored if the callback
2561                chooses to skip the input */
2562         }
2563         ch = (q[ihi] << 8) | q[ilo];
2564
2565         q += 2;
2566
2567         if (ch < 0xD800 || ch > 0xDFFF) {
2568             *p++ = ch;
2569             continue;
2570         }
2571
2572         /* UTF-16 code pair: */
2573         if (q >= e) {
2574             errmsg = "unexpected end of data";
2575             startinpos = (((const char *)q)-2)-starts;
2576             endinpos = ((const char *)e)-starts;
2577             goto utf16Error;
2578         }
2579         if (0xD800 <= ch && ch <= 0xDBFF) {
2580             Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2581             q += 2;
2582             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2583 #ifndef Py_UNICODE_WIDE
2584                 *p++ = ch;
2585                 *p++ = ch2;
2586 #else
2587                 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2588 #endif
2589                 continue;
2590             }
2591             else {
2592                 errmsg = "illegal UTF-16 surrogate";
2593                 startinpos = (((const char *)q)-4)-starts;
2594                 endinpos = startinpos+2;
2595                 goto utf16Error;
2596             }
2597
2598         }
2599         errmsg = "illegal encoding";
2600         startinpos = (((const char *)q)-2)-starts;
2601         endinpos = startinpos+2;
2602         /* Fall through to report the error */
2603
2604       utf16Error:
2605         outpos = p-PyUnicode_AS_UNICODE(unicode);
2606         if (unicode_decode_call_errorhandler(
2607                 errors, &errorHandler,
2608                 "utf16", errmsg,
2609                 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2610                 &unicode, &outpos, &p))
2611             goto onError;
2612     }
2613
2614     if (byteorder)
2615         *byteorder = bo;
2616
2617     if (consumed)
2618         *consumed = (const char *)q-starts;
2619
2620     /* Adjust length */
2621     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2622         goto onError;
2623
2624     Py_XDECREF(errorHandler);
2625     Py_XDECREF(exc);
2626     return (PyObject *)unicode;
2627
2628   onError:
2629     Py_DECREF(unicode);
2630     Py_XDECREF(errorHandler);
2631     Py_XDECREF(exc);
2632     return NULL;
2633 }
2634
2635 PyObject *
2636 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2637                       Py_ssize_t size,
2638                       const char *errors,
2639                       int byteorder)
2640 {
2641     PyObject *v;
2642     unsigned char *p;
2643     Py_ssize_t nsize, bytesize;
2644 #ifdef Py_UNICODE_WIDE
2645     Py_ssize_t i, pairs;
2646 #else
2647     const int pairs = 0;
2648 #endif
2649     /* Offsets from p for storing byte pairs in the right order. */
2650 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2651     int ihi = 1, ilo = 0;
2652 #else
2653     int ihi = 0, ilo = 1;
2654 #endif
2655
2656 #define STORECHAR(CH)                           \
2657     do {                                        \
2658         p[ihi] = ((CH) >> 8) & 0xff;            \
2659         p[ilo] = (CH) & 0xff;                   \
2660         p += 2;                                 \
2661     } while(0)
2662
2663 #ifdef Py_UNICODE_WIDE
2664     for (i = pairs = 0; i < size; i++)
2665         if (s[i] >= 0x10000)
2666             pairs++;
2667 #endif
2668     /* 2 * (size + pairs + (byteorder == 0)) */
2669     if (size > PY_SSIZE_T_MAX ||
2670         size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2671         return PyErr_NoMemory();
2672     nsize = size + pairs + (byteorder == 0);
2673     bytesize = nsize * 2;
2674     if (bytesize / 2 != nsize)
2675         return PyErr_NoMemory();
2676     v = PyString_FromStringAndSize(NULL, bytesize);
2677     if (v == NULL)
2678         return NULL;
2679
2680     p = (unsigned char *)PyString_AS_STRING(v);
2681     if (byteorder == 0)
2682         STORECHAR(0xFEFF);
2683     if (size == 0)
2684         return v;
2685
2686     if (byteorder == -1) {
2687         /* force LE */
2688         ihi = 1;
2689         ilo = 0;
2690     }
2691     else if (byteorder == 1) {
2692         /* force BE */
2693         ihi = 0;
2694         ilo = 1;
2695     }
2696
2697     while (size-- > 0) {
2698         Py_UNICODE ch = *s++;
2699         Py_UNICODE ch2 = 0;
2700 #ifdef Py_UNICODE_WIDE
2701         if (ch >= 0x10000) {
2702             ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2703             ch  = 0xD800 | ((ch-0x10000) >> 10);
2704         }
2705 #endif
2706         STORECHAR(ch);
2707         if (ch2)
2708             STORECHAR(ch2);
2709     }
2710     return v;
2711 #undef STORECHAR
2712 }
2713
2714 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2715 {
2716     if (!PyUnicode_Check(unicode)) {
2717         PyErr_BadArgument();
2718         return NULL;
2719     }
2720     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2721                                  PyUnicode_GET_SIZE(unicode),
2722                                  NULL,
2723                                  0);
2724 }
2725
2726 /* --- Unicode Escape Codec ----------------------------------------------- */
2727
2728 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2729
2730 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2731                                         Py_ssize_t size,
2732                                         const char *errors)
2733 {
2734     const char *starts = s;
2735     Py_ssize_t startinpos;
2736     Py_ssize_t endinpos;
2737     Py_ssize_t outpos;
2738     int i;
2739     PyUnicodeObject *v;
2740     Py_UNICODE *p;
2741     const char *end;
2742     char* message;
2743     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2744     PyObject *errorHandler = NULL;
2745     PyObject *exc = NULL;
2746
2747     /* Escaped strings will always be longer than the resulting
2748        Unicode string, so we start with size here and then reduce the
2749        length after conversion to the true value.
2750        (but if the error callback returns a long replacement string
2751        we'll have to allocate more space) */
2752     v = _PyUnicode_New(size);
2753     if (v == NULL)
2754         goto onError;
2755     if (size == 0)
2756         return (PyObject *)v;
2757
2758     p = PyUnicode_AS_UNICODE(v);
2759     end = s + size;
2760
2761     while (s < end) {
2762         unsigned char c;
2763         Py_UNICODE x;
2764         int digits;
2765
2766         /* Non-escape characters are interpreted as Unicode ordinals */
2767         if (*s != '\\') {
2768             *p++ = (unsigned char) *s++;
2769             continue;
2770         }
2771
2772         startinpos = s-starts;
2773         /* \ - Escapes */
2774         s++;
2775         c = *s++;
2776         if (s > end)
2777             c = '\0'; /* Invalid after \ */
2778         switch (c) {
2779
2780             /* \x escapes */
2781         case '\n': break;
2782         case '\\': *p++ = '\\'; break;
2783         case '\'': *p++ = '\''; break;
2784         case '\"': *p++ = '\"'; break;
2785         case 'b': *p++ = '\b'; break;
2786         case 'f': *p++ = '\014'; break; /* FF */
2787         case 't': *p++ = '\t'; break;
2788         case 'n': *p++ = '\n'; break;
2789         case 'r': *p++ = '\r'; break;
2790         case 'v': *p++ = '\013'; break; /* VT */
2791         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2792
2793             /* \OOO (octal) escapes */
2794         case '0': case '1': case '2': case '3':
2795         case '4': case '5': case '6': case '7':
2796             x = s[-1] - '0';
2797             if (s < end && '0' <= *s && *s <= '7') {
2798                 x = (x<<3) + *s++ - '0';
2799                 if (s < end && '0' <= *s && *s <= '7')
2800                     x = (x<<3) + *s++ - '0';
2801             }
2802             *p++ = x;
2803             break;
2804
2805             /* hex escapes */
2806             /* \xXX */
2807         case 'x':
2808             digits = 2;
2809             message = "truncated \\xXX escape";
2810             goto hexescape;
2811
2812             /* \uXXXX */
2813         case 'u':
2814             digits = 4;
2815             message = "truncated \\uXXXX escape";
2816             goto hexescape;
2817
2818             /* \UXXXXXXXX */
2819         case 'U':
2820             digits = 8;
2821             message = "truncated \\UXXXXXXXX escape";
2822         hexescape:
2823             chr = 0;
2824             outpos = p-PyUnicode_AS_UNICODE(v);
2825             if (s+digits>end) {
2826                 endinpos = size;
2827                 if (unicode_decode_call_errorhandler(
2828                         errors, &errorHandler,
2829                         "unicodeescape", "end of string in escape sequence",
2830                         starts, size, &startinpos, &endinpos, &exc, &s,
2831                         &v, &outpos, &p))
2832                     goto onError;
2833                 goto nextByte;
2834             }
2835             for (i = 0; i < digits; ++i) {
2836                 c = (unsigned char) s[i];
2837                 if (!isxdigit(c)) {
2838                     endinpos = (s+i+1)-starts;
2839                     if (unicode_decode_call_errorhandler(
2840                             errors, &errorHandler,
2841                             "unicodeescape", message,
2842                             starts, size, &startinpos, &endinpos, &exc, &s,
2843                             &v, &outpos, &p))
2844                         goto onError;
2845                     goto nextByte;
2846                 }
2847                 chr = (chr<<4) & ~0xF;
2848                 if (c >= '0' && c <= '9')
2849                     chr += c - '0';
2850                 else if (c >= 'a' && c <= 'f')
2851                     chr += 10 + c - 'a';
2852                 else
2853                     chr += 10 + c - 'A';
2854             }
2855             s += i;
2856             if (chr == 0xffffffff && PyErr_Occurred())
2857                 /* _decoding_error will have already written into the
2858                    target buffer. */
2859                 break;
2860         store:
2861             /* when we get here, chr is a 32-bit unicode character */
2862             if (chr <= 0xffff)
2863                 /* UCS-2 character */
2864                 *p++ = (Py_UNICODE) chr;
2865             else if (chr <= 0x10ffff) {
2866                 /* UCS-4 character. Either store directly, or as
2867                    surrogate pair. */
2868 #ifdef Py_UNICODE_WIDE
2869                 *p++ = chr;
2870 #else
2871                 chr -= 0x10000L;
2872                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2873                 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2874 #endif
2875             } else {
2876                 endinpos = s-starts;
2877                 outpos = p-PyUnicode_AS_UNICODE(v);
2878                 if (unicode_decode_call_errorhandler(
2879                         errors, &errorHandler,
2880                         "unicodeescape", "illegal Unicode character",
2881                         starts, size, &startinpos, &endinpos, &exc, &s,
2882                         &v, &outpos, &p))
2883                     goto onError;
2884             }
2885             break;
2886
2887             /* \N{name} */
2888         case 'N':
2889             message = "malformed \\N character escape";
2890             if (ucnhash_CAPI == NULL) {
2891                 /* load the unicode data module */
2892                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
2893                 if (ucnhash_CAPI == NULL)
2894                     goto ucnhashError;
2895             }
2896             if (*s == '{') {
2897                 const char *start = s+1;
2898                 /* look for the closing brace */
2899                 while (*s != '}' && s < end)
2900                     s++;
2901                 if (s > start && s < end && *s == '}') {
2902                     /* found a name.  look it up in the unicode database */
2903                     message = "unknown Unicode character name";
2904                     s++;
2905                     if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2906                         goto store;
2907                 }
2908             }
2909             endinpos = s-starts;
2910             outpos = p-PyUnicode_AS_UNICODE(v);
2911             if (unicode_decode_call_errorhandler(
2912                     errors, &errorHandler,
2913                     "unicodeescape", message,
2914                     starts, size, &startinpos, &endinpos, &exc, &s,
2915                     &v, &outpos, &p))
2916                 goto onError;
2917             break;
2918
2919         default:
2920             if (s > end) {
2921                 message = "\\ at end of string";
2922                 s--;
2923                 endinpos = s-starts;
2924                 outpos = p-PyUnicode_AS_UNICODE(v);
2925                 if (unicode_decode_call_errorhandler(
2926                         errors, &errorHandler,
2927                         "unicodeescape", message,
2928                         starts, size, &startinpos, &endinpos, &exc, &s,
2929                         &v, &outpos, &p))
2930                     goto onError;
2931             }
2932             else {
2933                 *p++ = '\\';
2934                 *p++ = (unsigned char)s[-1];
2935             }
2936             break;
2937         }
2938       nextByte:
2939         ;
2940     }
2941     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2942         goto onError;
2943     Py_XDECREF(errorHandler);
2944     Py_XDECREF(exc);
2945     return (PyObject *)v;
2946
2947   ucnhashError:
2948     PyErr_SetString(
2949         PyExc_UnicodeError,
2950         "\\N escapes not supported (can't load unicodedata module)"
2951         );
2952     Py_XDECREF(v);
2953     Py_XDECREF(errorHandler);
2954     Py_XDECREF(exc);
2955     return NULL;
2956
2957   onError:
2958     Py_XDECREF(v);
2959     Py_XDECREF(errorHandler);
2960     Py_XDECREF(exc);
2961     return NULL;
2962 }
2963
2964 /* Return a Unicode-Escape string version of the Unicode object.
2965
2966    If quotes is true, the string is enclosed in u"" or u'' quotes as
2967    appropriate.
2968
2969 */
2970
2971 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2972                                              Py_ssize_t size,
2973                                              Py_UNICODE ch)
2974 {
2975     /* like wcschr, but doesn't stop at NULL characters */
2976
2977     while (size-- > 0) {
2978         if (*s == ch)
2979             return s;
2980         s++;
2981     }
2982
2983     return NULL;
2984 }
2985
2986 static
2987 PyObject *unicodeescape_string(const Py_UNICODE *s,
2988                                Py_ssize_t size,
2989                                int quotes)
2990 {
2991     PyObject *repr;
2992     char *p;
2993
2994     static const char *hexdigit = "0123456789abcdef";
2995 #ifdef Py_UNICODE_WIDE
2996     const Py_ssize_t expandsize = 10;
2997 #else
2998     const Py_ssize_t expandsize = 6;
2999 #endif
3000
3001     /* XXX(nnorwitz): rather than over-allocating, it would be
3002        better to choose a different scheme.  Perhaps scan the
3003        first N-chars of the string and allocate based on that size.
3004     */
3005     /* Initial allocation is based on the longest-possible unichr
3006        escape.
3007
3008        In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3009        unichr, so in this case it's the longest unichr escape. In
3010        narrow (UTF-16) builds this is five chars per source unichr
3011        since there are two unichrs in the surrogate pair, so in narrow
3012        (UTF-16) builds it's not the longest unichr escape.
3013
3014        In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3015        so in the narrow (UTF-16) build case it's the longest unichr
3016        escape.
3017     */
3018
3019     if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3020         return PyErr_NoMemory();
3021
3022     repr = PyString_FromStringAndSize(NULL,
3023                                       2
3024                                       + expandsize*size
3025                                       + 1);
3026     if (repr == NULL)
3027         return NULL;
3028
3029     p = PyString_AS_STRING(repr);
3030
3031     if (quotes) {
3032         *p++ = 'u';
3033         *p++ = (findchar(s, size, '\'') &&
3034                 !findchar(s, size, '"')) ? '"' : '\'';
3035     }
3036     while (size-- > 0) {
3037         Py_UNICODE ch = *s++;
3038
3039         /* Escape quotes and backslashes */
3040         if ((quotes &&
3041              ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
3042             *p++ = '\\';
3043             *p++ = (char) ch;
3044             continue;
3045         }
3046
3047 #ifdef Py_UNICODE_WIDE
3048         /* Map 21-bit characters to '\U00xxxxxx' */
3049         else if (ch >= 0x10000) {
3050             *p++ = '\\';
3051             *p++ = 'U';
3052             *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3053             *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3054             *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3055             *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3056             *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3057             *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3058             *p++ = hexdigit[(ch >> 4) & 0x0000000F];
3059             *p++ = hexdigit[ch & 0x0000000F];
3060             continue;
3061         }
3062 #else
3063         /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3064         else if (ch >= 0xD800 && ch < 0xDC00) {
3065             Py_UNICODE ch2;
3066             Py_UCS4 ucs;
3067
3068             ch2 = *s++;
3069             size--;
3070             if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3071                 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3072                 *p++ = '\\';
3073                 *p++ = 'U';
3074                 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3075                 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3076                 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3077                 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3078                 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3079                 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3080                 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3081                 *p++ = hexdigit[ucs & 0x0000000F];
3082                 continue;
3083             }
3084             /* Fall through: isolated surrogates are copied as-is */
3085             s--;
3086             size++;
3087         }
3088 #endif
3089
3090         /* Map 16-bit characters to '\uxxxx' */
3091         if (ch >= 256) {
3092             *p++ = '\\';
3093             *p++ = 'u';
3094             *p++ = hexdigit[(ch >> 12) & 0x000F];
3095             *p++ = hexdigit[(ch >> 8) & 0x000F];
3096             *p++ = hexdigit[(ch >> 4) & 0x000F];
3097             *p++ = hexdigit[ch & 0x000F];
3098         }
3099
3100         /* Map special whitespace to '\t', \n', '\r' */
3101         else if (ch == '\t') {
3102             *p++ = '\\';
3103             *p++ = 't';
3104         }
3105         else if (ch == '\n') {
3106             *p++ = '\\';
3107             *p++ = 'n';
3108         }
3109         else if (ch == '\r') {
3110             *p++ = '\\';
3111             *p++ = 'r';
3112         }
3113
3114         /* Map non-printable US ASCII to '\xhh' */
3115         else if (ch < ' ' || ch >= 0x7F) {
3116             *p++ = '\\';
3117             *p++ = 'x';
3118             *p++ = hexdigit[(ch >> 4) & 0x000F];
3119             *p++ = hexdigit[ch & 0x000F];
3120         }
3121
3122         /* Copy everything else as-is */
3123         else
3124             *p++ = (char) ch;
3125     }
3126     if (quotes)
3127         *p++ = PyString_AS_STRING(repr)[1];
3128
3129     *p = '\0';
3130     if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3131         return NULL;
3132     return repr;
3133 }
3134
3135 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3136                                         Py_ssize_t size)
3137 {
3138     return unicodeescape_string(s, size, 0);
3139 }
3140
3141 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3142 {
3143     if (!PyUnicode_Check(unicode)) {
3144         PyErr_BadArgument();
3145         return NULL;
3146     }
3147     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3148                                          PyUnicode_GET_SIZE(unicode));
3149 }
3150
3151 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3152
3153 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3154                                            Py_ssize_t size,
3155                                            const char *errors)
3156 {
3157     const char *starts = s;
3158     Py_ssize_t startinpos;
3159     Py_ssize_t endinpos;
3160     Py_ssize_t outpos;
3161     PyUnicodeObject *v;
3162     Py_UNICODE *p;
3163     const char *end;
3164     const char *bs;
3165     PyObject *errorHandler = NULL;
3166     PyObject *exc = NULL;
3167
3168     /* Escaped strings will always be longer than the resulting
3169        Unicode string, so we start with size here and then reduce the
3170        length after conversion to the true value. (But decoding error
3171        handler might have to resize the string) */
3172     v = _PyUnicode_New(size);
3173     if (v == NULL)
3174         goto onError;
3175     if (size == 0)
3176         return (PyObject *)v;
3177     p = PyUnicode_AS_UNICODE(v);
3178     end = s + size;
3179     while (s < end) {
3180         unsigned char c;
3181         Py_UCS4 x;
3182         int i;
3183         int count;
3184
3185         /* Non-escape characters are interpreted as Unicode ordinals */
3186         if (*s != '\\') {
3187             *p++ = (unsigned char)*s++;
3188             continue;
3189         }
3190         startinpos = s-starts;
3191
3192         /* \u-escapes are only interpreted iff the number of leading
3193            backslashes if odd */
3194         bs = s;
3195         for (;s < end;) {
3196             if (*s != '\\')
3197                 break;
3198             *p++ = (unsigned char)*s++;
3199         }
3200         if (((s - bs) & 1) == 0 ||
3201             s >= end ||
3202             (*s != 'u' && *s != 'U')) {
3203             continue;
3204         }
3205         p--;
3206         count = *s=='u' ? 4 : 8;
3207         s++;
3208
3209         /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3210         outpos = p-PyUnicode_AS_UNICODE(v);
3211         for (x = 0, i = 0; i < count; ++i, ++s) {
3212             c = (unsigned char)*s;
3213             if (!isxdigit(c)) {
3214                 endinpos = s-starts;
3215                 if (unicode_decode_call_errorhandler(
3216                         errors, &errorHandler,
3217                         "rawunicodeescape", "truncated \\uXXXX",
3218                         starts, size, &startinpos, &endinpos, &exc, &s,
3219                         &v, &outpos, &p))
3220                     goto onError;
3221                 goto nextByte;
3222             }
3223             x = (x<<4) & ~0xF;
3224             if (c >= '0' && c <= '9')
3225                 x += c - '0';
3226             else if (c >= 'a' && c <= 'f')
3227                 x += 10 + c - 'a';
3228             else
3229                 x += 10 + c - 'A';
3230         }
3231         if (x <= 0xffff)
3232             /* UCS-2 character */
3233             *p++ = (Py_UNICODE) x;
3234         else if (x <= 0x10ffff) {
3235             /* UCS-4 character. Either store directly, or as
3236                surrogate pair. */
3237 #ifdef Py_UNICODE_WIDE
3238             *p++ = (Py_UNICODE) x;
3239 #else
3240             x -= 0x10000L;
3241             *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3242             *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3243 #endif
3244         } else {
3245             endinpos = s-starts;
3246             outpos = p-PyUnicode_AS_UNICODE(v);
3247             if (unicode_decode_call_errorhandler(
3248                     errors, &errorHandler,
3249                     "rawunicodeescape", "\\Uxxxxxxxx out of range",
3250                     starts, size, &startinpos, &endinpos, &exc, &s,
3251                     &v, &outpos, &p))
3252                 goto onError;
3253         }
3254       nextByte:
3255         ;
3256     }
3257     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3258         goto onError;
3259     Py_XDECREF(errorHandler);
3260     Py_XDECREF(exc);
3261     return (PyObject *)v;
3262
3263   onError:
3264     Py_XDECREF(v);
3265     Py_XDECREF(errorHandler);
3266     Py_XDECREF(exc);
3267     return NULL;
3268 }
3269
3270 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3271                                            Py_ssize_t size)
3272 {
3273     PyObject *repr;
3274     char *p;
3275     char *q;
3276
3277     static const char *hexdigit = "0123456789abcdef";
3278 #ifdef Py_UNICODE_WIDE
3279     const Py_ssize_t expandsize = 10;
3280 #else
3281     const Py_ssize_t expandsize = 6;
3282 #endif
3283
3284     if (size > PY_SSIZE_T_MAX / expandsize)
3285         return PyErr_NoMemory();
3286
3287     repr = PyString_FromStringAndSize(NULL, expandsize * size);
3288     if (repr == NULL)
3289         return NULL;
3290     if (size == 0)
3291         return repr;
3292
3293     p = q = PyString_AS_STRING(repr);
3294     while (size-- > 0) {
3295         Py_UNICODE ch = *s++;
3296 #ifdef Py_UNICODE_WIDE
3297         /* Map 32-bit characters to '\Uxxxxxxxx' */
3298         if (ch >= 0x10000) {
3299             *p++ = '\\';
3300             *p++ = 'U';
3301             *p++ = hexdigit[(ch >> 28) & 0xf];
3302             *p++ = hexdigit[(ch >> 24) & 0xf];
3303             *p++ = hexdigit[(ch >> 20) & 0xf];
3304             *p++ = hexdigit[(ch >> 16) & 0xf];
3305             *p++ = hexdigit[(ch >> 12) & 0xf];
3306             *p++ = hexdigit[(ch >> 8) & 0xf];
3307             *p++ = hexdigit[(ch >> 4) & 0xf];
3308             *p++ = hexdigit[ch & 15];
3309         }
3310         else
3311 #else
3312             /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3313             if (ch >= 0xD800 && ch < 0xDC00) {
3314                 Py_UNICODE ch2;
3315                 Py_UCS4 ucs;
3316
3317                 ch2 = *s++;
3318                 size--;
3319                 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3320                     ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3321                     *p++ = '\\';
3322                     *p++ = 'U';
3323                     *p++ = hexdigit[(ucs >> 28) & 0xf];
3324                     *p++ = hexdigit[(ucs >> 24) & 0xf];
3325                     *p++ = hexdigit[(ucs >> 20) & 0xf];
3326                     *p++ = hexdigit[(ucs >> 16) & 0xf];
3327                     *p++ = hexdigit[(ucs >> 12) & 0xf];
3328                     *p++ = hexdigit[(ucs >> 8) & 0xf];
3329                     *p++ = hexdigit[(ucs >> 4) & 0xf];
3330                     *p++ = hexdigit[ucs & 0xf];
3331                     continue;
3332                 }
3333                 /* Fall through: isolated surrogates are copied as-is */
3334                 s--;
3335                 size++;
3336             }
3337 #endif
3338         /* Map 16-bit characters to '\uxxxx' */
3339         if (ch >= 256) {
3340             *p++ = '\\';
3341             *p++ = 'u';
3342             *p++ = hexdigit[(ch >> 12) & 0xf];
3343             *p++ = hexdigit[(ch >> 8) & 0xf];
3344             *p++ = hexdigit[(ch >> 4) & 0xf];
3345             *p++ = hexdigit[ch & 15];
3346         }
3347         /* Copy everything else as-is */
3348         else
3349             *p++ = (char) ch;
3350     }
3351     *p = '\0';
3352     if (_PyString_Resize(&repr, p - q))
3353         return NULL;
3354     return repr;
3355 }
3356
3357 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3358 {
3359     if (!PyUnicode_Check(unicode)) {
3360         PyErr_BadArgument();
3361         return NULL;
3362     }
3363     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3364                                             PyUnicode_GET_SIZE(unicode));
3365 }
3366
3367 /* --- Unicode Internal Codec ------------------------------------------- */
3368
3369 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3370                                            Py_ssize_t size,
3371                                            const char *errors)
3372 {
3373     const char *starts = s;
3374     Py_ssize_t startinpos;
3375     Py_ssize_t endinpos;
3376     Py_ssize_t outpos;
3377     PyUnicodeObject *v;
3378     Py_UNICODE *p;
3379     const char *end;
3380     const char *reason;
3381     PyObject *errorHandler = NULL;
3382     PyObject *exc = NULL;
3383
3384 #ifdef Py_UNICODE_WIDE
3385     Py_UNICODE unimax = PyUnicode_GetMax();
3386 #endif
3387
3388     /* XXX overflow detection missing */
3389     v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3390     if (v == NULL)
3391         goto onError;
3392     if (PyUnicode_GetSize((PyObject *)v) == 0)
3393         return (PyObject *)v;
3394     p = PyUnicode_AS_UNICODE(v);
3395     end = s + size;
3396
3397     while (s < end) {
3398         memcpy(p, s, sizeof(Py_UNICODE));
3399         /* We have to sanity check the raw data, otherwise doom looms for
3400            some malformed UCS-4 data. */
3401         if (
3402 #ifdef Py_UNICODE_WIDE
3403             *p > unimax || *p < 0 ||
3404 #endif
3405             end-s < Py_UNICODE_SIZE
3406             )
3407         {
3408             startinpos = s - starts;
3409             if (end-s < Py_UNICODE_SIZE) {
3410                 endinpos = end-starts;
3411                 reason = "truncated input";
3412             }
3413             else {
3414                 endinpos = s - starts + Py_UNICODE_SIZE;
3415                 reason = "illegal code point (> 0x10FFFF)";
3416             }
3417             outpos = p - PyUnicode_AS_UNICODE(v);
3418             if (unicode_decode_call_errorhandler(
3419                     errors, &errorHandler,
3420                     "unicode_internal", reason,
3421                     starts, size, &startinpos, &endinpos, &exc, &s,
3422                     &v, &outpos, &p)) {
3423                 goto onError;
3424             }
3425         }
3426         else {
3427             p++;
3428             s += Py_UNICODE_SIZE;
3429         }
3430     }
3431
3432     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3433         goto onError;
3434     Py_XDECREF(errorHandler);
3435     Py_XDECREF(exc);
3436     return (PyObject *)v;
3437
3438   onError:
3439     Py_XDECREF(v);
3440     Py_XDECREF(errorHandler);
3441     Py_XDECREF(exc);
3442     return NULL;
3443 }
3444
3445 /* --- Latin-1 Codec ------------------------------------------------------ */
3446
3447 PyObject *PyUnicode_DecodeLatin1(const char *s,
3448                                  Py_ssize_t size,
3449                                  const char *errors)
3450 {
3451     PyUnicodeObject *v;
3452     Py_UNICODE *p;
3453
3454     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3455     if (size == 1) {
3456         Py_UNICODE r = *(unsigned char*)s;
3457         return PyUnicode_FromUnicode(&r, 1);
3458     }
3459
3460     v = _PyUnicode_New(size);
3461     if (v == NULL)
3462         goto onError;
3463     if (size == 0)
3464         return (PyObject *)v;
3465     p = PyUnicode_AS_UNICODE(v);
3466     while (size-- > 0)
3467         *p++ = (unsigned char)*s++;
3468     return (PyObject *)v;
3469
3470   onError:
3471     Py_XDECREF(v);
3472     return NULL;
3473 }
3474
3475 /* create or adjust a UnicodeEncodeError */
3476 static void make_encode_exception(PyObject **exceptionObject,
3477                                   const char *encoding,
3478                                   const Py_UNICODE *unicode, Py_ssize_t size,
3479                                   Py_ssize_t startpos, Py_ssize_t endpos,
3480                                   const char *reason)
3481 {
3482     if (*exceptionObject == NULL) {
3483         *exceptionObject = PyUnicodeEncodeError_Create(
3484             encoding, unicode, size, startpos, endpos, reason);
3485     }
3486     else {
3487         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3488             goto onError;
3489         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3490             goto onError;
3491         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3492             goto onError;
3493         return;
3494       onError:
3495         Py_DECREF(*exceptionObject);
3496         *exceptionObject = NULL;
3497     }
3498 }
3499
3500 /* raises a UnicodeEncodeError */
3501 static void raise_encode_exception(PyObject **exceptionObject,
3502                                    const char *encoding,
3503                                    const Py_UNICODE *unicode, Py_ssize_t size,
3504                                    Py_ssize_t startpos, Py_ssize_t endpos,
3505                                    const char *reason)
3506 {
3507     make_encode_exception(exceptionObject,
3508                           encoding, unicode, size, startpos, endpos, reason);
3509     if (*exceptionObject != NULL)
3510         PyCodec_StrictErrors(*exceptionObject);
3511 }
3512
3513 /* error handling callback helper:
3514    build arguments, call the callback and check the arguments,
3515    put the result into newpos and return the replacement string, which
3516    has to be freed by the caller */
3517 static PyObject *unicode_encode_call_errorhandler(const char *errors,
3518                                                   PyObject **errorHandler,
3519                                                   const char *encoding, const char *reason,
3520                                                   const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3521                                                   Py_ssize_t startpos, Py_ssize_t endpos,
3522                                                   Py_ssize_t *newpos)
3523 {
3524     static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3525
3526     PyObject *restuple;
3527     PyObject *resunicode;
3528
3529     if (*errorHandler == NULL) {
3530         *errorHandler = PyCodec_LookupError(errors);
3531         if (*errorHandler == NULL)
3532             return NULL;
3533     }
3534
3535     make_encode_exception(exceptionObject,
3536                           encoding, unicode, size, startpos, endpos, reason);
3537     if (*exceptionObject == NULL)
3538         return NULL;
3539
3540     restuple = PyObject_CallFunctionObjArgs(
3541         *errorHandler, *exceptionObject, NULL);
3542     if (restuple == NULL)
3543         return NULL;
3544     if (!PyTuple_Check(restuple)) {
3545         PyErr_SetString(PyExc_TypeError, &argparse[4]);
3546         Py_DECREF(restuple);
3547         return NULL;
3548     }
3549     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3550                           &resunicode, newpos)) {
3551         Py_DECREF(restuple);
3552         return NULL;
3553     }
3554     if (*newpos<0)
3555         *newpos = size+*newpos;
3556     if (*newpos<0 || *newpos>size) {
3557         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3558         Py_DECREF(restuple);
3559         return NULL;
3560     }
3561     Py_INCREF(resunicode);
3562     Py_DECREF(restuple);
3563     return resunicode;
3564 }
3565
3566 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3567                                      Py_ssize_t size,
3568                                      const char *errors,
3569                                      int limit)
3570 {
3571     /* output object */
3572     PyObject *res;
3573     /* pointers to the beginning and end+1 of input */
3574     const Py_UNICODE *startp = p;
3575     const Py_UNICODE *endp = p + size;
3576     /* pointer to the beginning of the unencodable characters */
3577     /* const Py_UNICODE *badp = NULL; */
3578     /* pointer into the output */
3579     char *str;
3580     /* current output position */
3581     Py_ssize_t respos = 0;
3582     Py_ssize_t ressize;
3583     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3584     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3585     PyObject *errorHandler = NULL;
3586     PyObject *exc = NULL;
3587     /* the following variable is used for caching string comparisons
3588      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3589     int known_errorHandler = -1;
3590
3591     /* allocate enough for a simple encoding without
3592        replacements, if we need more, we'll resize */
3593     res = PyString_FromStringAndSize(NULL, size);
3594     if (res == NULL)
3595         goto onError;
3596     if (size == 0)
3597         return res;
3598     str = PyString_AS_STRING(res);
3599     ressize = size;
3600
3601     while (p<endp) {
3602         Py_UNICODE c = *p;
3603
3604         /* can we encode this? */
3605         if (c<limit) {
3606             /* no overflow check, because we know that the space is enough */
3607             *str++ = (char)c;
3608             ++p;
3609         }
3610         else {
3611             Py_ssize_t unicodepos = p-startp;
3612             Py_ssize_t requiredsize;
3613             PyObject *repunicode;
3614             Py_ssize_t repsize;
3615             Py_ssize_t newpos;
3616             Py_ssize_t respos;
3617             Py_UNICODE *uni2;
3618             /* startpos for collecting unencodable chars */
3619             const Py_UNICODE *collstart = p;
3620             const Py_UNICODE *collend = p;
3621             /* find all unecodable characters */
3622             while ((collend < endp) && ((*collend)>=limit))
3623                 ++collend;
3624             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3625             if (known_errorHandler==-1) {
3626                 if ((errors==NULL) || (!strcmp(errors, "strict")))
3627                     known_errorHandler = 1;
3628                 else if (!strcmp(errors, "replace"))
3629                     known_errorHandler = 2;
3630                 else if (!strcmp(errors, "ignore"))
3631                     known_errorHandler = 3;
3632                 else if (!strcmp(errors, "xmlcharrefreplace"))
3633                     known_errorHandler = 4;
3634                 else
3635                     known_errorHandler = 0;
3636             }
3637             switch (known_errorHandler) {
3638             case 1: /* strict */
3639                 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3640                 goto onError;
3641             case 2: /* replace */
3642                 while (collstart++<collend)
3643                     *str++ = '?'; /* fall through */
3644             case 3: /* ignore */
3645                 p = collend;
3646                 break;
3647             case 4: /* xmlcharrefreplace */
3648                 respos = str-PyString_AS_STRING(res);
3649                 /* determine replacement size (temporarily (mis)uses p) */
3650                 for (p = collstart, repsize = 0; p < collend; ++p) {
3651                     if (*p<10)
3652                         repsize += 2+1+1;
3653                     else if (*p<100)
3654                         repsize += 2+2+1;
3655                     else if (*p<1000)
3656                         repsize += 2+3+1;
3657                     else if (*p<10000)
3658                         repsize += 2+4+1;
3659 #ifndef Py_UNICODE_WIDE
3660                     else
3661                         repsize += 2+5+1;
3662 #else
3663                     else if (*p<100000)
3664                         repsize += 2+5+1;
3665                     else if (*p<1000000)
3666                         repsize += 2+6+1;
3667                     else
3668                         repsize += 2+7+1;
3669 #endif
3670                 }
3671                 requiredsize = respos+repsize+(endp-collend);
3672                 if (requiredsize > ressize) {
3673                     if (requiredsize<2*ressize)
3674                         requiredsize = 2*ressize;
3675                     if (_PyString_Resize(&res, requiredsize))
3676                         goto onError;
3677                     str = PyString_AS_STRING(res) + respos;
3678                     ressize = requiredsize;
3679                 }
3680                 /* generate replacement (temporarily (mis)uses p) */
3681                 for (p = collstart; p < collend; ++p) {
3682                     str += sprintf(str, "&#%d;", (int)*p);
3683                 }
3684                 p = collend;
3685                 break;
3686             default:
3687                 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3688                                                               encoding, reason, startp, size, &exc,
3689                                                               collstart-startp, collend-startp, &newpos);
3690                 if (repunicode == NULL)
3691                     goto onError;
3692                 /* need more space? (at least enough for what we have+the
3693                    replacement+the rest of the string, so we won't have to
3694                    check space for encodable characters) */
3695                 respos = str-PyString_AS_STRING(res);
3696                 repsize = PyUnicode_GET_SIZE(repunicode);
3697                 requiredsize = respos+repsize+(endp-collend);
3698                 if (requiredsize > ressize) {
3699                     if (requiredsize<2*ressize)
3700                         requiredsize = 2*ressize;
3701                     if (_PyString_Resize(&res, requiredsize)) {
3702                         Py_DECREF(repunicode);
3703                         goto onError;
3704                     }
3705                     str = PyString_AS_STRING(res) + respos;
3706                     ressize = requiredsize;
3707                 }
3708                 /* check if there is anything unencodable in the replacement
3709                    and copy it to the output */
3710                 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3711                     c = *uni2;
3712                     if (c >= limit) {
3713                         raise_encode_exception(&exc, encoding, startp, size,
3714                                                unicodepos, unicodepos+1, reason);
3715                         Py_DECREF(repunicode);
3716                         goto onError;
3717                     }
3718                     *str = (char)c;
3719                 }
3720                 p = startp + newpos;
3721                 Py_DECREF(repunicode);
3722             }
3723         }
3724     }
3725     /* Resize if we allocated to much */
3726     respos = str-PyString_AS_STRING(res);
3727     if (respos<ressize)
3728         /* If this falls res will be NULL */
3729         _PyString_Resize(&res, respos);
3730     Py_XDECREF(errorHandler);
3731     Py_XDECREF(exc);
3732     return res;
3733
3734   onError:
3735     Py_XDECREF(res);
3736     Py_XDECREF(errorHandler);
3737     Py_XDECREF(exc);
3738     return NULL;
3739 }
3740
3741 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3742                                  Py_ssize_t size,
3743                                  const char *errors)
3744 {
3745     return unicode_encode_ucs1(p, size, errors, 256);
3746 }
3747
3748 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3749 {
3750     if (!PyUnicode_Check(unicode)) {
3751         PyErr_BadArgument();
3752         return NULL;
3753     }
3754     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3755                                   PyUnicode_GET_SIZE(unicode),
3756                                   NULL);
3757 }
3758
3759 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3760
3761 PyObject *PyUnicode_DecodeASCII(const char *s,
3762                                 Py_ssize_t size,
3763                                 const char *errors)
3764 {
3765     const char *starts = s;
3766     PyUnicodeObject *v;
3767     Py_UNICODE *p;
3768     Py_ssize_t startinpos;
3769     Py_ssize_t endinpos;
3770     Py_ssize_t outpos;
3771     const char *e;
3772     PyObject *errorHandler = NULL;
3773     PyObject *exc = NULL;
3774
3775     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3776     if (size == 1 && *(unsigned char*)s < 128) {
3777         Py_UNICODE r = *(unsigned char*)s;
3778         return PyUnicode_FromUnicode(&r, 1);
3779     }
3780
3781     v = _PyUnicode_New(size);
3782     if (v == NULL)
3783         goto onError;
3784     if (size == 0)
3785         return (PyObject *)v;
3786     p = PyUnicode_AS_UNICODE(v);
3787     e = s + size;
3788     while (s < e) {
3789         register unsigned char c = (unsigned char)*s;
3790         if (c < 128) {
3791             *p++ = c;
3792             ++s;
3793         }
3794         else {
3795             startinpos = s-starts;
3796             endinpos = startinpos + 1;
3797             outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3798             if (unicode_decode_call_errorhandler(
3799                     errors, &errorHandler,
3800                     "ascii", "ordinal not in range(128)",
3801                     starts, size, &startinpos, &endinpos, &exc, &s,
3802                     &v, &outpos, &p))
3803                 goto onError;
3804         }
3805     }
3806     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
3807         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3808             goto onError;
3809     Py_XDECREF(errorHandler);
3810     Py_XDECREF(exc);
3811     return (PyObject *)v;
3812
3813   onError:
3814     Py_XDECREF(v);
3815     Py_XDECREF(errorHandler);
3816     Py_XDECREF(exc);
3817     return NULL;
3818 }
3819
3820 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3821                                 Py_ssize_t size,
3822                                 const char *errors)
3823 {
3824     return unicode_encode_ucs1(p, size, errors, 128);
3825 }
3826
3827 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3828 {
3829     if (!PyUnicode_Check(unicode)) {
3830         PyErr_BadArgument();
3831         return NULL;
3832     }
3833     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3834                                  PyUnicode_GET_SIZE(unicode),
3835                                  NULL);
3836 }
3837
3838 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3839
3840 /* --- MBCS codecs for Windows -------------------------------------------- */
3841
3842 #if SIZEOF_INT < SIZEOF_SIZE_T
3843 #define NEED_RETRY
3844 #endif
3845
3846 /* XXX This code is limited to "true" double-byte encodings, as
3847    a) it assumes an incomplete character consists of a single byte, and
3848    b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3849    encodings, see IsDBCSLeadByteEx documentation. */
3850
3851 static int is_dbcs_lead_byte(const char *s, int offset)
3852 {
3853     const char *curr = s + offset;
3854
3855     if (IsDBCSLeadByte(*curr)) {
3856         const char *prev = CharPrev(s, curr);
3857         return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3858     }
3859     return 0;
3860 }
3861
3862 /*
3863  * Decode MBCS string into unicode object. If 'final' is set, converts
3864  * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3865  */
3866 static int decode_mbcs(PyUnicodeObject **v,
3867                        const char *s, /* MBCS string */
3868                        int size, /* sizeof MBCS string */
3869                        int final)
3870 {
3871     Py_UNICODE *p;
3872     Py_ssize_t n = 0;
3873     int usize = 0;
3874
3875     assert(size >= 0);
3876
3877     /* Skip trailing lead-byte unless 'final' is set */
3878     if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3879         --size;
3880
3881     /* First get the size of the result */
3882     if (size > 0) {
3883         usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3884         if (usize == 0) {
3885             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3886             return -1;
3887         }
3888     }
3889
3890     if (*v == NULL) {
3891         /* Create unicode object */
3892         *v = _PyUnicode_New(usize);
3893         if (*v == NULL)
3894             return -1;
3895     }
3896     else {
3897         /* Extend unicode object */
3898         n = PyUnicode_GET_SIZE(*v);
3899         if (_PyUnicode_Resize(v, n + usize) < 0)
3900             return -1;
3901     }
3902
3903     /* Do the conversion */
3904     if (size > 0) {
3905         p = PyUnicode_AS_UNICODE(*v) + n;
3906         if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3907             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3908             return -1;
3909         }
3910     }
3911
3912     return size;
3913 }
3914
3915 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3916                                        Py_ssize_t size,
3917                                        const char *errors,
3918                                        Py_ssize_t *consumed)
3919 {
3920     PyUnicodeObject *v = NULL;
3921     int done;
3922
3923     if (consumed)
3924         *consumed = 0;
3925
3926 #ifdef NEED_RETRY
3927   retry:
3928     if (size > INT_MAX)
3929         done = decode_mbcs(&v, s, INT_MAX, 0);
3930     else
3931 #endif
3932         done = decode_mbcs(&v, s, (int)size, !consumed);
3933
3934     if (done < 0) {
3935         Py_XDECREF(v);
3936         return NULL;
3937     }
3938
3939     if (consumed)
3940         *consumed += done;
3941
3942 #ifdef NEED_RETRY
3943     if (size > INT_MAX) {
3944         s += done;
3945         size -= done;
3946         goto retry;
3947     }
3948 #endif
3949
3950     return (PyObject *)v;
3951 }
3952
3953 PyObject *PyUnicode_DecodeMBCS(const char *s,
3954                                Py_ssize_t size,
3955                                const char *errors)
3956 {
3957     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3958 }
3959
3960 /*
3961  * Convert unicode into string object (MBCS).
3962  * Returns 0 if succeed, -1 otherwise.
3963  */
3964 static int encode_mbcs(PyObject **repr,
3965                        const Py_UNICODE *p, /* unicode */
3966                        int size) /* size of unicode */
3967 {
3968     int mbcssize = 0;
3969     Py_ssize_t n = 0;
3970
3971     assert(size >= 0);
3972
3973     /* First get the size of the result */
3974     if (size > 0) {
3975         mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3976         if (mbcssize == 0) {
3977             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3978             return -1;
3979         }
3980     }
3981
3982     if (*repr == NULL) {
3983         /* Create string object */
3984         *repr = PyString_FromStringAndSize(NULL, mbcssize);
3985         if (*repr == NULL)
3986             return -1;
3987     }
3988     else {
3989         /* Extend string object */
3990         n = PyString_Size(*repr);
3991         if (_PyString_Resize(repr, n + mbcssize) < 0)
3992             return -1;
3993     }
3994
3995     /* Do the conversion */
3996     if (size > 0) {
3997         char *s = PyString_AS_STRING(*repr) + n;
3998         if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3999             PyErr_SetFromWindowsErrWithFilename(0, NULL);
4000             return -1;
4001         }
4002     }
4003
4004     return 0;
4005 }
4006
4007 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
4008                                Py_ssize_t size,
4009                                const char *errors)
4010 {
4011     PyObject *repr = NULL;
4012     int ret;
4013
4014 #ifdef NEED_RETRY
4015   retry:
4016     if (size > INT_MAX)
4017         ret = encode_mbcs(&repr, p, INT_MAX);
4018     else
4019 #endif
4020         ret = encode_mbcs(&repr, p, (int)size);
4021
4022     if (ret < 0) {
4023         Py_XDECREF(repr);
4024         return NULL;
4025     }
4026
4027 #ifdef NEED_RETRY
4028     if (size > INT_MAX) {
4029         p += INT_MAX;
4030         size -= INT_MAX;
4031         goto retry;
4032     }
4033 #endif
4034
4035     return repr;
4036 }
4037
4038 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4039 {
4040     if (!PyUnicode_Check(unicode)) {
4041         PyErr_BadArgument();
4042         return NULL;
4043     }
4044     return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4045                                 PyUnicode_GET_SIZE(unicode),
4046                                 NULL);
4047 }
4048
4049 #undef NEED_RETRY
4050
4051 #endif /* MS_WINDOWS */
4052
4053 /* --- Character Mapping Codec -------------------------------------------- */
4054
4055 PyObject *PyUnicode_DecodeCharmap(const char *s,
4056                                   Py_ssize_t size,
4057                                   PyObject *mapping,
4058                                   const char *errors)
4059 {
4060     const char *starts = s;
4061     Py_ssize_t startinpos;
4062     Py_ssize_t endinpos;
4063     Py_ssize_t outpos;
4064     const char *e;
4065     PyUnicodeObject *v;
4066     Py_UNICODE *p;
4067     Py_ssize_t extrachars = 0;
4068     PyObject *errorHandler = NULL;
4069     PyObject *exc = NULL;
4070     Py_UNICODE *mapstring = NULL;
4071     Py_ssize_t maplen = 0;
4072
4073     /* Default to Latin-1 */
4074     if (mapping == NULL)
4075         return PyUnicode_DecodeLatin1(s, size, errors);
4076
4077     v = _PyUnicode_New(size);
4078     if (v == NULL)
4079         goto onError;
4080     if (size == 0)
4081         return (PyObject *)v;
4082     p = PyUnicode_AS_UNICODE(v);
4083     e = s + size;
4084     if (PyUnicode_CheckExact(mapping)) {
4085         mapstring = PyUnicode_AS_UNICODE(mapping);
4086         maplen = PyUnicode_GET_SIZE(mapping);
4087         while (s < e) {
4088             unsigned char ch = *s;
4089             Py_UNICODE x = 0xfffe; /* illegal value */
4090
4091             if (ch < maplen)
4092                 x = mapstring[ch];
4093
4094             if (x == 0xfffe) {
4095                 /* undefined mapping */
4096                 outpos = p-PyUnicode_AS_UNICODE(v);
4097                 startinpos = s-starts;
4098                 endinpos = startinpos+1;
4099                 if (unicode_decode_call_errorhandler(
4100                         errors, &errorHandler,
4101                         "charmap", "character maps to <undefined>",
4102                         starts, size, &startinpos, &endinpos, &exc, &s,
4103                         &v, &outpos, &p)) {
4104                     goto onError;
4105                 }
4106                 continue;
4107             }
4108             *p++ = x;
4109             ++s;
4110         }
4111     }
4112     else {
4113         while (s < e) {
4114             unsigned char ch = *s;
4115             PyObject *w, *x;
4116
4117             /* Get mapping (char ordinal -> integer, Unicode char or None) */
4118             w = PyInt_FromLong((long)ch);
4119             if (w == NULL)
4120                 goto onError;
4121             x = PyObject_GetItem(mapping, w);
4122             Py_DECREF(w);
4123             if (x == NULL) {
4124                 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4125                     /* No mapping found means: mapping is undefined. */
4126                     PyErr_Clear();
4127                     x = Py_None;
4128                     Py_INCREF(x);
4129                 } else
4130                     goto onError;
4131             }
4132
4133             /* Apply mapping */
4134             if (PyInt_Check(x)) {
4135                 long value = PyInt_AS_LONG(x);
4136                 if (value < 0 || value > 65535) {
4137                     PyErr_SetString(PyExc_TypeError,
4138                                     "character mapping must be in range(65536)");
4139                     Py_DECREF(x);
4140                     goto onError;
4141                 }
4142                 *p++ = (Py_UNICODE)value;
4143             }
4144             else if (x == Py_None) {
4145                 /* undefined mapping */
4146                 outpos = p-PyUnicode_AS_UNICODE(v);
4147                 startinpos = s-starts;
4148                 endinpos = startinpos+1;
4149                 if (unicode_decode_call_errorhandler(
4150                         errors, &errorHandler,
4151                         "charmap", "character maps to <undefined>",
4152                         starts, size, &startinpos, &endinpos, &exc, &s,
4153                         &v, &outpos, &p)) {
4154                     Py_DECREF(x);
4155                     goto onError;
4156                 }
4157                 Py_DECREF(x);
4158                 continue;
4159             }
4160             else if (PyUnicode_Check(x)) {
4161                 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4162
4163                 if (targetsize == 1)
4164                     /* 1-1 mapping */
4165                     *p++ = *PyUnicode_AS_UNICODE(x);
4166
4167                 else if (targetsize > 1) {
4168                     /* 1-n mapping */
4169                     if (targetsize > extrachars) {
4170                         /* resize first */
4171                         Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4172                         Py_ssize_t needed = (targetsize - extrachars) + \
4173                             (targetsize << 2);
4174                         extrachars += needed;
4175                         /* XXX overflow detection missing */
4176                         if (_PyUnicode_Resize(&v,
4177                                               PyUnicode_GET_SIZE(v) + needed) < 0) {
4178                             Py_DECREF(x);
4179                             goto onError;
4180                         }
4181                         p = PyUnicode_AS_UNICODE(v) + oldpos;
4182                     }
4183                     Py_UNICODE_COPY(p,
4184                                     PyUnicode_AS_UNICODE(x),
4185                                     targetsize);
4186                     p += targetsize;
4187                     extrachars -= targetsize;
4188                 }
4189                 /* 1-0 mapping: skip the character */
4190             }
4191             else {
4192                 /* wrong return value */
4193                 PyErr_SetString(PyExc_TypeError,
4194                                 "character mapping must return integer, None or unicode");
4195                 Py_DECREF(x);
4196                 goto onError;
4197             }
4198             Py_DECREF(x);
4199             ++s;
4200         }
4201     }
4202     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4203         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4204             goto onError;
4205     Py_XDECREF(errorHandler);
4206     Py_XDECREF(exc);
4207     return (PyObject *)v;
4208
4209   onError:
4210     Py_XDECREF(errorHandler);
4211     Py_XDECREF(exc);
4212     Py_XDECREF(v);
4213     return NULL;
4214 }
4215
4216 /* Charmap encoding: the lookup table */
4217
4218 struct encoding_map{
4219     PyObject_HEAD
4220     unsigned char level1[32];
4221     int count2, count3;
4222     unsigned char level23[1];
4223 };
4224
4225 static PyObject*
4226 encoding_map_size(PyObject *obj, PyObject* args)
4227 {
4228     struct encoding_map *map = (struct encoding_map*)obj;
4229     return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4230                           128*map->count3);
4231 }
4232
4233 static PyMethodDef encoding_map_methods[] = {
4234     {"size", encoding_map_size, METH_NOARGS,
4235      PyDoc_STR("Return the size (in bytes) of this object") },
4236     { 0 }
4237 };
4238
4239 static void
4240 encoding_map_dealloc(PyObject* o)
4241 {
4242     PyObject_FREE(o);
4243 }
4244
4245 static PyTypeObject EncodingMapType = {
4246     PyVarObject_HEAD_INIT(NULL, 0)
4247     "EncodingMap",          /*tp_name*/
4248     sizeof(struct encoding_map),   /*tp_basicsize*/
4249     0,                      /*tp_itemsize*/
4250     /* methods */
4251     encoding_map_dealloc,   /*tp_dealloc*/
4252     0,                      /*tp_print*/
4253     0,                      /*tp_getattr*/
4254     0,                      /*tp_setattr*/
4255     0,                      /*tp_compare*/
4256     0,                      /*tp_repr*/
4257     0,                      /*tp_as_number*/
4258     0,                      /*tp_as_sequence*/
4259     0,                      /*tp_as_mapping*/
4260     0,                      /*tp_hash*/
4261     0,                      /*tp_call*/
4262     0,                      /*tp_str*/
4263     0,                      /*tp_getattro*/
4264     0,                      /*tp_setattro*/
4265     0,                      /*tp_as_buffer*/
4266     Py_TPFLAGS_DEFAULT,     /*tp_flags*/
4267     0,                      /*tp_doc*/
4268     0,                      /*tp_traverse*/
4269     0,                      /*tp_clear*/
4270     0,                      /*tp_richcompare*/
4271     0,                      /*tp_weaklistoffset*/
4272     0,                      /*tp_iter*/
4273     0,                      /*tp_iternext*/
4274     encoding_map_methods,   /*tp_methods*/
4275     0,                      /*tp_members*/
4276     0,                      /*tp_getset*/
4277     0,                      /*tp_base*/
4278     0,                      /*tp_dict*/
4279     0,                      /*tp_descr_get*/
4280     0,                      /*tp_descr_set*/
4281     0,                      /*tp_dictoffset*/
4282     0,                      /*tp_init*/
4283     0,                      /*tp_alloc*/
4284     0,                      /*tp_new*/
4285     0,                      /*tp_free*/
4286     0,                      /*tp_is_gc*/
4287 };
4288
4289 PyObject*
4290 PyUnicode_BuildEncodingMap(PyObject* string)
4291 {
4292     Py_UNICODE *decode;
4293     PyObject *result;
4294     struct encoding_map *mresult;
4295     int i;
4296     int need_dict = 0;
4297     unsigned char level1[32];
4298     unsigned char level2[512];
4299     unsigned char *mlevel1, *mlevel2, *mlevel3;
4300     int count2 = 0, count3 = 0;
4301
4302     if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4303         PyErr_BadArgument();
4304         return NULL;
4305     }
4306     decode = PyUnicode_AS_UNICODE(string);
4307     memset(level1, 0xFF, sizeof level1);
4308     memset(level2, 0xFF, sizeof level2);
4309
4310     /* If there isn't a one-to-one mapping of NULL to \0,
4311        or if there are non-BMP characters, we need to use
4312        a mapping dictionary. */
4313     if (decode[0] != 0)
4314         need_dict = 1;
4315     for (i = 1; i < 256; i++) {
4316         int l1, l2;
4317         if (decode[i] == 0
4318 #ifdef Py_UNICODE_WIDE
4319             || decode[i] > 0xFFFF
4320 #endif
4321             ) {
4322             need_dict = 1;
4323             break;
4324         }
4325         if (decode[i] == 0xFFFE)
4326             /* unmapped character */
4327             continue;
4328         l1 = decode[i] >> 11;
4329         l2 = decode[i] >> 7;
4330         if (level1[l1] == 0xFF)
4331             level1[l1] = count2++;
4332         if (level2[l2] == 0xFF)
4333             level2[l2] = count3++;
4334     }
4335
4336     if (count2 >= 0xFF || count3 >= 0xFF)
4337         need_dict = 1;
4338
4339     if (need_dict) {
4340         PyObject *result = PyDict_New();
4341         PyObject *key, *value;
4342         if (!result)
4343             return NULL;
4344         for (i = 0; i < 256; i++) {
4345             value = NULL;
4346             key = PyInt_FromLong(decode[i]);
4347             value = PyInt_FromLong(i);
4348             if (!key || !value)
4349                 goto failed1;
4350             if (PyDict_SetItem(result, key, value) == -1)
4351                 goto failed1;
4352             Py_DECREF(key);
4353             Py_DECREF(value);
4354         }
4355         return result;
4356       failed1:
4357         Py_XDECREF(key);
4358         Py_XDECREF(value);
4359         Py_DECREF(result);
4360         return NULL;
4361     }
4362
4363     /* Create a three-level trie */
4364     result = PyObject_MALLOC(sizeof(struct encoding_map) +
4365                              16*count2 + 128*count3 - 1);
4366     if (!result)
4367         return PyErr_NoMemory();
4368     PyObject_Init(result, &EncodingMapType);
4369     mresult = (struct encoding_map*)result;
4370     mresult->count2 = count2;
4371     mresult->count3 = count3;
4372     mlevel1 = mresult->level1;
4373     mlevel2 = mresult->level23;
4374     mlevel3 = mresult->level23 + 16*count2;
4375     memcpy(mlevel1, level1, 32);
4376     memset(mlevel2, 0xFF, 16*count2);
4377     memset(mlevel3, 0, 128*count3);
4378     count3 = 0;
4379     for (i = 1; i < 256; i++) {
4380         int o1, o2, o3, i2, i3;
4381         if (decode[i] == 0xFFFE)
4382             /* unmapped character */
4383             continue;
4384         o1 = decode[i]>>11;
4385         o2 = (decode[i]>>7) & 0xF;
4386         i2 = 16*mlevel1[o1] + o2;
4387         if (mlevel2[i2] == 0xFF)
4388             mlevel2[i2] = count3++;
4389         o3 = decode[i] & 0x7F;
4390         i3 = 128*mlevel2[i2] + o3;
4391         mlevel3[i3] = i;
4392     }
4393     return result;
4394 }
4395
4396 static int
4397 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4398 {
4399     struct encoding_map *map = (struct encoding_map*)mapping;
4400     int l1 = c>>11;
4401     int l2 = (c>>7) & 0xF;
4402     int l3 = c & 0x7F;
4403     int i;
4404
4405 #ifdef Py_UNICODE_WIDE
4406     if (c > 0xFFFF) {
4407         return -1;
4408     }
4409 #endif
4410     if (c == 0)
4411         return 0;
4412     /* level 1*/
4413     i = map->level1[l1];
4414     if (i == 0xFF) {
4415         return -1;
4416     }
4417     /* level 2*/
4418     i = map->level23[16*i+l2];
4419     if (i == 0xFF) {
4420         return -1;
4421     }
4422     /* level 3 */
4423     i = map->level23[16*map->count2 + 128*i + l3];
4424     if (i == 0) {
4425         return -1;
4426     }
4427     return i;
4428 }
4429
4430 /* Lookup the character ch in the mapping. If the character
4431    can't be found, Py_None is returned (or NULL, if another
4432    error occurred). */
4433 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4434 {
4435     PyObject *w = PyInt_FromLong((long)c);
4436     PyObject *x;
4437
4438     if (w == NULL)
4439         return NULL;
4440     x = PyObject_GetItem(mapping, w);
4441     Py_DECREF(w);
4442     if (x == NULL) {
4443         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4444             /* No mapping found means: mapping is undefined. */
4445             PyErr_Clear();
4446             x = Py_None;
4447             Py_INCREF(x);
4448             return x;
4449         } else
4450             return NULL;
4451     }
4452     else if (x == Py_None)
4453         return x;
4454     else if (PyInt_Check(x)) {
4455         long value = PyInt_AS_LONG(x);
4456         if (value < 0 || value > 255) {
4457             PyErr_SetString(PyExc_TypeError,
4458                             "character mapping must be in range(256)");
4459             Py_DECREF(x);
4460             return NULL;
4461         }
4462         return x;
4463     }
4464     else if (PyString_Check(x))
4465         return x;
4466     else {
4467         /* wrong return value */
4468         PyErr_SetString(PyExc_TypeError,
4469                         "character mapping must return integer, None or str");
4470         Py_DECREF(x);
4471         return NULL;
4472     }
4473 }
4474
4475 static int
4476 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4477 {
4478     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4479     /* exponentially overallocate to minimize reallocations */
4480     if (requiredsize < 2*outsize)
4481         requiredsize = 2*outsize;
4482     if (_PyString_Resize(outobj, requiredsize)) {
4483         return 0;
4484     }
4485     return 1;
4486 }
4487
4488 typedef enum charmapencode_result {
4489     enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4490 }charmapencode_result;
4491 /* lookup the character, put the result in the output string and adjust
4492    various state variables. Reallocate the output string if not enough
4493    space is available. Return a new reference to the object that
4494    was put in the output buffer, or Py_None, if the mapping was undefined
4495    (in which case no character was written) or NULL, if a
4496    reallocation error occurred. The caller must decref the result */
4497 static
4498 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4499                                           PyObject **outobj, Py_ssize_t *outpos)
4500 {
4501     PyObject *rep;
4502     char *outstart;
4503     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4504
4505     if (Py_TYPE(mapping) == &EncodingMapType) {
4506         int res = encoding_map_lookup(c, mapping);
4507         Py_ssize_t requiredsize = *outpos+1;
4508         if (res == -1)
4509             return enc_FAILED;
4510         if (outsize<requiredsize)
4511             if (!charmapencode_resize(outobj, outpos, requiredsize))
4512                 return enc_EXCEPTION;
4513         outstart = PyString_AS_STRING(*outobj);
4514         outstart[(*outpos)++] = (char)res;
4515         return enc_SUCCESS;
4516     }
4517
4518     rep = charmapencode_lookup(c, mapping);
4519     if (rep==NULL)
4520         return enc_EXCEPTION;
4521     else if (rep==Py_None) {
4522         Py_DECREF(rep);
4523         return enc_FAILED;
4524     } else {
4525         if (PyInt_Check(rep)) {
4526             Py_ssize_t requiredsize = *outpos+1;
4527             if (outsize<requiredsize)
4528                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4529                     Py_DECREF(rep);
4530                     return enc_EXCEPTION;
4531                 }
4532             outstart = PyString_AS_STRING(*outobj);
4533             outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4534         }
4535         else {
4536             const char *repchars = PyString_AS_STRING(rep);
4537             Py_ssize_t repsize = PyString_GET_SIZE(rep);
4538             Py_ssize_t requiredsize = *outpos+repsize;
4539             if (outsize<requiredsize)
4540                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4541                     Py_DECREF(rep);
4542                     return enc_EXCEPTION;
4543                 }
4544             outstart = PyString_AS_STRING(*outobj);
4545             memcpy(outstart + *outpos, repchars, repsize);
4546             *outpos += repsize;
4547         }
4548     }
4549     Py_DECREF(rep);
4550     return enc_SUCCESS;
4551 }
4552
4553 /* handle an error in PyUnicode_EncodeCharmap
4554    Return 0 on success, -1 on error */
4555 static
4556 int charmap_encoding_error(
4557     const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4558     PyObject **exceptionObject,
4559     int *known_errorHandler, PyObject **errorHandler, const char *errors,
4560     PyObject **res, Py_ssize_t *respos)
4561 {
4562     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4563     Py_ssize_t repsize;
4564     Py_ssize_t newpos;
4565     Py_UNICODE *uni2;
4566     /* startpos for collecting unencodable chars */
4567     Py_ssize_t collstartpos = *inpos;
4568     Py_ssize_t collendpos = *inpos+1;
4569     Py_ssize_t collpos;
4570     char *encoding = "charmap";
4571     char *reason = "character maps to <undefined>";
4572     charmapencode_result x;
4573
4574     /* find all unencodable characters */
4575     while (collendpos < size) {
4576         PyObject *rep;
4577         if (Py_TYPE(mapping) == &EncodingMapType) {
4578             int res = encoding_map_lookup(p[collendpos], mapping);
4579             if (res != -1)
4580                 break;
4581             ++collendpos;
4582             continue;
4583         }
4584
4585         rep = charmapencode_lookup(p[collendpos], mapping);
4586         if (rep==NULL)
4587             return -1;
4588         else if (rep!=Py_None) {
4589             Py_DECREF(rep);
4590             break;
4591         }
4592         Py_DECREF(rep);
4593         ++collendpos;
4594     }
4595     /* cache callback name lookup
4596      * (if not done yet, i.e. it's the first error) */
4597     if (*known_errorHandler==-1) {
4598         if ((errors==NULL) || (!strcmp(errors, "strict")))
4599             *known_errorHandler = 1;
4600         else if (!strcmp(errors, "replace"))
4601             *known_errorHandler = 2;
4602         else if (!strcmp(errors, "ignore"))
4603             *known_errorHandler = 3;
4604         else if (!strcmp(errors, "xmlcharrefreplace"))
4605             *known_errorHandler = 4;
4606         else
4607             *known_errorHandler = 0;
4608     }
4609     switch (*known_errorHandler) {
4610     case 1: /* strict */
4611         raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4612         return -1;
4613     case 2: /* replace */
4614         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4615             x = charmapencode_output('?', mapping, res, respos);
4616             if (x==enc_EXCEPTION) {
4617                 return -1;
4618             }
4619             else if (x==enc_FAILED) {
4620                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4621                 return -1;
4622             }
4623         }
4624         /* fall through */
4625     case 3: /* ignore */
4626         *inpos = collendpos;
4627         break;
4628     case 4: /* xmlcharrefreplace */
4629         /* generate replacement (temporarily (mis)uses p) */
4630         for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4631             char buffer[2+29+1+1];
4632             char *cp;
4633             sprintf(buffer, "&#%d;", (int)p[collpos]);
4634             for (cp = buffer; *cp; ++cp) {
4635                 x = charmapencode_output(*cp, mapping, res, respos);
4636                 if (x==enc_EXCEPTION)
4637                     return -1;
4638                 else if (x==enc_FAILED) {
4639                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4640                     return -1;
4641                 }
4642             }
4643         }
4644         *inpos = collendpos;
4645         break;
4646     default:
4647         repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4648                                                       encoding, reason, p, size, exceptionObject,
4649                                                       collstartpos, collendpos, &newpos);
4650         if (repunicode == NULL)
4651             return -1;
4652         /* generate replacement  */
4653         repsize = PyUnicode_GET_SIZE(repunicode);
4654         for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4655             x = charmapencode_output(*uni2, mapping, res, respos);
4656             if (x==enc_EXCEPTION) {
4657                 return -1;
4658             }
4659             else if (x==enc_FAILED) {
4660                 Py_DECREF(repunicode);
4661                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4662                 return -1;
4663             }
4664         }
4665         *inpos = newpos;
4666         Py_DECREF(repunicode);
4667     }
4668     return 0;
4669 }
4670
4671 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4672                                   Py_ssize_t size,
4673                                   PyObject *mapping,
4674                                   const char *errors)
4675 {
4676     /* output object */
4677     PyObject *res = NULL;
4678     /* current input position */
4679     Py_ssize_t inpos = 0;
4680     /* current output position */
4681     Py_ssize_t respos = 0;
4682     PyObject *errorHandler = NULL;
4683     PyObject *exc = NULL;
4684     /* the following variable is used for caching string comparisons
4685      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4686      * 3=ignore, 4=xmlcharrefreplace */
4687     int known_errorHandler = -1;
4688
4689     /* Default to Latin-1 */
4690     if (mapping == NULL)
4691         return PyUnicode_EncodeLatin1(p, size, errors);
4692
4693     /* allocate enough for a simple encoding without
4694        replacements, if we need more, we'll resize */
4695     res = PyString_FromStringAndSize(NULL, size);
4696     if (res == NULL)
4697         goto onError;
4698     if (size == 0)
4699         return res;
4700
4701     while (inpos<size) {
4702         /* try to encode it */
4703         charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4704         if (x==enc_EXCEPTION) /* error */
4705             goto onError;
4706         if (x==enc_FAILED) { /* unencodable character */
4707             if (charmap_encoding_error(p, size, &inpos, mapping,
4708                                        &exc,
4709                                        &known_errorHandler, &errorHandler, errors,
4710                                        &res, &respos)) {
4711                 goto onError;
4712             }
4713         }
4714         else
4715             /* done with this character => adjust input position */
4716             ++inpos;
4717     }
4718
4719     /* Resize if we allocated to much */
4720     if (respos<PyString_GET_SIZE(res)) {
4721         if (_PyString_Resize(&res, respos))
4722             goto onError;
4723     }
4724     Py_XDECREF(exc);
4725     Py_XDECREF(errorHandler);
4726     return res;
4727
4728   onError:
4729     Py_XDECREF(res);
4730     Py_XDECREF(exc);
4731     Py_XDECREF(errorHandler);
4732     return NULL;
4733 }
4734
4735 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4736                                     PyObject *mapping)
4737 {
4738     if (!PyUnicode_Check(unicode) || mapping == NULL) {
4739         PyErr_BadArgument();
4740         return NULL;
4741     }
4742     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4743                                    PyUnicode_GET_SIZE(unicode),
4744                                    mapping,
4745                                    NULL);
4746 }
4747
4748 /* create or adjust a UnicodeTranslateError */
4749 static void make_translate_exception(PyObject **exceptionObject,
4750                                      const Py_UNICODE *unicode, Py_ssize_t size,
4751                                      Py_ssize_t startpos, Py_ssize_t endpos,
4752                                      const char *reason)
4753 {
4754     if (*exceptionObject == NULL) {
4755         *exceptionObject = PyUnicodeTranslateError_Create(
4756             unicode, size, startpos, endpos, reason);
4757     }
4758     else {
4759         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4760             goto onError;
4761         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4762             goto onError;
4763         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4764             goto onError;
4765         return;
4766       onError:
4767         Py_DECREF(*exceptionObject);
4768         *exceptionObject = NULL;
4769     }
4770 }
4771
4772 /* raises a UnicodeTranslateError */
4773 static void raise_translate_exception(PyObject **exceptionObject,
4774                                       const Py_UNICODE *unicode, Py_ssize_t size,
4775                                       Py_ssize_t startpos, Py_ssize_t endpos,
4776                                       const char *reason)
4777 {
4778     make_translate_exception(exceptionObject,
4779                              unicode, size, startpos, endpos, reason);
4780     if (*exceptionObject != NULL)
4781         PyCodec_StrictErrors(*exceptionObject);
4782 }
4783
4784 /* error handling callback helper:
4785    build arguments, call the callback and check the arguments,
4786    put the result into newpos and return the replacement string, which
4787    has to be freed by the caller */
4788 static PyObject *unicode_translate_call_errorhandler(const char *errors,
4789                                                      PyObject **errorHandler,
4790                                                      const char *reason,
4791                                                      const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4792                                                      Py_ssize_t startpos, Py_ssize_t endpos,
4793                                                      Py_ssize_t *newpos)
4794 {
4795     static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4796
4797     Py_ssize_t i_newpos;
4798     PyObject *restuple;
4799     PyObject *resunicode;
4800
4801     if (*errorHandler == NULL) {
4802         *errorHandler = PyCodec_LookupError(errors);
4803         if (*errorHandler == NULL)
4804             return NULL;
4805     }
4806
4807     make_translate_exception(exceptionObject,
4808                              unicode, size, startpos, endpos, reason);
4809     if (*exceptionObject == NULL)
4810         return NULL;
4811
4812     restuple = PyObject_CallFunctionObjArgs(
4813         *errorHandler, *exceptionObject, NULL);
4814     if (restuple == NULL)
4815         return NULL;
4816     if (!PyTuple_Check(restuple)) {
4817         PyErr_SetString(PyExc_TypeError, &argparse[4]);
4818         Py_DECREF(restuple);
4819         return NULL;
4820     }
4821     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4822                           &resunicode, &i_newpos)) {
4823         Py_DECREF(restuple);
4824         return NULL;
4825     }
4826     if (i_newpos<0)
4827         *newpos = size+i_newpos;
4828     else
4829         *newpos = i_newpos;
4830     if (*newpos<0 || *newpos>size) {
4831         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4832         Py_DECREF(restuple);
4833         return NULL;
4834     }
4835     Py_INCREF(resunicode);
4836     Py_DECREF(restuple);
4837     return resunicode;
4838 }
4839
4840 /* Lookup the character ch in the mapping and put the result in result,
4841    which must be decrefed by the caller.
4842    Return 0 on success, -1 on error */
4843 static
4844 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4845 {
4846     PyObject *w = PyInt_FromLong((long)c);
4847     PyObject *x;
4848
4849     if (w == NULL)
4850         return -1;
4851     x = PyObject_GetItem(mapping, w);
4852     Py_DECREF(w);
4853     if (x == NULL) {
4854         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4855             /* No mapping found means: use 1:1 mapping. */
4856             PyErr_Clear();
4857             *result = NULL;
4858             return 0;
4859         } else
4860             return -1;
4861     }
4862     else if (x == Py_None) {
4863         *result = x;
4864         return 0;
4865     }
4866     else if (PyInt_Check(x)) {
4867         long value = PyInt_AS_LONG(x);
4868         long max = PyUnicode_GetMax();
4869         if (value < 0 || value > max) {
4870             PyErr_Format(PyExc_TypeError,
4871                          "character mapping must be in range(0x%lx)", max+1);
4872             Py_DECREF(x);
4873             return -1;
4874         }
4875         *result = x;
4876         return 0;
4877     }
4878     else if (PyUnicode_Check(x)) {
4879         *result = x;
4880         return 0;
4881     }
4882     else {
4883         /* wrong return value */
4884         PyErr_SetString(PyExc_TypeError,
4885                         "character mapping must return integer, None or unicode");
4886         Py_DECREF(x);
4887         return -1;
4888     }
4889 }
4890 /* ensure that *outobj is at least requiredsize characters long,
4891    if not reallocate and adjust various state variables.
4892    Return 0 on success, -1 on error */
4893 static
4894 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4895                                Py_ssize_t requiredsize)
4896 {
4897     Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4898     if (requiredsize > oldsize) {
4899         /* remember old output position */
4900         Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4901         /* exponentially overallocate to minimize reallocations */
4902         if (requiredsize < 2 * oldsize)
4903             requiredsize = 2 * oldsize;
4904         if (PyUnicode_Resize(outobj, requiredsize) < 0)
4905             return -1;
4906         *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4907     }
4908     return 0;
4909 }
4910 /* lookup the character, put the result in the output string and adjust
4911    various state variables. Return a new reference to the object that
4912    was put in the output buffer in *result, or Py_None, if the mapping was
4913    undefined (in which case no character was written).
4914    The called must decref result.
4915    Return 0 on success, -1 on error. */
4916 static
4917 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
4918                             Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4919                             PyObject **res)
4920 {
4921     if (charmaptranslate_lookup(*curinp, mapping, res))
4922         return -1;
4923     if (*res==NULL) {
4924         /* not found => default to 1:1 mapping */
4925         *(*outp)++ = *curinp;
4926     }
4927     else if (*res==Py_None)
4928         ;
4929     else if (PyInt_Check(*res)) {
4930         /* no overflow check, because we know that the space is enough */
4931         *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4932     }
4933     else if (PyUnicode_Check(*res)) {
4934         Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4935         if (repsize==1) {
4936             /* no overflow check, because we know that the space is enough */
4937             *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4938         }
4939         else if (repsize!=0) {
4940             /* more than one character */
4941             Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4942                 (insize - (curinp-startinp)) +
4943                 repsize - 1;
4944             if (charmaptranslate_makespace(outobj, outp, requiredsize))
4945                 return -1;
4946             memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4947             *outp += repsize;
4948         }
4949     }
4950     else
4951         return -1;
4952     return 0;
4953 }
4954
4955 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
4956                                      Py_ssize_t size,
4957                                      PyObject *mapping,
4958                                      const char *errors)
4959 {
4960     /* output object */
4961     PyObject *res = NULL;
4962     /* pointers to the beginning and end+1 of input */
4963     const Py_UNICODE *startp = p;
4964     const Py_UNICODE *endp = p + size;
4965     /* pointer into the output */
4966     Py_UNICODE *str;
4967     /* current output position */
4968     Py_ssize_t respos = 0;
4969     char *reason = "character maps to <undefined>";
4970     PyObject *errorHandler = NULL;
4971     PyObject *exc = NULL;
4972     /* the following variable is used for caching string comparisons
4973      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4974      * 3=ignore, 4=xmlcharrefreplace */
4975     int known_errorHandler = -1;
4976
4977     if (mapping == NULL) {
4978         PyErr_BadArgument();
4979         return NULL;
4980     }
4981
4982     /* allocate enough for a simple 1:1 translation without
4983        replacements, if we need more, we'll resize */
4984     res = PyUnicode_FromUnicode(NULL, size);
4985     if (res == NULL)
4986         goto onError;
4987     if (size == 0)
4988         return res;
4989     str = PyUnicode_AS_UNICODE(res);
4990
4991     while (p<endp) {
4992         /* try to encode it */
4993         PyObject *x = NULL;
4994         if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4995             Py_XDECREF(x);
4996             goto onError;
4997         }
4998         Py_XDECREF(x);
4999         if (x!=Py_None) /* it worked => adjust input pointer */
5000             ++p;
5001         else { /* untranslatable character */
5002             PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5003             Py_ssize_t repsize;
5004             Py_ssize_t newpos;
5005             Py_UNICODE *uni2;
5006             /* startpos for collecting untranslatable chars */
5007             const Py_UNICODE *collstart = p;
5008             const Py_UNICODE *collend = p+1;
5009             const Py_UNICODE *coll;
5010
5011             /* find all untranslatable characters */
5012             while (collend < endp) {
5013                 if (charmaptranslate_lookup(*collend, mapping, &x))
5014                     goto onError;
5015                 Py_XDECREF(x);
5016                 if (x!=Py_None)
5017                     break;
5018                 ++collend;
5019             }
5020             /* cache callback name lookup
5021              * (if not done yet, i.e. it's the first error) */
5022             if (known_errorHandler==-1) {
5023                 if ((errors==NULL) || (!strcmp(errors, "strict")))
5024                     known_errorHandler = 1;
5025                 else if (!strcmp(errors, "replace"))
5026                     known_errorHandler = 2;
5027                 else if (!strcmp(errors, "ignore"))
5028                     known_errorHandler = 3;
5029                 else if (!strcmp(errors, "xmlcharrefreplace"))
5030                     known_errorHandler = 4;
5031                 else
5032                     known_errorHandler = 0;
5033             }
5034             switch (known_errorHandler) {
5035             case 1: /* strict */
5036                 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5037                 goto onError;
5038             case 2: /* replace */
5039                 /* No need to check for space, this is a 1:1 replacement */
5040                 for (coll = collstart; coll<collend; ++coll)
5041                     *str++ = '?';
5042                 /* fall through */
5043             case 3: /* ignore */
5044                 p = collend;
5045                 break;
5046             case 4: /* xmlcharrefreplace */
5047                 /* generate replacement (temporarily (mis)uses p) */
5048                 for (p = collstart; p < collend; ++p) {
5049                     char buffer[2+29+1+1];
5050                     char *cp;
5051                     sprintf(buffer, "&#%d;", (int)*p);
5052                     if (charmaptranslate_makespace(&res, &str,
5053                                                    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5054                         goto onError;
5055                     for (cp = buffer; *cp; ++cp)
5056                         *str++ = *cp;
5057                 }
5058                 p = collend;
5059                 break;
5060             default:
5061                 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5062                                                                  reason, startp, size, &exc,
5063                                                                  collstart-startp, collend-startp, &newpos);
5064                 if (repunicode == NULL)
5065                     goto onError;
5066                 /* generate replacement  */
5067                 repsize = PyUnicode_GET_SIZE(repunicode);
5068                 if (charmaptranslate_makespace(&res, &str,
5069                                                (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5070                     Py_DECREF(repunicode);
5071                     goto onError;
5072                 }
5073                 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5074                     *str++ = *uni2;
5075                 p = startp + newpos;
5076                 Py_DECREF(repunicode);
5077             }
5078         }
5079     }
5080     /* Resize if we allocated to much */
5081     respos = str-PyUnicode_AS_UNICODE(res);
5082     if (respos<PyUnicode_GET_SIZE(res)) {
5083         if (PyUnicode_Resize(&res, respos) < 0)
5084             goto onError;
5085     }
5086     Py_XDECREF(exc);
5087     Py_XDECREF(errorHandler);
5088     return res;
5089
5090   onError:
5091     Py_XDECREF(res);
5092     Py_XDECREF(exc);
5093     Py_XDECREF(errorHandler);
5094     return NULL;
5095 }
5096
5097 PyObject *PyUnicode_Translate(PyObject *str,
5098                               PyObject *mapping,
5099                               const char *errors)
5100 {
5101     PyObject *result;
5102
5103     str = PyUnicode_FromObject(str);
5104     if (str == NULL)
5105         goto onError;
5106     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5107                                         PyUnicode_GET_SIZE(str),
5108                                         mapping,
5109                                         errors);
5110     Py_DECREF(str);
5111     return result;
5112
5113   onError:
5114     Py_XDECREF(str);
5115     return NULL;
5116 }
5117
5118 /* --- Decimal Encoder ---------------------------------------------------- */
5119
5120 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5121                             Py_ssize_t length,
5122                             char *output,
5123                             const char *errors)
5124 {
5125     Py_UNICODE *p, *end;
5126     PyObject *errorHandler = NULL;
5127     PyObject *exc = NULL;
5128     const char *encoding = "decimal";
5129     const char *reason = "invalid decimal Unicode string";
5130     /* the following variable is used for caching string comparisons
5131      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5132     int known_errorHandler = -1;
5133
5134     if (output == NULL) {
5135         PyErr_BadArgument();
5136         return -1;
5137     }
5138
5139     p = s;
5140     end = s + length;
5141     while (p < end) {
5142         register Py_UNICODE ch = *p;
5143         int decimal;
5144         PyObject *repunicode;
5145         Py_ssize_t repsize;
5146         Py_ssize_t newpos;
5147         Py_UNICODE *uni2;
5148         Py_UNICODE *collstart;
5149         Py_UNICODE *collend;
5150
5151         if (Py_UNICODE_ISSPACE(ch)) {
5152             *output++ = ' ';
5153             ++p;
5154             continue;
5155         }
5156         decimal = Py_UNICODE_TODECIMAL(ch);
5157         if (decimal >= 0) {
5158             *output++ = '0' + decimal;
5159             ++p;
5160             continue;
5161         }
5162         if (0 < ch && ch < 256) {
5163             *output++ = (char)ch;
5164             ++p;
5165             continue;
5166         }
5167         /* All other characters are considered unencodable */
5168         collstart = p;
5169         collend = p+1;
5170         while (collend < end) {
5171             if ((0 < *collend && *collend < 256) ||
5172                 !Py_UNICODE_ISSPACE(*collend) ||
5173                 Py_UNICODE_TODECIMAL(*collend))
5174                 break;
5175         }
5176         /* cache callback name lookup
5177          * (if not done yet, i.e. it's the first error) */
5178         if (known_errorHandler==-1) {
5179             if ((errors==NULL) || (!strcmp(errors, "strict")))
5180                 known_errorHandler = 1;
5181             else if (!strcmp(errors, "replace"))
5182                 known_errorHandler = 2;
5183             else if (!strcmp(errors, "ignore"))
5184                 known_errorHandler = 3;
5185             else if (!strcmp(errors, "xmlcharrefreplace"))
5186                 known_errorHandler = 4;
5187             else
5188                 known_errorHandler = 0;
5189         }
5190         switch (known_errorHandler) {
5191         case 1: /* strict */
5192             raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5193             goto onError;
5194         case 2: /* replace */
5195             for (p = collstart; p < collend; ++p)
5196                 *output++ = '?';
5197             /* fall through */
5198         case 3: /* ignore */
5199             p = collend;
5200             break;
5201         case 4: /* xmlcharrefreplace */
5202             /* generate replacement (temporarily (mis)uses p) */
5203             for (p = collstart; p < collend; ++p)
5204                 output += sprintf(output, "&#%d;", (int)*p);
5205             p = collend;
5206             break;
5207         default:
5208             repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5209                                                           encoding, reason, s, length, &exc,
5210                                                           collstart-s, collend-s, &newpos);
5211             if (repunicode == NULL)
5212                 goto onError;
5213             /* generate replacement  */
5214             repsize = PyUnicode_GET_SIZE(repunicode);
5215             for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5216                 Py_UNICODE ch = *uni2;
5217                 if (Py_UNICODE_ISSPACE(ch))
5218                     *output++ = ' ';
5219                 else {
5220                     decimal = Py_UNICODE_TODECIMAL(ch);
5221                     if (decimal >= 0)
5222                         *output++ = '0' + decimal;
5223                     else if (0 < ch && ch < 256)
5224                         *output++ = (char)ch;
5225                     else {
5226                         Py_DECREF(repunicode);
5227                         raise_encode_exception(&exc, encoding,
5228                                                s, length, collstart-s, collend-s, reason);
5229                         goto onError;
5230                     }
5231                 }
5232             }
5233             p = s + newpos;
5234             Py_DECREF(repunicode);
5235         }
5236     }
5237     /* 0-terminate the output string */
5238     *output++ = '\0';
5239     Py_XDECREF(exc);
5240     Py_XDECREF(errorHandler);
5241     return 0;
5242
5243   onError:
5244     Py_XDECREF(exc);
5245     Py_XDECREF(errorHandler);
5246     return -1;
5247 }
5248
5249 /* --- Helpers ------------------------------------------------------------ */
5250
5251 #include "stringlib/unicodedefs.h"
5252 #include "stringlib/fastsearch.h"
5253
5254 #include "stringlib/count.h"
5255 #include "stringlib/find.h"
5256 #include "stringlib/partition.h"
5257 #include "stringlib/split.h"
5258
5259 /* helper macro to fixup start/end slice values */
5260 #define ADJUST_INDICES(start, end, len)         \
5261     if (end > len)                              \
5262         end = len;                              \
5263     else if (end < 0) {                         \
5264         end += len;                             \
5265         if (end < 0)                            \
5266             end = 0;                            \
5267     }                                           \
5268     if (start < 0) {                            \
5269         start += len;                           \
5270         if (start < 0)                          \
5271             start = 0;                          \
5272     }
5273
5274 Py_ssize_t PyUnicode_Count(PyObject *str,
5275                            PyObject *substr,
5276                            Py_ssize_t start,
5277                            Py_ssize_t end)
5278 {
5279     Py_ssize_t result;
5280     PyUnicodeObject* str_obj;
5281     PyUnicodeObject* sub_obj;
5282
5283     str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5284     if (!str_obj)
5285         return -1;
5286     sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5287     if (!sub_obj) {
5288         Py_DECREF(str_obj);
5289         return -1;
5290     }
5291
5292     ADJUST_INDICES(start, end, str_obj->length);
5293     result = stringlib_count(
5294         str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5295         PY_SSIZE_T_MAX
5296         );
5297
5298     Py_DECREF(sub_obj);
5299     Py_DECREF(str_obj);
5300
5301     return result;
5302 }
5303
5304 Py_ssize_t PyUnicode_Find(PyObject *str,
5305                           PyObject *sub,
5306                           Py_ssize_t start,
5307                           Py_ssize_t end,
5308                           int direction)
5309 {
5310     Py_ssize_t result;
5311
5312     str = PyUnicode_FromObject(str);
5313     if (!str)
5314         return -2;
5315     sub = PyUnicode_FromObject(sub);
5316     if (!sub) {
5317         Py_DECREF(str);
5318         return -2;
5319     }
5320
5321     if (direction > 0)
5322         result = stringlib_find_slice(
5323             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5324             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5325             start, end
5326             );
5327     else
5328         result = stringlib_rfind_slice(
5329             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5330             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5331             start, end
5332             );
5333
5334     Py_DECREF(str);
5335     Py_DECREF(sub);
5336
5337     return result;
5338 }
5339
5340 static
5341 int tailmatch(PyUnicodeObject *self,
5342               PyUnicodeObject *substring,
5343               Py_ssize_t start,
5344               Py_ssize_t end,
5345               int direction)
5346 {
5347     if (substring->length == 0)
5348         return 1;
5349
5350     ADJUST_INDICES(start, end, self->length);
5351     end -= substring->length;
5352     if (end < start)
5353         return 0;
5354
5355     if (direction > 0) {
5356         if (Py_UNICODE_MATCH(self, end, substring))
5357             return 1;
5358     } else {
5359         if (Py_UNICODE_MATCH(self, start, substring))
5360             return 1;
5361     }
5362
5363     return 0;
5364 }
5365
5366 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5367                                PyObject *substr,
5368                                Py_ssize_t start,
5369                                Py_ssize_t end,
5370                                int direction)
5371 {
5372     Py_ssize_t result;
5373
5374     str = PyUnicode_FromObject(str);
5375     if (str == NULL)
5376         return -1;
5377     substr = PyUnicode_FromObject(substr);
5378     if (substr == NULL) {
5379         Py_DECREF(str);
5380         return -1;
5381     }
5382
5383     result = tailmatch((PyUnicodeObject *)str,
5384                        (PyUnicodeObject *)substr,
5385                        start, end, direction);
5386     Py_DECREF(str);
5387     Py_DECREF(substr);
5388     return result;
5389 }
5390
5391 /* Apply fixfct filter to the Unicode object self and return a
5392    reference to the modified object */
5393
5394 static
5395 PyObject *fixup(PyUnicodeObject *self,
5396                 int (*fixfct)(PyUnicodeObject *s))
5397 {
5398
5399     PyUnicodeObject *u;
5400
5401     u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5402     if (u == NULL)
5403         return NULL;
5404
5405     Py_UNICODE_COPY(u->str, self->str, self->length);
5406
5407     if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5408         /* fixfct should return TRUE if it modified the buffer. If
5409            FALSE, return a reference to the original buffer instead
5410            (to save space, not time) */
5411         Py_INCREF(self);
5412         Py_DECREF(u);
5413         return (PyObject*) self;
5414     }
5415     return (PyObject*) u;
5416 }
5417
5418 static
5419 int fixupper(PyUnicodeObject *self)
5420 {
5421     Py_ssize_t len = self->length;
5422     Py_UNICODE *s = self->str;
5423     int status = 0;
5424
5425     while (len-- > 0) {
5426         register Py_UNICODE ch;
5427
5428         ch = Py_UNICODE_TOUPPER(*s);
5429         if (ch != *s) {
5430             status = 1;
5431             *s = ch;
5432         }
5433         s++;
5434     }
5435
5436     return status;
5437 }
5438
5439 static
5440 int fixlower(PyUnicodeObject *self)
5441 {
5442     Py_ssize_t len = self->length;
5443     Py_UNICODE *s = self->str;
5444     int status = 0;
5445
5446     while (len-- > 0) {
5447         register Py_UNICODE ch;
5448
5449         ch = Py_UNICODE_TOLOWER(*s);
5450         if (ch != *s) {
5451             status = 1;
5452             *s = ch;
5453         }
5454         s++;
5455     }
5456
5457     return status;
5458 }
5459
5460 static
5461 int fixswapcase(PyUnicodeObject *self)
5462 {
5463     Py_ssize_t len = self->length;
5464     Py_UNICODE *s = self->str;
5465     int status = 0;
5466
5467     while (len-- > 0) {
5468         if (Py_UNICODE_ISUPPER(*s)) {
5469             *s = Py_UNICODE_TOLOWER(*s);
5470             status = 1;
5471         } else if (Py_UNICODE_ISLOWER(*s)) {
5472             *s = Py_UNICODE_TOUPPER(*s);
5473             status = 1;
5474         }
5475         s++;
5476     }
5477
5478     return status;
5479 }
5480
5481 static
5482 int fixcapitalize(PyUnicodeObject *self)
5483 {
5484     Py_ssize_t len = self->length;
5485     Py_UNICODE *s = self->str;
5486     int status = 0;
5487
5488     if (len == 0)
5489         return 0;
5490     if (Py_UNICODE_ISLOWER(*s)) {
5491         *s = Py_UNICODE_TOUPPER(*s);
5492         status = 1;
5493     }
5494     s++;
5495     while (--len > 0) {
5496         if (Py_UNICODE_ISUPPER(*s)) {
5497             *s = Py_UNICODE_TOLOWER(*s);
5498             status = 1;
5499         }
5500         s++;
5501     }
5502     return status;
5503 }
5504
5505 static
5506 int fixtitle(PyUnicodeObject *self)
5507 {
5508     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5509     register Py_UNICODE *e;
5510     int previous_is_cased;
5511
5512     /* Shortcut for single character strings */
5513     if (PyUnicode_GET_SIZE(self) == 1) {
5514         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5515         if (*p != ch) {
5516             *p = ch;
5517             return 1;
5518         }
5519         else
5520             return 0;
5521     }
5522
5523     e = p + PyUnicode_GET_SIZE(self);
5524     previous_is_cased = 0;
5525     for (; p < e; p++) {
5526         register const Py_UNICODE ch = *p;
5527
5528         if (previous_is_cased)
5529             *p = Py_UNICODE_TOLOWER(ch);
5530         else
5531             *p = Py_UNICODE_TOTITLE(ch);
5532
5533         if (Py_UNICODE_ISLOWER(ch) ||
5534             Py_UNICODE_ISUPPER(ch) ||
5535             Py_UNICODE_ISTITLE(ch))
5536             previous_is_cased = 1;
5537         else
5538             previous_is_cased = 0;
5539     }
5540     return 1;
5541 }
5542
5543 PyObject *
5544 PyUnicode_Join(PyObject *separator, PyObject *seq)
5545 {
5546     PyObject *internal_separator = NULL;
5547     const Py_UNICODE blank = ' ';
5548     const Py_UNICODE *sep = &blank;
5549     Py_ssize_t seplen = 1;
5550     PyUnicodeObject *res = NULL; /* the result */
5551     Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
5552     Py_ssize_t res_used;         /* # used bytes */
5553     Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
5554     PyObject *fseq;          /* PySequence_Fast(seq) */
5555     Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
5556     PyObject *item;
5557     Py_ssize_t i;
5558
5559     fseq = PySequence_Fast(seq, "");
5560     if (fseq == NULL) {
5561         return NULL;
5562     }
5563
5564     /* Grrrr.  A codec may be invoked to convert str objects to
5565      * Unicode, and so it's possible to call back into Python code
5566      * during PyUnicode_FromObject(), and so it's possible for a sick
5567      * codec to change the size of fseq (if seq is a list).  Therefore
5568      * we have to keep refetching the size -- can't assume seqlen
5569      * is invariant.
5570      */
5571     seqlen = PySequence_Fast_GET_SIZE(fseq);
5572     /* If empty sequence, return u"". */
5573     if (seqlen == 0) {
5574         res = _PyUnicode_New(0);  /* empty sequence; return u"" */
5575         goto Done;
5576     }
5577     /* If singleton sequence with an exact Unicode, return that. */
5578     if (seqlen == 1) {
5579         item = PySequence_Fast_GET_ITEM(fseq, 0);
5580         if (PyUnicode_CheckExact(item)) {
5581             Py_INCREF(item);
5582             res = (PyUnicodeObject *)item;
5583             goto Done;
5584         }
5585     }
5586
5587     /* At least two items to join, or one that isn't exact Unicode. */
5588     if (seqlen > 1) {
5589         /* Set up sep and seplen -- they're needed. */
5590         if (separator == NULL) {
5591             sep = &blank;
5592             seplen = 1;
5593         }
5594         else {
5595             internal_separator = PyUnicode_FromObject(separator);
5596             if (internal_separator == NULL)
5597                 goto onError;
5598             sep = PyUnicode_AS_UNICODE(internal_separator);
5599             seplen = PyUnicode_GET_SIZE(internal_separator);
5600             /* In case PyUnicode_FromObject() mutated seq. */
5601             seqlen = PySequence_Fast_GET_SIZE(fseq);
5602         }
5603     }
5604
5605     /* Get space. */
5606     res = _PyUnicode_New(res_alloc);
5607     if (res == NULL)
5608         goto onError;
5609     res_p = PyUnicode_AS_UNICODE(res);
5610     res_used = 0;
5611
5612     for (i = 0; i < seqlen; ++i) {
5613         Py_ssize_t itemlen;
5614         Py_ssize_t new_res_used;
5615
5616         item = PySequence_Fast_GET_ITEM(fseq, i);
5617         /* Convert item to Unicode. */
5618         if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5619             PyErr_Format(PyExc_TypeError,
5620                          "sequence item %zd: expected string or Unicode,"
5621                          " %.80s found",
5622                          i, Py_TYPE(item)->tp_name);
5623             goto onError;
5624         }
5625         item = PyUnicode_FromObject(item);
5626         if (item == NULL)
5627             goto onError;
5628         /* We own a reference to item from here on. */
5629
5630         /* In case PyUnicode_FromObject() mutated seq. */
5631         seqlen = PySequence_Fast_GET_SIZE(fseq);
5632
5633         /* Make sure we have enough space for the separator and the item. */
5634         itemlen = PyUnicode_GET_SIZE(item);
5635         new_res_used = res_used + itemlen;
5636         if (new_res_used < 0)
5637             goto Overflow;
5638         if (i < seqlen - 1) {
5639             new_res_used += seplen;
5640             if (new_res_used < 0)
5641                 goto Overflow;
5642         }
5643         if (new_res_used > res_alloc) {
5644             /* double allocated size until it's big enough */
5645             do {
5646                 res_alloc += res_alloc;
5647                 if (res_alloc <= 0)
5648                     goto Overflow;
5649             } while (new_res_used > res_alloc);
5650             if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5651                 Py_DECREF(item);
5652                 goto onError;
5653             }
5654             res_p = PyUnicode_AS_UNICODE(res) + res_used;
5655         }
5656
5657         /* Copy item, and maybe the separator. */
5658         Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5659         res_p += itemlen;
5660         if (i < seqlen - 1) {
5661             Py_UNICODE_COPY(res_p, sep, seplen);
5662             res_p += seplen;
5663         }
5664         Py_DECREF(item);
5665         res_used = new_res_used;
5666     }
5667
5668     /* Shrink res to match the used area; this probably can't fail,
5669      * but it's cheap to check.
5670      */
5671     if (_PyUnicode_Resize(&res, res_used) < 0)
5672         goto onError;
5673
5674   Done:
5675     Py_XDECREF(internal_separator);
5676     Py_DECREF(fseq);
5677     return (PyObject *)res;
5678
5679   Overflow:
5680     PyErr_SetString(PyExc_OverflowError,
5681                     "join() result is too long for a Python string");
5682     Py_DECREF(item);
5683     /* fall through */
5684
5685   onError:
5686     Py_XDECREF(internal_separator);
5687     Py_DECREF(fseq);
5688     Py_XDECREF(res);
5689     return NULL;
5690 }
5691
5692 static
5693 PyUnicodeObject *pad(PyUnicodeObject *self,
5694                      Py_ssize_t left,
5695                      Py_ssize_t right,
5696                      Py_UNICODE fill)
5697 {
5698     PyUnicodeObject *u;
5699
5700     if (left < 0)
5701         left = 0;
5702     if (right < 0)
5703         right = 0;
5704
5705     if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5706         Py_INCREF(self);
5707         return self;
5708     }
5709
5710     if (left > PY_SSIZE_T_MAX - self->length ||
5711         right > PY_SSIZE_T_MAX - (left + self->length)) {
5712         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5713         return NULL;
5714     }
5715     u = _PyUnicode_New(left + self->length + right);
5716     if (u) {
5717         if (left)
5718             Py_UNICODE_FILL(u->str, fill, left);
5719         Py_UNICODE_COPY(u->str + left, self->str, self->length);
5720         if (right)
5721             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5722     }
5723
5724     return u;
5725 }
5726
5727 PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
5728 {
5729     PyObject *list;
5730
5731     string = PyUnicode_FromObject(string);
5732     if (string == NULL)
5733         return NULL;
5734
5735     list = stringlib_splitlines(
5736         (PyObject*) string, PyUnicode_AS_UNICODE(string),
5737         PyUnicode_GET_SIZE(string), keepends);
5738
5739     Py_DECREF(string);
5740     return list;
5741 }
5742
5743 static
5744 PyObject *split(PyUnicodeObject *self,
5745                 PyUnicodeObject *substring,
5746                 Py_ssize_t maxcount)
5747 {
5748     if (maxcount < 0)
5749         maxcount = PY_SSIZE_T_MAX;
5750
5751     if (substring == NULL)
5752         return stringlib_split_whitespace(
5753             (PyObject*) self,  self->str, self->length, maxcount
5754             );
5755
5756     return stringlib_split(
5757         (PyObject*) self,  self->str, self->length,
5758         substring->str, substring->length,
5759         maxcount
5760         );
5761 }
5762
5763 static
5764 PyObject *rsplit(PyUnicodeObject *self,
5765                  PyUnicodeObject *substring,
5766                  Py_ssize_t maxcount)
5767 {
5768     if (maxcount < 0)
5769         maxcount = PY_SSIZE_T_MAX;
5770
5771     if (substring == NULL)
5772         return stringlib_rsplit_whitespace(
5773             (PyObject*) self,  self->str, self->length, maxcount
5774             );
5775
5776     return stringlib_rsplit(
5777         (PyObject*) self,  self->str, self->length,
5778         substring->str, substring->length,
5779         maxcount
5780         );
5781 }
5782
5783 static
5784 PyObject *replace(PyUnicodeObject *self,
5785                   PyUnicodeObject *str1,
5786                   PyUnicodeObject *str2,
5787                   Py_ssize_t maxcount)
5788 {
5789     PyUnicodeObject *u;
5790
5791     if (maxcount < 0)
5792         maxcount = PY_SSIZE_T_MAX;
5793     else if (maxcount == 0 || self->length == 0)
5794         goto nothing;
5795
5796     if (str1->length == str2->length) {
5797         Py_ssize_t i;
5798         /* same length */
5799         if (str1->length == 0)
5800             goto nothing;
5801         if (str1->length == 1) {
5802             /* replace characters */
5803             Py_UNICODE u1, u2;
5804             if (!findchar(self->str, self->length, str1->str[0]))
5805                 goto nothing;
5806             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5807             if (!u)
5808                 return NULL;
5809             Py_UNICODE_COPY(u->str, self->str, self->length);
5810             u1 = str1->str[0];
5811             u2 = str2->str[0];
5812             for (i = 0; i < u->length; i++)
5813                 if (u->str[i] == u1) {
5814                     if (--maxcount < 0)
5815                         break;
5816                     u->str[i] = u2;
5817                 }
5818         } else {
5819             i = stringlib_find(
5820                 self->str, self->length, str1->str, str1->length, 0
5821                 );
5822             if (i < 0)
5823                 goto nothing;
5824             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5825             if (!u)
5826                 return NULL;
5827             Py_UNICODE_COPY(u->str, self->str, self->length);
5828
5829             /* change everything in-place, starting with this one */
5830             Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5831             i += str1->length;
5832
5833             while ( --maxcount > 0) {
5834                 i = stringlib_find(self->str+i, self->length-i,
5835                                    str1->str, str1->length,
5836                                    i);
5837                 if (i == -1)
5838                     break;
5839                 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5840                 i += str1->length;
5841             }
5842         }
5843     } else {
5844
5845         Py_ssize_t n, i, j;
5846         Py_ssize_t product, new_size, delta;
5847         Py_UNICODE *p;
5848
5849         /* replace strings */
5850         n = stringlib_count(self->str, self->length, str1->str, str1->length,
5851                             maxcount);
5852         if (n == 0)
5853             goto nothing;
5854         /* new_size = self->length + n * (str2->length - str1->length)); */
5855         delta = (str2->length - str1->length);
5856         if (delta == 0) {
5857             new_size = self->length;
5858         } else {
5859             product = n * (str2->length - str1->length);
5860             if ((product / (str2->length - str1->length)) != n) {
5861                 PyErr_SetString(PyExc_OverflowError,
5862                                 "replace string is too long");
5863                 return NULL;
5864             }
5865             new_size = self->length + product;
5866             if (new_size < 0) {
5867                 PyErr_SetString(PyExc_OverflowError,
5868                                 "replace string is too long");
5869                 return NULL;
5870             }
5871         }
5872         u = _PyUnicode_New(new_size);
5873         if (!u)
5874             return NULL;
5875         i = 0;
5876         p = u->str;
5877         if (str1->length > 0) {
5878             while (n-- > 0) {
5879                 /* look for next match */
5880                 j = stringlib_find(self->str+i, self->length-i,
5881                                    str1->str, str1->length,
5882                                    i);
5883                 if (j == -1)
5884                     break;
5885                 else if (j > i) {
5886                     /* copy unchanged part [i:j] */
5887                     Py_UNICODE_COPY(p, self->str+i, j-i);
5888                     p += j - i;
5889                 }
5890                 /* copy substitution string */
5891                 if (str2->length > 0) {
5892                     Py_UNICODE_COPY(p, str2->str, str2->length);
5893                     p += str2->length;
5894                 }
5895                 i = j + str1->length;
5896             }
5897             if (i < self->length)
5898                 /* copy tail [i:] */
5899                 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5900         } else {
5901             /* interleave */
5902             while (n > 0) {
5903                 Py_UNICODE_COPY(p, str2->str, str2->length);
5904                 p += str2->length;
5905                 if (--n <= 0)
5906                     break;
5907                 *p++ = self->str[i++];
5908             }
5909             Py_UNICODE_COPY(p, self->str+i, self->length-i);
5910         }
5911     }
5912     return (PyObject *) u;
5913
5914   nothing:
5915     /* nothing to replace; return original string (when possible) */
5916     if (PyUnicode_CheckExact(self)) {
5917         Py_INCREF(self);
5918         return (PyObject *) self;
5919     }
5920     return PyUnicode_FromUnicode(self->str, self->length);
5921 }
5922
5923 /* --- Unicode Object Methods --------------------------------------------- */
5924
5925 PyDoc_STRVAR(title__doc__,
5926              "S.title() -> unicode\n\
5927 \n\
5928 Return a titlecased version of S, i.e. words start with title case\n\
5929 characters, all remaining cased characters have lower case.");
5930
5931 static PyObject*
5932 unicode_title(PyUnicodeObject *self)
5933 {
5934     return fixup(self, fixtitle);
5935 }
5936
5937 PyDoc_STRVAR(capitalize__doc__,
5938              "S.capitalize() -> unicode\n\
5939 \n\
5940 Return a capitalized version of S, i.e. make the first character\n\
5941 have upper case.");
5942
5943 static PyObject*
5944 unicode_capitalize(PyUnicodeObject *self)
5945 {
5946     return fixup(self, fixcapitalize);
5947 }
5948
5949 #if 0
5950 PyDoc_STRVAR(capwords__doc__,
5951              "S.capwords() -> unicode\n\
5952 \n\
5953 Apply .capitalize() to all words in S and return the result with\n\
5954 normalized whitespace (all whitespace strings are replaced by ' ').");
5955
5956 static PyObject*
5957 unicode_capwords(PyUnicodeObject *self)
5958 {
5959     PyObject *list;
5960     PyObject *item;
5961     Py_ssize_t i;
5962
5963     /* Split into words */
5964     list = split(self, NULL, -1);
5965     if (!list)
5966         return NULL;
5967
5968     /* Capitalize each word */
5969     for (i = 0; i < PyList_GET_SIZE(list); i++) {
5970         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5971                      fixcapitalize);
5972         if (item == NULL)
5973             goto onError;
5974         Py_DECREF(PyList_GET_ITEM(list, i));
5975         PyList_SET_ITEM(list, i, item);
5976     }
5977
5978     /* Join the words to form a new string */
5979     item = PyUnicode_Join(NULL, list);
5980
5981   onError:
5982     Py_DECREF(list);
5983     return (PyObject *)item;
5984 }
5985 #endif
5986
5987 /* Argument converter.  Coerces to a single unicode character */
5988
5989 static int
5990 convert_uc(PyObject *obj, void *addr)
5991 {
5992     Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5993     PyObject *uniobj;
5994     Py_UNICODE *unistr;
5995
5996     uniobj = PyUnicode_FromObject(obj);
5997     if (uniobj == NULL) {
5998         PyErr_SetString(PyExc_TypeError,
5999                         "The fill character cannot be converted to Unicode");
6000         return 0;
6001     }
6002     if (PyUnicode_GET_SIZE(uniobj) != 1) {
6003         PyErr_SetString(PyExc_TypeError,
6004                         "The fill character must be exactly one character long");
6005         Py_DECREF(uniobj);
6006         return 0;
6007     }
6008     unistr = PyUnicode_AS_UNICODE(uniobj);
6009     *fillcharloc = unistr[0];
6010     Py_DECREF(uniobj);
6011     return 1;
6012 }
6013
6014 PyDoc_STRVAR(center__doc__,
6015              "S.center(width[, fillchar]) -> unicode\n\
6016 \n\
6017 Return S centered in a Unicode string of length width. Padding is\n\
6018 done using the specified fill character (default is a space)");
6019
6020 static PyObject *
6021 unicode_center(PyUnicodeObject *self, PyObject *args)
6022 {
6023     Py_ssize_t marg, left;
6024     Py_ssize_t width;
6025     Py_UNICODE fillchar = ' ';
6026
6027     if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6028         return NULL;
6029
6030     if (self->length >= width && PyUnicode_CheckExact(self)) {
6031         Py_INCREF(self);
6032         return (PyObject*) self;
6033     }
6034
6035     marg = width - self->length;
6036     left = marg / 2 + (marg & width & 1);
6037
6038     return (PyObject*) pad(self, left, marg - left, fillchar);
6039 }
6040
6041 #if 0
6042
6043 /* This code should go into some future Unicode collation support
6044    module. The basic comparison should compare ordinals on a naive
6045    basis (this is what Java does and thus Jython too). */
6046
6047 /* speedy UTF-16 code point order comparison */
6048 /* gleaned from: */
6049 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6050
6051 static short utf16Fixup[32] =
6052 {
6053     0, 0, 0, 0, 0, 0, 0, 0,
6054     0, 0, 0, 0, 0, 0, 0, 0,
6055     0, 0, 0, 0, 0, 0, 0, 0,
6056     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6057 };
6058
6059 static int
6060 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6061 {
6062     Py_ssize_t len1, len2;
6063
6064     Py_UNICODE *s1 = str1->str;
6065     Py_UNICODE *s2 = str2->str;
6066
6067     len1 = str1->length;
6068     len2 = str2->length;
6069
6070     while (len1 > 0 && len2 > 0) {
6071         Py_UNICODE c1, c2;
6072
6073         c1 = *s1++;
6074         c2 = *s2++;
6075
6076         if (c1 > (1<<11) * 26)
6077             c1 += utf16Fixup[c1>>11];
6078         if (c2 > (1<<11) * 26)
6079             c2 += utf16Fixup[c2>>11];
6080         /* now c1 and c2 are in UTF-32-compatible order */
6081
6082         if (c1 != c2)
6083             return (c1 < c2) ? -1 : 1;
6084
6085         len1--; len2--;
6086     }
6087
6088     return (len1 < len2) ? -1 : (len1 != len2);
6089 }
6090
6091 #else
6092
6093 static int
6094 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6095 {
6096     register Py_ssize_t len1, len2;
6097
6098     Py_UNICODE *s1 = str1->str;
6099     Py_UNICODE *s2 = str2->str;
6100
6101     len1 = str1->length;
6102     len2 = str2->length;
6103
6104     while (len1 > 0 && len2 > 0) {
6105         Py_UNICODE c1, c2;
6106
6107         c1 = *s1++;
6108         c2 = *s2++;
6109
6110         if (c1 != c2)
6111             return (c1 < c2) ? -1 : 1;
6112
6113         len1--; len2--;
6114     }
6115
6116     return (len1 < len2) ? -1 : (len1 != len2);
6117 }
6118
6119 #endif
6120
6121 int PyUnicode_Compare(PyObject *left,
6122                       PyObject *right)
6123 {
6124     PyUnicodeObject *u = NULL, *v = NULL;
6125     int result;
6126
6127     /* Coerce the two arguments */
6128     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6129     if (u == NULL)
6130         goto onError;
6131     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6132     if (v == NULL)
6133         goto onError;
6134
6135     /* Shortcut for empty or interned objects */
6136     if (v == u) {
6137         Py_DECREF(u);
6138         Py_DECREF(v);
6139         return 0;
6140     }
6141
6142     result = unicode_compare(u, v);
6143
6144     Py_DECREF(u);
6145     Py_DECREF(v);
6146     return result;
6147
6148   onError:
6149     Py_XDECREF(u);
6150     Py_XDECREF(v);
6151     return -1;
6152 }
6153
6154 PyObject *PyUnicode_RichCompare(PyObject *left,
6155                                 PyObject *right,
6156                                 int op)
6157 {
6158     int result;
6159
6160     result = PyUnicode_Compare(left, right);
6161     if (result == -1 && PyErr_Occurred())
6162         goto onError;
6163
6164     /* Convert the return value to a Boolean */
6165     switch (op) {
6166     case Py_EQ:
6167         result = (result == 0);
6168         break;
6169     case Py_NE:
6170         result = (result != 0);
6171         break;
6172     case Py_LE:
6173         result = (result <= 0);
6174         break;
6175     case Py_GE:
6176         result = (result >= 0);
6177         break;
6178     case Py_LT:
6179         result = (result == -1);
6180         break;
6181     case Py_GT:
6182         result = (result == 1);
6183         break;
6184     }
6185     return PyBool_FromLong(result);
6186
6187   onError:
6188
6189     /* Standard case
6190
6191        Type errors mean that PyUnicode_FromObject() could not convert
6192        one of the arguments (usually the right hand side) to Unicode,
6193        ie. we can't handle the comparison request. However, it is
6194        possible that the other object knows a comparison method, which
6195        is why we return Py_NotImplemented to give the other object a
6196        chance.
6197
6198     */
6199     if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6200         PyErr_Clear();
6201         Py_INCREF(Py_NotImplemented);
6202         return Py_NotImplemented;
6203     }
6204     if (op != Py_EQ && op != Py_NE)
6205         return NULL;
6206
6207     /* Equality comparison.
6208
6209        This is a special case: we silence any PyExc_UnicodeDecodeError
6210        and instead turn it into a PyErr_UnicodeWarning.
6211
6212     */
6213     if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6214         return NULL;
6215     PyErr_Clear();
6216     if (PyErr_Warn(PyExc_UnicodeWarning,
6217                    (op == Py_EQ) ?
6218                    "Unicode equal comparison "
6219                    "failed to convert both arguments to Unicode - "
6220                    "interpreting them as being unequal" :
6221                    "Unicode unequal comparison "
6222                    "failed to convert both arguments to Unicode - "
6223                    "interpreting them as being unequal"
6224             ) < 0)
6225         return NULL;
6226     result = (op == Py_NE);
6227     return PyBool_FromLong(result);
6228 }
6229
6230 int PyUnicode_Contains(PyObject *container,
6231                        PyObject *element)
6232 {
6233     PyObject *str, *sub;
6234     int result;
6235
6236     /* Coerce the two arguments */
6237     sub = PyUnicode_FromObject(element);
6238     if (!sub) {
6239         return -1;
6240     }
6241
6242     str = PyUnicode_FromObject(container);
6243     if (!str) {
6244         Py_DECREF(sub);
6245         return -1;
6246     }
6247
6248     result = stringlib_contains_obj(str, sub);
6249
6250     Py_DECREF(str);
6251     Py_DECREF(sub);
6252
6253     return result;
6254 }
6255
6256 /* Concat to string or Unicode object giving a new Unicode object. */
6257
6258 PyObject *PyUnicode_Concat(PyObject *left,
6259                            PyObject *right)
6260 {
6261     PyUnicodeObject *u = NULL, *v = NULL, *w;
6262
6263     /* Coerce the two arguments */
6264     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6265     if (u == NULL)
6266         goto onError;
6267     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6268     if (v == NULL)
6269         goto onError;
6270
6271     /* Shortcuts */
6272     if (v == unicode_empty) {
6273         Py_DECREF(v);
6274         return (PyObject *)u;
6275     }
6276     if (u == unicode_empty) {
6277         Py_DECREF(u);
6278         return (PyObject *)v;
6279     }
6280
6281     /* Concat the two Unicode strings */
6282     w = _PyUnicode_New(u->length + v->length);
6283     if (w == NULL)
6284         goto onError;
6285     Py_UNICODE_COPY(w->str, u->str, u->length);
6286     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6287
6288     Py_DECREF(u);
6289     Py_DECREF(v);
6290     return (PyObject *)w;
6291
6292   onError:
6293     Py_XDECREF(u);
6294     Py_XDECREF(v);
6295     return NULL;
6296 }
6297
6298 PyDoc_STRVAR(count__doc__,
6299              "S.count(sub[, start[, end]]) -> int\n\
6300 \n\
6301 Return the number of non-overlapping occurrences of substring sub in\n\
6302 Unicode string S[start:end].  Optional arguments start and end are\n\
6303 interpreted as in slice notation.");
6304
6305 static PyObject *
6306 unicode_count(PyUnicodeObject *self, PyObject *args)
6307 {
6308     PyUnicodeObject *substring;
6309     Py_ssize_t start = 0;
6310     Py_ssize_t end = PY_SSIZE_T_MAX;
6311     PyObject *result;
6312
6313     if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6314                           _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6315         return NULL;
6316
6317     substring = (PyUnicodeObject *)PyUnicode_FromObject(
6318         (PyObject *)substring);
6319     if (substring == NULL)
6320         return NULL;
6321
6322     ADJUST_INDICES(start, end, self->length);
6323     result = PyInt_FromSsize_t(
6324         stringlib_count(self->str + start, end - start,
6325                         substring->str, substring->length,
6326                         PY_SSIZE_T_MAX)
6327         );
6328
6329     Py_DECREF(substring);
6330
6331     return result;
6332 }
6333
6334 PyDoc_STRVAR(encode__doc__,
6335              "S.encode([encoding[,errors]]) -> string or unicode\n\
6336 \n\
6337 Encodes S using the codec registered for encoding. encoding defaults\n\
6338 to the default encoding. errors may be given to set a different error\n\
6339 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6340 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6341 'xmlcharrefreplace' as well as any other name registered with\n\
6342 codecs.register_error that can handle UnicodeEncodeErrors.");
6343
6344 static PyObject *
6345 unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6346 {
6347     static char *kwlist[] = {"encoding", "errors", 0};
6348     char *encoding = NULL;
6349     char *errors = NULL;
6350     PyObject *v;
6351
6352     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6353                                      kwlist, &encoding, &errors))
6354         return NULL;
6355     v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6356     if (v == NULL)
6357         goto onError;
6358     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6359         PyErr_Format(PyExc_TypeError,
6360                      "encoder did not return a string/unicode object "
6361                      "(type=%.400s)",
6362                      Py_TYPE(v)->tp_name);
6363         Py_DECREF(v);
6364         return NULL;
6365     }
6366     return v;
6367
6368   onError:
6369     return NULL;
6370 }
6371
6372 PyDoc_STRVAR(decode__doc__,
6373              "S.decode([encoding[,errors]]) -> string or unicode\n\
6374 \n\
6375 Decodes S using the codec registered for encoding. encoding defaults\n\
6376 to the default encoding. errors may be given to set a different error\n\
6377 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6378 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6379 as well as any other name registerd with codecs.register_error that is\n\
6380 able to handle UnicodeDecodeErrors.");
6381
6382 static PyObject *
6383 unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6384 {
6385     static char *kwlist[] = {"encoding", "errors", 0};
6386     char *encoding = NULL;
6387     char *errors = NULL;
6388     PyObject *v;
6389
6390     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6391                                      kwlist, &encoding, &errors))
6392         return NULL;
6393     v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
6394     if (v == NULL)
6395         goto onError;
6396     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6397         PyErr_Format(PyExc_TypeError,
6398                      "decoder did not return a string/unicode object "
6399                      "(type=%.400s)",
6400                      Py_TYPE(v)->tp_name);
6401         Py_DECREF(v);
6402         return NULL;
6403     }
6404     return v;
6405
6406   onError:
6407     return NULL;
6408 }
6409
6410 PyDoc_STRVAR(expandtabs__doc__,
6411              "S.expandtabs([tabsize]) -> unicode\n\
6412 \n\
6413 Return a copy of S where all tab characters are expanded using spaces.\n\
6414 If tabsize is not given, a tab size of 8 characters is assumed.");
6415
6416 static PyObject*
6417 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6418 {
6419     Py_UNICODE *e;
6420     Py_UNICODE *p;
6421     Py_UNICODE *q;
6422     Py_UNICODE *qe;
6423     Py_ssize_t i, j, incr;
6424     PyUnicodeObject *u;
6425     int tabsize = 8;
6426
6427     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6428         return NULL;
6429
6430     /* First pass: determine size of output string */
6431     i = 0; /* chars up to and including most recent \n or \r */
6432     j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6433     e = self->str + self->length; /* end of input */
6434     for (p = self->str; p < e; p++)
6435         if (*p == '\t') {
6436             if (tabsize > 0) {
6437                 incr = tabsize - (j % tabsize); /* cannot overflow */
6438                 if (j > PY_SSIZE_T_MAX - incr)
6439                     goto overflow1;
6440                 j += incr;
6441             }
6442         }
6443         else {
6444             if (j > PY_SSIZE_T_MAX - 1)
6445                 goto overflow1;
6446             j++;
6447             if (*p == '\n' || *p == '\r') {
6448                 if (i > PY_SSIZE_T_MAX - j)
6449                     goto overflow1;
6450                 i += j;
6451                 j = 0;
6452             }
6453         }
6454
6455     if (i > PY_SSIZE_T_MAX - j)
6456         goto overflow1;
6457
6458     /* Second pass: create output string and fill it */
6459     u = _PyUnicode_New(i + j);
6460     if (!u)
6461         return NULL;
6462
6463     j = 0; /* same as in first pass */
6464     q = u->str; /* next output char */
6465     qe = u->str + u->length; /* end of output */
6466
6467     for (p = self->str; p < e; p++)
6468         if (*p == '\t') {
6469             if (tabsize > 0) {
6470                 i = tabsize - (j % tabsize);
6471                 j += i;
6472                 while (i--) {
6473                     if (q >= qe)
6474                         goto overflow2;
6475                     *q++ = ' ';
6476                 }
6477             }
6478         }
6479         else {
6480             if (q >= qe)
6481                 goto overflow2;
6482             *q++ = *p;
6483             j++;
6484             if (*p == '\n' || *p == '\r')
6485                 j = 0;
6486         }
6487
6488     return (PyObject*) u;
6489
6490   overflow2:
6491     Py_DECREF(u);
6492   overflow1:
6493     PyErr_SetString(PyExc_OverflowError, "new string is too long");
6494     return NULL;
6495 }
6496
6497 PyDoc_STRVAR(find__doc__,
6498              "S.find(sub [,start [,end]]) -> int\n\
6499 \n\
6500 Return the lowest index in S where substring sub is found,\n\
6501 such that sub is contained within s[start:end].  Optional\n\
6502 arguments start and end are interpreted as in slice notation.\n\
6503 \n\
6504 Return -1 on failure.");
6505
6506 static PyObject *
6507 unicode_find(PyUnicodeObject *self, PyObject *args)
6508 {
6509     PyObject *substring;
6510     Py_ssize_t start;
6511     Py_ssize_t end;
6512     Py_ssize_t result;
6513
6514     if (!_ParseTupleFinds(args, &substring, &start, &end))
6515         return NULL;
6516
6517     result = stringlib_find_slice(
6518         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6519         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6520         start, end
6521         );
6522
6523     Py_DECREF(substring);
6524
6525     return PyInt_FromSsize_t(result);
6526 }
6527
6528 static PyObject *
6529 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6530 {
6531     if (index < 0 || index >= self->length) {
6532         PyErr_SetString(PyExc_IndexError, "string index out of range");
6533         return NULL;
6534     }
6535
6536     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6537 }
6538
6539 static long
6540 unicode_hash(PyUnicodeObject *self)
6541 {
6542     /* Since Unicode objects compare equal to their ASCII string
6543        counterparts, they should use the individual character values
6544        as basis for their hash value.  This is needed to assure that
6545        strings and Unicode objects behave in the same way as
6546        dictionary keys. */
6547
6548     register Py_ssize_t len;
6549     register Py_UNICODE *p;
6550     register long x;
6551
6552     if (self->hash != -1)
6553         return self->hash;
6554     len = PyUnicode_GET_SIZE(self);
6555     p = PyUnicode_AS_UNICODE(self);
6556     x = *p << 7;
6557     while (--len >= 0)
6558         x = (1000003*x) ^ *p++;
6559     x ^= PyUnicode_GET_SIZE(self);
6560     if (x == -1)
6561         x = -2;
6562     self->hash = x;
6563     return x;
6564 }
6565
6566 PyDoc_STRVAR(index__doc__,
6567              "S.index(sub [,start [,end]]) -> int\n\
6568 \n\
6569 Like S.find() but raise ValueError when the substring is not found.");
6570
6571 static PyObject *
6572 unicode_index(PyUnicodeObject *self, PyObject *args)
6573 {
6574     Py_ssize_t result;
6575     PyObject *substring;
6576     Py_ssize_t start;
6577     Py_ssize_t end;
6578
6579     if (!_ParseTupleFinds(args, &substring, &start, &end))
6580         return NULL;
6581
6582     result = stringlib_find_slice(
6583         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6584         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6585         start, end
6586         );
6587
6588     Py_DECREF(substring);
6589
6590     if (result < 0) {
6591         PyErr_SetString(PyExc_ValueError, "substring not found");
6592         return NULL;
6593     }
6594
6595     return PyInt_FromSsize_t(result);
6596 }
6597
6598 PyDoc_STRVAR(islower__doc__,
6599              "S.islower() -> bool\n\
6600 \n\
6601 Return True if all cased characters in S are lowercase and there is\n\
6602 at least one cased character in S, False otherwise.");
6603
6604 static PyObject*
6605 unicode_islower(PyUnicodeObject *self)
6606 {
6607     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6608     register const Py_UNICODE *e;
6609     int cased;
6610
6611     /* Shortcut for single character strings */
6612     if (PyUnicode_GET_SIZE(self) == 1)
6613         return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6614
6615     /* Special case for empty strings */
6616     if (PyUnicode_GET_SIZE(self) == 0)
6617         return PyBool_FromLong(0);
6618
6619     e = p + PyUnicode_GET_SIZE(self);
6620     cased = 0;
6621     for (; p < e; p++) {
6622         register const Py_UNICODE ch = *p;
6623
6624         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6625             return PyBool_FromLong(0);
6626         else if (!cased && Py_UNICODE_ISLOWER(ch))
6627             cased = 1;
6628     }
6629     return PyBool_FromLong(cased);
6630 }
6631
6632 PyDoc_STRVAR(isupper__doc__,
6633              "S.isupper() -> bool\n\
6634 \n\
6635 Return True if all cased characters in S are uppercase and there is\n\
6636 at least one cased character in S, False otherwise.");
6637
6638 static PyObject*
6639 unicode_isupper(PyUnicodeObject *self)
6640 {
6641     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6642     register const Py_UNICODE *e;
6643     int cased;
6644
6645     /* Shortcut for single character strings */
6646     if (PyUnicode_GET_SIZE(self) == 1)
6647         return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6648
6649     /* Special case for empty strings */
6650     if (PyUnicode_GET_SIZE(self) == 0)
6651         return PyBool_FromLong(0);
6652
6653     e = p + PyUnicode_GET_SIZE(self);
6654     cased = 0;
6655     for (; p < e; p++) {
6656         register const Py_UNICODE ch = *p;
6657
6658         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6659             return PyBool_FromLong(0);
6660         else if (!cased && Py_UNICODE_ISUPPER(ch))
6661             cased = 1;
6662     }
6663     return PyBool_FromLong(cased);
6664 }
6665
6666 PyDoc_STRVAR(istitle__doc__,
6667              "S.istitle() -> bool\n\
6668 \n\
6669 Return True if S is a titlecased string and there is at least one\n\
6670 character in S, i.e. upper- and titlecase characters may only\n\
6671 follow uncased characters and lowercase characters only cased ones.\n\
6672 Return False otherwise.");
6673
6674 static PyObject*
6675 unicode_istitle(PyUnicodeObject *self)
6676 {
6677     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6678     register const Py_UNICODE *e;
6679     int cased, previous_is_cased;
6680
6681     /* Shortcut for single character strings */
6682     if (PyUnicode_GET_SIZE(self) == 1)
6683         return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6684                                (Py_UNICODE_ISUPPER(*p) != 0));
6685
6686     /* Special case for empty strings */
6687     if (PyUnicode_GET_SIZE(self) == 0)
6688         return PyBool_FromLong(0);
6689
6690     e = p + PyUnicode_GET_SIZE(self);
6691     cased = 0;
6692     previous_is_cased = 0;
6693     for (; p < e; p++) {
6694         register const Py_UNICODE ch = *p;
6695
6696         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6697             if (previous_is_cased)
6698                 return PyBool_FromLong(0);
6699             previous_is_cased = 1;
6700             cased = 1;
6701         }
6702         else if (Py_UNICODE_ISLOWER(ch)) {
6703             if (!previous_is_cased)
6704                 return PyBool_FromLong(0);
6705             previous_is_cased = 1;
6706             cased = 1;
6707         }
6708         else
6709             previous_is_cased = 0;
6710     }
6711     return PyBool_FromLong(cased);
6712 }
6713
6714 PyDoc_STRVAR(isspace__doc__,
6715              "S.isspace() -> bool\n\
6716 \n\
6717 Return True if all characters in S are whitespace\n\
6718 and there is at least one character in S, False otherwise.");
6719
6720 static PyObject*
6721 unicode_isspace(PyUnicodeObject *self)
6722 {
6723     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6724     register const Py_UNICODE *e;
6725
6726     /* Shortcut for single character strings */
6727     if (PyUnicode_GET_SIZE(self) == 1 &&
6728         Py_UNICODE_ISSPACE(*p))
6729         return PyBool_FromLong(1);
6730
6731     /* Special case for empty strings */
6732     if (PyUnicode_GET_SIZE(self) == 0)
6733         return PyBool_FromLong(0);
6734
6735     e = p + PyUnicode_GET_SIZE(self);
6736     for (; p < e; p++) {
6737         if (!Py_UNICODE_ISSPACE(*p))
6738             return PyBool_FromLong(0);
6739     }
6740     return PyBool_FromLong(1);
6741 }
6742
6743 PyDoc_STRVAR(isalpha__doc__,
6744              "S.isalpha() -> bool\n\
6745 \n\
6746 Return True if all characters in S are alphabetic\n\
6747 and there is at least one character in S, False otherwise.");
6748
6749 static PyObject*
6750 unicode_isalpha(PyUnicodeObject *self)
6751 {
6752     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6753     register const Py_UNICODE *e;
6754
6755     /* Shortcut for single character strings */
6756     if (PyUnicode_GET_SIZE(self) == 1 &&
6757         Py_UNICODE_ISALPHA(*p))
6758         return PyBool_FromLong(1);
6759
6760     /* Special case for empty strings */
6761     if (PyUnicode_GET_SIZE(self) == 0)
6762         return PyBool_FromLong(0);
6763
6764     e = p + PyUnicode_GET_SIZE(self);
6765     for (; p < e; p++) {
6766         if (!Py_UNICODE_ISALPHA(*p))
6767             return PyBool_FromLong(0);
6768     }
6769     return PyBool_FromLong(1);
6770 }
6771
6772 PyDoc_STRVAR(isalnum__doc__,
6773              "S.isalnum() -> bool\n\
6774 \n\
6775 Return True if all characters in S are alphanumeric\n\
6776 and there is at least one character in S, False otherwise.");
6777
6778 static PyObject*
6779 unicode_isalnum(PyUnicodeObject *self)
6780 {
6781     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6782     register const Py_UNICODE *e;
6783
6784     /* Shortcut for single character strings */
6785     if (PyUnicode_GET_SIZE(self) == 1 &&
6786         Py_UNICODE_ISALNUM(*p))
6787         return PyBool_FromLong(1);
6788
6789     /* Special case for empty strings */
6790     if (PyUnicode_GET_SIZE(self) == 0)
6791         return PyBool_FromLong(0);
6792
6793     e = p + PyUnicode_GET_SIZE(self);
6794     for (; p < e; p++) {
6795         if (!Py_UNICODE_ISALNUM(*p))
6796             return PyBool_FromLong(0);
6797     }
6798     return PyBool_FromLong(1);
6799 }
6800
6801 PyDoc_STRVAR(isdecimal__doc__,
6802              "S.isdecimal() -> bool\n\
6803 \n\
6804 Return True if there are only decimal characters in S,\n\
6805 False otherwise.");
6806
6807 static PyObject*
6808 unicode_isdecimal(PyUnicodeObject *self)
6809 {
6810     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6811     register const Py_UNICODE *e;
6812
6813     /* Shortcut for single character strings */
6814     if (PyUnicode_GET_SIZE(self) == 1 &&
6815         Py_UNICODE_ISDECIMAL(*p))
6816         return PyBool_FromLong(1);
6817
6818     /* Special case for empty strings */
6819     if (PyUnicode_GET_SIZE(self) == 0)
6820         return PyBool_FromLong(0);
6821
6822     e = p + PyUnicode_GET_SIZE(self);
6823     for (; p < e; p++) {
6824         if (!Py_UNICODE_ISDECIMAL(*p))
6825             return PyBool_FromLong(0);
6826     }
6827     return PyBool_FromLong(1);
6828 }
6829
6830 PyDoc_STRVAR(isdigit__doc__,
6831              "S.isdigit() -> bool\n\
6832 \n\
6833 Return True if all characters in S are digits\n\
6834 and there is at least one character in S, False otherwise.");
6835
6836 static PyObject*
6837 unicode_isdigit(PyUnicodeObject *self)
6838 {
6839     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6840     register const Py_UNICODE *e;
6841
6842     /* Shortcut for single character strings */
6843     if (PyUnicode_GET_SIZE(self) == 1 &&
6844         Py_UNICODE_ISDIGIT(*p))
6845         return PyBool_FromLong(1);
6846
6847     /* Special case for empty strings */
6848     if (PyUnicode_GET_SIZE(self) == 0)
6849         return PyBool_FromLong(0);
6850
6851     e = p + PyUnicode_GET_SIZE(self);
6852     for (; p < e; p++) {
6853         if (!Py_UNICODE_ISDIGIT(*p))
6854             return PyBool_FromLong(0);
6855     }
6856     return PyBool_FromLong(1);
6857 }
6858
6859 PyDoc_STRVAR(isnumeric__doc__,
6860              "S.isnumeric() -> bool\n\
6861 \n\
6862 Return True if there are only numeric characters in S,\n\
6863 False otherwise.");
6864
6865 static PyObject*
6866 unicode_isnumeric(PyUnicodeObject *self)
6867 {
6868     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6869     register const Py_UNICODE *e;
6870
6871     /* Shortcut for single character strings */
6872     if (PyUnicode_GET_SIZE(self) == 1 &&
6873         Py_UNICODE_ISNUMERIC(*p))
6874         return PyBool_FromLong(1);
6875
6876     /* Special case for empty strings */
6877     if (PyUnicode_GET_SIZE(self) == 0)
6878         return PyBool_FromLong(0);
6879
6880     e = p + PyUnicode_GET_SIZE(self);
6881     for (; p < e; p++) {
6882         if (!Py_UNICODE_ISNUMERIC(*p))
6883             return PyBool_FromLong(0);
6884     }
6885     return PyBool_FromLong(1);
6886 }
6887
6888 PyDoc_STRVAR(join__doc__,
6889              "S.join(iterable) -> unicode\n\
6890 \n\
6891 Return a string which is the concatenation of the strings in the\n\
6892 iterable.  The separator between elements is S.");
6893
6894 static PyObject*
6895 unicode_join(PyObject *self, PyObject *data)
6896 {
6897     return PyUnicode_Join(self, data);
6898 }
6899
6900 static Py_ssize_t
6901 unicode_length(PyUnicodeObject *self)
6902 {
6903     return self->length;
6904 }
6905
6906 PyDoc_STRVAR(ljust__doc__,
6907              "S.ljust(width[, fillchar]) -> int\n\
6908 \n\
6909 Return S left-justified in a Unicode string of length width. Padding is\n\
6910 done using the specified fill character (default is a space).");
6911
6912 static PyObject *
6913 unicode_ljust(PyUnicodeObject *self, PyObject *args)
6914 {
6915     Py_ssize_t width;
6916     Py_UNICODE fillchar = ' ';
6917
6918     if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
6919         return NULL;
6920
6921     if (self->length >= width && PyUnicode_CheckExact(self)) {
6922         Py_INCREF(self);
6923         return (PyObject*) self;
6924     }
6925
6926     return (PyObject*) pad(self, 0, width - self->length, fillchar);
6927 }
6928
6929 PyDoc_STRVAR(lower__doc__,
6930              "S.lower() -> unicode\n\
6931 \n\
6932 Return a copy of the string S converted to lowercase.");
6933
6934 static PyObject*
6935 unicode_lower(PyUnicodeObject *self)
6936 {
6937     return fixup(self, fixlower);
6938 }
6939
6940 #define LEFTSTRIP 0
6941 #define RIGHTSTRIP 1
6942 #define BOTHSTRIP 2
6943
6944 /* Arrays indexed by above */
6945 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6946
6947 #define STRIPNAME(i) (stripformat[i]+3)
6948
6949 /* externally visible for str.strip(unicode) */
6950 PyObject *
6951 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6952 {
6953     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6954     Py_ssize_t len = PyUnicode_GET_SIZE(self);
6955     Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6956     Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6957     Py_ssize_t i, j;
6958
6959     BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6960
6961     i = 0;
6962     if (striptype != RIGHTSTRIP) {
6963         while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6964             i++;
6965         }
6966     }
6967
6968     j = len;
6969     if (striptype != LEFTSTRIP) {
6970         do {
6971             j--;
6972         } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6973         j++;
6974     }
6975
6976     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6977         Py_INCREF(self);
6978         return (PyObject*)self;
6979     }
6980     else
6981         return PyUnicode_FromUnicode(s+i, j-i);
6982 }
6983
6984
6985 static PyObject *
6986 do_strip(PyUnicodeObject *self, int striptype)
6987 {
6988     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6989     Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
6990
6991     i = 0;
6992     if (striptype != RIGHTSTRIP) {
6993         while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6994             i++;
6995         }
6996     }
6997
6998     j = len;
6999     if (striptype != LEFTSTRIP) {
7000         do {
7001             j--;
7002         } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7003         j++;
7004     }
7005
7006     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7007         Py_INCREF(self);
7008         return (PyObject*)self;
7009     }
7010     else
7011         return PyUnicode_FromUnicode(s+i, j-i);
7012 }
7013
7014
7015 static PyObject *
7016 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7017 {
7018     PyObject *sep = NULL;
7019
7020     if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7021         return NULL;
7022
7023     if (sep != NULL && sep != Py_None) {
7024         if (PyUnicode_Check(sep))
7025             return _PyUnicode_XStrip(self, striptype, sep);
7026         else if (PyString_Check(sep)) {
7027             PyObject *res;
7028             sep = PyUnicode_FromObject(sep);
7029             if (sep==NULL)
7030                 return NULL;
7031             res = _PyUnicode_XStrip(self, striptype, sep);
7032             Py_DECREF(sep);
7033             return res;
7034         }
7035         else {
7036             PyErr_Format(PyExc_TypeError,
7037                          "%s arg must be None, unicode or str",
7038                          STRIPNAME(striptype));
7039             return NULL;
7040         }
7041     }
7042
7043     return do_strip(self, striptype);
7044 }
7045
7046
7047 PyDoc_STRVAR(strip__doc__,
7048              "S.strip([chars]) -> unicode\n\
7049 \n\
7050 Return a copy of the string S with leading and trailing\n\
7051 whitespace removed.\n\
7052 If chars is given and not None, remove characters in chars instead.\n\
7053 If chars is a str, it will be converted to unicode before stripping");
7054
7055 static PyObject *
7056 unicode_strip(PyUnicodeObject *self, PyObject *args)
7057 {
7058     if (PyTuple_GET_SIZE(args) == 0)
7059         return do_strip(self, BOTHSTRIP); /* Common case */
7060     else
7061         return do_argstrip(self, BOTHSTRIP, args);
7062 }
7063
7064
7065 PyDoc_STRVAR(lstrip__doc__,
7066              "S.lstrip([chars]) -> unicode\n\
7067 \n\
7068 Return a copy of the string S with leading whitespace removed.\n\
7069 If chars is given and not None, remove characters in chars instead.\n\
7070 If chars is a str, it will be converted to unicode before stripping");
7071
7072 static PyObject *
7073 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7074 {
7075     if (PyTuple_GET_SIZE(args) == 0)
7076         return do_strip(self, LEFTSTRIP); /* Common case */
7077     else
7078         return do_argstrip(self, LEFTSTRIP, args);
7079 }
7080
7081
7082 PyDoc_STRVAR(rstrip__doc__,
7083              "S.rstrip([chars]) -> unicode\n\
7084 \n\
7085 Return a copy of the string S with trailing whitespace removed.\n\
7086 If chars is given and not None, remove characters in chars instead.\n\
7087 If chars is a str, it will be converted to unicode before stripping");
7088
7089 static PyObject *
7090 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7091 {
7092     if (PyTuple_GET_SIZE(args) == 0)
7093         return do_strip(self, RIGHTSTRIP); /* Common case */
7094     else
7095         return do_argstrip(self, RIGHTSTRIP, args);
7096 }
7097
7098
7099 static PyObject*
7100 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7101 {
7102     PyUnicodeObject *u;
7103     Py_UNICODE *p;
7104     Py_ssize_t nchars;
7105     size_t nbytes;
7106
7107     if (len < 0)
7108         len = 0;
7109
7110     if (len == 1 && PyUnicode_CheckExact(str)) {
7111         /* no repeat, return original string */
7112         Py_INCREF(str);
7113         return (PyObject*) str;
7114     }
7115
7116     /* ensure # of chars needed doesn't overflow int and # of bytes
7117      * needed doesn't overflow size_t
7118      */
7119     nchars = len * str->length;
7120     if (len && nchars / len != str->length) {
7121         PyErr_SetString(PyExc_OverflowError,
7122                         "repeated string is too long");
7123         return NULL;
7124     }
7125     nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7126     if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7127         PyErr_SetString(PyExc_OverflowError,
7128                         "repeated string is too long");
7129         return NULL;
7130     }
7131     u = _PyUnicode_New(nchars);
7132     if (!u)
7133         return NULL;
7134
7135     p = u->str;
7136
7137     if (str->length == 1 && len > 0) {
7138         Py_UNICODE_FILL(p, str->str[0], len);
7139     } else {
7140         Py_ssize_t done = 0; /* number of characters copied this far */
7141         if (done < nchars) {
7142             Py_UNICODE_COPY(p, str->str, str->length);
7143             done = str->length;
7144         }
7145         while (done < nchars) {
7146             Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7147             Py_UNICODE_COPY(p+done, p, n);
7148             done += n;
7149         }
7150     }
7151
7152     return (PyObject*) u;
7153 }
7154
7155 PyObject *PyUnicode_Replace(PyObject *obj,
7156                             PyObject *subobj,
7157                             PyObject *replobj,
7158                             Py_ssize_t maxcount)
7159 {
7160     PyObject *self;
7161     PyObject *str1;
7162     PyObject *str2;
7163     PyObject *result;
7164
7165     self = PyUnicode_FromObject(obj);
7166     if (self == NULL)
7167         return NULL;
7168     str1 = PyUnicode_FromObject(subobj);
7169     if (str1 == NULL) {
7170         Py_DECREF(self);
7171         return NULL;
7172     }
7173     str2 = PyUnicode_FromObject(replobj);
7174     if (str2 == NULL) {
7175         Py_DECREF(self);
7176         Py_DECREF(str1);
7177         return NULL;
7178     }
7179     result = replace((PyUnicodeObject *)self,
7180                      (PyUnicodeObject *)str1,
7181                      (PyUnicodeObject *)str2,
7182                      maxcount);
7183     Py_DECREF(self);
7184     Py_DECREF(str1);
7185     Py_DECREF(str2);
7186     return result;
7187 }
7188
7189 PyDoc_STRVAR(replace__doc__,
7190              "S.replace(old, new[, count]) -> unicode\n\
7191 \n\
7192 Return a copy of S with all occurrences of substring\n\
7193 old replaced by new.  If the optional argument count is\n\
7194 given, only the first count occurrences are replaced.");
7195
7196 static PyObject*
7197 unicode_replace(PyUnicodeObject *self, PyObject *args)
7198 {
7199     PyUnicodeObject *str1;
7200     PyUnicodeObject *str2;
7201     Py_ssize_t maxcount = -1;
7202     PyObject *result;
7203
7204     if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7205         return NULL;
7206     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7207     if (str1 == NULL)
7208         return NULL;
7209     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7210     if (str2 == NULL) {
7211         Py_DECREF(str1);
7212         return NULL;
7213     }
7214
7215     result = replace(self, str1, str2, maxcount);
7216
7217     Py_DECREF(str1);
7218     Py_DECREF(str2);
7219     return result;
7220 }
7221
7222 static
7223 PyObject *unicode_repr(PyObject *unicode)
7224 {
7225     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7226                                 PyUnicode_GET_SIZE(unicode),
7227                                 1);
7228 }
7229
7230 PyDoc_STRVAR(rfind__doc__,
7231              "S.rfind(sub [,start [,end]]) -> int\n\
7232 \n\
7233 Return the highest index in S where substring sub is found,\n\
7234 such that sub is contained within s[start:end].  Optional\n\
7235 arguments start and end are interpreted as in slice notation.\n\
7236 \n\
7237 Return -1 on failure.");
7238
7239 static PyObject *
7240 unicode_rfind(PyUnicodeObject *self, PyObject *args)
7241 {
7242     PyObject *substring;
7243     Py_ssize_t start;
7244     Py_ssize_t end;
7245     Py_ssize_t result;
7246
7247     if (!_ParseTupleFinds(args, &substring, &start, &end))
7248         return NULL;
7249
7250     result = stringlib_rfind_slice(
7251         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7252         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7253         start, end
7254         );
7255
7256     Py_DECREF(substring);
7257
7258     return PyInt_FromSsize_t(result);
7259 }
7260
7261 PyDoc_STRVAR(rindex__doc__,
7262              "S.rindex(sub [,start [,end]]) -> int\n\
7263 \n\
7264 Like S.rfind() but raise ValueError when the substring is not found.");
7265
7266 static PyObject *
7267 unicode_rindex(PyUnicodeObject *self, PyObject *args)
7268 {
7269     PyObject *substring;
7270     Py_ssize_t start;
7271     Py_ssize_t end;
7272     Py_ssize_t result;
7273
7274     if (!_ParseTupleFinds(args, &substring, &start, &end))
7275         return NULL;
7276
7277     result = stringlib_rfind_slice(
7278         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7279         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7280         start, end
7281         );
7282
7283     Py_DECREF(substring);
7284
7285     if (result < 0) {
7286         PyErr_SetString(PyExc_ValueError, "substring not found");
7287         return NULL;
7288     }
7289     return PyInt_FromSsize_t(result);
7290 }
7291
7292 PyDoc_STRVAR(rjust__doc__,
7293              "S.rjust(width[, fillchar]) -> unicode\n\
7294 \n\
7295 Return S right-justified in a Unicode string of length width. Padding is\n\
7296 done using the specified fill character (default is a space).");
7297
7298 static PyObject *
7299 unicode_rjust(PyUnicodeObject *self, PyObject *args)
7300 {
7301     Py_ssize_t width;
7302     Py_UNICODE fillchar = ' ';
7303
7304     if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7305         return NULL;
7306
7307     if (self->length >= width && PyUnicode_CheckExact(self)) {
7308         Py_INCREF(self);
7309         return (PyObject*) self;
7310     }
7311
7312     return (PyObject*) pad(self, width - self->length, 0, fillchar);
7313 }
7314
7315 static PyObject*
7316 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
7317 {
7318     /* standard clamping */
7319     if (start < 0)
7320         start = 0;
7321     if (end < 0)
7322         end = 0;
7323     if (end > self->length)
7324         end = self->length;
7325     if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
7326         /* full slice, return original string */
7327         Py_INCREF(self);
7328         return (PyObject*) self;
7329     }
7330     if (start > end)
7331         start = end;
7332     /* copy slice */
7333     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7334                                              end - start);
7335 }
7336
7337 PyObject *PyUnicode_Split(PyObject *s,
7338                           PyObject *sep,
7339                           Py_ssize_t maxsplit)
7340 {
7341     PyObject *result;
7342
7343     s = PyUnicode_FromObject(s);
7344     if (s == NULL)
7345         return NULL;
7346     if (sep != NULL) {
7347         sep = PyUnicode_FromObject(sep);
7348         if (sep == NULL) {
7349             Py_DECREF(s);
7350             return NULL;
7351         }
7352     }
7353
7354     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7355
7356     Py_DECREF(s);
7357     Py_XDECREF(sep);
7358     return result;
7359 }
7360
7361 PyDoc_STRVAR(split__doc__,
7362              "S.split([sep [,maxsplit]]) -> list of strings\n\
7363 \n\
7364 Return a list of the words in S, using sep as the\n\
7365 delimiter string.  If maxsplit is given, at most maxsplit\n\
7366 splits are done. If sep is not specified or is None, any\n\
7367 whitespace string is a separator and empty strings are\n\
7368 removed from the result.");
7369
7370 static PyObject*
7371 unicode_split(PyUnicodeObject *self, PyObject *args)
7372 {
7373     PyObject *substring = Py_None;
7374     Py_ssize_t maxcount = -1;
7375
7376     if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7377         return NULL;
7378
7379     if (substring == Py_None)
7380         return split(self, NULL, maxcount);
7381     else if (PyUnicode_Check(substring))
7382         return split(self, (PyUnicodeObject *)substring, maxcount);
7383     else
7384         return PyUnicode_Split((PyObject *)self, substring, maxcount);
7385 }
7386
7387 PyObject *
7388 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7389 {
7390     PyObject* str_obj;
7391     PyObject* sep_obj;
7392     PyObject* out;
7393
7394     str_obj = PyUnicode_FromObject(str_in);
7395     if (!str_obj)
7396         return NULL;
7397     sep_obj = PyUnicode_FromObject(sep_in);
7398     if (!sep_obj) {
7399         Py_DECREF(str_obj);
7400         return NULL;
7401     }
7402
7403     out = stringlib_partition(
7404         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7405         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7406         );
7407
7408     Py_DECREF(sep_obj);
7409     Py_DECREF(str_obj);
7410
7411     return out;
7412 }
7413
7414
7415 PyObject *
7416 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7417 {
7418     PyObject* str_obj;
7419     PyObject* sep_obj;
7420     PyObject* out;
7421
7422     str_obj = PyUnicode_FromObject(str_in);
7423     if (!str_obj)
7424         return NULL;
7425     sep_obj = PyUnicode_FromObject(sep_in);
7426     if (!sep_obj) {
7427         Py_DECREF(str_obj);
7428         return NULL;
7429     }
7430
7431     out = stringlib_rpartition(
7432         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7433         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7434         );
7435
7436     Py_DECREF(sep_obj);
7437     Py_DECREF(str_obj);
7438
7439     return out;
7440 }
7441
7442 PyDoc_STRVAR(partition__doc__,
7443              "S.partition(sep) -> (head, sep, tail)\n\
7444 \n\
7445 Search for the separator sep in S, and return the part before it,\n\
7446 the separator itself, and the part after it.  If the separator is not\n\
7447 found, return S and two empty strings.");
7448
7449 static PyObject*
7450 unicode_partition(PyUnicodeObject *self, PyObject *separator)
7451 {
7452     return PyUnicode_Partition((PyObject *)self, separator);
7453 }
7454
7455 PyDoc_STRVAR(rpartition__doc__,
7456              "S.rpartition(sep) -> (head, sep, tail)\n\
7457 \n\
7458 Search for the separator sep in S, starting at the end of S, and return\n\
7459 the part before it, the separator itself, and the part after it.  If the\n\
7460 separator is not found, return two empty strings and S.");
7461
7462 static PyObject*
7463 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7464 {
7465     return PyUnicode_RPartition((PyObject *)self, separator);
7466 }
7467
7468 PyObject *PyUnicode_RSplit(PyObject *s,
7469                            PyObject *sep,
7470                            Py_ssize_t maxsplit)
7471 {
7472     PyObject *result;
7473
7474     s = PyUnicode_FromObject(s);
7475     if (s == NULL)
7476         return NULL;
7477     if (sep != NULL) {
7478         sep = PyUnicode_FromObject(sep);
7479         if (sep == NULL) {
7480             Py_DECREF(s);
7481             return NULL;
7482         }
7483     }
7484
7485     result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7486
7487     Py_DECREF(s);
7488     Py_XDECREF(sep);
7489     return result;
7490 }
7491
7492 PyDoc_STRVAR(rsplit__doc__,
7493              "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7494 \n\
7495 Return a list of the words in S, using sep as the\n\
7496 delimiter string, starting at the end of the string and\n\
7497 working to the front.  If maxsplit is given, at most maxsplit\n\
7498 splits are done. If sep is not specified, any whitespace string\n\
7499 is a separator.");
7500
7501 static PyObject*
7502 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7503 {
7504     PyObject *substring = Py_None;
7505     Py_ssize_t maxcount = -1;
7506
7507     if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7508         return NULL;
7509
7510     if (substring == Py_None)
7511         return rsplit(self, NULL, maxcount);
7512     else if (PyUnicode_Check(substring))
7513         return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7514     else
7515         return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7516 }
7517
7518 PyDoc_STRVAR(splitlines__doc__,
7519              "S.splitlines([keepends]) -> list of strings\n\
7520 \n\
7521 Return a list of the lines in S, breaking at line boundaries.\n\
7522 Line breaks are not included in the resulting list unless keepends\n\
7523 is given and true.");
7524
7525 static PyObject*
7526 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7527 {
7528     int keepends = 0;
7529
7530     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7531         return NULL;
7532
7533     return PyUnicode_Splitlines((PyObject *)self, keepends);
7534 }
7535
7536 static
7537 PyObject *unicode_str(PyUnicodeObject *self)
7538 {
7539     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
7540 }
7541
7542 PyDoc_STRVAR(swapcase__doc__,
7543              "S.swapcase() -> unicode\n\
7544 \n\
7545 Return a copy of S with uppercase characters converted to lowercase\n\
7546 and vice versa.");
7547
7548 static PyObject*
7549 unicode_swapcase(PyUnicodeObject *self)
7550 {
7551     return fixup(self, fixswapcase);
7552 }
7553
7554 PyDoc_STRVAR(translate__doc__,
7555              "S.translate(table) -> unicode\n\
7556 \n\
7557 Return a copy of the string S, where all characters have been mapped\n\
7558 through the given translation table, which must be a mapping of\n\
7559 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7560 Unmapped characters are left untouched. Characters mapped to None\n\
7561 are deleted.");
7562
7563 static PyObject*
7564 unicode_translate(PyUnicodeObject *self, PyObject *table)
7565 {
7566     return PyUnicode_TranslateCharmap(self->str,
7567                                       self->length,
7568                                       table,
7569                                       "ignore");
7570 }
7571
7572 PyDoc_STRVAR(upper__doc__,
7573              "S.upper() -> unicode\n\
7574 \n\
7575 Return a copy of S converted to uppercase.");
7576
7577 static PyObject*
7578 unicode_upper(PyUnicodeObject *self)
7579 {
7580     return fixup(self, fixupper);
7581 }
7582
7583 PyDoc_STRVAR(zfill__doc__,
7584              "S.zfill(width) -> unicode\n\
7585 \n\
7586 Pad a numeric string S with zeros on the left, to fill a field\n\
7587 of the specified width. The string S is never truncated.");
7588
7589 static PyObject *
7590 unicode_zfill(PyUnicodeObject *self, PyObject *args)
7591 {
7592     Py_ssize_t fill;
7593     PyUnicodeObject *u;
7594
7595     Py_ssize_t width;
7596     if (!PyArg_ParseTuple(args, "n:zfill", &width))
7597         return NULL;
7598
7599     if (self->length >= width) {
7600         if (PyUnicode_CheckExact(self)) {
7601             Py_INCREF(self);
7602             return (PyObject*) self;
7603         }
7604         else
7605             return PyUnicode_FromUnicode(
7606                 PyUnicode_AS_UNICODE(self),
7607                 PyUnicode_GET_SIZE(self)
7608                 );
7609     }
7610
7611     fill = width - self->length;
7612
7613     u = pad(self, fill, 0, '0');
7614
7615     if (u == NULL)
7616         return NULL;
7617
7618     if (u->str[fill] == '+' || u->str[fill] == '-') {
7619         /* move sign to beginning of string */
7620         u->str[0] = u->str[fill];
7621         u->str[fill] = '0';
7622     }
7623
7624     return (PyObject*) u;
7625 }
7626
7627 #if 0
7628 static PyObject*
7629 free_listsize(PyUnicodeObject *self)
7630 {
7631     return PyInt_FromLong(numfree);
7632 }
7633 #endif
7634
7635 PyDoc_STRVAR(startswith__doc__,
7636              "S.startswith(prefix[, start[, end]]) -> bool\n\
7637 \n\
7638 Return True if S starts with the specified prefix, False otherwise.\n\
7639 With optional start, test S beginning at that position.\n\
7640 With optional end, stop comparing S at that position.\n\
7641 prefix can also be a tuple of strings to try.");
7642
7643 static PyObject *
7644 unicode_startswith(PyUnicodeObject *self,
7645                    PyObject *args)
7646 {
7647     PyObject *subobj;
7648     PyUnicodeObject *substring;
7649     Py_ssize_t start = 0;
7650     Py_ssize_t end = PY_SSIZE_T_MAX;
7651     int result;
7652
7653     if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
7654                           _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7655         return NULL;
7656     if (PyTuple_Check(subobj)) {
7657         Py_ssize_t i;
7658         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7659             substring = (PyUnicodeObject *)PyUnicode_FromObject(
7660                 PyTuple_GET_ITEM(subobj, i));
7661             if (substring == NULL)
7662                 return NULL;
7663             result = tailmatch(self, substring, start, end, -1);
7664             Py_DECREF(substring);
7665             if (result) {
7666                 Py_RETURN_TRUE;
7667             }
7668         }
7669         /* nothing matched */
7670         Py_RETURN_FALSE;
7671     }
7672     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7673     if (substring == NULL)
7674         return NULL;
7675     result = tailmatch(self, substring, start, end, -1);
7676     Py_DECREF(substring);
7677     return PyBool_FromLong(result);
7678 }
7679
7680
7681 PyDoc_STRVAR(endswith__doc__,
7682              "S.endswith(suffix[, start[, end]]) -> bool\n\
7683 \n\
7684 Return True if S ends with the specified suffix, False otherwise.\n\
7685 With optional start, test S beginning at that position.\n\
7686 With optional end, stop comparing S at that position.\n\
7687 suffix can also be a tuple of strings to try.");
7688
7689 static PyObject *
7690 unicode_endswith(PyUnicodeObject *self,
7691                  PyObject *args)
7692 {
7693     PyObject *subobj;
7694     PyUnicodeObject *substring;
7695     Py_ssize_t start = 0;
7696     Py_ssize_t end = PY_SSIZE_T_MAX;
7697     int result;
7698
7699     if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7700                           _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7701         return NULL;
7702     if (PyTuple_Check(subobj)) {
7703         Py_ssize_t i;
7704         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7705             substring = (PyUnicodeObject *)PyUnicode_FromObject(
7706                 PyTuple_GET_ITEM(subobj, i));
7707             if (substring == NULL)
7708                 return NULL;
7709             result = tailmatch(self, substring, start, end, +1);
7710             Py_DECREF(substring);
7711             if (result) {
7712                 Py_RETURN_TRUE;
7713             }
7714         }
7715         Py_RETURN_FALSE;
7716     }
7717     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7718     if (substring == NULL)
7719         return NULL;
7720
7721     result = tailmatch(self, substring, start, end, +1);
7722     Py_DECREF(substring);
7723     return PyBool_FromLong(result);
7724 }
7725
7726
7727 /* Implements do_string_format, which is unicode because of stringlib */
7728 #include "stringlib/string_format.h"
7729
7730 PyDoc_STRVAR(format__doc__,
7731              "S.format(*args, **kwargs) -> unicode\n\
7732 \n\
7733 ");
7734
7735 static PyObject *
7736 unicode__format__(PyObject *self, PyObject *args)
7737 {
7738     PyObject *format_spec;
7739     PyObject *result = NULL;
7740     PyObject *tmp = NULL;
7741
7742     /* If 2.x, convert format_spec to the same type as value */
7743     /* This is to allow things like u''.format('') */
7744     if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7745         goto done;
7746     if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7747         PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
7748                      "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
7749         goto done;
7750     }
7751     tmp = PyObject_Unicode(format_spec);
7752     if (tmp == NULL)
7753         goto done;
7754     format_spec = tmp;
7755
7756     result = _PyUnicode_FormatAdvanced(self,
7757                                        PyUnicode_AS_UNICODE(format_spec),
7758                                        PyUnicode_GET_SIZE(format_spec));
7759   done:
7760     Py_XDECREF(tmp);
7761     return result;
7762 }
7763
7764 PyDoc_STRVAR(p_format__doc__,
7765              "S.__format__(format_spec) -> unicode\n\
7766 \n\
7767 ");
7768
7769 static PyObject *
7770 unicode__sizeof__(PyUnicodeObject *v)
7771 {
7772     return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7773                              sizeof(Py_UNICODE) * (v->length + 1));
7774 }
7775
7776 PyDoc_STRVAR(sizeof__doc__,
7777              "S.__sizeof__() -> size of S in memory, in bytes\n\
7778 \n\
7779 ");
7780
7781 static PyObject *
7782 unicode_getnewargs(PyUnicodeObject *v)
7783 {
7784     return Py_BuildValue("(u#)", v->str, v->length);
7785 }
7786
7787
7788 static PyMethodDef unicode_methods[] = {
7789
7790     /* Order is according to common usage: often used methods should
7791        appear first, since lookup is done sequentially. */
7792
7793     {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
7794     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7795     {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
7796     {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
7797     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7798     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7799     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7800     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7801     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7802     {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7803     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
7804     {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
7805     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7806     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7807     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
7808     {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
7809     {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
7810 /*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7811     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7812     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7813     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
7814     {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
7815     {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
7816     {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
7817     {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
7818     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7819     {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7820     {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7821     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7822     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7823     {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7824     {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7825     {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7826     {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7827     {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7828     {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7829     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7830     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7831     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
7832     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
7833     {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7834     {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7835     {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7836     {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
7837     {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
7838 #if 0
7839     {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
7840 #endif
7841
7842 #if 0
7843     /* This one is just used for debugging the implementation. */
7844     {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
7845 #endif
7846
7847     {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
7848     {NULL, NULL}
7849 };
7850
7851 static PyObject *
7852 unicode_mod(PyObject *v, PyObject *w)
7853 {
7854     if (!PyUnicode_Check(v)) {
7855         Py_INCREF(Py_NotImplemented);
7856         return Py_NotImplemented;
7857     }
7858     return PyUnicode_Format(v, w);
7859 }
7860
7861 static PyNumberMethods unicode_as_number = {
7862     0,              /*nb_add*/
7863     0,              /*nb_subtract*/
7864     0,              /*nb_multiply*/
7865     0,              /*nb_divide*/
7866     unicode_mod,            /*nb_remainder*/
7867 };
7868
7869 static PySequenceMethods unicode_as_sequence = {
7870     (lenfunc) unicode_length,       /* sq_length */
7871     PyUnicode_Concat,           /* sq_concat */
7872     (ssizeargfunc) unicode_repeat,  /* sq_repeat */
7873     (ssizeargfunc) unicode_getitem,     /* sq_item */
7874     (ssizessizeargfunc) unicode_slice,  /* sq_slice */
7875     0,                  /* sq_ass_item */
7876     0,                  /* sq_ass_slice */
7877     PyUnicode_Contains,         /* sq_contains */
7878 };
7879
7880 static PyObject*
7881 unicode_subscript(PyUnicodeObject* self, PyObject* item)
7882 {
7883     if (PyIndex_Check(item)) {
7884         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
7885         if (i == -1 && PyErr_Occurred())
7886             return NULL;
7887         if (i < 0)
7888             i += PyUnicode_GET_SIZE(self);
7889         return unicode_getitem(self, i);
7890     } else if (PySlice_Check(item)) {
7891         Py_ssize_t start, stop, step, slicelength, cur, i;
7892         Py_UNICODE* source_buf;
7893         Py_UNICODE* result_buf;
7894         PyObject* result;
7895
7896         if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
7897                                  &start, &stop, &step, &slicelength) < 0) {
7898             return NULL;
7899         }
7900
7901         if (slicelength <= 0) {
7902             return PyUnicode_FromUnicode(NULL, 0);
7903         } else if (start == 0 && step == 1 && slicelength == self->length &&
7904                    PyUnicode_CheckExact(self)) {
7905             Py_INCREF(self);
7906             return (PyObject *)self;
7907         } else if (step == 1) {
7908             return PyUnicode_FromUnicode(self->str + start, slicelength);
7909         } else {
7910             source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
7911             result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7912                                                        sizeof(Py_UNICODE));
7913
7914             if (result_buf == NULL)
7915                 return PyErr_NoMemory();
7916
7917             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7918                 result_buf[i] = source_buf[cur];
7919             }
7920
7921             result = PyUnicode_FromUnicode(result_buf, slicelength);
7922             PyObject_FREE(result_buf);
7923             return result;
7924         }
7925     } else {
7926         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7927         return NULL;
7928     }
7929 }
7930
7931 static PyMappingMethods unicode_as_mapping = {
7932     (lenfunc)unicode_length,        /* mp_length */
7933     (binaryfunc)unicode_subscript,  /* mp_subscript */
7934     (objobjargproc)0,           /* mp_ass_subscript */
7935 };
7936
7937 static Py_ssize_t
7938 unicode_buffer_getreadbuf(PyUnicodeObject *self,
7939                           Py_ssize_t index,
7940                           const void **ptr)
7941 {
7942     if (index != 0) {
7943         PyErr_SetString(PyExc_SystemError,
7944                         "accessing non-existent unicode segment");
7945         return -1;
7946     }
7947     *ptr = (void *) self->str;
7948     return PyUnicode_GET_DATA_SIZE(self);
7949 }
7950
7951 static Py_ssize_t
7952 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
7953                            const void **ptr)
7954 {
7955     PyErr_SetString(PyExc_TypeError,
7956                     "cannot use unicode as modifiable buffer");
7957     return -1;
7958 }
7959
7960 static int
7961 unicode_buffer_getsegcount(PyUnicodeObject *self,
7962                            Py_ssize_t *lenp)
7963 {
7964     if (lenp)
7965         *lenp = PyUnicode_GET_DATA_SIZE(self);
7966     return 1;
7967 }
7968
7969 static Py_ssize_t
7970 unicode_buffer_getcharbuf(PyUnicodeObject *self,
7971                           Py_ssize_t index,
7972                           const void **ptr)
7973 {
7974     PyObject *str;
7975
7976     if (index != 0) {
7977         PyErr_SetString(PyExc_SystemError,
7978                         "accessing non-existent unicode segment");
7979         return -1;
7980     }
7981     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
7982     if (str == NULL)
7983         return -1;
7984     *ptr = (void *) PyString_AS_STRING(str);
7985     return PyString_GET_SIZE(str);
7986 }
7987
7988 /* Helpers for PyUnicode_Format() */
7989
7990 static PyObject *
7991 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
7992 {
7993     Py_ssize_t argidx = *p_argidx;
7994     if (argidx < arglen) {
7995         (*p_argidx)++;
7996         if (arglen < 0)
7997             return args;
7998         else
7999             return PyTuple_GetItem(args, argidx);
8000     }
8001     PyErr_SetString(PyExc_TypeError,
8002                     "not enough arguments for format string");
8003     return NULL;
8004 }
8005
8006 #define F_LJUST (1<<0)
8007 #define F_SIGN  (1<<1)
8008 #define F_BLANK (1<<2)
8009 #define F_ALT   (1<<3)
8010 #define F_ZERO  (1<<4)
8011
8012 static Py_ssize_t
8013 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8014 {
8015     register Py_ssize_t i;
8016     Py_ssize_t len = strlen(charbuffer);
8017     for (i = len - 1; i >= 0; i--)
8018         buffer[i] = (Py_UNICODE) charbuffer[i];
8019
8020     return len;
8021 }
8022
8023 static int
8024 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8025 {
8026     Py_ssize_t result;
8027
8028     PyOS_snprintf((char *)buffer, len, format, x);
8029     result = strtounicode(buffer, (char *)buffer);
8030     return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8031 }
8032
8033 /* XXX To save some code duplication, formatfloat/long/int could have been
8034    shared with stringobject.c, converting from 8-bit to Unicode after the
8035    formatting is done. */
8036
8037 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
8038
8039 static PyObject *
8040 formatfloat(PyObject *v, int flags, int prec, int type)
8041 {
8042     char *p;
8043     PyObject *result;
8044     double x;
8045
8046     x = PyFloat_AsDouble(v);
8047     if (x == -1.0 && PyErr_Occurred())
8048         return NULL;
8049
8050     if (prec < 0)
8051         prec = 6;
8052
8053     p = PyOS_double_to_string(x, type, prec,
8054                               (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8055     if (p == NULL)
8056         return NULL;
8057     result = PyUnicode_FromStringAndSize(p, strlen(p));
8058     PyMem_Free(p);
8059     return result;
8060 }
8061
8062 static PyObject*
8063 formatlong(PyObject *val, int flags, int prec, int type)
8064 {
8065     char *buf;
8066     int i, len;
8067     PyObject *str; /* temporary string object. */
8068     PyUnicodeObject *result;
8069
8070     str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8071     if (!str)
8072         return NULL;
8073     result = _PyUnicode_New(len);
8074     if (!result) {
8075         Py_DECREF(str);
8076         return NULL;
8077     }
8078     for (i = 0; i < len; i++)
8079         result->str[i] = buf[i];
8080     result->str[len] = 0;
8081     Py_DECREF(str);
8082     return (PyObject*)result;
8083 }
8084
8085 static int
8086 formatint(Py_UNICODE *buf,
8087           size_t buflen,
8088           int flags,
8089           int prec,
8090           int type,
8091           PyObject *v)
8092 {
8093     /* fmt = '%#.' + `prec` + 'l' + `type`
8094      * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8095      *                     + 1 + 1
8096      *                   = 24
8097      */
8098     char fmt[64]; /* plenty big enough! */
8099     char *sign;
8100     long x;
8101
8102     x = PyInt_AsLong(v);
8103     if (x == -1 && PyErr_Occurred())
8104         return -1;
8105     if (x < 0 && type == 'u') {
8106         type = 'd';
8107     }
8108     if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8109         sign = "-";
8110     else
8111         sign = "";
8112     if (prec < 0)
8113         prec = 1;
8114
8115     /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8116      * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8117      */
8118     if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8119         PyErr_SetString(PyExc_OverflowError,
8120                         "formatted integer is too long (precision too large?)");
8121         return -1;
8122     }
8123
8124     if ((flags & F_ALT) &&
8125         (type == 'x' || type == 'X')) {
8126         /* When converting under %#x or %#X, there are a number
8127          * of issues that cause pain:
8128          * - when 0 is being converted, the C standard leaves off
8129          *   the '0x' or '0X', which is inconsistent with other
8130          *   %#x/%#X conversions and inconsistent with Python's
8131          *   hex() function
8132          * - there are platforms that violate the standard and
8133          *   convert 0 with the '0x' or '0X'
8134          *   (Metrowerks, Compaq Tru64)
8135          * - there are platforms that give '0x' when converting
8136          *   under %#X, but convert 0 in accordance with the
8137          *   standard (OS/2 EMX)
8138          *
8139          * We can achieve the desired consistency by inserting our
8140          * own '0x' or '0X' prefix, and substituting %x/%X in place
8141          * of %#x/%#X.
8142          *
8143          * Note that this is the same approach as used in
8144          * formatint() in stringobject.c
8145          */
8146         PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8147                       sign, type, prec, type);
8148     }
8149     else {
8150         PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8151                       sign, (flags&F_ALT) ? "#" : "",
8152                       prec, type);
8153     }
8154     if (sign[0])
8155         return longtounicode(buf, buflen, fmt, -x);
8156     else
8157         return longtounicode(buf, buflen, fmt, x);
8158 }
8159
8160 static int
8161 formatchar(Py_UNICODE *buf,
8162            size_t buflen,
8163            PyObject *v)
8164 {
8165     PyObject *unistr;
8166     char *str;
8167     /* presume that the buffer is at least 2 characters long */
8168     if (PyUnicode_Check(v)) {
8169         if (PyUnicode_GET_SIZE(v) != 1)
8170             goto onError;
8171         buf[0] = PyUnicode_AS_UNICODE(v)[0];
8172     }
8173
8174     else if (PyString_Check(v)) {
8175         if (PyString_GET_SIZE(v) != 1)
8176             goto onError;
8177         /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8178            with a UnicodeDecodeError if 'char' is not decodable with the
8179            default encoding (usually ASCII, but it might be something else) */
8180         str = PyString_AS_STRING(v);
8181         if ((unsigned char)str[0] > 0x7F) {
8182             /* the char is not ASCII; try to decode the string using the
8183                default encoding and return -1 to let the UnicodeDecodeError
8184                be raised if the string can't be decoded */
8185             unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8186             if (unistr == NULL)
8187                 return -1;
8188             buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8189             Py_DECREF(unistr);
8190         }
8191         else
8192             buf[0] = (Py_UNICODE)str[0];
8193     }
8194
8195     else {
8196         /* Integer input truncated to a character */
8197         long x;
8198         x = PyInt_AsLong(v);
8199         if (x == -1 && PyErr_Occurred())
8200             goto onError;
8201 #ifdef Py_UNICODE_WIDE
8202         if (x < 0 || x > 0x10ffff) {
8203             PyErr_SetString(PyExc_OverflowError,
8204                             "%c arg not in range(0x110000) "
8205                             "(wide Python build)");
8206             return -1;
8207         }
8208 #else
8209         if (x < 0 || x > 0xffff) {
8210             PyErr_SetString(PyExc_OverflowError,
8211                             "%c arg not in range(0x10000) "
8212                             "(narrow Python build)");
8213             return -1;
8214         }
8215 #endif
8216         buf[0] = (Py_UNICODE) x;
8217     }
8218     buf[1] = '\0';
8219     return 1;
8220
8221   onError:
8222     PyErr_SetString(PyExc_TypeError,
8223                     "%c requires int or char");
8224     return -1;
8225 }
8226
8227 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8228
8229    FORMATBUFLEN is the length of the buffer in which the ints &
8230    chars are formatted. XXX This is a magic number. Each formatting
8231    routine does bounds checking to ensure no overflow, but a better
8232    solution may be to malloc a buffer of appropriate size for each
8233    format. For now, the current solution is sufficient.
8234 */
8235 #define FORMATBUFLEN (size_t)120
8236
8237 PyObject *PyUnicode_Format(PyObject *format,
8238                            PyObject *args)
8239 {
8240     Py_UNICODE *fmt, *res;
8241     Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8242     int args_owned = 0;
8243     PyUnicodeObject *result = NULL;
8244     PyObject *dict = NULL;
8245     PyObject *uformat;
8246
8247     if (format == NULL || args == NULL) {
8248         PyErr_BadInternalCall();
8249         return NULL;
8250     }
8251     uformat = PyUnicode_FromObject(format);
8252     if (uformat == NULL)
8253         return NULL;
8254     fmt = PyUnicode_AS_UNICODE(uformat);
8255     fmtcnt = PyUnicode_GET_SIZE(uformat);
8256
8257     reslen = rescnt = fmtcnt + 100;
8258     result = _PyUnicode_New(reslen);
8259     if (result == NULL)
8260         goto onError;
8261     res = PyUnicode_AS_UNICODE(result);
8262
8263     if (PyTuple_Check(args)) {
8264         arglen = PyTuple_Size(args);
8265         argidx = 0;
8266     }
8267     else {
8268         arglen = -1;
8269         argidx = -2;
8270     }
8271     if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
8272         !PyObject_TypeCheck(args, &PyBaseString_Type))
8273         dict = args;
8274
8275     while (--fmtcnt >= 0) {
8276         if (*fmt != '%') {
8277             if (--rescnt < 0) {
8278                 rescnt = fmtcnt + 100;
8279                 reslen += rescnt;
8280                 if (_PyUnicode_Resize(&result, reslen) < 0)
8281                     goto onError;
8282                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8283                 --rescnt;
8284             }
8285             *res++ = *fmt++;
8286         }
8287         else {
8288             /* Got a format specifier */
8289             int flags = 0;
8290             Py_ssize_t width = -1;
8291             int prec = -1;
8292             Py_UNICODE c = '\0';
8293             Py_UNICODE fill;
8294             int isnumok;
8295             PyObject *v = NULL;
8296             PyObject *temp = NULL;
8297             Py_UNICODE *pbuf;
8298             Py_UNICODE sign;
8299             Py_ssize_t len;
8300             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
8301
8302             fmt++;
8303             if (*fmt == '(') {
8304                 Py_UNICODE *keystart;
8305                 Py_ssize_t keylen;
8306                 PyObject *key;
8307                 int pcount = 1;
8308
8309                 if (dict == NULL) {
8310                     PyErr_SetString(PyExc_TypeError,
8311                                     "format requires a mapping");
8312                     goto onError;
8313                 }
8314                 ++fmt;
8315                 --fmtcnt;
8316                 keystart = fmt;
8317                 /* Skip over balanced parentheses */
8318                 while (pcount > 0 && --fmtcnt >= 0) {
8319                     if (*fmt == ')')
8320                         --pcount;
8321                     else if (*fmt == '(')
8322                         ++pcount;
8323                     fmt++;
8324                 }
8325                 keylen = fmt - keystart - 1;
8326                 if (fmtcnt < 0 || pcount > 0) {
8327                     PyErr_SetString(PyExc_ValueError,
8328                                     "incomplete format key");
8329                     goto onError;
8330                 }
8331 #if 0
8332                 /* keys are converted to strings using UTF-8 and
8333                    then looked up since Python uses strings to hold
8334                    variables names etc. in its namespaces and we
8335                    wouldn't want to break common idioms. */
8336                 key = PyUnicode_EncodeUTF8(keystart,
8337                                            keylen,
8338                                            NULL);
8339 #else
8340                 key = PyUnicode_FromUnicode(keystart, keylen);
8341 #endif
8342                 if (key == NULL)
8343                     goto onError;
8344                 if (args_owned) {
8345                     Py_DECREF(args);
8346                     args_owned = 0;
8347                 }
8348                 args = PyObject_GetItem(dict, key);
8349                 Py_DECREF(key);
8350                 if (args == NULL) {
8351                     goto onError;
8352                 }
8353                 args_owned = 1;
8354                 arglen = -1;
8355                 argidx = -2;
8356             }
8357             while (--fmtcnt >= 0) {
8358                 switch (c = *fmt++) {
8359                 case '-': flags |= F_LJUST; continue;
8360                 case '+': flags |= F_SIGN; continue;
8361                 case ' ': flags |= F_BLANK; continue;
8362                 case '#': flags |= F_ALT; continue;
8363                 case '0': flags |= F_ZERO; continue;
8364                 }
8365                 break;
8366             }
8367             if (c == '*') {
8368                 v = getnextarg(args, arglen, &argidx);
8369                 if (v == NULL)
8370                     goto onError;
8371                 if (!PyInt_Check(v)) {
8372                     PyErr_SetString(PyExc_TypeError,
8373                                     "* wants int");
8374                     goto onError;
8375                 }
8376                 width = PyInt_AsLong(v);
8377                 if (width < 0) {
8378                     flags |= F_LJUST;
8379                     width = -width;
8380                 }
8381                 if (--fmtcnt >= 0)
8382                     c = *fmt++;
8383             }
8384             else if (c >= '0' && c <= '9') {
8385                 width = c - '0';
8386                 while (--fmtcnt >= 0) {
8387                     c = *fmt++;
8388                     if (c < '0' || c > '9')
8389                         break;
8390                     if ((width*10) / 10 != width) {
8391                         PyErr_SetString(PyExc_ValueError,
8392                                         "width too big");
8393                         goto onError;
8394                     }
8395                     width = width*10 + (c - '0');
8396                 }
8397             }
8398             if (c == '.') {
8399                 prec = 0;
8400                 if (--fmtcnt >= 0)
8401                     c = *fmt++;
8402                 if (c == '*') {
8403                     v = getnextarg(args, arglen, &argidx);
8404                     if (v == NULL)
8405                         goto onError;
8406                     if (!PyInt_Check(v)) {
8407                         PyErr_SetString(PyExc_TypeError,
8408                                         "* wants int");
8409                         goto onError;
8410                     }
8411                     prec = PyInt_AsLong(v);
8412                     if (prec < 0)
8413                         prec = 0;
8414                     if (--fmtcnt >= 0)
8415                         c = *fmt++;
8416                 }
8417                 else if (c >= '0' && c <= '9') {
8418                     prec = c - '0';
8419                     while (--fmtcnt >= 0) {
8420                         c = Py_CHARMASK(*fmt++);
8421                         if (c < '0' || c > '9')
8422                             break;
8423                         if ((prec*10) / 10 != prec) {
8424                             PyErr_SetString(PyExc_ValueError,
8425                                             "prec too big");
8426                             goto onError;
8427                         }
8428                         prec = prec*10 + (c - '0');
8429                     }
8430                 }
8431             } /* prec */
8432             if (fmtcnt >= 0) {
8433                 if (c == 'h' || c == 'l' || c == 'L') {
8434                     if (--fmtcnt >= 0)
8435                         c = *fmt++;
8436                 }
8437             }
8438             if (fmtcnt < 0) {
8439                 PyErr_SetString(PyExc_ValueError,
8440                                 "incomplete format");
8441                 goto onError;
8442             }
8443             if (c != '%') {
8444                 v = getnextarg(args, arglen, &argidx);
8445                 if (v == NULL)
8446                     goto onError;
8447             }
8448             sign = 0;
8449             fill = ' ';
8450             switch (c) {
8451
8452             case '%':
8453                 pbuf = formatbuf;
8454                 /* presume that buffer length is at least 1 */
8455                 pbuf[0] = '%';
8456                 len = 1;
8457                 break;
8458
8459             case 's':
8460             case 'r':
8461                 if (PyUnicode_CheckExact(v) && c == 's') {
8462                     temp = v;
8463                     Py_INCREF(temp);
8464                 }
8465                 else {
8466                     PyObject *unicode;
8467                     if (c == 's')
8468                         temp = PyObject_Unicode(v);
8469                     else
8470                         temp = PyObject_Repr(v);
8471                     if (temp == NULL)
8472                         goto onError;
8473                     if (PyUnicode_Check(temp))
8474                         /* nothing to do */;
8475                     else if (PyString_Check(temp)) {
8476                         /* convert to string to Unicode */
8477                         unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8478                                                    PyString_GET_SIZE(temp),
8479                                                    NULL,
8480                                                    "strict");
8481                         Py_DECREF(temp);
8482                         temp = unicode;
8483                         if (temp == NULL)
8484                             goto onError;
8485                     }
8486                     else {
8487                         Py_DECREF(temp);
8488                         PyErr_SetString(PyExc_TypeError,
8489                                         "%s argument has non-string str()");
8490                         goto onError;
8491                     }
8492                 }
8493                 pbuf = PyUnicode_AS_UNICODE(temp);
8494                 len = PyUnicode_GET_SIZE(temp);
8495                 if (prec >= 0 && len > prec)
8496                     len = prec;
8497                 break;
8498
8499             case 'i':
8500             case 'd':
8501             case 'u':
8502             case 'o':
8503             case 'x':
8504             case 'X':
8505                 if (c == 'i')
8506                     c = 'd';
8507                 isnumok = 0;
8508                 if (PyNumber_Check(v)) {
8509                     PyObject *iobj=NULL;
8510
8511                     if (PyInt_Check(v) || (PyLong_Check(v))) {
8512                         iobj = v;
8513                         Py_INCREF(iobj);
8514                     }
8515                     else {
8516                         iobj = PyNumber_Int(v);
8517                         if (iobj==NULL) iobj = PyNumber_Long(v);
8518                     }
8519                     if (iobj!=NULL) {
8520                         if (PyInt_Check(iobj)) {
8521                             isnumok = 1;
8522                             pbuf = formatbuf;
8523                             len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8524                                             flags, prec, c, iobj);
8525                             Py_DECREF(iobj);
8526                             if (len < 0)
8527                                 goto onError;
8528                             sign = 1;
8529                         }
8530                         else if (PyLong_Check(iobj)) {
8531                             isnumok = 1;
8532                             temp = formatlong(iobj, flags, prec, c);
8533                             Py_DECREF(iobj);
8534                             if (!temp)
8535                                 goto onError;
8536                             pbuf = PyUnicode_AS_UNICODE(temp);
8537                             len = PyUnicode_GET_SIZE(temp);
8538                             sign = 1;
8539                         }
8540                         else {
8541                             Py_DECREF(iobj);
8542                         }
8543                     }
8544                 }
8545                 if (!isnumok) {
8546                     PyErr_Format(PyExc_TypeError,
8547                                  "%%%c format: a number is required, "
8548                                  "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8549                     goto onError;
8550                 }
8551                 if (flags & F_ZERO)
8552                     fill = '0';
8553                 break;
8554
8555             case 'e':
8556             case 'E':
8557             case 'f':
8558             case 'F':
8559             case 'g':
8560             case 'G':
8561                 temp = formatfloat(v, flags, prec, c);
8562                 if (temp == NULL)
8563                     goto onError;
8564                 pbuf = PyUnicode_AS_UNICODE(temp);
8565                 len = PyUnicode_GET_SIZE(temp);
8566                 sign = 1;
8567                 if (flags & F_ZERO)
8568                     fill = '0';
8569                 break;
8570
8571             case 'c':
8572                 pbuf = formatbuf;
8573                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8574                 if (len < 0)
8575                     goto onError;
8576                 break;
8577
8578             default:
8579                 PyErr_Format(PyExc_ValueError,
8580                              "unsupported format character '%c' (0x%x) "
8581                              "at index %zd",
8582                              (31<=c && c<=126) ? (char)c : '?',
8583                              (int)c,
8584                              (Py_ssize_t)(fmt - 1 -
8585                                           PyUnicode_AS_UNICODE(uformat)));
8586                 goto onError;
8587             }
8588             if (sign) {
8589                 if (*pbuf == '-' || *pbuf == '+') {
8590                     sign = *pbuf++;
8591                     len--;
8592                 }
8593                 else if (flags & F_SIGN)
8594                     sign = '+';
8595                 else if (flags & F_BLANK)
8596                     sign = ' ';
8597                 else
8598                     sign = 0;
8599             }
8600             if (width < len)
8601                 width = len;
8602             if (rescnt - (sign != 0) < width) {
8603                 reslen -= rescnt;
8604                 rescnt = width + fmtcnt + 100;
8605                 reslen += rescnt;
8606                 if (reslen < 0) {
8607                     Py_XDECREF(temp);
8608                     PyErr_NoMemory();
8609                     goto onError;
8610                 }
8611                 if (_PyUnicode_Resize(&result, reslen) < 0) {
8612                     Py_XDECREF(temp);
8613                     goto onError;
8614                 }
8615                 res = PyUnicode_AS_UNICODE(result)
8616                     + reslen - rescnt;
8617             }
8618             if (sign) {
8619                 if (fill != ' ')
8620                     *res++ = sign;
8621                 rescnt--;
8622                 if (width > len)
8623                     width--;
8624             }
8625             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8626                 assert(pbuf[0] == '0');
8627                 assert(pbuf[1] == c);
8628                 if (fill != ' ') {
8629                     *res++ = *pbuf++;
8630                     *res++ = *pbuf++;
8631                 }
8632                 rescnt -= 2;
8633                 width -= 2;
8634                 if (width < 0)
8635                     width = 0;
8636                 len -= 2;
8637             }
8638             if (width > len && !(flags & F_LJUST)) {
8639                 do {
8640                     --rescnt;
8641                     *res++ = fill;
8642                 } while (--width > len);
8643             }
8644             if (fill == ' ') {
8645                 if (sign)
8646                     *res++ = sign;
8647                 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8648                     assert(pbuf[0] == '0');
8649                     assert(pbuf[1] == c);
8650                     *res++ = *pbuf++;
8651                     *res++ = *pbuf++;
8652                 }
8653             }
8654             Py_UNICODE_COPY(res, pbuf, len);
8655             res += len;
8656             rescnt -= len;
8657             while (--width >= len) {
8658                 --rescnt;
8659                 *res++ = ' ';
8660             }
8661             if (dict && (argidx < arglen) && c != '%') {
8662                 PyErr_SetString(PyExc_TypeError,
8663                                 "not all arguments converted during string formatting");
8664                 Py_XDECREF(temp);
8665                 goto onError;
8666             }
8667             Py_XDECREF(temp);
8668         } /* '%' */
8669     } /* until end */
8670     if (argidx < arglen && !dict) {
8671         PyErr_SetString(PyExc_TypeError,
8672                         "not all arguments converted during string formatting");
8673         goto onError;
8674     }
8675
8676     if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8677         goto onError;
8678     if (args_owned) {
8679         Py_DECREF(args);
8680     }
8681     Py_DECREF(uformat);
8682     return (PyObject *)result;
8683
8684   onError:
8685     Py_XDECREF(result);
8686     Py_DECREF(uformat);
8687     if (args_owned) {
8688         Py_DECREF(args);
8689     }
8690     return NULL;
8691 }
8692
8693 static PyBufferProcs unicode_as_buffer = {
8694     (readbufferproc) unicode_buffer_getreadbuf,
8695     (writebufferproc) unicode_buffer_getwritebuf,
8696     (segcountproc) unicode_buffer_getsegcount,
8697     (charbufferproc) unicode_buffer_getcharbuf,
8698 };
8699
8700 static PyObject *
8701 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8702
8703 static PyObject *
8704 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8705 {
8706     PyObject *x = NULL;
8707     static char *kwlist[] = {"string", "encoding", "errors", 0};
8708     char *encoding = NULL;
8709     char *errors = NULL;
8710
8711     if (type != &PyUnicode_Type)
8712         return unicode_subtype_new(type, args, kwds);
8713     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8714                                      kwlist, &x, &encoding, &errors))
8715         return NULL;
8716     if (x == NULL)
8717         return (PyObject *)_PyUnicode_New(0);
8718     if (encoding == NULL && errors == NULL)
8719         return PyObject_Unicode(x);
8720     else
8721         return PyUnicode_FromEncodedObject(x, encoding, errors);
8722 }
8723
8724 static PyObject *
8725 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8726 {
8727     PyUnicodeObject *tmp, *pnew;
8728     Py_ssize_t n;
8729
8730     assert(PyType_IsSubtype(type, &PyUnicode_Type));
8731     tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8732     if (tmp == NULL)
8733         return NULL;
8734     assert(PyUnicode_Check(tmp));
8735     pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8736     if (pnew == NULL) {
8737         Py_DECREF(tmp);
8738         return NULL;
8739     }
8740     pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8741     if (pnew->str == NULL) {
8742         _Py_ForgetReference((PyObject *)pnew);
8743         PyObject_Del(pnew);
8744         Py_DECREF(tmp);
8745         return PyErr_NoMemory();
8746     }
8747     Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8748     pnew->length = n;
8749     pnew->hash = tmp->hash;
8750     Py_DECREF(tmp);
8751     return (PyObject *)pnew;
8752 }
8753
8754 PyDoc_STRVAR(unicode_doc,
8755              "unicode(string [, encoding[, errors]]) -> object\n\
8756 \n\
8757 Create a new Unicode object from the given encoded string.\n\
8758 encoding defaults to the current default string encoding.\n\
8759 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8760
8761 PyTypeObject PyUnicode_Type = {
8762     PyVarObject_HEAD_INIT(&PyType_Type, 0)
8763     "unicode",              /* tp_name */
8764     sizeof(PyUnicodeObject),        /* tp_size */
8765     0,                  /* tp_itemsize */
8766     /* Slots */
8767     (destructor)unicode_dealloc,    /* tp_dealloc */
8768     0,                  /* tp_print */
8769     0,                  /* tp_getattr */
8770     0,                  /* tp_setattr */
8771     0,                  /* tp_compare */
8772     unicode_repr,           /* tp_repr */
8773     &unicode_as_number,         /* tp_as_number */
8774     &unicode_as_sequence,       /* tp_as_sequence */
8775     &unicode_as_mapping,        /* tp_as_mapping */
8776     (hashfunc) unicode_hash,        /* tp_hash*/
8777     0,                  /* tp_call*/
8778     (reprfunc) unicode_str,     /* tp_str */
8779     PyObject_GenericGetAttr,        /* tp_getattro */
8780     0,                  /* tp_setattro */
8781     &unicode_as_buffer,         /* tp_as_buffer */
8782     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
8783     Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS,  /* tp_flags */
8784     unicode_doc,            /* tp_doc */
8785     0,                  /* tp_traverse */
8786     0,                  /* tp_clear */
8787     PyUnicode_RichCompare,      /* tp_richcompare */
8788     0,                  /* tp_weaklistoffset */
8789     0,                  /* tp_iter */
8790     0,                  /* tp_iternext */
8791     unicode_methods,            /* tp_methods */
8792     0,                  /* tp_members */
8793     0,                  /* tp_getset */
8794     &PyBaseString_Type,         /* tp_base */
8795     0,                  /* tp_dict */
8796     0,                  /* tp_descr_get */
8797     0,                  /* tp_descr_set */
8798     0,                  /* tp_dictoffset */
8799     0,                  /* tp_init */
8800     0,                  /* tp_alloc */
8801     unicode_new,            /* tp_new */
8802     PyObject_Del,           /* tp_free */
8803 };
8804
8805 /* Initialize the Unicode implementation */
8806
8807 void _PyUnicode_Init(void)
8808 {
8809     int i;
8810
8811     /* XXX - move this array to unicodectype.c ? */
8812     Py_UNICODE linebreak[] = {
8813         0x000A, /* LINE FEED */
8814         0x000D, /* CARRIAGE RETURN */
8815         0x001C, /* FILE SEPARATOR */
8816         0x001D, /* GROUP SEPARATOR */
8817         0x001E, /* RECORD SEPARATOR */
8818         0x0085, /* NEXT LINE */
8819         0x2028, /* LINE SEPARATOR */
8820         0x2029, /* PARAGRAPH SEPARATOR */
8821     };
8822
8823     /* Init the implementation */
8824     free_list = NULL;
8825     numfree = 0;
8826     unicode_empty = _PyUnicode_New(0);
8827     if (!unicode_empty)
8828         return;
8829
8830     strcpy(unicode_default_encoding, "ascii");
8831     for (i = 0; i < 256; i++)
8832         unicode_latin1[i] = NULL;
8833     if (PyType_Ready(&PyUnicode_Type) < 0)
8834         Py_FatalError("Can't initialize 'unicode'");
8835
8836     /* initialize the linebreak bloom filter */
8837     bloom_linebreak = make_bloom_mask(
8838         linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8839         );
8840
8841     PyType_Ready(&EncodingMapType);
8842 }
8843
8844 /* Finalize the Unicode implementation */
8845
8846 int
8847 PyUnicode_ClearFreeList(void)
8848 {
8849     int freelist_size = numfree;
8850     PyUnicodeObject *u;
8851
8852     for (u = free_list; u != NULL;) {
8853         PyUnicodeObject *v = u;
8854         u = *(PyUnicodeObject **)u;
8855         if (v->str)
8856             PyObject_DEL(v->str);
8857         Py_XDECREF(v->defenc);
8858         PyObject_Del(v);
8859         numfree--;
8860     }
8861     free_list = NULL;
8862     assert(numfree == 0);
8863     return freelist_size;
8864 }
8865
8866 void
8867 _PyUnicode_Fini(void)
8868 {
8869     int i;
8870
8871     Py_XDECREF(unicode_empty);
8872     unicode_empty = NULL;
8873
8874     for (i = 0; i < 256; i++) {
8875         if (unicode_latin1[i]) {
8876             Py_DECREF(unicode_latin1[i]);
8877             unicode_latin1[i] = NULL;
8878         }
8879     }
8880     (void)PyUnicode_ClearFreeList();
8881 }
8882
8883 #ifdef __cplusplus
8884 }
8885 #endif