Objects/unicodeobject.c

   1 /*
   2
   3 Unicode implementation based on original code by Fredrik Lundh,
   4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
   5 Unicode Integration Proposal (see file Misc/unicode.txt).
   6
   7 Major speed upgrades to the method implementations at the Reykjavik
   8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
   9
  10 Copyright (c) Corporation for National Research Initiatives.
  11
  12 --------------------------------------------------------------------
  13 The original string type implementation is:
  14
  15   Copyright (c) 1999 by Secret Labs AB
  16   Copyright (c) 1999 by Fredrik Lundh
  17
  18 By obtaining, using, and/or copying this software and/or its
  19 associated documentation, you agree that you have read, understood,
  20 and will comply with the following terms and conditions:
  21
  22 Permission to use, copy, modify, and distribute this software and its
  23 associated documentation for any purpose and without fee is hereby
  24 granted, provided that the above copyright notice appears in all
  25 copies, and that both that copyright notice and this permission notice
  26 appear in supporting documentation, and that the name of Secret Labs
  27 AB or the author not be used in advertising or publicity pertaining to
  28 distribution of the software without specific, written prior
  29 permission.
  30
  31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  33 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  38 --------------------------------------------------------------------
  39
  40 */
  41
  42 #define PY_SSIZE_T_CLEAN
  43 #include "Python.h"
  44
  45 #include "unicodeobject.h"
  46 #include "ucnhash.h"
  47
  48 #ifdef MS_WINDOWS
  49 #include <windows.h>
  50 #endif
  51
  52 /* Limit for the Unicode object free list */
  53
  54 #define PyUnicode_MAXFREELIST       1024
  55
  56 /* Limit for the Unicode object free list stay alive optimization.
  57
  58    The implementation will keep allocated Unicode memory intact for
  59    all objects on the free list having a size less than this
  60    limit. This reduces malloc() overhead for small Unicode objects.
  61
  62    At worst this will result in PyUnicode_MAXFREELIST *
  63    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
  64    malloc()-overhead) bytes of unused garbage.
  65
  66    Setting the limit to 0 effectively turns the feature off.
  67
  68    Note: This is an experimental feature ! If you get core dumps when
  69    using Unicode objects, turn this feature off.
  70
  71 */
  72
  73 #define KEEPALIVE_SIZE_LIMIT       9
  74
  75 /* Endianness switches; defaults to little endian */
  76
  77 #ifdef WORDS_BIGENDIAN
  78 # define BYTEORDER_IS_BIG_ENDIAN
  79 #else
  80 # define BYTEORDER_IS_LITTLE_ENDIAN
  81 #endif
  82
  83 /* --- Globals ------------------------------------------------------------
  84
  85    The globals are initialized by the _PyUnicode_Init() API and should
  86    not be used before calling that API.
  87
  88 */
  89
  90
  91 #ifdef __cplusplus
  92 extern "C" {
  93 #endif
  94
  95 /* Free list for Unicode objects */
  96 static PyUnicodeObject *free_list;
  97 static int numfree;
  98
  99 /* The empty Unicode object is shared to improve performance. */
 100 static PyUnicodeObject *unicode_empty;
 101
 102 /* Single character Unicode strings in the Latin-1 range are being
 103    shared as well. */
 104 static PyUnicodeObject *unicode_latin1[256];
 105
 106 /* Default encoding to use and assume when NULL is passed as encoding
 107    parameter; it is initialized by _PyUnicode_Init().
 108
 109    Always use the PyUnicode_SetDefaultEncoding() and
 110    PyUnicode_GetDefaultEncoding() APIs to access this global.
 111
 112 */
 113 static char unicode_default_encoding[100];
 114
 115 /* Fast detection of the most frequent whitespace characters */
 116 const unsigned char _Py_ascii_whitespace[] = {
 117     0, 0, 0, 0, 0, 0, 0, 0,
 118 /*     case 0x0009: * HORIZONTAL TABULATION */
 119 /*     case 0x000A: * LINE FEED */
 120 /*     case 0x000B: * VERTICAL TABULATION */
 121 /*     case 0x000C: * FORM FEED */
 122 /*     case 0x000D: * CARRIAGE RETURN */
 123     0, 1, 1, 1, 1, 1, 0, 0,
 124     0, 0, 0, 0, 0, 0, 0, 0,
 125 /*     case 0x001C: * FILE SEPARATOR */
 126 /*     case 0x001D: * GROUP SEPARATOR */
 127 /*     case 0x001E: * RECORD SEPARATOR */
 128 /*     case 0x001F: * UNIT SEPARATOR */
 129     0, 0, 0, 0, 1, 1, 1, 1,
 130 /*     case 0x0020: * SPACE */
 131     1, 0, 0, 0, 0, 0, 0, 0,
 132     0, 0, 0, 0, 0, 0, 0, 0,
 133     0, 0, 0, 0, 0, 0, 0, 0,
 134     0, 0, 0, 0, 0, 0, 0, 0,
 135
 136     0, 0, 0, 0, 0, 0, 0, 0,
 137     0, 0, 0, 0, 0, 0, 0, 0,
 138     0, 0, 0, 0, 0, 0, 0, 0,
 139     0, 0, 0, 0, 0, 0, 0, 0,
 140     0, 0, 0, 0, 0, 0, 0, 0,
 141     0, 0, 0, 0, 0, 0, 0, 0,
 142     0, 0, 0, 0, 0, 0, 0, 0,
 143     0, 0, 0, 0, 0, 0, 0, 0
 144 };
 145
 146 /* Same for linebreaks */
 147 static unsigned char ascii_linebreak[] = {
 148     0, 0, 0, 0, 0, 0, 0, 0,
 149 /*         0x000A, * LINE FEED */
 150 /*         0x000D, * CARRIAGE RETURN */
 151     0, 0, 1, 0, 0, 1, 0, 0,
 152     0, 0, 0, 0, 0, 0, 0, 0,
 153 /*         0x001C, * FILE SEPARATOR */
 154 /*         0x001D, * GROUP SEPARATOR */
 155 /*         0x001E, * RECORD SEPARATOR */
 156     0, 0, 0, 0, 1, 1, 1, 0,
 157     0, 0, 0, 0, 0, 0, 0, 0,
 158     0, 0, 0, 0, 0, 0, 0, 0,
 159     0, 0, 0, 0, 0, 0, 0, 0,
 160     0, 0, 0, 0, 0, 0, 0, 0,
 161
 162     0, 0, 0, 0, 0, 0, 0, 0,
 163     0, 0, 0, 0, 0, 0, 0, 0,
 164     0, 0, 0, 0, 0, 0, 0, 0,
 165     0, 0, 0, 0, 0, 0, 0, 0,
 166     0, 0, 0, 0, 0, 0, 0, 0,
 167     0, 0, 0, 0, 0, 0, 0, 0,
 168     0, 0, 0, 0, 0, 0, 0, 0,
 169     0, 0, 0, 0, 0, 0, 0, 0
 170 };
 171
 172
 173 Py_UNICODE
 174 PyUnicode_GetMax(void)
 175 {
 176 #ifdef Py_UNICODE_WIDE
 177     return 0x10FFFF;
 178 #else
 179     /* This is actually an illegal character, so it should
 180        not be passed to unichr. */
 181     return 0xFFFF;
 182 #endif
 183 }
 184
 185 /* --- Bloom Filters ----------------------------------------------------- */
 186
 187 /* stuff to implement simple "bloom filters" for Unicode characters.
 188    to keep things simple, we use a single bitmask, using the least 5
 189    bits from each unicode characters as the bit index. */
 190
 191 /* the linebreak mask is set up by Unicode_Init below */
 192
 193 #if LONG_BIT >= 128
 194 #define BLOOM_WIDTH 128
 195 #elif LONG_BIT >= 64
 196 #define BLOOM_WIDTH 64
 197 #elif LONG_BIT >= 32
 198 #define BLOOM_WIDTH 32
 199 #else
 200 #error "LONG_BIT is smaller than 32"
 201 #endif
 202
 203 #define BLOOM_MASK unsigned long
 204
 205 static BLOOM_MASK bloom_linebreak;
 206
 207 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
 208 #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
 209
 210 #define BLOOM_LINEBREAK(ch)                                             \
 211     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
 212      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
 213
 214 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
 215 {
 216     /* calculate simple bloom-style bitmask for a given unicode string */
 217
 218     BLOOM_MASK mask;
 219     Py_ssize_t i;
 220
 221     mask = 0;
 222     for (i = 0; i < len; i++)
 223         BLOOM_ADD(mask, ptr[i]);
 224
 225     return mask;
 226 }
 227
 228 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
 229 {
 230     Py_ssize_t i;
 231
 232     for (i = 0; i < setlen; i++)
 233         if (set[i] == chr)
 234             return 1;
 235
 236     return 0;
 237 }
 238
 239 #define BLOOM_MEMBER(mask, chr, set, setlen)                    \
 240     BLOOM(mask, chr) && unicode_member(chr, set, setlen)
 241
 242 /* --- Unicode Object ----------------------------------------------------- */
 243
 244 static
 245 int unicode_resize(register PyUnicodeObject *unicode,
 246                    Py_ssize_t length)
 247 {
 248     void *oldstr;
 249
 250     /* Shortcut if there's nothing much to do. */
 251     if (unicode->length == length)
 252         goto reset;
 253
 254     /* Resizing shared object (unicode_empty or single character
 255        objects) in-place is not allowed. Use PyUnicode_Resize()
 256        instead ! */
 257
 258     if (unicode == unicode_empty ||
 259         (unicode->length == 1 &&
 260          unicode->str[0] < 256U &&
 261          unicode_latin1[unicode->str[0]] == unicode)) {
 262         PyErr_SetString(PyExc_SystemError,
 263                         "can't resize shared unicode objects");
 264         return -1;
 265     }
 266
 267     /* We allocate one more byte to make sure the string is Ux0000 terminated.
 268        The overallocation is also used by fastsearch, which assumes that it's
 269        safe to look at str[length] (without making any assumptions about what
 270        it contains). */
 271
 272     oldstr = unicode->str;
 273     unicode->str = PyObject_REALLOC(unicode->str,
 274                                     sizeof(Py_UNICODE) * (length + 1));
 275     if (!unicode->str) {
 276         unicode->str = (Py_UNICODE *)oldstr;
 277         PyErr_NoMemory();
 278         return -1;
 279     }
 280     unicode->str[length] = 0;
 281     unicode->length = length;
 282
 283   reset:
 284     /* Reset the object caches */
 285     if (unicode->defenc) {
 286         Py_DECREF(unicode->defenc);
 287         unicode->defenc = NULL;
 288     }
 289     unicode->hash = -1;
 290
 291     return 0;
 292 }
 293
 294 /* We allocate one more byte to make sure the string is
 295    Ux0000 terminated -- XXX is this needed ?
 296
 297    XXX This allocator could further be enhanced by assuring that the
 298    free list never reduces its size below 1.
 299
 300 */
 301
 302 static
 303 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
 304 {
 305     register PyUnicodeObject *unicode;
 306
 307     /* Optimization for empty strings */
 308     if (length == 0 && unicode_empty != NULL) {
 309         Py_INCREF(unicode_empty);
 310         return unicode_empty;
 311     }
 312
 313     /* Ensure we won't overflow the size. */
 314     if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
 315         return (PyUnicodeObject *)PyErr_NoMemory();
 316     }
 317
 318     /* Unicode freelist & memory allocation */
 319     if (free_list) {
 320         unicode = free_list;
 321         free_list = *(PyUnicodeObject **)unicode;
 322         numfree--;
 323         if (unicode->str) {
 324             /* Keep-Alive optimization: we only upsize the buffer,
 325                never downsize it. */
 326             if ((unicode->length < length) &&
 327                 unicode_resize(unicode, length) < 0) {
 328                 PyObject_DEL(unicode->str);
 329                 unicode->str = NULL;
 330             }
 331         }
 332         else {
 333             size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
 334             unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
 335         }
 336         PyObject_INIT(unicode, &PyUnicode_Type);
 337     }
 338     else {
 339         size_t new_size;
 340         unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
 341         if (unicode == NULL)
 342             return NULL;
 343         new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
 344         unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
 345     }
 346
 347     if (!unicode->str) {
 348         PyErr_NoMemory();
 349         goto onError;
 350     }
 351     /* Initialize the first element to guard against cases where
 352      * the caller fails before initializing str -- unicode_resize()
 353      * reads str[0], and the Keep-Alive optimization can keep memory
 354      * allocated for str alive across a call to unicode_dealloc(unicode).
 355      * We don't want unicode_resize to read uninitialized memory in
 356      * that case.
 357      */
 358     unicode->str[0] = 0;
 359     unicode->str[length] = 0;
 360     unicode->length = length;
 361     unicode->hash = -1;
 362     unicode->defenc = NULL;
 363     return unicode;
 364
 365   onError:
 366     /* XXX UNREF/NEWREF interface should be more symmetrical */
 367     _Py_DEC_REFTOTAL;
 368     _Py_ForgetReference((PyObject *)unicode);
 369     PyObject_Del(unicode);
 370     return NULL;
 371 }
 372
 373 static
 374 void unicode_dealloc(register PyUnicodeObject *unicode)
 375 {
 376     if (PyUnicode_CheckExact(unicode) &&
 377         numfree < PyUnicode_MAXFREELIST) {
 378         /* Keep-Alive optimization */
 379         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
 380             PyObject_DEL(unicode->str);
 381             unicode->str = NULL;
 382             unicode->length = 0;
 383         }
 384         if (unicode->defenc) {
 385             Py_DECREF(unicode->defenc);
 386             unicode->defenc = NULL;
 387         }
 388         /* Add to free list */
 389         *(PyUnicodeObject **)unicode = free_list;
 390         free_list = unicode;
 391         numfree++;
 392     }
 393     else {
 394         PyObject_DEL(unicode->str);
 395         Py_XDECREF(unicode->defenc);
 396         Py_TYPE(unicode)->tp_free((PyObject *)unicode);
 397     }
 398 }
 399
 400 static
 401 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
 402 {
 403     register PyUnicodeObject *v;
 404
 405     /* Argument checks */
 406     if (unicode == NULL) {
 407         PyErr_BadInternalCall();
 408         return -1;
 409     }
 410     v = *unicode;
 411     if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
 412         PyErr_BadInternalCall();
 413         return -1;
 414     }
 415
 416     /* Resizing unicode_empty and single character objects is not
 417        possible since these are being shared. We simply return a fresh
 418        copy with the same Unicode content. */
 419     if (v->length != length &&
 420         (v == unicode_empty || v->length == 1)) {
 421         PyUnicodeObject *w = _PyUnicode_New(length);
 422         if (w == NULL)
 423             return -1;
 424         Py_UNICODE_COPY(w->str, v->str,
 425                         length < v->length ? length : v->length);
 426         Py_DECREF(*unicode);
 427         *unicode = w;
 428         return 0;
 429     }
 430
 431     /* Note that we don't have to modify *unicode for unshared Unicode
 432        objects, since we can modify them in-place. */
 433     return unicode_resize(v, length);
 434 }
 435
 436 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
 437 {
 438     return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
 439 }
 440
 441 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
 442                                 Py_ssize_t size)
 443 {
 444     PyUnicodeObject *unicode;
 445
 446     /* If the Unicode data is known at construction time, we can apply
 447        some optimizations which share commonly used objects. */
 448     if (u != NULL) {
 449
 450         /* Optimization for empty strings */
 451         if (size == 0 && unicode_empty != NULL) {
 452             Py_INCREF(unicode_empty);
 453             return (PyObject *)unicode_empty;
 454         }
 455
 456         /* Single character Unicode objects in the Latin-1 range are
 457            shared when using this constructor */
 458         if (size == 1 && *u < 256) {
 459             unicode = unicode_latin1[*u];
 460             if (!unicode) {
 461                 unicode = _PyUnicode_New(1);
 462                 if (!unicode)
 463                     return NULL;
 464                 unicode->str[0] = *u;
 465                 unicode_latin1[*u] = unicode;
 466             }
 467             Py_INCREF(unicode);
 468             return (PyObject *)unicode;
 469         }
 470     }
 471
 472     unicode = _PyUnicode_New(size);
 473     if (!unicode)
 474         return NULL;
 475
 476     /* Copy the Unicode data into the new object */
 477     if (u != NULL)
 478         Py_UNICODE_COPY(unicode->str, u, size);
 479
 480     return (PyObject *)unicode;
 481 }
 482
 483 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
 484 {
 485     PyUnicodeObject *unicode;
 486
 487     if (size < 0) {
 488         PyErr_SetString(PyExc_SystemError,
 489                         "Negative size passed to PyUnicode_FromStringAndSize");
 490         return NULL;
 491     }
 492
 493     /* If the Unicode data is known at construction time, we can apply
 494        some optimizations which share commonly used objects.
 495        Also, this means the input must be UTF-8, so fall back to the
 496        UTF-8 decoder at the end. */
 497     if (u != NULL) {
 498
 499         /* Optimization for empty strings */
 500         if (size == 0 && unicode_empty != NULL) {
 501             Py_INCREF(unicode_empty);
 502             return (PyObject *)unicode_empty;
 503         }
 504
 505         /* Single characters are shared when using this constructor.
 506            Restrict to ASCII, since the input must be UTF-8. */
 507         if (size == 1 && Py_CHARMASK(*u) < 128) {
 508             unicode = unicode_latin1[Py_CHARMASK(*u)];
 509             if (!unicode) {
 510                 unicode = _PyUnicode_New(1);
 511                 if (!unicode)
 512                     return NULL;
 513                 unicode->str[0] = Py_CHARMASK(*u);
 514                 unicode_latin1[Py_CHARMASK(*u)] = unicode;
 515             }
 516             Py_INCREF(unicode);
 517             return (PyObject *)unicode;
 518         }
 519
 520         return PyUnicode_DecodeUTF8(u, size, NULL);
 521     }
 522
 523     unicode = _PyUnicode_New(size);
 524     if (!unicode)
 525         return NULL;
 526
 527     return (PyObject *)unicode;
 528 }
 529
 530 PyObject *PyUnicode_FromString(const char *u)
 531 {
 532     size_t size = strlen(u);
 533     if (size > PY_SSIZE_T_MAX) {
 534         PyErr_SetString(PyExc_OverflowError, "input too long");
 535         return NULL;
 536     }
 537
 538     return PyUnicode_FromStringAndSize(u, size);
 539 }
 540
 541 #ifdef HAVE_WCHAR_H
 542
 543 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
 544 # define CONVERT_WCHAR_TO_SURROGATES
 545 #endif
 546
 547 #ifdef CONVERT_WCHAR_TO_SURROGATES
 548
 549 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
 550    to convert from UTF32 to UTF16. */
 551
 552 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 553                                  Py_ssize_t size)
 554 {
 555     PyUnicodeObject *unicode;
 556     register Py_ssize_t i;
 557     Py_ssize_t alloc;
 558     const wchar_t *orig_w;
 559
 560     if (w == NULL) {
 561         PyErr_BadInternalCall();
 562         return NULL;
 563     }
 564
 565     alloc = size;
 566     orig_w = w;
 567     for (i = size; i > 0; i--) {
 568         if (*w > 0xFFFF)
 569             alloc++;
 570         w++;
 571     }
 572     w = orig_w;
 573     unicode = _PyUnicode_New(alloc);
 574     if (!unicode)
 575         return NULL;
 576
 577     /* Copy the wchar_t data into the new object */
 578     {
 579         register Py_UNICODE *u;
 580         u = PyUnicode_AS_UNICODE(unicode);
 581         for (i = size; i > 0; i--) {
 582             if (*w > 0xFFFF) {
 583                 wchar_t ordinal = *w++;
 584                 ordinal -= 0x10000;
 585                 *u++ = 0xD800 | (ordinal >> 10);
 586                 *u++ = 0xDC00 | (ordinal & 0x3FF);
 587             }
 588             else
 589                 *u++ = *w++;
 590         }
 591     }
 592     return (PyObject *)unicode;
 593 }
 594
 595 #else
 596
 597 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 598                                  Py_ssize_t size)
 599 {
 600     PyUnicodeObject *unicode;
 601
 602     if (w == NULL) {
 603         PyErr_BadInternalCall();
 604         return NULL;
 605     }
 606
 607     unicode = _PyUnicode_New(size);
 608     if (!unicode)
 609         return NULL;
 610
 611     /* Copy the wchar_t data into the new object */
 612 #ifdef HAVE_USABLE_WCHAR_T
 613     memcpy(unicode->str, w, size * sizeof(wchar_t));
 614 #else
 615     {
 616         register Py_UNICODE *u;
 617         register Py_ssize_t i;
 618         u = PyUnicode_AS_UNICODE(unicode);
 619         for (i = size; i > 0; i--)
 620             *u++ = *w++;
 621     }
 622 #endif
 623
 624     return (PyObject *)unicode;
 625 }
 626
 627 #endif /* CONVERT_WCHAR_TO_SURROGATES */
 628
 629 #undef CONVERT_WCHAR_TO_SURROGATES
 630
 631 static void
 632 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
 633 {
 634     *fmt++ = '%';
 635     if (width) {
 636         if (zeropad)
 637             *fmt++ = '0';
 638         fmt += sprintf(fmt, "%d", width);
 639     }
 640     if (precision)
 641         fmt += sprintf(fmt, ".%d", precision);
 642     if (longflag)
 643         *fmt++ = 'l';
 644     else if (size_tflag) {
 645         char *f = PY_FORMAT_SIZE_T;
 646         while (*f)
 647             *fmt++ = *f++;
 648     }
 649     *fmt++ = c;
 650     *fmt = '\0';
 651 }
 652
 653 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
 654
 655 PyObject *
 656 PyUnicode_FromFormatV(const char *format, va_list vargs)
 657 {
 658     va_list count;
 659     Py_ssize_t callcount = 0;
 660     PyObject **callresults = NULL;
 661     PyObject **callresult = NULL;
 662     Py_ssize_t n = 0;
 663     int width = 0;
 664     int precision = 0;
 665     int zeropad;
 666     const char* f;
 667     Py_UNICODE *s;
 668     PyObject *string;
 669     /* used by sprintf */
 670     char buffer[21];
 671     /* use abuffer instead of buffer, if we need more space
 672      * (which can happen if there's a format specifier with width). */
 673     char *abuffer = NULL;
 674     char *realbuffer;
 675     Py_ssize_t abuffersize = 0;
 676     char fmt[60]; /* should be enough for %0width.precisionld */
 677     const char *copy;
 678
 679 #ifdef VA_LIST_IS_ARRAY
 680     Py_MEMCPY(count, vargs, sizeof(va_list));
 681 #else
 682 #ifdef  __va_copy
 683     __va_copy(count, vargs);
 684 #else
 685     count = vargs;
 686 #endif
 687 #endif
 688      /* step 1: count the number of %S/%R/%s format specifications
 689       * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
 690       * objects once during step 3 and put the result in an array) */
 691     for (f = format; *f; f++) {
 692          if (*f == '%') {
 693              if (*(f+1)=='%')
 694                  continue;
 695              if (*(f+1)=='S' || *(f+1)=='R')
 696                  ++callcount;
 697              while (isdigit((unsigned)*f))
 698                  width = (width*10) + *f++ - '0';
 699              while (*++f && *f != '%' && !isalpha((unsigned)*f))
 700                  ;
 701              if (*f == 's')
 702                  ++callcount;
 703          }
 704     }
 705     /* step 2: allocate memory for the results of
 706      * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
 707     if (callcount) {
 708         callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
 709         if (!callresults) {
 710             PyErr_NoMemory();
 711             return NULL;
 712         }
 713         callresult = callresults;
 714     }
 715     /* step 3: figure out how large a buffer we need */
 716     for (f = format; *f; f++) {
 717         if (*f == '%') {
 718             const char* p = f;
 719             width = 0;
 720             while (isdigit((unsigned)*f))
 721                 width = (width*10) + *f++ - '0';
 722             while (*++f && *f != '%' && !isalpha((unsigned)*f))
 723                 ;
 724
 725             /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
 726              * they don't affect the amount of space we reserve.
 727              */
 728             if ((*f == 'l' || *f == 'z') &&
 729                 (f[1] == 'd' || f[1] == 'u'))
 730                 ++f;
 731
 732             switch (*f) {
 733             case 'c':
 734                 (void)va_arg(count, int);
 735                 /* fall through... */
 736             case '%':
 737                 n++;
 738                 break;
 739             case 'd': case 'u': case 'i': case 'x':
 740                 (void) va_arg(count, int);
 741                 /* 20 bytes is enough to hold a 64-bit
 742                    integer.  Decimal takes the most space.
 743                    This isn't enough for octal.
 744                    If a width is specified we need more
 745                    (which we allocate later). */
 746                 if (width < 20)
 747                     width = 20;
 748                 n += width;
 749                 if (abuffersize < width)
 750                     abuffersize = width;
 751                 break;
 752             case 's':
 753             {
 754                 /* UTF-8 */
 755                 const char *s = va_arg(count, const char*);
 756                 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
 757                 if (!str)
 758                     goto fail;
 759                 n += PyUnicode_GET_SIZE(str);
 760                 /* Remember the str and switch to the next slot */
 761                 *callresult++ = str;
 762                 break;
 763             }
 764             case 'U':
 765             {
 766                 PyObject *obj = va_arg(count, PyObject *);
 767                 assert(obj && PyUnicode_Check(obj));
 768                 n += PyUnicode_GET_SIZE(obj);
 769                 break;
 770             }
 771             case 'V':
 772             {
 773                 PyObject *obj = va_arg(count, PyObject *);
 774                 const char *str = va_arg(count, const char *);
 775                 assert(obj || str);
 776                 assert(!obj || PyUnicode_Check(obj));
 777                 if (obj)
 778                     n += PyUnicode_GET_SIZE(obj);
 779                 else
 780                     n += strlen(str);
 781                 break;
 782             }
 783             case 'S':
 784             {
 785                 PyObject *obj = va_arg(count, PyObject *);
 786                 PyObject *str;
 787                 assert(obj);
 788                 str = PyObject_Str(obj);
 789                 if (!str)
 790                     goto fail;
 791                 n += PyUnicode_GET_SIZE(str);
 792                 /* Remember the str and switch to the next slot */
 793                 *callresult++ = str;
 794                 break;
 795             }
 796             case 'R':
 797             {
 798                 PyObject *obj = va_arg(count, PyObject *);
 799                 PyObject *repr;
 800                 assert(obj);
 801                 repr = PyObject_Repr(obj);
 802                 if (!repr)
 803                     goto fail;
 804                 n += PyUnicode_GET_SIZE(repr);
 805                 /* Remember the repr and switch to the next slot */
 806                 *callresult++ = repr;
 807                 break;
 808             }
 809             case 'p':
 810                 (void) va_arg(count, int);
 811                 /* maximum 64-bit pointer representation:
 812                  * 0xffffffffffffffff
 813                  * so 19 characters is enough.
 814                  * XXX I count 18 -- what's the extra for?
 815                  */
 816                 n += 19;
 817                 break;
 818             default:
 819                 /* if we stumble upon an unknown
 820                    formatting code, copy the rest of
 821                    the format string to the output
 822                    string. (we cannot just skip the
 823                    code, since there's no way to know
 824                    what's in the argument list) */
 825                 n += strlen(p);
 826                 goto expand;
 827             }
 828         } else
 829             n++;
 830     }
 831   expand:
 832     if (abuffersize > 20) {
 833         abuffer = PyObject_Malloc(abuffersize);
 834         if (!abuffer) {
 835             PyErr_NoMemory();
 836             goto fail;
 837         }
 838         realbuffer = abuffer;
 839     }
 840     else
 841         realbuffer = buffer;
 842     /* step 4: fill the buffer */
 843     /* Since we've analyzed how much space we need for the worst case,
 844        we don't have to resize the string.
 845        There can be no errors beyond this point. */
 846     string = PyUnicode_FromUnicode(NULL, n);
 847     if (!string)
 848         goto fail;
 849
 850     s = PyUnicode_AS_UNICODE(string);
 851     callresult = callresults;
 852
 853     for (f = format; *f; f++) {
 854         if (*f == '%') {
 855             const char* p = f++;
 856             int longflag = 0;
 857             int size_tflag = 0;
 858             zeropad = (*f == '0');
 859             /* parse the width.precision part */
 860             width = 0;
 861             while (isdigit((unsigned)*f))
 862                 width = (width*10) + *f++ - '0';
 863             precision = 0;
 864             if (*f == '.') {
 865                 f++;
 866                 while (isdigit((unsigned)*f))
 867                     precision = (precision*10) + *f++ - '0';
 868             }
 869             /* handle the long flag, but only for %ld and %lu.
 870                others can be added when necessary. */
 871             if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
 872                 longflag = 1;
 873                 ++f;
 874             }
 875             /* handle the size_t flag. */
 876             if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
 877                 size_tflag = 1;
 878                 ++f;
 879             }
 880
 881             switch (*f) {
 882             case 'c':
 883                 *s++ = va_arg(vargs, int);
 884                 break;
 885             case 'd':
 886                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
 887                 if (longflag)
 888                     sprintf(realbuffer, fmt, va_arg(vargs, long));
 889                 else if (size_tflag)
 890                     sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
 891                 else
 892                     sprintf(realbuffer, fmt, va_arg(vargs, int));
 893                 appendstring(realbuffer);
 894                 break;
 895             case 'u':
 896                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
 897                 if (longflag)
 898                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
 899                 else if (size_tflag)
 900                     sprintf(realbuffer, fmt, va_arg(vargs, size_t));
 901                 else
 902                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
 903                 appendstring(realbuffer);
 904                 break;
 905             case 'i':
 906                 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
 907                 sprintf(realbuffer, fmt, va_arg(vargs, int));
 908                 appendstring(realbuffer);
 909                 break;
 910             case 'x':
 911                 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
 912                 sprintf(realbuffer, fmt, va_arg(vargs, int));
 913                 appendstring(realbuffer);
 914                 break;
 915             case 's':
 916             {
 917                 /* unused, since we already have the result */
 918                 (void) va_arg(vargs, char *);
 919                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
 920                                 PyUnicode_GET_SIZE(*callresult));
 921                 s += PyUnicode_GET_SIZE(*callresult);
 922                 /* We're done with the unicode()/repr() => forget it */
 923                 Py_DECREF(*callresult);
 924                 /* switch to next unicode()/repr() result */
 925                 ++callresult;
 926                 break;
 927             }
 928             case 'U':
 929             {
 930                 PyObject *obj = va_arg(vargs, PyObject *);
 931                 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
 932                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
 933                 s += size;
 934                 break;
 935             }
 936             case 'V':
 937             {
 938                 PyObject *obj = va_arg(vargs, PyObject *);
 939                 const char *str = va_arg(vargs, const char *);
 940                 if (obj) {
 941                     Py_ssize_t size = PyUnicode_GET_SIZE(obj);
 942                     Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
 943                     s += size;
 944                 } else {
 945                     appendstring(str);
 946                 }
 947                 break;
 948             }
 949             case 'S':
 950             case 'R':
 951             {
 952                 Py_UNICODE *ucopy;
 953                 Py_ssize_t usize;
 954                 Py_ssize_t upos;
 955                 /* unused, since we already have the result */
 956                 (void) va_arg(vargs, PyObject *);
 957                 ucopy = PyUnicode_AS_UNICODE(*callresult);
 958                 usize = PyUnicode_GET_SIZE(*callresult);
 959                 for (upos = 0; upos<usize;)
 960                     *s++ = ucopy[upos++];
 961                 /* We're done with the unicode()/repr() => forget it */
 962                 Py_DECREF(*callresult);
 963                 /* switch to next unicode()/repr() result */
 964                 ++callresult;
 965                 break;
 966             }
 967             case 'p':
 968                 sprintf(buffer, "%p", va_arg(vargs, void*));
 969                 /* %p is ill-defined:  ensure leading 0x. */
 970                 if (buffer[1] == 'X')
 971                     buffer[1] = 'x';
 972                 else if (buffer[1] != 'x') {
 973                     memmove(buffer+2, buffer, strlen(buffer)+1);
 974                     buffer[0] = '0';
 975                     buffer[1] = 'x';
 976                 }
 977                 appendstring(buffer);
 978                 break;
 979             case '%':
 980                 *s++ = '%';
 981                 break;
 982             default:
 983                 appendstring(p);
 984                 goto end;
 985             }
 986         } else
 987             *s++ = *f;
 988     }
 989
 990   end:
 991     if (callresults)
 992         PyObject_Free(callresults);
 993     if (abuffer)
 994         PyObject_Free(abuffer);
 995     PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
 996     return string;
 997   fail:
 998     if (callresults) {
 999         PyObject **callresult2 = callresults;
1000         while (callresult2 < callresult) {
1001             Py_DECREF(*callresult2);
1002             ++callresult2;
1003         }
1004         PyObject_Free(callresults);
1005     }
1006     if (abuffer)
1007         PyObject_Free(abuffer);
1008     return NULL;
1009 }
1010
1011 #undef appendstring
1012
1013 PyObject *
1014 PyUnicode_FromFormat(const char *format, ...)
1015 {
1016     PyObject* ret;
1017     va_list vargs;
1018
1019 #ifdef HAVE_STDARG_PROTOTYPES
1020     va_start(vargs, format);
1021 #else
1022     va_start(vargs);
1023 #endif
1024     ret = PyUnicode_FromFormatV(format, vargs);
1025     va_end(vargs);
1026     return ret;
1027 }
1028
1029 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1030                                 wchar_t *w,
1031                                 Py_ssize_t size)
1032 {
1033     if (unicode == NULL) {
1034         PyErr_BadInternalCall();
1035         return -1;
1036     }
1037
1038     /* If possible, try to copy the 0-termination as well */
1039     if (size > PyUnicode_GET_SIZE(unicode))
1040         size = PyUnicode_GET_SIZE(unicode) + 1;
1041
1042 #ifdef HAVE_USABLE_WCHAR_T
1043     memcpy(w, unicode->str, size * sizeof(wchar_t));
1044 #else
1045     {
1046         register Py_UNICODE *u;
1047         register Py_ssize_t i;
1048         u = PyUnicode_AS_UNICODE(unicode);
1049         for (i = size; i > 0; i--)
1050             *w++ = *u++;
1051     }
1052 #endif
1053
1054     if (size > PyUnicode_GET_SIZE(unicode))
1055         return PyUnicode_GET_SIZE(unicode);
1056     else
1057         return size;
1058 }
1059
1060 #endif
1061
1062 PyObject *PyUnicode_FromOrdinal(int ordinal)
1063 {
1064     Py_UNICODE s[1];
1065
1066 #ifdef Py_UNICODE_WIDE
1067     if (ordinal < 0 || ordinal > 0x10ffff) {
1068         PyErr_SetString(PyExc_ValueError,
1069                         "unichr() arg not in range(0x110000) "
1070                         "(wide Python build)");
1071         return NULL;
1072     }
1073 #else
1074     if (ordinal < 0 || ordinal > 0xffff) {
1075         PyErr_SetString(PyExc_ValueError,
1076                         "unichr() arg not in range(0x10000) "
1077                         "(narrow Python build)");
1078         return NULL;
1079     }
1080 #endif
1081
1082     s[0] = (Py_UNICODE)ordinal;
1083     return PyUnicode_FromUnicode(s, 1);
1084 }
1085
1086 PyObject *PyUnicode_FromObject(register PyObject *obj)
1087 {
1088     /* XXX Perhaps we should make this API an alias of
1089        PyObject_Unicode() instead ?! */
1090     if (PyUnicode_CheckExact(obj)) {
1091         Py_INCREF(obj);
1092         return obj;
1093     }
1094     if (PyUnicode_Check(obj)) {
1095         /* For a Unicode subtype that's not a Unicode object,
1096            return a true Unicode object with the same data. */
1097         return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1098                                      PyUnicode_GET_SIZE(obj));
1099     }
1100     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1101 }
1102
1103 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1104                                       const char *encoding,
1105                                       const char *errors)
1106 {
1107     const char *s = NULL;
1108     Py_ssize_t len;
1109     PyObject *v;
1110
1111     if (obj == NULL) {
1112         PyErr_BadInternalCall();
1113         return NULL;
1114     }
1115
1116 #if 0
1117     /* For b/w compatibility we also accept Unicode objects provided
1118        that no encodings is given and then redirect to
1119        PyObject_Unicode() which then applies the additional logic for
1120        Unicode subclasses.
1121
1122        NOTE: This API should really only be used for object which
1123        represent *encoded* Unicode !
1124
1125     */
1126     if (PyUnicode_Check(obj)) {
1127         if (encoding) {
1128             PyErr_SetString(PyExc_TypeError,
1129                             "decoding Unicode is not supported");
1130             return NULL;
1131         }
1132         return PyObject_Unicode(obj);
1133     }
1134 #else
1135     if (PyUnicode_Check(obj)) {
1136         PyErr_SetString(PyExc_TypeError,
1137                         "decoding Unicode is not supported");
1138         return NULL;
1139     }
1140 #endif
1141
1142     /* Coerce object */
1143     if (PyString_Check(obj)) {
1144         s = PyString_AS_STRING(obj);
1145         len = PyString_GET_SIZE(obj);
1146     }
1147     else if (PyByteArray_Check(obj)) {
1148         /* Python 2.x specific */
1149         PyErr_Format(PyExc_TypeError,
1150                      "decoding bytearray is not supported");
1151         return NULL;
1152     }
1153     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1154         /* Overwrite the error message with something more useful in
1155            case of a TypeError. */
1156         if (PyErr_ExceptionMatches(PyExc_TypeError))
1157             PyErr_Format(PyExc_TypeError,
1158                          "coercing to Unicode: need string or buffer, "
1159                          "%.80s found",
1160                          Py_TYPE(obj)->tp_name);
1161         goto onError;
1162     }
1163
1164     /* Convert to Unicode */
1165     if (len == 0) {
1166         Py_INCREF(unicode_empty);
1167         v = (PyObject *)unicode_empty;
1168     }
1169     else
1170         v = PyUnicode_Decode(s, len, encoding, errors);
1171
1172     return v;
1173
1174   onError:
1175     return NULL;
1176 }
1177
1178 PyObject *PyUnicode_Decode(const char *s,
1179                            Py_ssize_t size,
1180                            const char *encoding,
1181                            const char *errors)
1182 {
1183     PyObject *buffer = NULL, *unicode;
1184
1185     if (encoding == NULL)
1186         encoding = PyUnicode_GetDefaultEncoding();
1187
1188     /* Shortcuts for common default encodings */
1189     if (strcmp(encoding, "utf-8") == 0)
1190         return PyUnicode_DecodeUTF8(s, size, errors);
1191     else if (strcmp(encoding, "latin-1") == 0)
1192         return PyUnicode_DecodeLatin1(s, size, errors);
1193 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1194     else if (strcmp(encoding, "mbcs") == 0)
1195         return PyUnicode_DecodeMBCS(s, size, errors);
1196 #endif
1197     else if (strcmp(encoding, "ascii") == 0)
1198         return PyUnicode_DecodeASCII(s, size, errors);
1199
1200     /* Decode via the codec registry */
1201     buffer = PyBuffer_FromMemory((void *)s, size);
1202     if (buffer == NULL)
1203         goto onError;
1204     unicode = PyCodec_Decode(buffer, encoding, errors);
1205     if (unicode == NULL)
1206         goto onError;
1207     if (!PyUnicode_Check(unicode)) {
1208         PyErr_Format(PyExc_TypeError,
1209                      "decoder did not return an unicode object (type=%.400s)",
1210                      Py_TYPE(unicode)->tp_name);
1211         Py_DECREF(unicode);
1212         goto onError;
1213     }
1214     Py_DECREF(buffer);
1215     return unicode;
1216
1217   onError:
1218     Py_XDECREF(buffer);
1219     return NULL;
1220 }
1221
1222 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1223                                     const char *encoding,
1224                                     const char *errors)
1225 {
1226     PyObject *v;
1227
1228     if (!PyUnicode_Check(unicode)) {
1229         PyErr_BadArgument();
1230         goto onError;
1231     }
1232
1233     if (encoding == NULL)
1234         encoding = PyUnicode_GetDefaultEncoding();
1235
1236     /* Decode via the codec registry */
1237     v = PyCodec_Decode(unicode, encoding, errors);
1238     if (v == NULL)
1239         goto onError;
1240     return v;
1241
1242   onError:
1243     return NULL;
1244 }
1245
1246 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1247                            Py_ssize_t size,
1248                            const char *encoding,
1249                            const char *errors)
1250 {
1251     PyObject *v, *unicode;
1252
1253     unicode = PyUnicode_FromUnicode(s, size);
1254     if (unicode == NULL)
1255         return NULL;
1256     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1257     Py_DECREF(unicode);
1258     return v;
1259 }
1260
1261 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1262                                     const char *encoding,
1263                                     const char *errors)
1264 {
1265     PyObject *v;
1266
1267     if (!PyUnicode_Check(unicode)) {
1268         PyErr_BadArgument();
1269         goto onError;
1270     }
1271
1272     if (encoding == NULL)
1273         encoding = PyUnicode_GetDefaultEncoding();
1274
1275     /* Encode via the codec registry */
1276     v = PyCodec_Encode(unicode, encoding, errors);
1277     if (v == NULL)
1278         goto onError;
1279     return v;
1280
1281   onError:
1282     return NULL;
1283 }
1284
1285 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1286                                     const char *encoding,
1287                                     const char *errors)
1288 {
1289     PyObject *v;
1290
1291     if (!PyUnicode_Check(unicode)) {
1292         PyErr_BadArgument();
1293         goto onError;
1294     }
1295
1296     if (encoding == NULL)
1297         encoding = PyUnicode_GetDefaultEncoding();
1298
1299     /* Shortcuts for common default encodings */
1300     if (errors == NULL) {
1301         if (strcmp(encoding, "utf-8") == 0)
1302             return PyUnicode_AsUTF8String(unicode);
1303         else if (strcmp(encoding, "latin-1") == 0)
1304             return PyUnicode_AsLatin1String(unicode);
1305 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1306         else if (strcmp(encoding, "mbcs") == 0)
1307             return PyUnicode_AsMBCSString(unicode);
1308 #endif
1309         else if (strcmp(encoding, "ascii") == 0)
1310             return PyUnicode_AsASCIIString(unicode);
1311     }
1312
1313     /* Encode via the codec registry */
1314     v = PyCodec_Encode(unicode, encoding, errors);
1315     if (v == NULL)
1316         goto onError;
1317     if (!PyString_Check(v)) {
1318         PyErr_Format(PyExc_TypeError,
1319                      "encoder did not return a string object (type=%.400s)",
1320                      Py_TYPE(v)->tp_name);
1321         Py_DECREF(v);
1322         goto onError;
1323     }
1324     return v;
1325
1326   onError:
1327     return NULL;
1328 }
1329
1330 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1331                                             const char *errors)
1332 {
1333     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1334
1335     if (v)
1336         return v;
1337     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1338     if (v && errors == NULL)
1339         ((PyUnicodeObject *)unicode)->defenc = v;
1340     return v;
1341 }
1342
1343 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1344 {
1345     if (!PyUnicode_Check(unicode)) {
1346         PyErr_BadArgument();
1347         goto onError;
1348     }
1349     return PyUnicode_AS_UNICODE(unicode);
1350
1351   onError:
1352     return NULL;
1353 }
1354
1355 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1356 {
1357     if (!PyUnicode_Check(unicode)) {
1358         PyErr_BadArgument();
1359         goto onError;
1360     }
1361     return PyUnicode_GET_SIZE(unicode);
1362
1363   onError:
1364     return -1;
1365 }
1366
1367 const char *PyUnicode_GetDefaultEncoding(void)
1368 {
1369     return unicode_default_encoding;
1370 }
1371
1372 int PyUnicode_SetDefaultEncoding(const char *encoding)
1373 {
1374     PyObject *v;
1375
1376     /* Make sure the encoding is valid. As side effect, this also
1377        loads the encoding into the codec registry cache. */
1378     v = _PyCodec_Lookup(encoding);
1379     if (v == NULL)
1380         goto onError;
1381     Py_DECREF(v);
1382     strncpy(unicode_default_encoding,
1383             encoding,
1384             sizeof(unicode_default_encoding));
1385     return 0;
1386
1387   onError:
1388     return -1;
1389 }
1390
1391 /* error handling callback helper:
1392    build arguments, call the callback and check the arguments,
1393    if no exception occurred, copy the replacement to the output
1394    and adjust various state variables.
1395    return 0 on success, -1 on error
1396 */
1397
1398 static
1399 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1400                                      const char *encoding, const char *reason,
1401                                      const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1402                                      Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1403                                      PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1404 {
1405     static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1406
1407     PyObject *restuple = NULL;
1408     PyObject *repunicode = NULL;
1409     Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1410     Py_ssize_t requiredsize;
1411     Py_ssize_t newpos;
1412     Py_UNICODE *repptr;
1413     Py_ssize_t repsize;
1414     int res = -1;
1415
1416     if (*errorHandler == NULL) {
1417         *errorHandler = PyCodec_LookupError(errors);
1418         if (*errorHandler == NULL)
1419             goto onError;
1420     }
1421
1422     if (*exceptionObject == NULL) {
1423         *exceptionObject = PyUnicodeDecodeError_Create(
1424             encoding, input, insize, *startinpos, *endinpos, reason);
1425         if (*exceptionObject == NULL)
1426             goto onError;
1427     }
1428     else {
1429         if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1430             goto onError;
1431         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1432             goto onError;
1433         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1434             goto onError;
1435     }
1436
1437     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1438     if (restuple == NULL)
1439         goto onError;
1440     if (!PyTuple_Check(restuple)) {
1441         PyErr_SetString(PyExc_TypeError, &argparse[4]);
1442         goto onError;
1443     }
1444     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1445         goto onError;
1446     if (newpos<0)
1447         newpos = insize+newpos;
1448     if (newpos<0 || newpos>insize) {
1449         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1450         goto onError;
1451     }
1452
1453     /* need more space? (at least enough for what we
1454        have+the replacement+the rest of the string (starting
1455        at the new input position), so we won't have to check space
1456        when there are no errors in the rest of the string) */
1457     repptr = PyUnicode_AS_UNICODE(repunicode);
1458     repsize = PyUnicode_GET_SIZE(repunicode);
1459     requiredsize = *outpos + repsize + insize-newpos;
1460     if (requiredsize > outsize) {
1461         if (requiredsize<2*outsize)
1462             requiredsize = 2*outsize;
1463         if (_PyUnicode_Resize(output, requiredsize) < 0)
1464             goto onError;
1465         *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1466     }
1467     *endinpos = newpos;
1468     *inptr = input + newpos;
1469     Py_UNICODE_COPY(*outptr, repptr, repsize);
1470     *outptr += repsize;
1471     *outpos += repsize;
1472     /* we made it! */
1473     res = 0;
1474
1475   onError:
1476     Py_XDECREF(restuple);
1477     return res;
1478 }
1479
1480 /* --- UTF-7 Codec -------------------------------------------------------- */
1481
1482 /* See RFC2152 for details.  We encode conservatively and decode liberally. */
1483
1484 /* Three simple macros defining base-64. */
1485
1486 /* Is c a base-64 character? */
1487
1488 #define IS_BASE64(c) \
1489     (isalnum(c) || (c) == '+' || (c) == '/')
1490
1491 /* given that c is a base-64 character, what is its base-64 value? */
1492
1493 #define FROM_BASE64(c)                                                  \
1494     (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
1495      ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
1496      ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
1497      (c) == '+' ? 62 : 63)
1498
1499 /* What is the base-64 character of the bottom 6 bits of n? */
1500
1501 #define TO_BASE64(n)  \
1502     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1503
1504 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1505  * decoded as itself.  We are permissive on decoding; the only ASCII
1506  * byte not decoding to itself is the + which begins a base64
1507  * string. */
1508
1509 #define DECODE_DIRECT(c)                                \
1510     ((c) <= 127 && (c) != '+')
1511
1512 /* The UTF-7 encoder treats ASCII characters differently according to
1513  * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1514  * the above).  See RFC2152.  This array identifies these different
1515  * sets:
1516  * 0 : "Set D"
1517  *     alphanumeric and '(),-./:?
1518  * 1 : "Set O"
1519  *     !"#$%&*;<=>@[]^_`{|}
1520  * 2 : "whitespace"
1521  *     ht nl cr sp
1522  * 3 : special (must be base64 encoded)
1523  *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1524  */
1525
1526 static
1527 char utf7_category[128] = {
1528 /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
1529     3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
1530 /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
1531     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
1532 /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
1533     2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
1534 /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
1535     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
1536 /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
1537     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1538 /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
1539     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
1540 /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
1541     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1542 /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
1543     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
1544 };
1545
1546 /* ENCODE_DIRECT: this character should be encoded as itself.  The
1547  * answer depends on whether we are encoding set O as itself, and also
1548  * on whether we are encoding whitespace as itself.  RFC2152 makes it
1549  * clear that the answers to these questions vary between
1550  * applications, so this code needs to be flexible.  */
1551
1552 #define ENCODE_DIRECT(c, directO, directWS)             \
1553     ((c) < 128 && (c) > 0 &&                            \
1554      ((utf7_category[(c)] == 0) ||                      \
1555       (directWS && (utf7_category[(c)] == 2)) ||        \
1556       (directO && (utf7_category[(c)] == 1))))
1557
1558 PyObject *PyUnicode_DecodeUTF7(const char *s,
1559                                Py_ssize_t size,
1560                                const char *errors)
1561 {
1562     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1563 }
1564
1565 /* The decoder.  The only state we preserve is our read position,
1566  * i.e. how many characters we have consumed.  So if we end in the
1567  * middle of a shift sequence we have to back off the read position
1568  * and the output to the beginning of the sequence, otherwise we lose
1569  * all the shift state (seen bits, number of bits seen, high
1570  * surrogate). */
1571
1572 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1573                                        Py_ssize_t size,
1574                                        const char *errors,
1575                                        Py_ssize_t *consumed)
1576 {
1577     const char *starts = s;
1578     Py_ssize_t startinpos;
1579     Py_ssize_t endinpos;
1580     Py_ssize_t outpos;
1581     const char *e;
1582     PyUnicodeObject *unicode;
1583     Py_UNICODE *p;
1584     const char *errmsg = "";
1585     int inShift = 0;
1586     Py_UNICODE *shiftOutStart;
1587     unsigned int base64bits = 0;
1588     unsigned long base64buffer = 0;
1589     Py_UNICODE surrogate = 0;
1590     PyObject *errorHandler = NULL;
1591     PyObject *exc = NULL;
1592
1593     unicode = _PyUnicode_New(size);
1594     if (!unicode)
1595         return NULL;
1596     if (size == 0) {
1597         if (consumed)
1598             *consumed = 0;
1599         return (PyObject *)unicode;
1600     }
1601
1602     p = unicode->str;
1603     shiftOutStart = p;
1604     e = s + size;
1605
1606     while (s < e) {
1607         Py_UNICODE ch = (unsigned char) *s;
1608
1609         if (inShift) { /* in a base-64 section */
1610             if (IS_BASE64(ch)) { /* consume a base-64 character */
1611                 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1612                 base64bits += 6;
1613                 s++;
1614                 if (base64bits >= 16) {
1615                     /* we have enough bits for a UTF-16 value */
1616                     Py_UNICODE outCh = (Py_UNICODE)
1617                                        (base64buffer >> (base64bits-16));
1618                     base64bits -= 16;
1619                     base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1620                     if (surrogate) {
1621                         /* expecting a second surrogate */
1622                         if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1623 #ifdef Py_UNICODE_WIDE
1624                             *p++ = (((surrogate & 0x3FF)<<10)
1625                                     | (outCh & 0x3FF)) + 0x10000;
1626 #else
1627                             *p++ = surrogate;
1628                             *p++ = outCh;
1629 #endif
1630                             surrogate = 0;
1631                         }
1632                         else {
1633                             surrogate = 0;
1634                             errmsg = "second surrogate missing";
1635                             goto utf7Error;
1636                         }
1637                     }
1638                     else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1639                         /* first surrogate */
1640                         surrogate = outCh;
1641                     }
1642                     else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1643                         errmsg = "unexpected second surrogate";
1644                         goto utf7Error;
1645                     }
1646                     else {
1647                         *p++ = outCh;
1648                     }
1649                 }
1650             }
1651             else { /* now leaving a base-64 section */
1652                 inShift = 0;
1653                 s++;
1654                 if (surrogate) {
1655                     errmsg = "second surrogate missing at end of shift sequence";
1656                     goto utf7Error;
1657                 }
1658                 if (base64bits > 0) { /* left-over bits */
1659                     if (base64bits >= 6) {
1660                         /* We've seen at least one base-64 character */
1661                         errmsg = "partial character in shift sequence";
1662                         goto utf7Error;
1663                     }
1664                     else {
1665                         /* Some bits remain; they should be zero */
1666                         if (base64buffer != 0) {
1667                             errmsg = "non-zero padding bits in shift sequence";
1668                             goto utf7Error;
1669                         }
1670                     }
1671                 }
1672                 if (ch != '-') {
1673                     /* '-' is absorbed; other terminating
1674                        characters are preserved */
1675                     *p++ = ch;
1676                 }
1677             }
1678         }
1679         else if ( ch == '+' ) {
1680             startinpos = s-starts;
1681             s++; /* consume '+' */
1682             if (s < e && *s == '-') { /* '+-' encodes '+' */
1683                 s++;
1684                 *p++ = '+';
1685             }
1686             else { /* begin base64-encoded section */
1687                 inShift = 1;
1688                 shiftOutStart = p;
1689                 base64bits = 0;
1690             }
1691         }
1692         else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
1693             *p++ = ch;
1694             s++;
1695         }
1696         else {
1697             startinpos = s-starts;
1698             s++;
1699             errmsg = "unexpected special character";
1700             goto utf7Error;
1701         }
1702         continue;
1703 utf7Error:
1704         outpos = p-PyUnicode_AS_UNICODE(unicode);
1705         endinpos = s-starts;
1706         if (unicode_decode_call_errorhandler(
1707                 errors, &errorHandler,
1708                 "utf7", errmsg,
1709                 starts, size, &startinpos, &endinpos, &exc, &s,
1710                 &unicode, &outpos, &p))
1711             goto onError;
1712     }
1713
1714     /* end of string */
1715
1716     if (inShift && !consumed) { /* in shift sequence, no more to follow */
1717         /* if we're in an inconsistent state, that's an error */
1718         if (surrogate ||
1719                 (base64bits >= 6) ||
1720                 (base64bits > 0 && base64buffer != 0)) {
1721             outpos = p-PyUnicode_AS_UNICODE(unicode);
1722             endinpos = size;
1723             if (unicode_decode_call_errorhandler(
1724                     errors, &errorHandler,
1725                     "utf7", "unterminated shift sequence",
1726                     starts, size, &startinpos, &endinpos, &exc, &s,
1727                     &unicode, &outpos, &p))
1728                 goto onError;
1729         }
1730     }
1731
1732     /* return state */
1733     if (consumed) {
1734         if (inShift) {
1735             p = shiftOutStart; /* back off output */
1736             *consumed = startinpos;
1737         }
1738         else {
1739             *consumed = s-starts;
1740         }
1741     }
1742
1743     if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1744         goto onError;
1745
1746     Py_XDECREF(errorHandler);
1747     Py_XDECREF(exc);
1748     return (PyObject *)unicode;
1749
1750   onError:
1751     Py_XDECREF(errorHandler);
1752     Py_XDECREF(exc);
1753     Py_DECREF(unicode);
1754     return NULL;
1755 }
1756
1757
1758 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1759                                Py_ssize_t size,
1760                                int base64SetO,
1761                                int base64WhiteSpace,
1762                                const char *errors)
1763 {
1764     PyObject *v;
1765     /* It might be possible to tighten this worst case */
1766     Py_ssize_t allocated = 8 * size;
1767     int inShift = 0;
1768     Py_ssize_t i = 0;
1769     unsigned int base64bits = 0;
1770     unsigned long base64buffer = 0;
1771     char * out;
1772     char * start;
1773
1774     if (allocated / 8 != size)
1775         return PyErr_NoMemory();
1776
1777     if (size == 0)
1778         return PyString_FromStringAndSize(NULL, 0);
1779
1780     v = PyString_FromStringAndSize(NULL, allocated);
1781     if (v == NULL)
1782         return NULL;
1783
1784     start = out = PyString_AS_STRING(v);
1785     for (;i < size; ++i) {
1786         Py_UNICODE ch = s[i];
1787
1788         if (inShift) {
1789             if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1790                 /* shifting out */
1791                 if (base64bits) { /* output remaining bits */
1792                     *out++ = TO_BASE64(base64buffer << (6-base64bits));
1793                     base64buffer = 0;
1794                     base64bits = 0;
1795                 }
1796                 inShift = 0;
1797                 /* Characters not in the BASE64 set implicitly unshift the sequence
1798                    so no '-' is required, except if the character is itself a '-' */
1799                 if (IS_BASE64(ch) || ch == '-') {
1800                     *out++ = '-';
1801                 }
1802                 *out++ = (char) ch;
1803             }
1804             else {
1805                 goto encode_char;
1806             }
1807         }
1808         else { /* not in a shift sequence */
1809             if (ch == '+') {
1810                 *out++ = '+';
1811                         *out++ = '-';
1812             }
1813             else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1814                 *out++ = (char) ch;
1815             }
1816             else {
1817                 *out++ = '+';
1818                 inShift = 1;
1819                 goto encode_char;
1820             }
1821         }
1822         continue;
1823 encode_char:
1824 #ifdef Py_UNICODE_WIDE
1825         if (ch >= 0x10000) {
1826             /* code first surrogate */
1827             base64bits += 16;
1828             base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1829             while (base64bits >= 6) {
1830                 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1831                 base64bits -= 6;
1832             }
1833             /* prepare second surrogate */
1834             ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
1835         }
1836 #endif
1837         base64bits += 16;
1838         base64buffer = (base64buffer << 16) | ch;
1839         while (base64bits >= 6) {
1840             *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1841             base64bits -= 6;
1842         }
1843     }
1844     if (base64bits)
1845         *out++= TO_BASE64(base64buffer << (6-base64bits) );
1846     if (inShift)
1847         *out++ = '-';
1848
1849     _PyString_Resize(&v, out - start);
1850     return v;
1851 }
1852
1853 #undef IS_BASE64
1854 #undef FROM_BASE64
1855 #undef TO_BASE64
1856 #undef DECODE_DIRECT
1857 #undef ENCODE_DIRECT
1858
1859 /* --- UTF-8 Codec -------------------------------------------------------- */
1860
1861 static
1862 char utf8_code_length[256] = {
1863     /* Map UTF-8 encoded prefix byte to sequence length.  zero means
1864        illegal prefix.  see RFC 2279 for details */
1865     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1866     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1867     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1868     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1869     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1870     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1871     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1872     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1873     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1874     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1875     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1876     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1877     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1878     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1879     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1880     4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1881 };
1882
1883 PyObject *PyUnicode_DecodeUTF8(const char *s,
1884                                Py_ssize_t size,
1885                                const char *errors)
1886 {
1887     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1888 }
1889
1890 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1891                                        Py_ssize_t size,
1892                                        const char *errors,
1893                                        Py_ssize_t *consumed)
1894 {
1895     const char *starts = s;
1896     int n;
1897     Py_ssize_t startinpos;
1898     Py_ssize_t endinpos;
1899     Py_ssize_t outpos;
1900     const char *e;
1901     PyUnicodeObject *unicode;
1902     Py_UNICODE *p;
1903     const char *errmsg = "";
1904     PyObject *errorHandler = NULL;
1905     PyObject *exc = NULL;
1906
1907     /* Note: size will always be longer than the resulting Unicode
1908        character count */
1909     unicode = _PyUnicode_New(size);
1910     if (!unicode)
1911         return NULL;
1912     if (size == 0) {
1913         if (consumed)
1914             *consumed = 0;
1915         return (PyObject *)unicode;
1916     }
1917
1918     /* Unpack UTF-8 encoded data */
1919     p = unicode->str;
1920     e = s + size;
1921
1922     while (s < e) {
1923         Py_UCS4 ch = (unsigned char)*s;
1924
1925         if (ch < 0x80) {
1926             *p++ = (Py_UNICODE)ch;
1927             s++;
1928             continue;
1929         }
1930
1931         n = utf8_code_length[ch];
1932
1933         if (s + n > e) {
1934             if (consumed)
1935                 break;
1936             else {
1937                 errmsg = "unexpected end of data";
1938                 startinpos = s-starts;
1939                 endinpos = size;
1940                 goto utf8Error;
1941             }
1942         }
1943
1944         switch (n) {
1945
1946         case 0:
1947             errmsg = "unexpected code byte";
1948             startinpos = s-starts;
1949             endinpos = startinpos+1;
1950             goto utf8Error;
1951
1952         case 1:
1953             errmsg = "internal error";
1954             startinpos = s-starts;
1955             endinpos = startinpos+1;
1956             goto utf8Error;
1957
1958         case 2:
1959             if ((s[1] & 0xc0) != 0x80) {
1960                 errmsg = "invalid data";
1961                 startinpos = s-starts;
1962                 endinpos = startinpos+2;
1963                 goto utf8Error;
1964             }
1965             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1966             if (ch < 0x80) {
1967                 startinpos = s-starts;
1968                 endinpos = startinpos+2;
1969                 errmsg = "illegal encoding";
1970                 goto utf8Error;
1971             }
1972             else
1973                 *p++ = (Py_UNICODE)ch;
1974             break;
1975
1976         case 3:
1977             if ((s[1] & 0xc0) != 0x80 ||
1978                 (s[2] & 0xc0) != 0x80) {
1979                 errmsg = "invalid data";
1980                 startinpos = s-starts;
1981                 endinpos = startinpos+3;
1982                 goto utf8Error;
1983             }
1984             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1985             if (ch < 0x0800) {
1986                 /* Note: UTF-8 encodings of surrogates are considered
1987                    legal UTF-8 sequences;
1988
1989                    XXX For wide builds (UCS-4) we should probably try
1990                    to recombine the surrogates into a single code
1991                    unit.
1992                 */
1993                 errmsg = "illegal encoding";
1994                 startinpos = s-starts;
1995                 endinpos = startinpos+3;
1996                 goto utf8Error;
1997             }
1998             else
1999                 *p++ = (Py_UNICODE)ch;
2000             break;
2001
2002         case 4:
2003             if ((s[1] & 0xc0) != 0x80 ||
2004                 (s[2] & 0xc0) != 0x80 ||
2005                 (s[3] & 0xc0) != 0x80) {
2006                 errmsg = "invalid data";
2007                 startinpos = s-starts;
2008                 endinpos = startinpos+4;
2009                 goto utf8Error;
2010             }
2011             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2012                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2013             /* validate and convert to UTF-16 */
2014             if ((ch < 0x10000)        /* minimum value allowed for 4
2015                                          byte encoding */
2016                 || (ch > 0x10ffff))   /* maximum value allowed for
2017                                          UTF-16 */
2018             {
2019                 errmsg = "illegal encoding";
2020                 startinpos = s-starts;
2021                 endinpos = startinpos+4;
2022                 goto utf8Error;
2023             }
2024 #ifdef Py_UNICODE_WIDE
2025             *p++ = (Py_UNICODE)ch;
2026 #else
2027             /*  compute and append the two surrogates: */
2028
2029             /*  translate from 10000..10FFFF to 0..FFFF */
2030             ch -= 0x10000;
2031
2032             /*  high surrogate = top 10 bits added to D800 */
2033             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2034
2035             /*  low surrogate = bottom 10 bits added to DC00 */
2036             *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2037 #endif
2038             break;
2039
2040         default:
2041             /* Other sizes are only needed for UCS-4 */
2042             errmsg = "unsupported Unicode code range";
2043             startinpos = s-starts;
2044             endinpos = startinpos+n;
2045             goto utf8Error;
2046         }
2047         s += n;
2048         continue;
2049
2050       utf8Error:
2051         outpos = p-PyUnicode_AS_UNICODE(unicode);
2052         if (unicode_decode_call_errorhandler(
2053                 errors, &errorHandler,
2054                 "utf8", errmsg,
2055                 starts, size, &startinpos, &endinpos, &exc, &s,
2056                 &unicode, &outpos, &p))
2057             goto onError;
2058     }
2059     if (consumed)
2060         *consumed = s-starts;
2061
2062     /* Adjust length */
2063     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2064         goto onError;
2065
2066     Py_XDECREF(errorHandler);
2067     Py_XDECREF(exc);
2068     return (PyObject *)unicode;
2069
2070   onError:
2071     Py_XDECREF(errorHandler);
2072     Py_XDECREF(exc);
2073     Py_DECREF(unicode);
2074     return NULL;
2075 }
2076
2077 /* Allocation strategy:  if the string is short, convert into a stack buffer
2078    and allocate exactly as much space needed at the end.  Else allocate the
2079    maximum possible needed (4 result bytes per Unicode character), and return
2080    the excess memory at the end.
2081 */
2082 PyObject *
2083 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2084                      Py_ssize_t size,
2085                      const char *errors)
2086 {
2087 #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
2088
2089     Py_ssize_t i;           /* index into s of next input byte */
2090     PyObject *v;        /* result string object */
2091     char *p;            /* next free byte in output buffer */
2092     Py_ssize_t nallocated;  /* number of result bytes allocated */
2093     Py_ssize_t nneeded;        /* number of result bytes needed */
2094     char stackbuf[MAX_SHORT_UNICHARS * 4];
2095
2096     assert(s != NULL);
2097     assert(size >= 0);
2098
2099     if (size <= MAX_SHORT_UNICHARS) {
2100         /* Write into the stack buffer; nallocated can't overflow.
2101          * At the end, we'll allocate exactly as much heap space as it
2102          * turns out we need.
2103          */
2104         nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2105         v = NULL;   /* will allocate after we're done */
2106         p = stackbuf;
2107     }
2108     else {
2109         /* Overallocate on the heap, and give the excess back at the end. */
2110         nallocated = size * 4;
2111         if (nallocated / 4 != size)  /* overflow! */
2112             return PyErr_NoMemory();
2113         v = PyString_FromStringAndSize(NULL, nallocated);
2114         if (v == NULL)
2115             return NULL;
2116         p = PyString_AS_STRING(v);
2117     }
2118
2119     for (i = 0; i < size;) {
2120         Py_UCS4 ch = s[i++];
2121
2122         if (ch < 0x80)
2123             /* Encode ASCII */
2124             *p++ = (char) ch;
2125
2126         else if (ch < 0x0800) {
2127             /* Encode Latin-1 */
2128             *p++ = (char)(0xc0 | (ch >> 6));
2129             *p++ = (char)(0x80 | (ch & 0x3f));
2130         }
2131         else {
2132             /* Encode UCS2 Unicode ordinals */
2133             if (ch < 0x10000) {
2134                 /* Special case: check for high surrogate */
2135                 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2136                     Py_UCS4 ch2 = s[i];
2137                     /* Check for low surrogate and combine the two to
2138                        form a UCS4 value */
2139                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2140                         ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2141                         i++;
2142                         goto encodeUCS4;
2143                     }
2144                     /* Fall through: handles isolated high surrogates */
2145                 }
2146                 *p++ = (char)(0xe0 | (ch >> 12));
2147                 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2148                 *p++ = (char)(0x80 | (ch & 0x3f));
2149                 continue;
2150             }
2151           encodeUCS4:
2152             /* Encode UCS4 Unicode ordinals */
2153             *p++ = (char)(0xf0 | (ch >> 18));
2154             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2155             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2156             *p++ = (char)(0x80 | (ch & 0x3f));
2157         }
2158     }
2159
2160     if (v == NULL) {
2161         /* This was stack allocated. */
2162         nneeded = p - stackbuf;
2163         assert(nneeded <= nallocated);
2164         v = PyString_FromStringAndSize(stackbuf, nneeded);
2165     }
2166     else {
2167         /* Cut back to size actually needed. */
2168         nneeded = p - PyString_AS_STRING(v);
2169         assert(nneeded <= nallocated);
2170         _PyString_Resize(&v, nneeded);
2171     }
2172     return v;
2173
2174 #undef MAX_SHORT_UNICHARS
2175 }
2176
2177 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2178 {
2179     if (!PyUnicode_Check(unicode)) {
2180         PyErr_BadArgument();
2181         return NULL;
2182     }
2183     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2184                                 PyUnicode_GET_SIZE(unicode),
2185                                 NULL);
2186 }
2187
2188 /* --- UTF-32 Codec ------------------------------------------------------- */
2189
2190 PyObject *
2191 PyUnicode_DecodeUTF32(const char *s,
2192                       Py_ssize_t size,
2193                       const char *errors,
2194                       int *byteorder)
2195 {
2196     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2197 }
2198
2199 PyObject *
2200 PyUnicode_DecodeUTF32Stateful(const char *s,
2201                               Py_ssize_t size,
2202                               const char *errors,
2203                               int *byteorder,
2204                               Py_ssize_t *consumed)
2205 {
2206     const char *starts = s;
2207     Py_ssize_t startinpos;
2208     Py_ssize_t endinpos;
2209     Py_ssize_t outpos;
2210     PyUnicodeObject *unicode;
2211     Py_UNICODE *p;
2212 #ifndef Py_UNICODE_WIDE
2213     int i, pairs;
2214 #else
2215     const int pairs = 0;
2216 #endif
2217     const unsigned char *q, *e;
2218     int bo = 0;       /* assume native ordering by default */
2219     const char *errmsg = "";
2220     /* Offsets from q for retrieving bytes in the right order. */
2221 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2222     int iorder[] = {0, 1, 2, 3};
2223 #else
2224     int iorder[] = {3, 2, 1, 0};
2225 #endif
2226     PyObject *errorHandler = NULL;
2227     PyObject *exc = NULL;
2228     /* On narrow builds we split characters outside the BMP into two
2229        codepoints => count how much extra space we need. */
2230 #ifndef Py_UNICODE_WIDE
2231     for (i = pairs = 0; i < size/4; i++)
2232         if (((Py_UCS4 *)s)[i] >= 0x10000)
2233             pairs++;
2234 #endif
2235
2236     /* This might be one to much, because of a BOM */
2237     unicode = _PyUnicode_New((size+3)/4+pairs);
2238     if (!unicode)
2239         return NULL;
2240     if (size == 0)
2241         return (PyObject *)unicode;
2242
2243     /* Unpack UTF-32 encoded data */
2244     p = unicode->str;
2245     q = (unsigned char *)s;
2246     e = q + size;
2247
2248     if (byteorder)
2249         bo = *byteorder;
2250
2251     /* Check for BOM marks (U+FEFF) in the input and adjust current
2252        byte order setting accordingly. In native mode, the leading BOM
2253        mark is skipped, in all other modes, it is copied to the output
2254        stream as-is (giving a ZWNBSP character). */
2255     if (bo == 0) {
2256         if (size >= 4) {
2257             const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2258                 (q[iorder[1]] << 8) | q[iorder[0]];
2259 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2260             if (bom == 0x0000FEFF) {
2261                 q += 4;
2262                 bo = -1;
2263             }
2264             else if (bom == 0xFFFE0000) {
2265                 q += 4;
2266                 bo = 1;
2267             }
2268 #else
2269             if (bom == 0x0000FEFF) {
2270                 q += 4;
2271                 bo = 1;
2272             }
2273             else if (bom == 0xFFFE0000) {
2274                 q += 4;
2275                 bo = -1;
2276             }
2277 #endif
2278         }
2279     }
2280
2281     if (bo == -1) {
2282         /* force LE */
2283         iorder[0] = 0;
2284         iorder[1] = 1;
2285         iorder[2] = 2;
2286         iorder[3] = 3;
2287     }
2288     else if (bo == 1) {
2289         /* force BE */
2290         iorder[0] = 3;
2291         iorder[1] = 2;
2292         iorder[2] = 1;
2293         iorder[3] = 0;
2294     }
2295
2296     while (q < e) {
2297         Py_UCS4 ch;
2298         /* remaining bytes at the end? (size should be divisible by 4) */
2299         if (e-q<4) {
2300             if (consumed)
2301                 break;
2302             errmsg = "truncated data";
2303             startinpos = ((const char *)q)-starts;
2304             endinpos = ((const char *)e)-starts;
2305             goto utf32Error;
2306             /* The remaining input chars are ignored if the callback
2307                chooses to skip the input */
2308         }
2309         ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2310             (q[iorder[1]] << 8) | q[iorder[0]];
2311
2312         if (ch >= 0x110000)
2313         {
2314             errmsg = "codepoint not in range(0x110000)";
2315             startinpos = ((const char *)q)-starts;
2316             endinpos = startinpos+4;
2317             goto utf32Error;
2318         }
2319 #ifndef Py_UNICODE_WIDE
2320         if (ch >= 0x10000)
2321         {
2322             *p++ = 0xD800 | ((ch-0x10000) >> 10);
2323             *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2324         }
2325         else
2326 #endif
2327             *p++ = ch;
2328         q += 4;
2329         continue;
2330       utf32Error:
2331         outpos = p-PyUnicode_AS_UNICODE(unicode);
2332         if (unicode_decode_call_errorhandler(
2333                 errors, &errorHandler,
2334                 "utf32", errmsg,
2335                 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2336                 &unicode, &outpos, &p))
2337             goto onError;
2338     }
2339
2340     if (byteorder)
2341         *byteorder = bo;
2342
2343     if (consumed)
2344         *consumed = (const char *)q-starts;
2345
2346     /* Adjust length */
2347     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2348         goto onError;
2349
2350     Py_XDECREF(errorHandler);
2351     Py_XDECREF(exc);
2352     return (PyObject *)unicode;
2353
2354   onError:
2355     Py_DECREF(unicode);
2356     Py_XDECREF(errorHandler);
2357     Py_XDECREF(exc);
2358     return NULL;
2359 }
2360
2361 PyObject *
2362 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2363                       Py_ssize_t size,
2364                       const char *errors,
2365                       int byteorder)
2366 {
2367     PyObject *v;
2368     unsigned char *p;
2369     Py_ssize_t nsize, bytesize;
2370 #ifndef Py_UNICODE_WIDE
2371     Py_ssize_t i, pairs;
2372 #else
2373     const int pairs = 0;
2374 #endif
2375     /* Offsets from p for storing byte pairs in the right order. */
2376 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2377     int iorder[] = {0, 1, 2, 3};
2378 #else
2379     int iorder[] = {3, 2, 1, 0};
2380 #endif
2381
2382 #define STORECHAR(CH)                           \
2383     do {                                        \
2384         p[iorder[3]] = ((CH) >> 24) & 0xff;     \
2385         p[iorder[2]] = ((CH) >> 16) & 0xff;     \
2386         p[iorder[1]] = ((CH) >> 8) & 0xff;      \
2387         p[iorder[0]] = (CH) & 0xff;             \
2388         p += 4;                                 \
2389     } while(0)
2390
2391     /* In narrow builds we can output surrogate pairs as one codepoint,
2392        so we need less space. */
2393 #ifndef Py_UNICODE_WIDE
2394     for (i = pairs = 0; i < size-1; i++)
2395         if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2396             0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2397             pairs++;
2398 #endif
2399     nsize = (size - pairs + (byteorder == 0));
2400     bytesize = nsize * 4;
2401     if (bytesize / 4 != nsize)
2402         return PyErr_NoMemory();
2403     v = PyString_FromStringAndSize(NULL, bytesize);
2404     if (v == NULL)
2405         return NULL;
2406
2407     p = (unsigned char *)PyString_AS_STRING(v);
2408     if (byteorder == 0)
2409         STORECHAR(0xFEFF);
2410     if (size == 0)
2411         return v;
2412
2413     if (byteorder == -1) {
2414         /* force LE */
2415         iorder[0] = 0;
2416         iorder[1] = 1;
2417         iorder[2] = 2;
2418         iorder[3] = 3;
2419     }
2420     else if (byteorder == 1) {
2421         /* force BE */
2422         iorder[0] = 3;
2423         iorder[1] = 2;
2424         iorder[2] = 1;
2425         iorder[3] = 0;
2426     }
2427
2428     while (size-- > 0) {
2429         Py_UCS4 ch = *s++;
2430 #ifndef Py_UNICODE_WIDE
2431         if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2432             Py_UCS4 ch2 = *s;
2433             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2434                 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2435                 s++;
2436                 size--;
2437             }
2438         }
2439 #endif
2440         STORECHAR(ch);
2441     }
2442     return v;
2443 #undef STORECHAR
2444 }
2445
2446 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2447 {
2448     if (!PyUnicode_Check(unicode)) {
2449         PyErr_BadArgument();
2450         return NULL;
2451     }
2452     return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2453                                  PyUnicode_GET_SIZE(unicode),
2454                                  NULL,
2455                                  0);
2456 }
2457
2458 /* --- UTF-16 Codec ------------------------------------------------------- */
2459
2460 PyObject *
2461 PyUnicode_DecodeUTF16(const char *s,
2462                       Py_ssize_t size,
2463                       const char *errors,
2464                       int *byteorder)
2465 {
2466     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2467 }
2468
2469 PyObject *
2470 PyUnicode_DecodeUTF16Stateful(const char *s,
2471                               Py_ssize_t size,
2472                               const char *errors,
2473                               int *byteorder,
2474                               Py_ssize_t *consumed)
2475 {
2476     const char *starts = s;
2477     Py_ssize_t startinpos;
2478     Py_ssize_t endinpos;
2479     Py_ssize_t outpos;
2480     PyUnicodeObject *unicode;
2481     Py_UNICODE *p;
2482     const unsigned char *q, *e;
2483     int bo = 0;       /* assume native ordering by default */
2484     const char *errmsg = "";
2485     /* Offsets from q for retrieving byte pairs in the right order. */
2486 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2487     int ihi = 1, ilo = 0;
2488 #else
2489     int ihi = 0, ilo = 1;
2490 #endif
2491     PyObject *errorHandler = NULL;
2492     PyObject *exc = NULL;
2493
2494     /* Note: size will always be longer than the resulting Unicode
2495        character count */
2496     unicode = _PyUnicode_New(size);
2497     if (!unicode)
2498         return NULL;
2499     if (size == 0)
2500         return (PyObject *)unicode;
2501
2502     /* Unpack UTF-16 encoded data */
2503     p = unicode->str;
2504     q = (unsigned char *)s;
2505     e = q + size;
2506
2507     if (byteorder)
2508         bo = *byteorder;
2509
2510     /* Check for BOM marks (U+FEFF) in the input and adjust current
2511        byte order setting accordingly. In native mode, the leading BOM
2512        mark is skipped, in all other modes, it is copied to the output
2513        stream as-is (giving a ZWNBSP character). */
2514     if (bo == 0) {
2515         if (size >= 2) {
2516             const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2517 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2518             if (bom == 0xFEFF) {
2519                 q += 2;
2520                 bo = -1;
2521             }
2522             else if (bom == 0xFFFE) {
2523                 q += 2;
2524                 bo = 1;
2525             }
2526 #else
2527             if (bom == 0xFEFF) {
2528                 q += 2;
2529                 bo = 1;
2530             }
2531             else if (bom == 0xFFFE) {
2532                 q += 2;
2533                 bo = -1;
2534             }
2535 #endif
2536         }
2537     }
2538
2539     if (bo == -1) {
2540         /* force LE */
2541         ihi = 1;
2542         ilo = 0;
2543     }
2544     else if (bo == 1) {
2545         /* force BE */
2546         ihi = 0;
2547         ilo = 1;
2548     }
2549
2550     while (q < e) {
2551         Py_UNICODE ch;
2552         /* remaining bytes at the end? (size should be even) */
2553         if (e-q<2) {
2554             if (consumed)
2555                 break;
2556             errmsg = "truncated data";
2557             startinpos = ((const char *)q)-starts;
2558             endinpos = ((const char *)e)-starts;
2559             goto utf16Error;
2560             /* The remaining input chars are ignored if the callback
2561                chooses to skip the input */
2562         }
2563         ch = (q[ihi] << 8) | q[ilo];
2564
2565         q += 2;
2566
2567         if (ch < 0xD800 || ch > 0xDFFF) {
2568             *p++ = ch;
2569             continue;
2570         }
2571
2572         /* UTF-16 code pair: */
2573         if (q >= e) {
2574             errmsg = "unexpected end of data";
2575             startinpos = (((const char *)q)-2)-starts;
2576             endinpos = ((const char *)e)-starts;
2577             goto utf16Error;
2578         }
2579         if (0xD800 <= ch && ch <= 0xDBFF) {
2580             Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2581             q += 2;
2582             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2583 #ifndef Py_UNICODE_WIDE
2584                 *p++ = ch;
2585                 *p++ = ch2;
2586 #else
2587                 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2588 #endif
2589                 continue;
2590             }
2591             else {
2592                 errmsg = "illegal UTF-16 surrogate";
2593                 startinpos = (((const char *)q)-4)-starts;
2594                 endinpos = startinpos+2;
2595                 goto utf16Error;
2596             }
2597
2598         }
2599         errmsg = "illegal encoding";
2600         startinpos = (((const char *)q)-2)-starts;
2601         endinpos = startinpos+2;
2602         /* Fall through to report the error */
2603
2604       utf16Error:
2605         outpos = p-PyUnicode_AS_UNICODE(unicode);
2606         if (unicode_decode_call_errorhandler(
2607                 errors, &errorHandler,
2608                 "utf16", errmsg,
2609                 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2610                 &unicode, &outpos, &p))
2611             goto onError;
2612     }
2613
2614     if (byteorder)
2615         *byteorder = bo;
2616
2617     if (consumed)
2618         *consumed = (const char *)q-starts;
2619
2620     /* Adjust length */
2621     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2622         goto onError;
2623
2624     Py_XDECREF(errorHandler);
2625     Py_XDECREF(exc);
2626     return (PyObject *)unicode;
2627
2628   onError:
2629     Py_DECREF(unicode);
2630     Py_XDECREF(errorHandler);
2631     Py_XDECREF(exc);
2632     return NULL;
2633 }
2634
2635 PyObject *
2636 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2637                       Py_ssize_t size,
2638                       const char *errors,
2639                       int byteorder)
2640 {
2641     PyObject *v;
2642     unsigned char *p;
2643     Py_ssize_t nsize, bytesize;
2644 #ifdef Py_UNICODE_WIDE
2645     Py_ssize_t i, pairs;
2646 #else
2647     const int pairs = 0;
2648 #endif
2649     /* Offsets from p for storing byte pairs in the right order. */
2650 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2651     int ihi = 1, ilo = 0;
2652 #else
2653     int ihi = 0, ilo = 1;
2654 #endif
2655
2656 #define STORECHAR(CH)                           \
2657     do {                                        \
2658         p[ihi] = ((CH) >> 8) & 0xff;            \
2659         p[ilo] = (CH) & 0xff;                   \
2660         p += 2;                                 \
2661     } while(0)
2662
2663 #ifdef Py_UNICODE_WIDE
2664     for (i = pairs = 0; i < size; i++)
2665         if (s[i] >= 0x10000)
2666             pairs++;
2667 #endif
2668     /* 2 * (size + pairs + (byteorder == 0)) */
2669     if (size > PY_SSIZE_T_MAX ||
2670         size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2671         return PyErr_NoMemory();
2672     nsize = size + pairs + (byteorder == 0);
2673     bytesize = nsize * 2;
2674     if (bytesize / 2 != nsize)
2675         return PyErr_NoMemory();
2676     v = PyString_FromStringAndSize(NULL, bytesize);
2677     if (v == NULL)
2678         return NULL;
2679
2680     p = (unsigned char *)PyString_AS_STRING(v);
2681     if (byteorder == 0)
2682         STORECHAR(0xFEFF);
2683     if (size == 0)
2684         return v;
2685
2686     if (byteorder == -1) {
2687         /* force LE */
2688         ihi = 1;
2689         ilo = 0;
2690     }
2691     else if (byteorder == 1) {
2692         /* force BE */
2693         ihi = 0;
2694         ilo = 1;
2695     }
2696
2697     while (size-- > 0) {
2698         Py_UNICODE ch = *s++;
2699         Py_UNICODE ch2 = 0;
2700 #ifdef Py_UNICODE_WIDE
2701         if (ch >= 0x10000) {
2702             ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2703             ch  = 0xD800 | ((ch-0x10000) >> 10);
2704         }
2705 #endif
2706         STORECHAR(ch);
2707         if (ch2)
2708             STORECHAR(ch2);
2709     }
2710     return v;
2711 #undef STORECHAR
2712 }
2713
2714 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2715 {
2716     if (!PyUnicode_Check(unicode)) {
2717         PyErr_BadArgument();
2718         return NULL;
2719     }
2720     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2721                                  PyUnicode_GET_SIZE(unicode),
2722                                  NULL,
2723                                  0);
2724 }
2725
2726 /* --- Unicode Escape Codec ----------------------------------------------- */
2727
2728 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2729
2730 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2731                                         Py_ssize_t size,
2732                                         const char *errors)
2733 {
2734     const char *starts = s;
2735     Py_ssize_t startinpos;
2736     Py_ssize_t endinpos;
2737     Py_ssize_t outpos;
2738     int i;
2739     PyUnicodeObject *v;
2740     Py_UNICODE *p;
2741     const char *end;
2742     char* message;
2743     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2744     PyObject *errorHandler = NULL;
2745     PyObject *exc = NULL;
2746
2747     /* Escaped strings will always be longer than the resulting
2748        Unicode string, so we start with size here and then reduce the
2749        length after conversion to the true value.
2750        (but if the error callback returns a long replacement string
2751        we'll have to allocate more space) */
2752     v = _PyUnicode_New(size);
2753     if (v == NULL)
2754         goto onError;
2755     if (size == 0)
2756         return (PyObject *)v;
2757
2758     p = PyUnicode_AS_UNICODE(v);
2759     end = s + size;
2760
2761     while (s < end) {
2762         unsigned char c;
2763         Py_UNICODE x;
2764         int digits;
2765
2766         /* Non-escape characters are interpreted as Unicode ordinals */
2767         if (*s != '\\') {
2768             *p++ = (unsigned char) *s++;
2769             continue;
2770         }
2771
2772         startinpos = s-starts;
2773         /* \ - Escapes */
2774         s++;
2775         c = *s++;
2776         if (s > end)
2777             c = '\0'; /* Invalid after \ */
2778         switch (c) {
2779
2780             /* \x escapes */
2781         case '\n': break;
2782         case '\\': *p++ = '\\'; break;
2783         case '\'': *p++ = '\''; break;
2784         case '\"': *p++ = '\"'; break;
2785         case 'b': *p++ = '\b'; break;
2786         case 'f': *p++ = '\014'; break; /* FF */
2787         case 't': *p++ = '\t'; break;
2788         case 'n': *p++ = '\n'; break;
2789         case 'r': *p++ = '\r'; break;
2790         case 'v': *p++ = '\013'; break; /* VT */
2791         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2792
2793             /* \OOO (octal) escapes */
2794         case '0': case '1': case '2': case '3':
2795         case '4': case '5': case '6': case '7':
2796             x = s[-1] - '0';
2797             if (s < end && '0' <= *s && *s <= '7') {
2798                 x = (x<<3) + *s++ - '0';
2799                 if (s < end && '0' <= *s && *s <= '7')
2800                     x = (x<<3) + *s++ - '0';
2801             }
2802             *p++ = x;
2803             break;
2804
2805             /* hex escapes */
2806             /* \xXX */
2807         case 'x':
2808             digits = 2;
2809             message = "truncated \\xXX escape";
2810             goto hexescape;
2811
2812             /* \uXXXX */
2813         case 'u':
2814             digits = 4;
2815             message = "truncated \\uXXXX escape";
2816             goto hexescape;
2817
2818             /* \UXXXXXXXX */
2819         case 'U':
2820             digits = 8;
2821             message = "truncated \\UXXXXXXXX escape";
2822         hexescape:
2823             chr = 0;
2824             outpos = p-PyUnicode_AS_UNICODE(v);
2825             if (s+digits>end) {
2826                 endinpos = size;
2827                 if (unicode_decode_call_errorhandler(
2828                         errors, &errorHandler,
2829                         "unicodeescape", "end of string in escape sequence",
2830                         starts, size, &startinpos, &endinpos, &exc, &s,
2831                         &v, &outpos, &p))
2832                     goto onError;
2833                 goto nextByte;
2834             }
2835             for (i = 0; i < digits; ++i) {
2836                 c = (unsigned char) s[i];
2837                 if (!isxdigit(c)) {
2838                     endinpos = (s+i+1)-starts;
2839                     if (unicode_decode_call_errorhandler(
2840                             errors, &errorHandler,
2841                             "unicodeescape", message,
2842                             starts, size, &startinpos, &endinpos, &exc, &s,
2843                             &v, &outpos, &p))
2844                         goto onError;
2845                     goto nextByte;
2846                 }
2847                 chr = (chr<<4) & ~0xF;
2848                 if (c >= '0' && c <= '9')
2849                     chr += c - '0';
2850                 else if (c >= 'a' && c <= 'f')
2851                     chr += 10 + c - 'a';
2852                 else
2853                     chr += 10 + c - 'A';
2854             }
2855             s += i;
2856             if (chr == 0xffffffff && PyErr_Occurred())
2857                 /* _decoding_error will have already written into the
2858                    target buffer. */
2859                 break;
2860         store:
2861             /* when we get here, chr is a 32-bit unicode character */
2862             if (chr <= 0xffff)
2863                 /* UCS-2 character */
2864                 *p++ = (Py_UNICODE) chr;
2865             else if (chr <= 0x10ffff) {
2866                 /* UCS-4 character. Either store directly, or as
2867                    surrogate pair. */
2868 #ifdef Py_UNICODE_WIDE
2869                 *p++ = chr;
2870 #else
2871                 chr -= 0x10000L;
2872                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2873                 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2874 #endif
2875             } else {
2876                 endinpos = s-starts;
2877                 outpos = p-PyUnicode_AS_UNICODE(v);
2878                 if (unicode_decode_call_errorhandler(
2879                         errors, &errorHandler,
2880                         "unicodeescape", "illegal Unicode character",
2881                         starts, size, &startinpos, &endinpos, &exc, &s,
2882                         &v, &outpos, &p))
2883                     goto onError;
2884             }
2885             break;
2886
2887             /* \N{name} */
2888         case 'N':
2889             message = "malformed \\N character escape";
2890             if (ucnhash_CAPI == NULL) {
2891                 /* load the unicode data module */
2892                 PyObject *m, *api;
2893                 m = PyImport_ImportModuleNoBlock("unicodedata");
2894                 if (m == NULL)
2895                     goto ucnhashError;
2896                 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
2897                 Py_DECREF(m);
2898                 if (api == NULL)
2899                     goto ucnhashError;
2900                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
2901                 Py_DECREF(api);
2902                 if (ucnhash_CAPI == NULL)
2903                     goto ucnhashError;
2904             }
2905             if (*s == '{') {
2906                 const char *start = s+1;
2907                 /* look for the closing brace */
2908                 while (*s != '}' && s < end)
2909                     s++;
2910                 if (s > start && s < end && *s == '}') {
2911                     /* found a name.  look it up in the unicode database */
2912                     message = "unknown Unicode character name";
2913                     s++;
2914                     if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2915                         goto store;
2916                 }
2917             }
2918             endinpos = s-starts;
2919             outpos = p-PyUnicode_AS_UNICODE(v);
2920             if (unicode_decode_call_errorhandler(
2921                     errors, &errorHandler,
2922                     "unicodeescape", message,
2923                     starts, size, &startinpos, &endinpos, &exc, &s,
2924                     &v, &outpos, &p))
2925                 goto onError;
2926             break;
2927
2928         default:
2929             if (s > end) {
2930                 message = "\\ at end of string";
2931                 s--;
2932                 endinpos = s-starts;
2933                 outpos = p-PyUnicode_AS_UNICODE(v);
2934                 if (unicode_decode_call_errorhandler(
2935                         errors, &errorHandler,
2936                         "unicodeescape", message,
2937                         starts, size, &startinpos, &endinpos, &exc, &s,
2938                         &v, &outpos, &p))
2939                     goto onError;
2940             }
2941             else {
2942                 *p++ = '\\';
2943                 *p++ = (unsigned char)s[-1];
2944             }
2945             break;
2946         }
2947       nextByte:
2948         ;
2949     }
2950     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2951         goto onError;
2952     Py_XDECREF(errorHandler);
2953     Py_XDECREF(exc);
2954     return (PyObject *)v;
2955
2956   ucnhashError:
2957     PyErr_SetString(
2958         PyExc_UnicodeError,
2959         "\\N escapes not supported (can't load unicodedata module)"
2960         );
2961     Py_XDECREF(v);
2962     Py_XDECREF(errorHandler);
2963     Py_XDECREF(exc);
2964     return NULL;
2965
2966   onError:
2967     Py_XDECREF(v);
2968     Py_XDECREF(errorHandler);
2969     Py_XDECREF(exc);
2970     return NULL;
2971 }
2972
2973 /* Return a Unicode-Escape string version of the Unicode object.
2974
2975    If quotes is true, the string is enclosed in u"" or u'' quotes as
2976    appropriate.
2977
2978 */
2979
2980 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2981                                              Py_ssize_t size,
2982                                              Py_UNICODE ch)
2983 {
2984     /* like wcschr, but doesn't stop at NULL characters */
2985
2986     while (size-- > 0) {
2987         if (*s == ch)
2988             return s;
2989         s++;
2990     }
2991
2992     return NULL;
2993 }
2994
2995 static
2996 PyObject *unicodeescape_string(const Py_UNICODE *s,
2997                                Py_ssize_t size,
2998                                int quotes)
2999 {
3000     PyObject *repr;
3001     char *p;
3002
3003     static const char *hexdigit = "0123456789abcdef";
3004 #ifdef Py_UNICODE_WIDE
3005     const Py_ssize_t expandsize = 10;
3006 #else
3007     const Py_ssize_t expandsize = 6;
3008 #endif
3009
3010     /* XXX(nnorwitz): rather than over-allocating, it would be
3011        better to choose a different scheme.  Perhaps scan the
3012        first N-chars of the string and allocate based on that size.
3013     */
3014     /* Initial allocation is based on the longest-possible unichr
3015        escape.
3016
3017        In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3018        unichr, so in this case it's the longest unichr escape. In
3019        narrow (UTF-16) builds this is five chars per source unichr
3020        since there are two unichrs in the surrogate pair, so in narrow
3021        (UTF-16) builds it's not the longest unichr escape.
3022
3023        In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3024        so in the narrow (UTF-16) build case it's the longest unichr
3025        escape.
3026     */
3027
3028     if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3029         return PyErr_NoMemory();
3030
3031     repr = PyString_FromStringAndSize(NULL,
3032                                       2
3033                                       + expandsize*size
3034                                       + 1);
3035     if (repr == NULL)
3036         return NULL;
3037
3038     p = PyString_AS_STRING(repr);
3039
3040     if (quotes) {
3041         *p++ = 'u';
3042         *p++ = (findchar(s, size, '\'') &&
3043                 !findchar(s, size, '"')) ? '"' : '\'';
3044     }
3045     while (size-- > 0) {
3046         Py_UNICODE ch = *s++;
3047
3048         /* Escape quotes and backslashes */
3049         if ((quotes &&
3050              ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
3051             *p++ = '\\';
3052             *p++ = (char) ch;
3053             continue;
3054         }
3055
3056 #ifdef Py_UNICODE_WIDE
3057         /* Map 21-bit characters to '\U00xxxxxx' */
3058         else if (ch >= 0x10000) {
3059             *p++ = '\\';
3060             *p++ = 'U';
3061             *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3062             *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3063             *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3064             *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3065             *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3066             *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3067             *p++ = hexdigit[(ch >> 4) & 0x0000000F];
3068             *p++ = hexdigit[ch & 0x0000000F];
3069             continue;
3070         }
3071 #else
3072         /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3073         else if (ch >= 0xD800 && ch < 0xDC00) {
3074             Py_UNICODE ch2;
3075             Py_UCS4 ucs;
3076
3077             ch2 = *s++;
3078             size--;
3079             if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3080                 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3081                 *p++ = '\\';
3082                 *p++ = 'U';
3083                 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3084                 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3085                 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3086                 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3087                 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3088                 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3089                 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3090                 *p++ = hexdigit[ucs & 0x0000000F];
3091                 continue;
3092             }
3093             /* Fall through: isolated surrogates are copied as-is */
3094             s--;
3095             size++;
3096         }
3097 #endif
3098
3099         /* Map 16-bit characters to '\uxxxx' */
3100         if (ch >= 256) {
3101             *p++ = '\\';
3102             *p++ = 'u';
3103             *p++ = hexdigit[(ch >> 12) & 0x000F];
3104             *p++ = hexdigit[(ch >> 8) & 0x000F];
3105             *p++ = hexdigit[(ch >> 4) & 0x000F];
3106             *p++ = hexdigit[ch & 0x000F];
3107         }
3108
3109         /* Map special whitespace to '\t', \n', '\r' */
3110         else if (ch == '\t') {
3111             *p++ = '\\';
3112             *p++ = 't';
3113         }
3114         else if (ch == '\n') {
3115             *p++ = '\\';
3116             *p++ = 'n';
3117         }
3118         else if (ch == '\r') {
3119             *p++ = '\\';
3120             *p++ = 'r';
3121         }
3122
3123         /* Map non-printable US ASCII to '\xhh' */
3124         else if (ch < ' ' || ch >= 0x7F) {
3125             *p++ = '\\';
3126             *p++ = 'x';
3127             *p++ = hexdigit[(ch >> 4) & 0x000F];
3128             *p++ = hexdigit[ch & 0x000F];
3129         }
3130
3131         /* Copy everything else as-is */
3132         else
3133             *p++ = (char) ch;
3134     }
3135     if (quotes)
3136         *p++ = PyString_AS_STRING(repr)[1];
3137
3138     *p = '\0';
3139     _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
3140     return repr;
3141 }
3142
3143 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3144                                         Py_ssize_t size)
3145 {
3146     return unicodeescape_string(s, size, 0);
3147 }
3148
3149 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3150 {
3151     if (!PyUnicode_Check(unicode)) {
3152         PyErr_BadArgument();
3153         return NULL;
3154     }
3155     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3156                                          PyUnicode_GET_SIZE(unicode));
3157 }
3158
3159 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3160
3161 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3162                                            Py_ssize_t size,
3163                                            const char *errors)
3164 {
3165     const char *starts = s;
3166     Py_ssize_t startinpos;
3167     Py_ssize_t endinpos;
3168     Py_ssize_t outpos;
3169     PyUnicodeObject *v;
3170     Py_UNICODE *p;
3171     const char *end;
3172     const char *bs;
3173     PyObject *errorHandler = NULL;
3174     PyObject *exc = NULL;
3175
3176     /* Escaped strings will always be longer than the resulting
3177        Unicode string, so we start with size here and then reduce the
3178        length after conversion to the true value. (But decoding error
3179        handler might have to resize the string) */
3180     v = _PyUnicode_New(size);
3181     if (v == NULL)
3182         goto onError;
3183     if (size == 0)
3184         return (PyObject *)v;
3185     p = PyUnicode_AS_UNICODE(v);
3186     end = s + size;
3187     while (s < end) {
3188         unsigned char c;
3189         Py_UCS4 x;
3190         int i;
3191         int count;
3192
3193         /* Non-escape characters are interpreted as Unicode ordinals */
3194         if (*s != '\\') {
3195             *p++ = (unsigned char)*s++;
3196             continue;
3197         }
3198         startinpos = s-starts;
3199
3200         /* \u-escapes are only interpreted iff the number of leading
3201            backslashes if odd */
3202         bs = s;
3203         for (;s < end;) {
3204             if (*s != '\\')
3205                 break;
3206             *p++ = (unsigned char)*s++;
3207         }
3208         if (((s - bs) & 1) == 0 ||
3209             s >= end ||
3210             (*s != 'u' && *s != 'U')) {
3211             continue;
3212         }
3213         p--;
3214         count = *s=='u' ? 4 : 8;
3215         s++;
3216
3217         /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3218         outpos = p-PyUnicode_AS_UNICODE(v);
3219         for (x = 0, i = 0; i < count; ++i, ++s) {
3220             c = (unsigned char)*s;
3221             if (!isxdigit(c)) {
3222                 endinpos = s-starts;
3223                 if (unicode_decode_call_errorhandler(
3224                         errors, &errorHandler,
3225                         "rawunicodeescape", "truncated \\uXXXX",
3226                         starts, size, &startinpos, &endinpos, &exc, &s,
3227                         &v, &outpos, &p))
3228                     goto onError;
3229                 goto nextByte;
3230             }
3231             x = (x<<4) & ~0xF;
3232             if (c >= '0' && c <= '9')
3233                 x += c - '0';
3234             else if (c >= 'a' && c <= 'f')
3235                 x += 10 + c - 'a';
3236             else
3237                 x += 10 + c - 'A';
3238         }
3239         if (x <= 0xffff)
3240             /* UCS-2 character */
3241             *p++ = (Py_UNICODE) x;
3242         else if (x <= 0x10ffff) {
3243             /* UCS-4 character. Either store directly, or as
3244                surrogate pair. */
3245 #ifdef Py_UNICODE_WIDE
3246             *p++ = (Py_UNICODE) x;
3247 #else
3248             x -= 0x10000L;
3249             *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3250             *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3251 #endif
3252         } else {
3253             endinpos = s-starts;
3254             outpos = p-PyUnicode_AS_UNICODE(v);
3255             if (unicode_decode_call_errorhandler(
3256                     errors, &errorHandler,
3257                     "rawunicodeescape", "\\Uxxxxxxxx out of range",
3258                     starts, size, &startinpos, &endinpos, &exc, &s,
3259                     &v, &outpos, &p))
3260                 goto onError;
3261         }
3262       nextByte:
3263         ;
3264     }
3265     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3266         goto onError;
3267     Py_XDECREF(errorHandler);
3268     Py_XDECREF(exc);
3269     return (PyObject *)v;
3270
3271   onError:
3272     Py_XDECREF(v);
3273     Py_XDECREF(errorHandler);
3274     Py_XDECREF(exc);
3275     return NULL;
3276 }
3277
3278 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3279                                            Py_ssize_t size)
3280 {
3281     PyObject *repr;
3282     char *p;
3283     char *q;
3284
3285     static const char *hexdigit = "0123456789abcdef";
3286 #ifdef Py_UNICODE_WIDE
3287     const Py_ssize_t expandsize = 10;
3288 #else
3289     const Py_ssize_t expandsize = 6;
3290 #endif
3291
3292     if (size > PY_SSIZE_T_MAX / expandsize)
3293         return PyErr_NoMemory();
3294
3295     repr = PyString_FromStringAndSize(NULL, expandsize * size);
3296     if (repr == NULL)
3297         return NULL;
3298     if (size == 0)
3299         return repr;
3300
3301     p = q = PyString_AS_STRING(repr);
3302     while (size-- > 0) {
3303         Py_UNICODE ch = *s++;
3304 #ifdef Py_UNICODE_WIDE
3305         /* Map 32-bit characters to '\Uxxxxxxxx' */
3306         if (ch >= 0x10000) {
3307             *p++ = '\\';
3308             *p++ = 'U';
3309             *p++ = hexdigit[(ch >> 28) & 0xf];
3310             *p++ = hexdigit[(ch >> 24) & 0xf];
3311             *p++ = hexdigit[(ch >> 20) & 0xf];
3312             *p++ = hexdigit[(ch >> 16) & 0xf];
3313             *p++ = hexdigit[(ch >> 12) & 0xf];
3314             *p++ = hexdigit[(ch >> 8) & 0xf];
3315             *p++ = hexdigit[(ch >> 4) & 0xf];
3316             *p++ = hexdigit[ch & 15];
3317         }
3318         else
3319 #else
3320             /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3321             if (ch >= 0xD800 && ch < 0xDC00) {
3322                 Py_UNICODE ch2;
3323                 Py_UCS4 ucs;
3324
3325                 ch2 = *s++;
3326                 size--;
3327                 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3328                     ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3329                     *p++ = '\\';
3330                     *p++ = 'U';
3331                     *p++ = hexdigit[(ucs >> 28) & 0xf];
3332                     *p++ = hexdigit[(ucs >> 24) & 0xf];
3333                     *p++ = hexdigit[(ucs >> 20) & 0xf];
3334                     *p++ = hexdigit[(ucs >> 16) & 0xf];
3335                     *p++ = hexdigit[(ucs >> 12) & 0xf];
3336                     *p++ = hexdigit[(ucs >> 8) & 0xf];
3337                     *p++ = hexdigit[(ucs >> 4) & 0xf];
3338                     *p++ = hexdigit[ucs & 0xf];
3339                     continue;
3340                 }
3341                 /* Fall through: isolated surrogates are copied as-is */
3342                 s--;
3343                 size++;
3344             }
3345 #endif
3346         /* Map 16-bit characters to '\uxxxx' */
3347         if (ch >= 256) {
3348             *p++ = '\\';
3349             *p++ = 'u';
3350             *p++ = hexdigit[(ch >> 12) & 0xf];
3351             *p++ = hexdigit[(ch >> 8) & 0xf];
3352             *p++ = hexdigit[(ch >> 4) & 0xf];
3353             *p++ = hexdigit[ch & 15];
3354         }
3355         /* Copy everything else as-is */
3356         else
3357             *p++ = (char) ch;
3358     }
3359     *p = '\0';
3360     _PyString_Resize(&repr, p - q);
3361     return repr;
3362 }
3363
3364 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3365 {
3366     if (!PyUnicode_Check(unicode)) {
3367         PyErr_BadArgument();
3368         return NULL;
3369     }
3370     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3371                                             PyUnicode_GET_SIZE(unicode));
3372 }
3373
3374 /* --- Unicode Internal Codec ------------------------------------------- */
3375
3376 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3377                                            Py_ssize_t size,
3378                                            const char *errors)
3379 {
3380     const char *starts = s;
3381     Py_ssize_t startinpos;
3382     Py_ssize_t endinpos;
3383     Py_ssize_t outpos;
3384     PyUnicodeObject *v;
3385     Py_UNICODE *p;
3386     const char *end;
3387     const char *reason;
3388     PyObject *errorHandler = NULL;
3389     PyObject *exc = NULL;
3390
3391 #ifdef Py_UNICODE_WIDE
3392     Py_UNICODE unimax = PyUnicode_GetMax();
3393 #endif
3394
3395     /* XXX overflow detection missing */
3396     v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3397     if (v == NULL)
3398         goto onError;
3399     if (PyUnicode_GetSize((PyObject *)v) == 0)
3400         return (PyObject *)v;
3401     p = PyUnicode_AS_UNICODE(v);
3402     end = s + size;
3403
3404     while (s < end) {
3405         memcpy(p, s, sizeof(Py_UNICODE));
3406         /* We have to sanity check the raw data, otherwise doom looms for
3407            some malformed UCS-4 data. */
3408         if (
3409 #ifdef Py_UNICODE_WIDE
3410             *p > unimax || *p < 0 ||
3411 #endif
3412             end-s < Py_UNICODE_SIZE
3413             )
3414         {
3415             startinpos = s - starts;
3416             if (end-s < Py_UNICODE_SIZE) {
3417                 endinpos = end-starts;
3418                 reason = "truncated input";
3419             }
3420             else {
3421                 endinpos = s - starts + Py_UNICODE_SIZE;
3422                 reason = "illegal code point (> 0x10FFFF)";
3423             }
3424             outpos = p - PyUnicode_AS_UNICODE(v);
3425             if (unicode_decode_call_errorhandler(
3426                     errors, &errorHandler,
3427                     "unicode_internal", reason,
3428                     starts, size, &startinpos, &endinpos, &exc, &s,
3429                     &v, &outpos, &p)) {
3430                 goto onError;
3431             }
3432         }
3433         else {
3434             p++;
3435             s += Py_UNICODE_SIZE;
3436         }
3437     }
3438
3439     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3440         goto onError;
3441     Py_XDECREF(errorHandler);
3442     Py_XDECREF(exc);
3443     return (PyObject *)v;
3444
3445   onError:
3446     Py_XDECREF(v);
3447     Py_XDECREF(errorHandler);
3448     Py_XDECREF(exc);
3449     return NULL;
3450 }
3451
3452 /* --- Latin-1 Codec ------------------------------------------------------ */
3453
3454 PyObject *PyUnicode_DecodeLatin1(const char *s,
3455                                  Py_ssize_t size,
3456                                  const char *errors)
3457 {
3458     PyUnicodeObject *v;
3459     Py_UNICODE *p;
3460
3461     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3462     if (size == 1) {
3463         Py_UNICODE r = *(unsigned char*)s;
3464         return PyUnicode_FromUnicode(&r, 1);
3465     }
3466
3467     v = _PyUnicode_New(size);
3468     if (v == NULL)
3469         goto onError;
3470     if (size == 0)
3471         return (PyObject *)v;
3472     p = PyUnicode_AS_UNICODE(v);
3473     while (size-- > 0)
3474         *p++ = (unsigned char)*s++;
3475     return (PyObject *)v;
3476
3477   onError:
3478     Py_XDECREF(v);
3479     return NULL;
3480 }
3481
3482 /* create or adjust a UnicodeEncodeError */
3483 static void make_encode_exception(PyObject **exceptionObject,
3484                                   const char *encoding,
3485                                   const Py_UNICODE *unicode, Py_ssize_t size,
3486                                   Py_ssize_t startpos, Py_ssize_t endpos,
3487                                   const char *reason)
3488 {
3489     if (*exceptionObject == NULL) {
3490         *exceptionObject = PyUnicodeEncodeError_Create(
3491             encoding, unicode, size, startpos, endpos, reason);
3492     }
3493     else {
3494         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3495             goto onError;
3496         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3497             goto onError;
3498         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3499             goto onError;
3500         return;
3501       onError:
3502         Py_DECREF(*exceptionObject);
3503         *exceptionObject = NULL;
3504     }
3505 }
3506
3507 /* raises a UnicodeEncodeError */
3508 static void raise_encode_exception(PyObject **exceptionObject,
3509                                    const char *encoding,
3510                                    const Py_UNICODE *unicode, Py_ssize_t size,
3511                                    Py_ssize_t startpos, Py_ssize_t endpos,
3512                                    const char *reason)
3513 {
3514     make_encode_exception(exceptionObject,
3515                           encoding, unicode, size, startpos, endpos, reason);
3516     if (*exceptionObject != NULL)
3517         PyCodec_StrictErrors(*exceptionObject);
3518 }
3519
3520 /* error handling callback helper:
3521    build arguments, call the callback and check the arguments,
3522    put the result into newpos and return the replacement string, which
3523    has to be freed by the caller */
3524 static PyObject *unicode_encode_call_errorhandler(const char *errors,
3525                                                   PyObject **errorHandler,
3526                                                   const char *encoding, const char *reason,
3527                                                   const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3528                                                   Py_ssize_t startpos, Py_ssize_t endpos,
3529                                                   Py_ssize_t *newpos)
3530 {
3531     static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3532
3533     PyObject *restuple;
3534     PyObject *resunicode;
3535
3536     if (*errorHandler == NULL) {
3537         *errorHandler = PyCodec_LookupError(errors);
3538         if (*errorHandler == NULL)
3539             return NULL;
3540     }
3541
3542     make_encode_exception(exceptionObject,
3543                           encoding, unicode, size, startpos, endpos, reason);
3544     if (*exceptionObject == NULL)
3545         return NULL;
3546
3547     restuple = PyObject_CallFunctionObjArgs(
3548         *errorHandler, *exceptionObject, NULL);
3549     if (restuple == NULL)
3550         return NULL;
3551     if (!PyTuple_Check(restuple)) {
3552         PyErr_SetString(PyExc_TypeError, &argparse[4]);
3553         Py_DECREF(restuple);
3554         return NULL;
3555     }
3556     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3557                           &resunicode, newpos)) {
3558         Py_DECREF(restuple);
3559         return NULL;
3560     }
3561     if (*newpos<0)
3562         *newpos = size+*newpos;
3563     if (*newpos<0 || *newpos>size) {
3564         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3565         Py_DECREF(restuple);
3566         return NULL;
3567     }
3568     Py_INCREF(resunicode);
3569     Py_DECREF(restuple);
3570     return resunicode;
3571 }
3572
3573 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3574                                      Py_ssize_t size,
3575                                      const char *errors,
3576                                      int limit)
3577 {
3578     /* output object */
3579     PyObject *res;
3580     /* pointers to the beginning and end+1 of input */
3581     const Py_UNICODE *startp = p;
3582     const Py_UNICODE *endp = p + size;
3583     /* pointer to the beginning of the unencodable characters */
3584     /* const Py_UNICODE *badp = NULL; */
3585     /* pointer into the output */
3586     char *str;
3587     /* current output position */
3588     Py_ssize_t respos = 0;
3589     Py_ssize_t ressize;
3590     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3591     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3592     PyObject *errorHandler = NULL;
3593     PyObject *exc = NULL;
3594     /* the following variable is used for caching string comparisons
3595      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3596     int known_errorHandler = -1;
3597
3598     /* allocate enough for a simple encoding without
3599        replacements, if we need more, we'll resize */
3600     res = PyString_FromStringAndSize(NULL, size);
3601     if (res == NULL)
3602         goto onError;
3603     if (size == 0)
3604         return res;
3605     str = PyString_AS_STRING(res);
3606     ressize = size;
3607
3608     while (p<endp) {
3609         Py_UNICODE c = *p;
3610
3611         /* can we encode this? */
3612         if (c<limit) {
3613             /* no overflow check, because we know that the space is enough */
3614             *str++ = (char)c;
3615             ++p;
3616         }
3617         else {
3618             Py_ssize_t unicodepos = p-startp;
3619             Py_ssize_t requiredsize;
3620             PyObject *repunicode;
3621             Py_ssize_t repsize;
3622             Py_ssize_t newpos;
3623             Py_ssize_t respos;
3624             Py_UNICODE *uni2;
3625             /* startpos for collecting unencodable chars */
3626             const Py_UNICODE *collstart = p;
3627             const Py_UNICODE *collend = p;
3628             /* find all unecodable characters */
3629             while ((collend < endp) && ((*collend)>=limit))
3630                 ++collend;
3631             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3632             if (known_errorHandler==-1) {
3633                 if ((errors==NULL) || (!strcmp(errors, "strict")))
3634                     known_errorHandler = 1;
3635                 else if (!strcmp(errors, "replace"))
3636                     known_errorHandler = 2;
3637                 else if (!strcmp(errors, "ignore"))
3638                     known_errorHandler = 3;
3639                 else if (!strcmp(errors, "xmlcharrefreplace"))
3640                     known_errorHandler = 4;
3641                 else
3642                     known_errorHandler = 0;
3643             }
3644             switch (known_errorHandler) {
3645             case 1: /* strict */
3646                 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3647                 goto onError;
3648             case 2: /* replace */
3649                 while (collstart++<collend)
3650                     *str++ = '?'; /* fall through */
3651             case 3: /* ignore */
3652                 p = collend;
3653                 break;
3654             case 4: /* xmlcharrefreplace */
3655                 respos = str-PyString_AS_STRING(res);
3656                 /* determine replacement size (temporarily (mis)uses p) */
3657                 for (p = collstart, repsize = 0; p < collend; ++p) {
3658                     if (*p<10)
3659                         repsize += 2+1+1;
3660                     else if (*p<100)
3661                         repsize += 2+2+1;
3662                     else if (*p<1000)
3663                         repsize += 2+3+1;
3664                     else if (*p<10000)
3665                         repsize += 2+4+1;
3666 #ifndef Py_UNICODE_WIDE
3667                     else
3668                         repsize += 2+5+1;
3669 #else
3670                     else if (*p<100000)
3671                         repsize += 2+5+1;
3672                     else if (*p<1000000)
3673                         repsize += 2+6+1;
3674                     else
3675                         repsize += 2+7+1;
3676 #endif
3677                 }
3678                 requiredsize = respos+repsize+(endp-collend);
3679                 if (requiredsize > ressize) {
3680                     if (requiredsize<2*ressize)
3681                         requiredsize = 2*ressize;
3682                     if (_PyString_Resize(&res, requiredsize))
3683                         goto onError;
3684                     str = PyString_AS_STRING(res) + respos;
3685                     ressize = requiredsize;
3686                 }
3687                 /* generate replacement (temporarily (mis)uses p) */
3688                 for (p = collstart; p < collend; ++p) {
3689                     str += sprintf(str, "&#%d;", (int)*p);
3690                 }
3691                 p = collend;
3692                 break;
3693             default:
3694                 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3695                                                               encoding, reason, startp, size, &exc,
3696                                                               collstart-startp, collend-startp, &newpos);
3697                 if (repunicode == NULL)
3698                     goto onError;
3699                 /* need more space? (at least enough for what we have+the
3700                    replacement+the rest of the string, so we won't have to
3701                    check space for encodable characters) */
3702                 respos = str-PyString_AS_STRING(res);
3703                 repsize = PyUnicode_GET_SIZE(repunicode);
3704                 requiredsize = respos+repsize+(endp-collend);
3705                 if (requiredsize > ressize) {
3706                     if (requiredsize<2*ressize)
3707                         requiredsize = 2*ressize;
3708                     if (_PyString_Resize(&res, requiredsize)) {
3709                         Py_DECREF(repunicode);
3710                         goto onError;
3711                     }
3712                     str = PyString_AS_STRING(res) + respos;
3713                     ressize = requiredsize;
3714                 }
3715                 /* check if there is anything unencodable in the replacement
3716                    and copy it to the output */
3717                 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3718                     c = *uni2;
3719                     if (c >= limit) {
3720                         raise_encode_exception(&exc, encoding, startp, size,
3721                                                unicodepos, unicodepos+1, reason);
3722                         Py_DECREF(repunicode);
3723                         goto onError;
3724                     }
3725                     *str = (char)c;
3726                 }
3727                 p = startp + newpos;
3728                 Py_DECREF(repunicode);
3729             }
3730         }
3731     }
3732     /* Resize if we allocated to much */
3733     respos = str-PyString_AS_STRING(res);
3734     if (respos<ressize)
3735         /* If this falls res will be NULL */
3736         _PyString_Resize(&res, respos);
3737     Py_XDECREF(errorHandler);
3738     Py_XDECREF(exc);
3739     return res;
3740
3741   onError:
3742     Py_XDECREF(res);
3743     Py_XDECREF(errorHandler);
3744     Py_XDECREF(exc);
3745     return NULL;
3746 }
3747
3748 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3749                                  Py_ssize_t size,
3750                                  const char *errors)
3751 {
3752     return unicode_encode_ucs1(p, size, errors, 256);
3753 }
3754
3755 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3756 {
3757     if (!PyUnicode_Check(unicode)) {
3758         PyErr_BadArgument();
3759         return NULL;
3760     }
3761     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3762                                   PyUnicode_GET_SIZE(unicode),
3763                                   NULL);
3764 }
3765
3766 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3767
3768 PyObject *PyUnicode_DecodeASCII(const char *s,
3769                                 Py_ssize_t size,
3770                                 const char *errors)
3771 {
3772     const char *starts = s;
3773     PyUnicodeObject *v;
3774     Py_UNICODE *p;
3775     Py_ssize_t startinpos;
3776     Py_ssize_t endinpos;
3777     Py_ssize_t outpos;
3778     const char *e;
3779     PyObject *errorHandler = NULL;
3780     PyObject *exc = NULL;
3781
3782     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3783     if (size == 1 && *(unsigned char*)s < 128) {
3784         Py_UNICODE r = *(unsigned char*)s;
3785         return PyUnicode_FromUnicode(&r, 1);
3786     }
3787
3788     v = _PyUnicode_New(size);
3789     if (v == NULL)
3790         goto onError;
3791     if (size == 0)
3792         return (PyObject *)v;
3793     p = PyUnicode_AS_UNICODE(v);
3794     e = s + size;
3795     while (s < e) {
3796         register unsigned char c = (unsigned char)*s;
3797         if (c < 128) {
3798             *p++ = c;
3799             ++s;
3800         }
3801         else {
3802             startinpos = s-starts;
3803             endinpos = startinpos + 1;
3804             outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3805             if (unicode_decode_call_errorhandler(
3806                     errors, &errorHandler,
3807                     "ascii", "ordinal not in range(128)",
3808                     starts, size, &startinpos, &endinpos, &exc, &s,
3809                     &v, &outpos, &p))
3810                 goto onError;
3811         }
3812     }
3813     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
3814         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3815             goto onError;
3816     Py_XDECREF(errorHandler);
3817     Py_XDECREF(exc);
3818     return (PyObject *)v;
3819
3820   onError:
3821     Py_XDECREF(v);
3822     Py_XDECREF(errorHandler);
3823     Py_XDECREF(exc);
3824     return NULL;
3825 }
3826
3827 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3828                                 Py_ssize_t size,
3829                                 const char *errors)
3830 {
3831     return unicode_encode_ucs1(p, size, errors, 128);
3832 }
3833
3834 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3835 {
3836     if (!PyUnicode_Check(unicode)) {
3837         PyErr_BadArgument();
3838         return NULL;
3839     }
3840     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3841                                  PyUnicode_GET_SIZE(unicode),
3842                                  NULL);
3843 }
3844
3845 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3846
3847 /* --- MBCS codecs for Windows -------------------------------------------- */
3848
3849 #if SIZEOF_INT < SIZEOF_SIZE_T
3850 #define NEED_RETRY
3851 #endif
3852
3853 /* XXX This code is limited to "true" double-byte encodings, as
3854    a) it assumes an incomplete character consists of a single byte, and
3855    b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3856    encodings, see IsDBCSLeadByteEx documentation. */
3857
3858 static int is_dbcs_lead_byte(const char *s, int offset)
3859 {
3860     const char *curr = s + offset;
3861
3862     if (IsDBCSLeadByte(*curr)) {
3863         const char *prev = CharPrev(s, curr);
3864         return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3865     }
3866     return 0;
3867 }
3868
3869 /*
3870  * Decode MBCS string into unicode object. If 'final' is set, converts
3871  * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3872  */
3873 static int decode_mbcs(PyUnicodeObject **v,
3874                        const char *s, /* MBCS string */
3875                        int size, /* sizeof MBCS string */
3876                        int final)
3877 {
3878     Py_UNICODE *p;
3879     Py_ssize_t n = 0;
3880     int usize = 0;
3881
3882     assert(size >= 0);
3883
3884     /* Skip trailing lead-byte unless 'final' is set */
3885     if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3886         --size;
3887
3888     /* First get the size of the result */
3889     if (size > 0) {
3890         usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3891         if (usize == 0) {
3892             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3893             return -1;
3894         }
3895     }
3896
3897     if (*v == NULL) {
3898         /* Create unicode object */
3899         *v = _PyUnicode_New(usize);
3900         if (*v == NULL)
3901             return -1;
3902     }
3903     else {
3904         /* Extend unicode object */
3905         n = PyUnicode_GET_SIZE(*v);
3906         if (_PyUnicode_Resize(v, n + usize) < 0)
3907             return -1;
3908     }
3909
3910     /* Do the conversion */
3911     if (size > 0) {
3912         p = PyUnicode_AS_UNICODE(*v) + n;
3913         if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3914             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3915             return -1;
3916         }
3917     }
3918
3919     return size;
3920 }
3921
3922 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3923                                        Py_ssize_t size,
3924                                        const char *errors,
3925                                        Py_ssize_t *consumed)
3926 {
3927     PyUnicodeObject *v = NULL;
3928     int done;
3929
3930     if (consumed)
3931         *consumed = 0;
3932
3933 #ifdef NEED_RETRY
3934   retry:
3935     if (size > INT_MAX)
3936         done = decode_mbcs(&v, s, INT_MAX, 0);
3937     else
3938 #endif
3939         done = decode_mbcs(&v, s, (int)size, !consumed);
3940
3941     if (done < 0) {
3942         Py_XDECREF(v);
3943         return NULL;
3944     }
3945
3946     if (consumed)
3947         *consumed += done;
3948
3949 #ifdef NEED_RETRY
3950     if (size > INT_MAX) {
3951         s += done;
3952         size -= done;
3953         goto retry;
3954     }
3955 #endif
3956
3957     return (PyObject *)v;
3958 }
3959
3960 PyObject *PyUnicode_DecodeMBCS(const char *s,
3961                                Py_ssize_t size,
3962                                const char *errors)
3963 {
3964     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3965 }
3966
3967 /*
3968  * Convert unicode into string object (MBCS).
3969  * Returns 0 if succeed, -1 otherwise.
3970  */
3971 static int encode_mbcs(PyObject **repr,
3972                        const Py_UNICODE *p, /* unicode */
3973                        int size) /* size of unicode */
3974 {
3975     int mbcssize = 0;
3976     Py_ssize_t n = 0;
3977
3978     assert(size >= 0);
3979
3980     /* First get the size of the result */
3981     if (size > 0) {
3982         mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3983         if (mbcssize == 0) {
3984             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3985             return -1;
3986         }
3987     }
3988
3989     if (*repr == NULL) {
3990         /* Create string object */
3991         *repr = PyString_FromStringAndSize(NULL, mbcssize);
3992         if (*repr == NULL)
3993             return -1;
3994     }
3995     else {
3996         /* Extend string object */
3997         n = PyString_Size(*repr);
3998         if (_PyString_Resize(repr, n + mbcssize) < 0)
3999             return -1;
4000     }
4001
4002     /* Do the conversion */
4003     if (size > 0) {
4004         char *s = PyString_AS_STRING(*repr) + n;
4005         if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4006             PyErr_SetFromWindowsErrWithFilename(0, NULL);
4007             return -1;
4008         }
4009     }
4010
4011     return 0;
4012 }
4013
4014 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
4015                                Py_ssize_t size,
4016                                const char *errors)
4017 {
4018     PyObject *repr = NULL;
4019     int ret;
4020
4021 #ifdef NEED_RETRY
4022   retry:
4023     if (size > INT_MAX)
4024         ret = encode_mbcs(&repr, p, INT_MAX);
4025     else
4026 #endif
4027         ret = encode_mbcs(&repr, p, (int)size);
4028
4029     if (ret < 0) {
4030         Py_XDECREF(repr);
4031         return NULL;
4032     }
4033
4034 #ifdef NEED_RETRY
4035     if (size > INT_MAX) {
4036         p += INT_MAX;
4037         size -= INT_MAX;
4038         goto retry;
4039     }
4040 #endif
4041
4042     return repr;
4043 }
4044
4045 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4046 {
4047     if (!PyUnicode_Check(unicode)) {
4048         PyErr_BadArgument();
4049         return NULL;
4050     }
4051     return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4052                                 PyUnicode_GET_SIZE(unicode),
4053                                 NULL);
4054 }
4055
4056 #undef NEED_RETRY
4057
4058 #endif /* MS_WINDOWS */
4059
4060 /* --- Character Mapping Codec -------------------------------------------- */
4061
4062 PyObject *PyUnicode_DecodeCharmap(const char *s,
4063                                   Py_ssize_t size,
4064                                   PyObject *mapping,
4065                                   const char *errors)
4066 {
4067     const char *starts = s;
4068     Py_ssize_t startinpos;
4069     Py_ssize_t endinpos;
4070     Py_ssize_t outpos;
4071     const char *e;
4072     PyUnicodeObject *v;
4073     Py_UNICODE *p;
4074     Py_ssize_t extrachars = 0;
4075     PyObject *errorHandler = NULL;
4076     PyObject *exc = NULL;
4077     Py_UNICODE *mapstring = NULL;
4078     Py_ssize_t maplen = 0;
4079
4080     /* Default to Latin-1 */
4081     if (mapping == NULL)
4082         return PyUnicode_DecodeLatin1(s, size, errors);
4083
4084     v = _PyUnicode_New(size);
4085     if (v == NULL)
4086         goto onError;
4087     if (size == 0)
4088         return (PyObject *)v;
4089     p = PyUnicode_AS_UNICODE(v);
4090     e = s + size;
4091     if (PyUnicode_CheckExact(mapping)) {
4092         mapstring = PyUnicode_AS_UNICODE(mapping);
4093         maplen = PyUnicode_GET_SIZE(mapping);
4094         while (s < e) {
4095             unsigned char ch = *s;
4096             Py_UNICODE x = 0xfffe; /* illegal value */
4097
4098             if (ch < maplen)
4099                 x = mapstring[ch];
4100
4101             if (x == 0xfffe) {
4102                 /* undefined mapping */
4103                 outpos = p-PyUnicode_AS_UNICODE(v);
4104                 startinpos = s-starts;
4105                 endinpos = startinpos+1;
4106                 if (unicode_decode_call_errorhandler(
4107                         errors, &errorHandler,
4108                         "charmap", "character maps to <undefined>",
4109                         starts, size, &startinpos, &endinpos, &exc, &s,
4110                         &v, &outpos, &p)) {
4111                     goto onError;
4112                 }
4113                 continue;
4114             }
4115             *p++ = x;
4116             ++s;
4117         }
4118     }
4119     else {
4120         while (s < e) {
4121             unsigned char ch = *s;
4122             PyObject *w, *x;
4123
4124             /* Get mapping (char ordinal -> integer, Unicode char or None) */
4125             w = PyInt_FromLong((long)ch);
4126             if (w == NULL)
4127                 goto onError;
4128             x = PyObject_GetItem(mapping, w);
4129             Py_DECREF(w);
4130             if (x == NULL) {
4131                 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4132                     /* No mapping found means: mapping is undefined. */
4133                     PyErr_Clear();
4134                     x = Py_None;
4135                     Py_INCREF(x);
4136                 } else
4137                     goto onError;
4138             }
4139
4140             /* Apply mapping */
4141             if (PyInt_Check(x)) {
4142                 long value = PyInt_AS_LONG(x);
4143                 if (value < 0 || value > 65535) {
4144                     PyErr_SetString(PyExc_TypeError,
4145                                     "character mapping must be in range(65536)");
4146                     Py_DECREF(x);
4147                     goto onError;
4148                 }
4149                 *p++ = (Py_UNICODE)value;
4150             }
4151             else if (x == Py_None) {
4152                 /* undefined mapping */
4153                 outpos = p-PyUnicode_AS_UNICODE(v);
4154                 startinpos = s-starts;
4155                 endinpos = startinpos+1;
4156                 if (unicode_decode_call_errorhandler(
4157                         errors, &errorHandler,
4158                         "charmap", "character maps to <undefined>",
4159                         starts, size, &startinpos, &endinpos, &exc, &s,
4160                         &v, &outpos, &p)) {
4161                     Py_DECREF(x);
4162                     goto onError;
4163                 }
4164                 Py_DECREF(x);
4165                 continue;
4166             }
4167             else if (PyUnicode_Check(x)) {
4168                 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4169
4170                 if (targetsize == 1)
4171                     /* 1-1 mapping */
4172                     *p++ = *PyUnicode_AS_UNICODE(x);
4173
4174                 else if (targetsize > 1) {
4175                     /* 1-n mapping */
4176                     if (targetsize > extrachars) {
4177                         /* resize first */
4178                         Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4179                         Py_ssize_t needed = (targetsize - extrachars) + \
4180                             (targetsize << 2);
4181                         extrachars += needed;
4182                         /* XXX overflow detection missing */
4183                         if (_PyUnicode_Resize(&v,
4184                                               PyUnicode_GET_SIZE(v) + needed) < 0) {
4185                             Py_DECREF(x);
4186                             goto onError;
4187                         }
4188                         p = PyUnicode_AS_UNICODE(v) + oldpos;
4189                     }
4190                     Py_UNICODE_COPY(p,
4191                                     PyUnicode_AS_UNICODE(x),
4192                                     targetsize);
4193                     p += targetsize;
4194                     extrachars -= targetsize;
4195                 }
4196                 /* 1-0 mapping: skip the character */
4197             }
4198             else {
4199                 /* wrong return value */
4200                 PyErr_SetString(PyExc_TypeError,
4201                                 "character mapping must return integer, None or unicode");
4202                 Py_DECREF(x);
4203                 goto onError;
4204             }
4205             Py_DECREF(x);
4206             ++s;
4207         }
4208     }
4209     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4210         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4211             goto onError;
4212     Py_XDECREF(errorHandler);
4213     Py_XDECREF(exc);
4214     return (PyObject *)v;
4215
4216   onError:
4217     Py_XDECREF(errorHandler);
4218     Py_XDECREF(exc);
4219     Py_XDECREF(v);
4220     return NULL;
4221 }
4222
4223 /* Charmap encoding: the lookup table */
4224
4225 struct encoding_map{
4226     PyObject_HEAD
4227     unsigned char level1[32];
4228     int count2, count3;
4229     unsigned char level23[1];
4230 };
4231
4232 static PyObject*
4233 encoding_map_size(PyObject *obj, PyObject* args)
4234 {
4235     struct encoding_map *map = (struct encoding_map*)obj;
4236     return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4237                           128*map->count3);
4238 }
4239
4240 static PyMethodDef encoding_map_methods[] = {
4241     {"size", encoding_map_size, METH_NOARGS,
4242      PyDoc_STR("Return the size (in bytes) of this object") },
4243     { 0 }
4244 };
4245
4246 static void
4247 encoding_map_dealloc(PyObject* o)
4248 {
4249     PyObject_FREE(o);
4250 }
4251
4252 static PyTypeObject EncodingMapType = {
4253     PyVarObject_HEAD_INIT(NULL, 0)
4254     "EncodingMap",          /*tp_name*/
4255     sizeof(struct encoding_map),   /*tp_basicsize*/
4256     0,                      /*tp_itemsize*/
4257     /* methods */
4258     encoding_map_dealloc,   /*tp_dealloc*/
4259     0,                      /*tp_print*/
4260     0,                      /*tp_getattr*/
4261     0,                      /*tp_setattr*/
4262     0,                      /*tp_compare*/
4263     0,                      /*tp_repr*/
4264     0,                      /*tp_as_number*/
4265     0,                      /*tp_as_sequence*/
4266     0,                      /*tp_as_mapping*/
4267     0,                      /*tp_hash*/
4268     0,                      /*tp_call*/
4269     0,                      /*tp_str*/
4270     0,                      /*tp_getattro*/
4271     0,                      /*tp_setattro*/
4272     0,                      /*tp_as_buffer*/
4273     Py_TPFLAGS_DEFAULT,     /*tp_flags*/
4274     0,                      /*tp_doc*/
4275     0,                      /*tp_traverse*/
4276     0,                      /*tp_clear*/
4277     0,                      /*tp_richcompare*/
4278     0,                      /*tp_weaklistoffset*/
4279     0,                      /*tp_iter*/
4280     0,                      /*tp_iternext*/
4281     encoding_map_methods,   /*tp_methods*/
4282     0,                      /*tp_members*/
4283     0,                      /*tp_getset*/
4284     0,                      /*tp_base*/
4285     0,                      /*tp_dict*/
4286     0,                      /*tp_descr_get*/
4287     0,                      /*tp_descr_set*/
4288     0,                      /*tp_dictoffset*/
4289     0,                      /*tp_init*/
4290     0,                      /*tp_alloc*/
4291     0,                      /*tp_new*/
4292     0,                      /*tp_free*/
4293     0,                      /*tp_is_gc*/
4294 };
4295
4296 PyObject*
4297 PyUnicode_BuildEncodingMap(PyObject* string)
4298 {
4299     Py_UNICODE *decode;
4300     PyObject *result;
4301     struct encoding_map *mresult;
4302     int i;
4303     int need_dict = 0;
4304     unsigned char level1[32];
4305     unsigned char level2[512];
4306     unsigned char *mlevel1, *mlevel2, *mlevel3;
4307     int count2 = 0, count3 = 0;
4308
4309     if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4310         PyErr_BadArgument();
4311         return NULL;
4312     }
4313     decode = PyUnicode_AS_UNICODE(string);
4314     memset(level1, 0xFF, sizeof level1);
4315     memset(level2, 0xFF, sizeof level2);
4316
4317     /* If there isn't a one-to-one mapping of NULL to \0,
4318        or if there are non-BMP characters, we need to use
4319        a mapping dictionary. */
4320     if (decode[0] != 0)
4321         need_dict = 1;
4322     for (i = 1; i < 256; i++) {
4323         int l1, l2;
4324         if (decode[i] == 0
4325 #ifdef Py_UNICODE_WIDE
4326             || decode[i] > 0xFFFF
4327 #endif
4328             ) {
4329             need_dict = 1;
4330             break;
4331         }
4332         if (decode[i] == 0xFFFE)
4333             /* unmapped character */
4334             continue;
4335         l1 = decode[i] >> 11;
4336         l2 = decode[i] >> 7;
4337         if (level1[l1] == 0xFF)
4338             level1[l1] = count2++;
4339         if (level2[l2] == 0xFF)
4340             level2[l2] = count3++;
4341     }
4342
4343     if (count2 >= 0xFF || count3 >= 0xFF)
4344         need_dict = 1;
4345
4346     if (need_dict) {
4347         PyObject *result = PyDict_New();
4348         PyObject *key, *value;
4349         if (!result)
4350             return NULL;
4351         for (i = 0; i < 256; i++) {
4352             key = value = NULL;
4353             key = PyInt_FromLong(decode[i]);
4354             value = PyInt_FromLong(i);
4355             if (!key || !value)
4356                 goto failed1;
4357             if (PyDict_SetItem(result, key, value) == -1)
4358                 goto failed1;
4359             Py_DECREF(key);
4360             Py_DECREF(value);
4361         }
4362         return result;
4363       failed1:
4364         Py_XDECREF(key);
4365         Py_XDECREF(value);
4366         Py_DECREF(result);
4367         return NULL;
4368     }
4369
4370     /* Create a three-level trie */
4371     result = PyObject_MALLOC(sizeof(struct encoding_map) +
4372                              16*count2 + 128*count3 - 1);
4373     if (!result)
4374         return PyErr_NoMemory();
4375     PyObject_Init(result, &EncodingMapType);
4376     mresult = (struct encoding_map*)result;
4377     mresult->count2 = count2;
4378     mresult->count3 = count3;
4379     mlevel1 = mresult->level1;
4380     mlevel2 = mresult->level23;
4381     mlevel3 = mresult->level23 + 16*count2;
4382     memcpy(mlevel1, level1, 32);
4383     memset(mlevel2, 0xFF, 16*count2);
4384     memset(mlevel3, 0, 128*count3);
4385     count3 = 0;
4386     for (i = 1; i < 256; i++) {
4387         int o1, o2, o3, i2, i3;
4388         if (decode[i] == 0xFFFE)
4389             /* unmapped character */
4390             continue;
4391         o1 = decode[i]>>11;
4392         o2 = (decode[i]>>7) & 0xF;
4393         i2 = 16*mlevel1[o1] + o2;
4394         if (mlevel2[i2] == 0xFF)
4395             mlevel2[i2] = count3++;
4396         o3 = decode[i] & 0x7F;
4397         i3 = 128*mlevel2[i2] + o3;
4398         mlevel3[i3] = i;
4399     }
4400     return result;
4401 }
4402
4403 static int
4404 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4405 {
4406     struct encoding_map *map = (struct encoding_map*)mapping;
4407     int l1 = c>>11;
4408     int l2 = (c>>7) & 0xF;
4409     int l3 = c & 0x7F;
4410     int i;
4411
4412 #ifdef Py_UNICODE_WIDE
4413     if (c > 0xFFFF) {
4414         return -1;
4415     }
4416 #endif
4417     if (c == 0)
4418         return 0;
4419     /* level 1*/
4420     i = map->level1[l1];
4421     if (i == 0xFF) {
4422         return -1;
4423     }
4424     /* level 2*/
4425     i = map->level23[16*i+l2];
4426     if (i == 0xFF) {
4427         return -1;
4428     }
4429     /* level 3 */
4430     i = map->level23[16*map->count2 + 128*i + l3];
4431     if (i == 0) {
4432         return -1;
4433     }
4434     return i;
4435 }
4436
4437 /* Lookup the character ch in the mapping. If the character
4438    can't be found, Py_None is returned (or NULL, if another
4439    error occurred). */
4440 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4441 {
4442     PyObject *w = PyInt_FromLong((long)c);
4443     PyObject *x;
4444
4445     if (w == NULL)
4446         return NULL;
4447     x = PyObject_GetItem(mapping, w);
4448     Py_DECREF(w);
4449     if (x == NULL) {
4450         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4451             /* No mapping found means: mapping is undefined. */
4452             PyErr_Clear();
4453             x = Py_None;
4454             Py_INCREF(x);
4455             return x;
4456         } else
4457             return NULL;
4458     }
4459     else if (x == Py_None)
4460         return x;
4461     else if (PyInt_Check(x)) {
4462         long value = PyInt_AS_LONG(x);
4463         if (value < 0 || value > 255) {
4464             PyErr_SetString(PyExc_TypeError,
4465                             "character mapping must be in range(256)");
4466             Py_DECREF(x);
4467             return NULL;
4468         }
4469         return x;
4470     }
4471     else if (PyString_Check(x))
4472         return x;
4473     else {
4474         /* wrong return value */
4475         PyErr_SetString(PyExc_TypeError,
4476                         "character mapping must return integer, None or str");
4477         Py_DECREF(x);
4478         return NULL;
4479     }
4480 }
4481
4482 static int
4483 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4484 {
4485     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4486     /* exponentially overallocate to minimize reallocations */
4487     if (requiredsize < 2*outsize)
4488         requiredsize = 2*outsize;
4489     if (_PyString_Resize(outobj, requiredsize)) {
4490         return 0;
4491     }
4492     return 1;
4493 }
4494
4495 typedef enum charmapencode_result {
4496     enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4497 }charmapencode_result;
4498 /* lookup the character, put the result in the output string and adjust
4499    various state variables. Reallocate the output string if not enough
4500    space is available. Return a new reference to the object that
4501    was put in the output buffer, or Py_None, if the mapping was undefined
4502    (in which case no character was written) or NULL, if a
4503    reallocation error occurred. The caller must decref the result */
4504 static
4505 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4506                                           PyObject **outobj, Py_ssize_t *outpos)
4507 {
4508     PyObject *rep;
4509     char *outstart;
4510     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4511
4512     if (Py_TYPE(mapping) == &EncodingMapType) {
4513         int res = encoding_map_lookup(c, mapping);
4514         Py_ssize_t requiredsize = *outpos+1;
4515         if (res == -1)
4516             return enc_FAILED;
4517         if (outsize<requiredsize)
4518             if (!charmapencode_resize(outobj, outpos, requiredsize))
4519                 return enc_EXCEPTION;
4520         outstart = PyString_AS_STRING(*outobj);
4521         outstart[(*outpos)++] = (char)res;
4522         return enc_SUCCESS;
4523     }
4524
4525     rep = charmapencode_lookup(c, mapping);
4526     if (rep==NULL)
4527         return enc_EXCEPTION;
4528     else if (rep==Py_None) {
4529         Py_DECREF(rep);
4530         return enc_FAILED;
4531     } else {
4532         if (PyInt_Check(rep)) {
4533             Py_ssize_t requiredsize = *outpos+1;
4534             if (outsize<requiredsize)
4535                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4536                     Py_DECREF(rep);
4537                     return enc_EXCEPTION;
4538                 }
4539             outstart = PyString_AS_STRING(*outobj);
4540             outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4541         }
4542         else {
4543             const char *repchars = PyString_AS_STRING(rep);
4544             Py_ssize_t repsize = PyString_GET_SIZE(rep);
4545             Py_ssize_t requiredsize = *outpos+repsize;
4546             if (outsize<requiredsize)
4547                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4548                     Py_DECREF(rep);
4549                     return enc_EXCEPTION;
4550                 }
4551             outstart = PyString_AS_STRING(*outobj);
4552             memcpy(outstart + *outpos, repchars, repsize);
4553             *outpos += repsize;
4554         }
4555     }
4556     Py_DECREF(rep);
4557     return enc_SUCCESS;
4558 }
4559
4560 /* handle an error in PyUnicode_EncodeCharmap
4561    Return 0 on success, -1 on error */
4562 static
4563 int charmap_encoding_error(
4564     const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4565     PyObject **exceptionObject,
4566     int *known_errorHandler, PyObject **errorHandler, const char *errors,
4567     PyObject **res, Py_ssize_t *respos)
4568 {
4569     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4570     Py_ssize_t repsize;
4571     Py_ssize_t newpos;
4572     Py_UNICODE *uni2;
4573     /* startpos for collecting unencodable chars */
4574     Py_ssize_t collstartpos = *inpos;
4575     Py_ssize_t collendpos = *inpos+1;
4576     Py_ssize_t collpos;
4577     char *encoding = "charmap";
4578     char *reason = "character maps to <undefined>";
4579     charmapencode_result x;
4580
4581     /* find all unencodable characters */
4582     while (collendpos < size) {
4583         PyObject *rep;
4584         if (Py_TYPE(mapping) == &EncodingMapType) {
4585             int res = encoding_map_lookup(p[collendpos], mapping);
4586             if (res != -1)
4587                 break;
4588             ++collendpos;
4589             continue;
4590         }
4591
4592         rep = charmapencode_lookup(p[collendpos], mapping);
4593         if (rep==NULL)
4594             return -1;
4595         else if (rep!=Py_None) {
4596             Py_DECREF(rep);
4597             break;
4598         }
4599         Py_DECREF(rep);
4600         ++collendpos;
4601     }
4602     /* cache callback name lookup
4603      * (if not done yet, i.e. it's the first error) */
4604     if (*known_errorHandler==-1) {
4605         if ((errors==NULL) || (!strcmp(errors, "strict")))
4606             *known_errorHandler = 1;
4607         else if (!strcmp(errors, "replace"))
4608             *known_errorHandler = 2;
4609         else if (!strcmp(errors, "ignore"))
4610             *known_errorHandler = 3;
4611         else if (!strcmp(errors, "xmlcharrefreplace"))
4612             *known_errorHandler = 4;
4613         else
4614             *known_errorHandler = 0;
4615     }
4616     switch (*known_errorHandler) {
4617     case 1: /* strict */
4618         raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4619         return -1;
4620     case 2: /* replace */
4621         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4622             x = charmapencode_output('?', mapping, res, respos);
4623             if (x==enc_EXCEPTION) {
4624                 return -1;
4625             }
4626             else if (x==enc_FAILED) {
4627                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4628                 return -1;
4629             }
4630         }
4631         /* fall through */
4632     case 3: /* ignore */
4633         *inpos = collendpos;
4634         break;
4635     case 4: /* xmlcharrefreplace */
4636         /* generate replacement (temporarily (mis)uses p) */
4637         for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4638             char buffer[2+29+1+1];
4639             char *cp;
4640             sprintf(buffer, "&#%d;", (int)p[collpos]);
4641             for (cp = buffer; *cp; ++cp) {
4642                 x = charmapencode_output(*cp, mapping, res, respos);
4643                 if (x==enc_EXCEPTION)
4644                     return -1;
4645                 else if (x==enc_FAILED) {
4646                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4647                     return -1;
4648                 }
4649             }
4650         }
4651         *inpos = collendpos;
4652         break;
4653     default:
4654         repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4655                                                       encoding, reason, p, size, exceptionObject,
4656                                                       collstartpos, collendpos, &newpos);
4657         if (repunicode == NULL)
4658             return -1;
4659         /* generate replacement  */
4660         repsize = PyUnicode_GET_SIZE(repunicode);
4661         for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4662             x = charmapencode_output(*uni2, mapping, res, respos);
4663             if (x==enc_EXCEPTION) {
4664                 return -1;
4665             }
4666             else if (x==enc_FAILED) {
4667                 Py_DECREF(repunicode);
4668                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4669                 return -1;
4670             }
4671         }
4672         *inpos = newpos;
4673         Py_DECREF(repunicode);
4674     }
4675     return 0;
4676 }
4677
4678 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4679                                   Py_ssize_t size,
4680                                   PyObject *mapping,
4681                                   const char *errors)
4682 {
4683     /* output object */
4684     PyObject *res = NULL;
4685     /* current input position */
4686     Py_ssize_t inpos = 0;
4687     /* current output position */
4688     Py_ssize_t respos = 0;
4689     PyObject *errorHandler = NULL;
4690     PyObject *exc = NULL;
4691     /* the following variable is used for caching string comparisons
4692      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4693      * 3=ignore, 4=xmlcharrefreplace */
4694     int known_errorHandler = -1;
4695
4696     /* Default to Latin-1 */
4697     if (mapping == NULL)
4698         return PyUnicode_EncodeLatin1(p, size, errors);
4699
4700     /* allocate enough for a simple encoding without
4701        replacements, if we need more, we'll resize */
4702     res = PyString_FromStringAndSize(NULL, size);
4703     if (res == NULL)
4704         goto onError;
4705     if (size == 0)
4706         return res;
4707
4708     while (inpos<size) {
4709         /* try to encode it */
4710         charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4711         if (x==enc_EXCEPTION) /* error */
4712             goto onError;
4713         if (x==enc_FAILED) { /* unencodable character */
4714             if (charmap_encoding_error(p, size, &inpos, mapping,
4715                                        &exc,
4716                                        &known_errorHandler, &errorHandler, errors,
4717                                        &res, &respos)) {
4718                 goto onError;
4719             }
4720         }
4721         else
4722             /* done with this character => adjust input position */
4723             ++inpos;
4724     }
4725
4726     /* Resize if we allocated to much */
4727     if (respos<PyString_GET_SIZE(res)) {
4728         if (_PyString_Resize(&res, respos))
4729             goto onError;
4730     }
4731     Py_XDECREF(exc);
4732     Py_XDECREF(errorHandler);
4733     return res;
4734
4735   onError:
4736     Py_XDECREF(res);
4737     Py_XDECREF(exc);
4738     Py_XDECREF(errorHandler);
4739     return NULL;
4740 }
4741
4742 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4743                                     PyObject *mapping)
4744 {
4745     if (!PyUnicode_Check(unicode) || mapping == NULL) {
4746         PyErr_BadArgument();
4747         return NULL;
4748     }
4749     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4750                                    PyUnicode_GET_SIZE(unicode),
4751                                    mapping,
4752                                    NULL);
4753 }
4754
4755 /* create or adjust a UnicodeTranslateError */
4756 static void make_translate_exception(PyObject **exceptionObject,
4757                                      const Py_UNICODE *unicode, Py_ssize_t size,
4758                                      Py_ssize_t startpos, Py_ssize_t endpos,
4759                                      const char *reason)
4760 {
4761     if (*exceptionObject == NULL) {
4762         *exceptionObject = PyUnicodeTranslateError_Create(
4763             unicode, size, startpos, endpos, reason);
4764     }
4765     else {
4766         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4767             goto onError;
4768         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4769             goto onError;
4770         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4771             goto onError;
4772         return;
4773       onError:
4774         Py_DECREF(*exceptionObject);
4775         *exceptionObject = NULL;
4776     }
4777 }
4778
4779 /* raises a UnicodeTranslateError */
4780 static void raise_translate_exception(PyObject **exceptionObject,
4781                                       const Py_UNICODE *unicode, Py_ssize_t size,
4782                                       Py_ssize_t startpos, Py_ssize_t endpos,
4783                                       const char *reason)
4784 {
4785     make_translate_exception(exceptionObject,
4786                              unicode, size, startpos, endpos, reason);
4787     if (*exceptionObject != NULL)
4788         PyCodec_StrictErrors(*exceptionObject);
4789 }
4790
4791 /* error handling callback helper:
4792    build arguments, call the callback and check the arguments,
4793    put the result into newpos and return the replacement string, which
4794    has to be freed by the caller */
4795 static PyObject *unicode_translate_call_errorhandler(const char *errors,
4796                                                      PyObject **errorHandler,
4797                                                      const char *reason,
4798                                                      const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4799                                                      Py_ssize_t startpos, Py_ssize_t endpos,
4800                                                      Py_ssize_t *newpos)
4801 {
4802     static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4803
4804     Py_ssize_t i_newpos;
4805     PyObject *restuple;
4806     PyObject *resunicode;
4807
4808     if (*errorHandler == NULL) {
4809         *errorHandler = PyCodec_LookupError(errors);
4810         if (*errorHandler == NULL)
4811             return NULL;
4812     }
4813
4814     make_translate_exception(exceptionObject,
4815                              unicode, size, startpos, endpos, reason);
4816     if (*exceptionObject == NULL)
4817         return NULL;
4818
4819     restuple = PyObject_CallFunctionObjArgs(
4820         *errorHandler, *exceptionObject, NULL);
4821     if (restuple == NULL)
4822         return NULL;
4823     if (!PyTuple_Check(restuple)) {
4824         PyErr_SetString(PyExc_TypeError, &argparse[4]);
4825         Py_DECREF(restuple);
4826         return NULL;
4827     }
4828     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4829                           &resunicode, &i_newpos)) {
4830         Py_DECREF(restuple);
4831         return NULL;
4832     }
4833     if (i_newpos<0)
4834         *newpos = size+i_newpos;
4835     else
4836         *newpos = i_newpos;
4837     if (*newpos<0 || *newpos>size) {
4838         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4839         Py_DECREF(restuple);
4840         return NULL;
4841     }
4842     Py_INCREF(resunicode);
4843     Py_DECREF(restuple);
4844     return resunicode;
4845 }
4846
4847 /* Lookup the character ch in the mapping and put the result in result,
4848    which must be decrefed by the caller.
4849    Return 0 on success, -1 on error */
4850 static
4851 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4852 {
4853     PyObject *w = PyInt_FromLong((long)c);
4854     PyObject *x;
4855
4856     if (w == NULL)
4857         return -1;
4858     x = PyObject_GetItem(mapping, w);
4859     Py_DECREF(w);
4860     if (x == NULL) {
4861         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4862             /* No mapping found means: use 1:1 mapping. */
4863             PyErr_Clear();
4864             *result = NULL;
4865             return 0;
4866         } else
4867             return -1;
4868     }
4869     else if (x == Py_None) {
4870         *result = x;
4871         return 0;
4872     }
4873     else if (PyInt_Check(x)) {
4874         long value = PyInt_AS_LONG(x);
4875         long max = PyUnicode_GetMax();
4876         if (value < 0 || value > max) {
4877             PyErr_Format(PyExc_TypeError,
4878                          "character mapping must be in range(0x%lx)", max+1);
4879             Py_DECREF(x);
4880             return -1;
4881         }
4882         *result = x;
4883         return 0;
4884     }
4885     else if (PyUnicode_Check(x)) {
4886         *result = x;
4887         return 0;
4888     }
4889     else {
4890         /* wrong return value */
4891         PyErr_SetString(PyExc_TypeError,
4892                         "character mapping must return integer, None or unicode");
4893         Py_DECREF(x);
4894         return -1;
4895     }
4896 }
4897 /* ensure that *outobj is at least requiredsize characters long,
4898    if not reallocate and adjust various state variables.
4899    Return 0 on success, -1 on error */
4900 static
4901 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4902                                Py_ssize_t requiredsize)
4903 {
4904     Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4905     if (requiredsize > oldsize) {
4906         /* remember old output position */
4907         Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4908         /* exponentially overallocate to minimize reallocations */
4909         if (requiredsize < 2 * oldsize)
4910             requiredsize = 2 * oldsize;
4911         if (PyUnicode_Resize(outobj, requiredsize) < 0)
4912             return -1;
4913         *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4914     }
4915     return 0;
4916 }
4917 /* lookup the character, put the result in the output string and adjust
4918    various state variables. Return a new reference to the object that
4919    was put in the output buffer in *result, or Py_None, if the mapping was
4920    undefined (in which case no character was written).
4921    The called must decref result.
4922    Return 0 on success, -1 on error. */
4923 static
4924 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
4925                             Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4926                             PyObject **res)
4927 {
4928     if (charmaptranslate_lookup(*curinp, mapping, res))
4929         return -1;
4930     if (*res==NULL) {
4931         /* not found => default to 1:1 mapping */
4932         *(*outp)++ = *curinp;
4933     }
4934     else if (*res==Py_None)
4935         ;
4936     else if (PyInt_Check(*res)) {
4937         /* no overflow check, because we know that the space is enough */
4938         *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4939     }
4940     else if (PyUnicode_Check(*res)) {
4941         Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4942         if (repsize==1) {
4943             /* no overflow check, because we know that the space is enough */
4944             *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4945         }
4946         else if (repsize!=0) {
4947             /* more than one character */
4948             Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4949                 (insize - (curinp-startinp)) +
4950                 repsize - 1;
4951             if (charmaptranslate_makespace(outobj, outp, requiredsize))
4952                 return -1;
4953             memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4954             *outp += repsize;
4955         }
4956     }
4957     else
4958         return -1;
4959     return 0;
4960 }
4961
4962 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
4963                                      Py_ssize_t size,
4964                                      PyObject *mapping,
4965                                      const char *errors)
4966 {
4967     /* output object */
4968     PyObject *res = NULL;
4969     /* pointers to the beginning and end+1 of input */
4970     const Py_UNICODE *startp = p;
4971     const Py_UNICODE *endp = p + size;
4972     /* pointer into the output */
4973     Py_UNICODE *str;
4974     /* current output position */
4975     Py_ssize_t respos = 0;
4976     char *reason = "character maps to <undefined>";
4977     PyObject *errorHandler = NULL;
4978     PyObject *exc = NULL;
4979     /* the following variable is used for caching string comparisons
4980      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4981      * 3=ignore, 4=xmlcharrefreplace */
4982     int known_errorHandler = -1;
4983
4984     if (mapping == NULL) {
4985         PyErr_BadArgument();
4986         return NULL;
4987     }
4988
4989     /* allocate enough for a simple 1:1 translation without
4990        replacements, if we need more, we'll resize */
4991     res = PyUnicode_FromUnicode(NULL, size);
4992     if (res == NULL)
4993         goto onError;
4994     if (size == 0)
4995         return res;
4996     str = PyUnicode_AS_UNICODE(res);
4997
4998     while (p<endp) {
4999         /* try to encode it */
5000         PyObject *x = NULL;
5001         if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5002             Py_XDECREF(x);
5003             goto onError;
5004         }
5005         Py_XDECREF(x);
5006         if (x!=Py_None) /* it worked => adjust input pointer */
5007             ++p;
5008         else { /* untranslatable character */
5009             PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5010             Py_ssize_t repsize;
5011             Py_ssize_t newpos;
5012             Py_UNICODE *uni2;
5013             /* startpos for collecting untranslatable chars */
5014             const Py_UNICODE *collstart = p;
5015             const Py_UNICODE *collend = p+1;
5016             const Py_UNICODE *coll;
5017
5018             /* find all untranslatable characters */
5019             while (collend < endp) {
5020                 if (charmaptranslate_lookup(*collend, mapping, &x))
5021                     goto onError;
5022                 Py_XDECREF(x);
5023                 if (x!=Py_None)
5024                     break;
5025                 ++collend;
5026             }
5027             /* cache callback name lookup
5028              * (if not done yet, i.e. it's the first error) */
5029             if (known_errorHandler==-1) {
5030                 if ((errors==NULL) || (!strcmp(errors, "strict")))
5031                     known_errorHandler = 1;
5032                 else if (!strcmp(errors, "replace"))
5033                     known_errorHandler = 2;
5034                 else if (!strcmp(errors, "ignore"))
5035                     known_errorHandler = 3;
5036                 else if (!strcmp(errors, "xmlcharrefreplace"))
5037                     known_errorHandler = 4;
5038                 else
5039                     known_errorHandler = 0;
5040             }
5041             switch (known_errorHandler) {
5042             case 1: /* strict */
5043                 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5044                 goto onError;
5045             case 2: /* replace */
5046                 /* No need to check for space, this is a 1:1 replacement */
5047                 for (coll = collstart; coll<collend; ++coll)
5048                     *str++ = '?';
5049                 /* fall through */
5050             case 3: /* ignore */
5051                 p = collend;
5052                 break;
5053             case 4: /* xmlcharrefreplace */
5054                 /* generate replacement (temporarily (mis)uses p) */
5055                 for (p = collstart; p < collend; ++p) {
5056                     char buffer[2+29+1+1];
5057                     char *cp;
5058                     sprintf(buffer, "&#%d;", (int)*p);
5059                     if (charmaptranslate_makespace(&res, &str,
5060                                                    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5061                         goto onError;
5062                     for (cp = buffer; *cp; ++cp)
5063                         *str++ = *cp;
5064                 }
5065                 p = collend;
5066                 break;
5067             default:
5068                 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5069                                                                  reason, startp, size, &exc,
5070                                                                  collstart-startp, collend-startp, &newpos);
5071                 if (repunicode == NULL)
5072                     goto onError;
5073                 /* generate replacement  */
5074                 repsize = PyUnicode_GET_SIZE(repunicode);
5075                 if (charmaptranslate_makespace(&res, &str,
5076                                                (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5077                     Py_DECREF(repunicode);
5078                     goto onError;
5079                 }
5080                 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5081                     *str++ = *uni2;
5082                 p = startp + newpos;
5083                 Py_DECREF(repunicode);
5084             }
5085         }
5086     }
5087     /* Resize if we allocated to much */
5088     respos = str-PyUnicode_AS_UNICODE(res);
5089     if (respos<PyUnicode_GET_SIZE(res)) {
5090         if (PyUnicode_Resize(&res, respos) < 0)
5091             goto onError;
5092     }
5093     Py_XDECREF(exc);
5094     Py_XDECREF(errorHandler);
5095     return res;
5096
5097   onError:
5098     Py_XDECREF(res);
5099     Py_XDECREF(exc);
5100     Py_XDECREF(errorHandler);
5101     return NULL;
5102 }
5103
5104 PyObject *PyUnicode_Translate(PyObject *str,
5105                               PyObject *mapping,
5106                               const char *errors)
5107 {
5108     PyObject *result;
5109
5110     str = PyUnicode_FromObject(str);
5111     if (str == NULL)
5112         goto onError;
5113     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5114                                         PyUnicode_GET_SIZE(str),
5115                                         mapping,
5116                                         errors);
5117     Py_DECREF(str);
5118     return result;
5119
5120   onError:
5121     Py_XDECREF(str);
5122     return NULL;
5123 }
5124
5125 /* --- Decimal Encoder ---------------------------------------------------- */
5126
5127 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5128                             Py_ssize_t length,
5129                             char *output,
5130                             const char *errors)
5131 {
5132     Py_UNICODE *p, *end;
5133     PyObject *errorHandler = NULL;
5134     PyObject *exc = NULL;
5135     const char *encoding = "decimal";
5136     const char *reason = "invalid decimal Unicode string";
5137     /* the following variable is used for caching string comparisons
5138      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5139     int known_errorHandler = -1;
5140
5141     if (output == NULL) {
5142         PyErr_BadArgument();
5143         return -1;
5144     }
5145
5146     p = s;
5147     end = s + length;
5148     while (p < end) {
5149         register Py_UNICODE ch = *p;
5150         int decimal;
5151         PyObject *repunicode;
5152         Py_ssize_t repsize;
5153         Py_ssize_t newpos;
5154         Py_UNICODE *uni2;
5155         Py_UNICODE *collstart;
5156         Py_UNICODE *collend;
5157
5158         if (Py_UNICODE_ISSPACE(ch)) {
5159             *output++ = ' ';
5160             ++p;
5161             continue;
5162         }
5163         decimal = Py_UNICODE_TODECIMAL(ch);
5164         if (decimal >= 0) {
5165             *output++ = '0' + decimal;
5166             ++p;
5167             continue;
5168         }
5169         if (0 < ch && ch < 256) {
5170             *output++ = (char)ch;
5171             ++p;
5172             continue;
5173         }
5174         /* All other characters are considered unencodable */
5175         collstart = p;
5176         collend = p+1;
5177         while (collend < end) {
5178             if ((0 < *collend && *collend < 256) ||
5179                 !Py_UNICODE_ISSPACE(*collend) ||
5180                 Py_UNICODE_TODECIMAL(*collend))
5181                 break;
5182         }
5183         /* cache callback name lookup
5184          * (if not done yet, i.e. it's the first error) */
5185         if (known_errorHandler==-1) {
5186             if ((errors==NULL) || (!strcmp(errors, "strict")))
5187                 known_errorHandler = 1;
5188             else if (!strcmp(errors, "replace"))
5189                 known_errorHandler = 2;
5190             else if (!strcmp(errors, "ignore"))
5191                 known_errorHandler = 3;
5192             else if (!strcmp(errors, "xmlcharrefreplace"))
5193                 known_errorHandler = 4;
5194             else
5195                 known_errorHandler = 0;
5196         }
5197         switch (known_errorHandler) {
5198         case 1: /* strict */
5199             raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5200             goto onError;
5201         case 2: /* replace */
5202             for (p = collstart; p < collend; ++p)
5203                 *output++ = '?';
5204             /* fall through */
5205         case 3: /* ignore */
5206             p = collend;
5207             break;
5208         case 4: /* xmlcharrefreplace */
5209             /* generate replacement (temporarily (mis)uses p) */
5210             for (p = collstart; p < collend; ++p)
5211                 output += sprintf(output, "&#%d;", (int)*p);
5212             p = collend;
5213             break;
5214         default:
5215             repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5216                                                           encoding, reason, s, length, &exc,
5217                                                           collstart-s, collend-s, &newpos);
5218             if (repunicode == NULL)
5219                 goto onError;
5220             /* generate replacement  */
5221             repsize = PyUnicode_GET_SIZE(repunicode);
5222             for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5223                 Py_UNICODE ch = *uni2;
5224                 if (Py_UNICODE_ISSPACE(ch))
5225                     *output++ = ' ';
5226                 else {
5227                     decimal = Py_UNICODE_TODECIMAL(ch);
5228                     if (decimal >= 0)
5229                         *output++ = '0' + decimal;
5230                     else if (0 < ch && ch < 256)
5231                         *output++ = (char)ch;
5232                     else {
5233                         Py_DECREF(repunicode);
5234                         raise_encode_exception(&exc, encoding,
5235                                                s, length, collstart-s, collend-s, reason);
5236                         goto onError;
5237                     }
5238                 }
5239             }
5240             p = s + newpos;
5241             Py_DECREF(repunicode);
5242         }
5243     }
5244     /* 0-terminate the output string */
5245     *output++ = '\0';
5246     Py_XDECREF(exc);
5247     Py_XDECREF(errorHandler);
5248     return 0;
5249
5250   onError:
5251     Py_XDECREF(exc);
5252     Py_XDECREF(errorHandler);
5253     return -1;
5254 }
5255
5256 /* --- Helpers ------------------------------------------------------------ */
5257
5258 #include "stringlib/unicodedefs.h"
5259 #include "stringlib/fastsearch.h"
5260
5261 #include "stringlib/count.h"
5262 #include "stringlib/find.h"
5263 #include "stringlib/partition.h"
5264 #include "stringlib/split.h"
5265
5266 /* helper macro to fixup start/end slice values */
5267 #define ADJUST_INDICES(start, end, len)         \
5268     if (end > len)                              \
5269         end = len;                              \
5270     else if (end < 0) {                         \
5271         end += len;                             \
5272         if (end < 0)                            \
5273             end = 0;                            \
5274     }                                           \
5275     if (start < 0) {                            \
5276         start += len;                           \
5277         if (start < 0)                          \
5278             start = 0;                          \
5279     }
5280
5281 Py_ssize_t PyUnicode_Count(PyObject *str,
5282                            PyObject *substr,
5283                            Py_ssize_t start,
5284                            Py_ssize_t end)
5285 {
5286     Py_ssize_t result;
5287     PyUnicodeObject* str_obj;
5288     PyUnicodeObject* sub_obj;
5289
5290     str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5291     if (!str_obj)
5292         return -1;
5293     sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5294     if (!sub_obj) {
5295         Py_DECREF(str_obj);
5296         return -1;
5297     }
5298
5299     ADJUST_INDICES(start, end, str_obj->length);
5300     result = stringlib_count(
5301         str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5302         PY_SSIZE_T_MAX
5303         );
5304
5305     Py_DECREF(sub_obj);
5306     Py_DECREF(str_obj);
5307
5308     return result;
5309 }
5310
5311 Py_ssize_t PyUnicode_Find(PyObject *str,
5312                           PyObject *sub,
5313                           Py_ssize_t start,
5314                           Py_ssize_t end,
5315                           int direction)
5316 {
5317     Py_ssize_t result;
5318
5319     str = PyUnicode_FromObject(str);
5320     if (!str)
5321         return -2;
5322     sub = PyUnicode_FromObject(sub);
5323     if (!sub) {
5324         Py_DECREF(str);
5325         return -2;
5326     }
5327
5328     if (direction > 0)
5329         result = stringlib_find_slice(
5330             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5331             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5332             start, end
5333             );
5334     else
5335         result = stringlib_rfind_slice(
5336             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5337             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5338             start, end
5339             );
5340
5341     Py_DECREF(str);
5342     Py_DECREF(sub);
5343
5344     return result;
5345 }
5346
5347 static
5348 int tailmatch(PyUnicodeObject *self,
5349               PyUnicodeObject *substring,
5350               Py_ssize_t start,
5351               Py_ssize_t end,
5352               int direction)
5353 {
5354     if (substring->length == 0)
5355         return 1;
5356
5357     ADJUST_INDICES(start, end, self->length);
5358     end -= substring->length;
5359     if (end < start)
5360         return 0;
5361
5362     if (direction > 0) {
5363         if (Py_UNICODE_MATCH(self, end, substring))
5364             return 1;
5365     } else {
5366         if (Py_UNICODE_MATCH(self, start, substring))
5367             return 1;
5368     }
5369
5370     return 0;
5371 }
5372
5373 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5374                                PyObject *substr,
5375                                Py_ssize_t start,
5376                                Py_ssize_t end,
5377                                int direction)
5378 {
5379     Py_ssize_t result;
5380
5381     str = PyUnicode_FromObject(str);
5382     if (str == NULL)
5383         return -1;
5384     substr = PyUnicode_FromObject(substr);
5385     if (substr == NULL) {
5386         Py_DECREF(str);
5387         return -1;
5388     }
5389
5390     result = tailmatch((PyUnicodeObject *)str,
5391                        (PyUnicodeObject *)substr,
5392                        start, end, direction);
5393     Py_DECREF(str);
5394     Py_DECREF(substr);
5395     return result;
5396 }
5397
5398 /* Apply fixfct filter to the Unicode object self and return a
5399    reference to the modified object */
5400
5401 static
5402 PyObject *fixup(PyUnicodeObject *self,
5403                 int (*fixfct)(PyUnicodeObject *s))
5404 {
5405
5406     PyUnicodeObject *u;
5407
5408     u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5409     if (u == NULL)
5410         return NULL;
5411
5412     Py_UNICODE_COPY(u->str, self->str, self->length);
5413
5414     if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5415         /* fixfct should return TRUE if it modified the buffer. If
5416            FALSE, return a reference to the original buffer instead
5417            (to save space, not time) */
5418         Py_INCREF(self);
5419         Py_DECREF(u);
5420         return (PyObject*) self;
5421     }
5422     return (PyObject*) u;
5423 }
5424
5425 static
5426 int fixupper(PyUnicodeObject *self)
5427 {
5428     Py_ssize_t len = self->length;
5429     Py_UNICODE *s = self->str;
5430     int status = 0;
5431
5432     while (len-- > 0) {
5433         register Py_UNICODE ch;
5434
5435         ch = Py_UNICODE_TOUPPER(*s);
5436         if (ch != *s) {
5437             status = 1;
5438             *s = ch;
5439         }
5440         s++;
5441     }
5442
5443     return status;
5444 }
5445
5446 static
5447 int fixlower(PyUnicodeObject *self)
5448 {
5449     Py_ssize_t len = self->length;
5450     Py_UNICODE *s = self->str;
5451     int status = 0;
5452
5453     while (len-- > 0) {
5454         register Py_UNICODE ch;
5455
5456         ch = Py_UNICODE_TOLOWER(*s);
5457         if (ch != *s) {
5458             status = 1;
5459             *s = ch;
5460         }
5461         s++;
5462     }
5463
5464     return status;
5465 }
5466
5467 static
5468 int fixswapcase(PyUnicodeObject *self)
5469 {
5470     Py_ssize_t len = self->length;
5471     Py_UNICODE *s = self->str;
5472     int status = 0;
5473
5474     while (len-- > 0) {
5475         if (Py_UNICODE_ISUPPER(*s)) {
5476             *s = Py_UNICODE_TOLOWER(*s);
5477             status = 1;
5478         } else if (Py_UNICODE_ISLOWER(*s)) {
5479             *s = Py_UNICODE_TOUPPER(*s);
5480             status = 1;
5481         }
5482         s++;
5483     }
5484
5485     return status;
5486 }
5487
5488 static
5489 int fixcapitalize(PyUnicodeObject *self)
5490 {
5491     Py_ssize_t len = self->length;
5492     Py_UNICODE *s = self->str;
5493     int status = 0;
5494
5495     if (len == 0)
5496         return 0;
5497     if (Py_UNICODE_ISLOWER(*s)) {
5498         *s = Py_UNICODE_TOUPPER(*s);
5499         status = 1;
5500     }
5501     s++;
5502     while (--len > 0) {
5503         if (Py_UNICODE_ISUPPER(*s)) {
5504             *s = Py_UNICODE_TOLOWER(*s);
5505             status = 1;
5506         }
5507         s++;
5508     }
5509     return status;
5510 }
5511
5512 static
5513 int fixtitle(PyUnicodeObject *self)
5514 {
5515     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5516     register Py_UNICODE *e;
5517     int previous_is_cased;
5518
5519     /* Shortcut for single character strings */
5520     if (PyUnicode_GET_SIZE(self) == 1) {
5521         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5522         if (*p != ch) {
5523             *p = ch;
5524             return 1;
5525         }
5526         else
5527             return 0;
5528     }
5529
5530     e = p + PyUnicode_GET_SIZE(self);
5531     previous_is_cased = 0;
5532     for (; p < e; p++) {
5533         register const Py_UNICODE ch = *p;
5534
5535         if (previous_is_cased)
5536             *p = Py_UNICODE_TOLOWER(ch);
5537         else
5538             *p = Py_UNICODE_TOTITLE(ch);
5539
5540         if (Py_UNICODE_ISLOWER(ch) ||
5541             Py_UNICODE_ISUPPER(ch) ||
5542             Py_UNICODE_ISTITLE(ch))
5543             previous_is_cased = 1;
5544         else
5545             previous_is_cased = 0;
5546     }
5547     return 1;
5548 }
5549
5550 PyObject *
5551 PyUnicode_Join(PyObject *separator, PyObject *seq)
5552 {
5553     PyObject *internal_separator = NULL;
5554     const Py_UNICODE blank = ' ';
5555     const Py_UNICODE *sep = &blank;
5556     Py_ssize_t seplen = 1;
5557     PyUnicodeObject *res = NULL; /* the result */
5558     Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
5559     Py_ssize_t res_used;         /* # used bytes */
5560     Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
5561     PyObject *fseq;          /* PySequence_Fast(seq) */
5562     Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
5563     PyObject *item;
5564     Py_ssize_t i;
5565
5566     fseq = PySequence_Fast(seq, "");
5567     if (fseq == NULL) {
5568         return NULL;
5569     }
5570
5571     /* Grrrr.  A codec may be invoked to convert str objects to
5572      * Unicode, and so it's possible to call back into Python code
5573      * during PyUnicode_FromObject(), and so it's possible for a sick
5574      * codec to change the size of fseq (if seq is a list).  Therefore
5575      * we have to keep refetching the size -- can't assume seqlen
5576      * is invariant.
5577      */
5578     seqlen = PySequence_Fast_GET_SIZE(fseq);
5579     /* If empty sequence, return u"". */
5580     if (seqlen == 0) {
5581         res = _PyUnicode_New(0);  /* empty sequence; return u"" */
5582         goto Done;
5583     }
5584     /* If singleton sequence with an exact Unicode, return that. */
5585     if (seqlen == 1) {
5586         item = PySequence_Fast_GET_ITEM(fseq, 0);
5587         if (PyUnicode_CheckExact(item)) {
5588             Py_INCREF(item);
5589             res = (PyUnicodeObject *)item;
5590             goto Done;
5591         }
5592     }
5593
5594     /* At least two items to join, or one that isn't exact Unicode. */
5595     if (seqlen > 1) {
5596         /* Set up sep and seplen -- they're needed. */
5597         if (separator == NULL) {
5598             sep = &blank;
5599             seplen = 1;
5600         }
5601         else {
5602             internal_separator = PyUnicode_FromObject(separator);
5603             if (internal_separator == NULL)
5604                 goto onError;
5605             sep = PyUnicode_AS_UNICODE(internal_separator);
5606             seplen = PyUnicode_GET_SIZE(internal_separator);
5607             /* In case PyUnicode_FromObject() mutated seq. */
5608             seqlen = PySequence_Fast_GET_SIZE(fseq);
5609         }
5610     }
5611
5612     /* Get space. */
5613     res = _PyUnicode_New(res_alloc);
5614     if (res == NULL)
5615         goto onError;
5616     res_p = PyUnicode_AS_UNICODE(res);
5617     res_used = 0;
5618
5619     for (i = 0; i < seqlen; ++i) {
5620         Py_ssize_t itemlen;
5621         Py_ssize_t new_res_used;
5622
5623         item = PySequence_Fast_GET_ITEM(fseq, i);
5624         /* Convert item to Unicode. */
5625         if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5626             PyErr_Format(PyExc_TypeError,
5627                          "sequence item %zd: expected string or Unicode,"
5628                          " %.80s found",
5629                          i, Py_TYPE(item)->tp_name);
5630             goto onError;
5631         }
5632         item = PyUnicode_FromObject(item);
5633         if (item == NULL)
5634             goto onError;
5635         /* We own a reference to item from here on. */
5636
5637         /* In case PyUnicode_FromObject() mutated seq. */
5638         seqlen = PySequence_Fast_GET_SIZE(fseq);
5639
5640         /* Make sure we have enough space for the separator and the item. */
5641         itemlen = PyUnicode_GET_SIZE(item);
5642         new_res_used = res_used + itemlen;
5643         if (new_res_used < 0)
5644             goto Overflow;
5645         if (i < seqlen - 1) {
5646             new_res_used += seplen;
5647             if (new_res_used < 0)
5648                 goto Overflow;
5649         }
5650         if (new_res_used > res_alloc) {
5651             /* double allocated size until it's big enough */
5652             do {
5653                 res_alloc += res_alloc;
5654                 if (res_alloc <= 0)
5655                     goto Overflow;
5656             } while (new_res_used > res_alloc);
5657             if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5658                 Py_DECREF(item);
5659                 goto onError;
5660             }
5661             res_p = PyUnicode_AS_UNICODE(res) + res_used;
5662         }
5663
5664         /* Copy item, and maybe the separator. */
5665         Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5666         res_p += itemlen;
5667         if (i < seqlen - 1) {
5668             Py_UNICODE_COPY(res_p, sep, seplen);
5669             res_p += seplen;
5670         }
5671         Py_DECREF(item);
5672         res_used = new_res_used;
5673     }
5674
5675     /* Shrink res to match the used area; this probably can't fail,
5676      * but it's cheap to check.
5677      */
5678     if (_PyUnicode_Resize(&res, res_used) < 0)
5679         goto onError;
5680
5681   Done:
5682     Py_XDECREF(internal_separator);
5683     Py_DECREF(fseq);
5684     return (PyObject *)res;
5685
5686   Overflow:
5687     PyErr_SetString(PyExc_OverflowError,
5688                     "join() result is too long for a Python string");
5689     Py_DECREF(item);
5690     /* fall through */
5691
5692   onError:
5693     Py_XDECREF(internal_separator);
5694     Py_DECREF(fseq);
5695     Py_XDECREF(res);
5696     return NULL;
5697 }
5698
5699 static
5700 PyUnicodeObject *pad(PyUnicodeObject *self,
5701                      Py_ssize_t left,
5702                      Py_ssize_t right,
5703                      Py_UNICODE fill)
5704 {
5705     PyUnicodeObject *u;
5706
5707     if (left < 0)
5708         left = 0;
5709     if (right < 0)
5710         right = 0;
5711
5712     if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5713         Py_INCREF(self);
5714         return self;
5715     }
5716
5717     if (left > PY_SSIZE_T_MAX - self->length ||
5718         right > PY_SSIZE_T_MAX - (left + self->length)) {
5719         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5720         return NULL;
5721     }
5722     u = _PyUnicode_New(left + self->length + right);
5723     if (u) {
5724         if (left)
5725             Py_UNICODE_FILL(u->str, fill, left);
5726         Py_UNICODE_COPY(u->str + left, self->str, self->length);
5727         if (right)
5728             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5729     }
5730
5731     return u;
5732 }
5733
5734 PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
5735 {
5736     PyObject *list;
5737
5738     string = PyUnicode_FromObject(string);
5739     if (string == NULL)
5740         return NULL;
5741
5742     list = stringlib_splitlines(
5743         (PyObject*) string, PyUnicode_AS_UNICODE(string),
5744         PyUnicode_GET_SIZE(string), keepends);
5745
5746     Py_DECREF(string);
5747     return list;
5748 }
5749
5750 static
5751 PyObject *split(PyUnicodeObject *self,
5752                 PyUnicodeObject *substring,
5753                 Py_ssize_t maxcount)
5754 {
5755     if (maxcount < 0)
5756         maxcount = PY_SSIZE_T_MAX;
5757
5758     if (substring == NULL)
5759         return stringlib_split_whitespace(
5760             (PyObject*) self,  self->str, self->length, maxcount
5761             );
5762
5763     return stringlib_split(
5764         (PyObject*) self,  self->str, self->length,
5765         substring->str, substring->length,
5766         maxcount
5767         );
5768 }
5769
5770 static
5771 PyObject *rsplit(PyUnicodeObject *self,
5772                  PyUnicodeObject *substring,
5773                  Py_ssize_t maxcount)
5774 {
5775     if (maxcount < 0)
5776         maxcount = PY_SSIZE_T_MAX;
5777
5778     if (substring == NULL)
5779         return stringlib_rsplit_whitespace(
5780             (PyObject*) self,  self->str, self->length, maxcount
5781             );
5782
5783     return stringlib_rsplit(
5784         (PyObject*) self,  self->str, self->length,
5785         substring->str, substring->length,
5786         maxcount
5787         );
5788 }
5789
5790 static
5791 PyObject *replace(PyUnicodeObject *self,
5792                   PyUnicodeObject *str1,
5793                   PyUnicodeObject *str2,
5794                   Py_ssize_t maxcount)
5795 {
5796     PyUnicodeObject *u;
5797
5798     if (maxcount < 0)
5799         maxcount = PY_SSIZE_T_MAX;
5800     else if (maxcount == 0 || self->length == 0)
5801         goto nothing;
5802
5803     if (str1->length == str2->length) {
5804         Py_ssize_t i;
5805         /* same length */
5806         if (str1->length == 0)
5807             goto nothing;
5808         if (str1->length == 1) {
5809             /* replace characters */
5810             Py_UNICODE u1, u2;
5811             if (!findchar(self->str, self->length, str1->str[0]))
5812                 goto nothing;
5813             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5814             if (!u)
5815                 return NULL;
5816             Py_UNICODE_COPY(u->str, self->str, self->length);
5817             u1 = str1->str[0];
5818             u2 = str2->str[0];
5819             for (i = 0; i < u->length; i++)
5820                 if (u->str[i] == u1) {
5821                     if (--maxcount < 0)
5822                         break;
5823                     u->str[i] = u2;
5824                 }
5825         } else {
5826             i = stringlib_find(
5827                 self->str, self->length, str1->str, str1->length, 0
5828                 );
5829             if (i < 0)
5830                 goto nothing;
5831             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5832             if (!u)
5833                 return NULL;
5834             Py_UNICODE_COPY(u->str, self->str, self->length);
5835
5836             /* change everything in-place, starting with this one */
5837             Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5838             i += str1->length;
5839
5840             while ( --maxcount > 0) {
5841                 i = stringlib_find(self->str+i, self->length-i,
5842                                    str1->str, str1->length,
5843                                    i);
5844                 if (i == -1)
5845                     break;
5846                 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5847                 i += str1->length;
5848             }
5849         }
5850     } else {
5851
5852         Py_ssize_t n, i, j, e;
5853         Py_ssize_t product, new_size, delta;
5854         Py_UNICODE *p;
5855
5856         /* replace strings */
5857         n = stringlib_count(self->str, self->length, str1->str, str1->length,
5858                             maxcount);
5859         if (n == 0)
5860             goto nothing;
5861         /* new_size = self->length + n * (str2->length - str1->length)); */
5862         delta = (str2->length - str1->length);
5863         if (delta == 0) {
5864             new_size = self->length;
5865         } else {
5866             product = n * (str2->length - str1->length);
5867             if ((product / (str2->length - str1->length)) != n) {
5868                 PyErr_SetString(PyExc_OverflowError,
5869                                 "replace string is too long");
5870                 return NULL;
5871             }
5872             new_size = self->length + product;
5873             if (new_size < 0) {
5874                 PyErr_SetString(PyExc_OverflowError,
5875                                 "replace string is too long");
5876                 return NULL;
5877             }
5878         }
5879         u = _PyUnicode_New(new_size);
5880         if (!u)
5881             return NULL;
5882         i = 0;
5883         p = u->str;
5884         e = self->length - str1->length;
5885         if (str1->length > 0) {
5886             while (n-- > 0) {
5887                 /* look for next match */
5888                 j = stringlib_find(self->str+i, self->length-i,
5889                                    str1->str, str1->length,
5890                                    i);
5891                 if (j == -1)
5892                     break;
5893                 else if (j > i) {
5894                     /* copy unchanged part [i:j] */
5895                     Py_UNICODE_COPY(p, self->str+i, j-i);
5896                     p += j - i;
5897                 }
5898                 /* copy substitution string */
5899                 if (str2->length > 0) {
5900                     Py_UNICODE_COPY(p, str2->str, str2->length);
5901                     p += str2->length;
5902                 }
5903                 i = j + str1->length;
5904             }
5905             if (i < self->length)
5906                 /* copy tail [i:] */
5907                 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5908         } else {
5909             /* interleave */
5910             while (n > 0) {
5911                 Py_UNICODE_COPY(p, str2->str, str2->length);
5912                 p += str2->length;
5913                 if (--n <= 0)
5914                     break;
5915                 *p++ = self->str[i++];
5916             }
5917             Py_UNICODE_COPY(p, self->str+i, self->length-i);
5918         }
5919     }
5920     return (PyObject *) u;
5921
5922   nothing:
5923     /* nothing to replace; return original string (when possible) */
5924     if (PyUnicode_CheckExact(self)) {
5925         Py_INCREF(self);
5926         return (PyObject *) self;
5927     }
5928     return PyUnicode_FromUnicode(self->str, self->length);
5929 }
5930
5931 /* --- Unicode Object Methods --------------------------------------------- */
5932
5933 PyDoc_STRVAR(title__doc__,
5934              "S.title() -> unicode\n\
5935 \n\
5936 Return a titlecased version of S, i.e. words start with title case\n\
5937 characters, all remaining cased characters have lower case.");
5938
5939 static PyObject*
5940 unicode_title(PyUnicodeObject *self)
5941 {
5942     return fixup(self, fixtitle);
5943 }
5944
5945 PyDoc_STRVAR(capitalize__doc__,
5946              "S.capitalize() -> unicode\n\
5947 \n\
5948 Return a capitalized version of S, i.e. make the first character\n\
5949 have upper case.");
5950
5951 static PyObject*
5952 unicode_capitalize(PyUnicodeObject *self)
5953 {
5954     return fixup(self, fixcapitalize);
5955 }
5956
5957 #if 0
5958 PyDoc_STRVAR(capwords__doc__,
5959              "S.capwords() -> unicode\n\
5960 \n\
5961 Apply .capitalize() to all words in S and return the result with\n\
5962 normalized whitespace (all whitespace strings are replaced by ' ').");
5963
5964 static PyObject*
5965 unicode_capwords(PyUnicodeObject *self)
5966 {
5967     PyObject *list;
5968     PyObject *item;
5969     Py_ssize_t i;
5970
5971     /* Split into words */
5972     list = split(self, NULL, -1);
5973     if (!list)
5974         return NULL;
5975
5976     /* Capitalize each word */
5977     for (i = 0; i < PyList_GET_SIZE(list); i++) {
5978         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5979                      fixcapitalize);
5980         if (item == NULL)
5981             goto onError;
5982         Py_DECREF(PyList_GET_ITEM(list, i));
5983         PyList_SET_ITEM(list, i, item);
5984     }
5985
5986     /* Join the words to form a new string */
5987     item = PyUnicode_Join(NULL, list);
5988
5989   onError:
5990     Py_DECREF(list);
5991     return (PyObject *)item;
5992 }
5993 #endif
5994
5995 /* Argument converter.  Coerces to a single unicode character */
5996
5997 static int
5998 convert_uc(PyObject *obj, void *addr)
5999 {
6000     Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6001     PyObject *uniobj;
6002     Py_UNICODE *unistr;
6003
6004     uniobj = PyUnicode_FromObject(obj);
6005     if (uniobj == NULL) {
6006         PyErr_SetString(PyExc_TypeError,
6007                         "The fill character cannot be converted to Unicode");
6008         return 0;
6009     }
6010     if (PyUnicode_GET_SIZE(uniobj) != 1) {
6011         PyErr_SetString(PyExc_TypeError,
6012                         "The fill character must be exactly one character long");
6013         Py_DECREF(uniobj);
6014         return 0;
6015     }
6016     unistr = PyUnicode_AS_UNICODE(uniobj);
6017     *fillcharloc = unistr[0];
6018     Py_DECREF(uniobj);
6019     return 1;
6020 }
6021
6022 PyDoc_STRVAR(center__doc__,
6023              "S.center(width[, fillchar]) -> unicode\n\
6024 \n\
6025 Return S centered in a Unicode string of length width. Padding is\n\
6026 done using the specified fill character (default is a space)");
6027
6028 static PyObject *
6029 unicode_center(PyUnicodeObject *self, PyObject *args)
6030 {
6031     Py_ssize_t marg, left;
6032     Py_ssize_t width;
6033     Py_UNICODE fillchar = ' ';
6034
6035     if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6036         return NULL;
6037
6038     if (self->length >= width && PyUnicode_CheckExact(self)) {
6039         Py_INCREF(self);
6040         return (PyObject*) self;
6041     }
6042
6043     marg = width - self->length;
6044     left = marg / 2 + (marg & width & 1);
6045
6046     return (PyObject*) pad(self, left, marg - left, fillchar);
6047 }
6048
6049 #if 0
6050
6051 /* This code should go into some future Unicode collation support
6052    module. The basic comparison should compare ordinals on a naive
6053    basis (this is what Java does and thus Jython too). */
6054
6055 /* speedy UTF-16 code point order comparison */
6056 /* gleaned from: */
6057 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6058
6059 static short utf16Fixup[32] =
6060 {
6061     0, 0, 0, 0, 0, 0, 0, 0,
6062     0, 0, 0, 0, 0, 0, 0, 0,
6063     0, 0, 0, 0, 0, 0, 0, 0,
6064     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6065 };
6066
6067 static int
6068 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6069 {
6070     Py_ssize_t len1, len2;
6071
6072     Py_UNICODE *s1 = str1->str;
6073     Py_UNICODE *s2 = str2->str;
6074
6075     len1 = str1->length;
6076     len2 = str2->length;
6077
6078     while (len1 > 0 && len2 > 0) {
6079         Py_UNICODE c1, c2;
6080
6081         c1 = *s1++;
6082         c2 = *s2++;
6083
6084         if (c1 > (1<<11) * 26)
6085             c1 += utf16Fixup[c1>>11];
6086         if (c2 > (1<<11) * 26)
6087             c2 += utf16Fixup[c2>>11];
6088         /* now c1 and c2 are in UTF-32-compatible order */
6089
6090         if (c1 != c2)
6091             return (c1 < c2) ? -1 : 1;
6092
6093         len1--; len2--;
6094     }
6095
6096     return (len1 < len2) ? -1 : (len1 != len2);
6097 }
6098
6099 #else
6100
6101 static int
6102 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6103 {
6104     register Py_ssize_t len1, len2;
6105
6106     Py_UNICODE *s1 = str1->str;
6107     Py_UNICODE *s2 = str2->str;
6108
6109     len1 = str1->length;
6110     len2 = str2->length;
6111
6112     while (len1 > 0 && len2 > 0) {
6113         Py_UNICODE c1, c2;
6114
6115         c1 = *s1++;
6116         c2 = *s2++;
6117
6118         if (c1 != c2)
6119             return (c1 < c2) ? -1 : 1;
6120
6121         len1--; len2--;
6122     }
6123
6124     return (len1 < len2) ? -1 : (len1 != len2);
6125 }
6126
6127 #endif
6128
6129 int PyUnicode_Compare(PyObject *left,
6130                       PyObject *right)
6131 {
6132     PyUnicodeObject *u = NULL, *v = NULL;
6133     int result;
6134
6135     /* Coerce the two arguments */
6136     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6137     if (u == NULL)
6138         goto onError;
6139     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6140     if (v == NULL)
6141         goto onError;
6142
6143     /* Shortcut for empty or interned objects */
6144     if (v == u) {
6145         Py_DECREF(u);
6146         Py_DECREF(v);
6147         return 0;
6148     }
6149
6150     result = unicode_compare(u, v);
6151
6152     Py_DECREF(u);
6153     Py_DECREF(v);
6154     return result;
6155
6156   onError:
6157     Py_XDECREF(u);
6158     Py_XDECREF(v);
6159     return -1;
6160 }
6161
6162 PyObject *PyUnicode_RichCompare(PyObject *left,
6163                                 PyObject *right,
6164                                 int op)
6165 {
6166     int result;
6167
6168     result = PyUnicode_Compare(left, right);
6169     if (result == -1 && PyErr_Occurred())
6170         goto onError;
6171
6172     /* Convert the return value to a Boolean */
6173     switch (op) {
6174     case Py_EQ:
6175         result = (result == 0);
6176         break;
6177     case Py_NE:
6178         result = (result != 0);
6179         break;
6180     case Py_LE:
6181         result = (result <= 0);
6182         break;
6183     case Py_GE:
6184         result = (result >= 0);
6185         break;
6186     case Py_LT:
6187         result = (result == -1);
6188         break;
6189     case Py_GT:
6190         result = (result == 1);
6191         break;
6192     }
6193     return PyBool_FromLong(result);
6194
6195   onError:
6196
6197     /* Standard case
6198
6199        Type errors mean that PyUnicode_FromObject() could not convert
6200        one of the arguments (usually the right hand side) to Unicode,
6201        ie. we can't handle the comparison request. However, it is
6202        possible that the other object knows a comparison method, which
6203        is why we return Py_NotImplemented to give the other object a
6204        chance.
6205
6206     */
6207     if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6208         PyErr_Clear();
6209         Py_INCREF(Py_NotImplemented);
6210         return Py_NotImplemented;
6211     }
6212     if (op != Py_EQ && op != Py_NE)
6213         return NULL;
6214
6215     /* Equality comparison.
6216
6217        This is a special case: we silence any PyExc_UnicodeDecodeError
6218        and instead turn it into a PyErr_UnicodeWarning.
6219
6220     */
6221     if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6222         return NULL;
6223     PyErr_Clear();
6224     if (PyErr_Warn(PyExc_UnicodeWarning,
6225                    (op == Py_EQ) ?
6226                    "Unicode equal comparison "
6227                    "failed to convert both arguments to Unicode - "
6228                    "interpreting them as being unequal" :
6229                    "Unicode unequal comparison "
6230                    "failed to convert both arguments to Unicode - "
6231                    "interpreting them as being unequal"
6232             ) < 0)
6233         return NULL;
6234     result = (op == Py_NE);
6235     return PyBool_FromLong(result);
6236 }
6237
6238 int PyUnicode_Contains(PyObject *container,
6239                        PyObject *element)
6240 {
6241     PyObject *str, *sub;
6242     int result;
6243
6244     /* Coerce the two arguments */
6245     sub = PyUnicode_FromObject(element);
6246     if (!sub) {
6247         return -1;
6248     }
6249
6250     str = PyUnicode_FromObject(container);
6251     if (!str) {
6252         Py_DECREF(sub);
6253         return -1;
6254     }
6255
6256     result = stringlib_contains_obj(str, sub);
6257
6258     Py_DECREF(str);
6259     Py_DECREF(sub);
6260
6261     return result;
6262 }
6263
6264 /* Concat to string or Unicode object giving a new Unicode object. */
6265
6266 PyObject *PyUnicode_Concat(PyObject *left,
6267                            PyObject *right)
6268 {
6269     PyUnicodeObject *u = NULL, *v = NULL, *w;
6270
6271     /* Coerce the two arguments */
6272     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6273     if (u == NULL)
6274         goto onError;
6275     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6276     if (v == NULL)
6277         goto onError;
6278
6279     /* Shortcuts */
6280     if (v == unicode_empty) {
6281         Py_DECREF(v);
6282         return (PyObject *)u;
6283     }
6284     if (u == unicode_empty) {
6285         Py_DECREF(u);
6286         return (PyObject *)v;
6287     }
6288
6289     /* Concat the two Unicode strings */
6290     w = _PyUnicode_New(u->length + v->length);
6291     if (w == NULL)
6292         goto onError;
6293     Py_UNICODE_COPY(w->str, u->str, u->length);
6294     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6295
6296     Py_DECREF(u);
6297     Py_DECREF(v);
6298     return (PyObject *)w;
6299
6300   onError:
6301     Py_XDECREF(u);
6302     Py_XDECREF(v);
6303     return NULL;
6304 }
6305
6306 PyDoc_STRVAR(count__doc__,
6307              "S.count(sub[, start[, end]]) -> int\n\
6308 \n\
6309 Return the number of non-overlapping occurrences of substring sub in\n\
6310 Unicode string S[start:end].  Optional arguments start and end are\n\
6311 interpreted as in slice notation.");
6312
6313 static PyObject *
6314 unicode_count(PyUnicodeObject *self, PyObject *args)
6315 {
6316     PyUnicodeObject *substring;
6317     Py_ssize_t start = 0;
6318     Py_ssize_t end = PY_SSIZE_T_MAX;
6319     PyObject *result;
6320
6321     if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6322                           _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6323         return NULL;
6324
6325     substring = (PyUnicodeObject *)PyUnicode_FromObject(
6326         (PyObject *)substring);
6327     if (substring == NULL)
6328         return NULL;
6329
6330     ADJUST_INDICES(start, end, self->length);
6331     result = PyInt_FromSsize_t(
6332         stringlib_count(self->str + start, end - start,
6333                         substring->str, substring->length,
6334                         PY_SSIZE_T_MAX)
6335         );
6336
6337     Py_DECREF(substring);
6338
6339     return result;
6340 }
6341
6342 PyDoc_STRVAR(encode__doc__,
6343              "S.encode([encoding[,errors]]) -> string or unicode\n\
6344 \n\
6345 Encodes S using the codec registered for encoding. encoding defaults\n\
6346 to the default encoding. errors may be given to set a different error\n\
6347 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6348 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6349 'xmlcharrefreplace' as well as any other name registered with\n\
6350 codecs.register_error that can handle UnicodeEncodeErrors.");
6351
6352 static PyObject *
6353 unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6354 {
6355     static char *kwlist[] = {"encoding", "errors", 0};
6356     char *encoding = NULL;
6357     char *errors = NULL;
6358     PyObject *v;
6359
6360     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6361                                      kwlist, &encoding, &errors))
6362         return NULL;
6363     v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6364     if (v == NULL)
6365         goto onError;
6366     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6367         PyErr_Format(PyExc_TypeError,
6368                      "encoder did not return a string/unicode object "
6369                      "(type=%.400s)",
6370                      Py_TYPE(v)->tp_name);
6371         Py_DECREF(v);
6372         return NULL;
6373     }
6374     return v;
6375
6376   onError:
6377     return NULL;
6378 }
6379
6380 PyDoc_STRVAR(decode__doc__,
6381              "S.decode([encoding[,errors]]) -> string or unicode\n\
6382 \n\
6383 Decodes S using the codec registered for encoding. encoding defaults\n\
6384 to the default encoding. errors may be given to set a different error\n\
6385 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6386 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6387 as well as any other name registerd with codecs.register_error that is\n\
6388 able to handle UnicodeDecodeErrors.");
6389
6390 static PyObject *
6391 unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6392 {
6393     static char *kwlist[] = {"encoding", "errors", 0};
6394     char *encoding = NULL;
6395     char *errors = NULL;
6396     PyObject *v;
6397
6398     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6399                                      kwlist, &encoding, &errors))
6400         return NULL;
6401     v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
6402     if (v == NULL)
6403         goto onError;
6404     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6405         PyErr_Format(PyExc_TypeError,
6406                      "decoder did not return a string/unicode object "
6407                      "(type=%.400s)",
6408                      Py_TYPE(v)->tp_name);
6409         Py_DECREF(v);
6410         return NULL;
6411     }
6412     return v;
6413
6414   onError:
6415     return NULL;
6416 }
6417
6418 PyDoc_STRVAR(expandtabs__doc__,
6419              "S.expandtabs([tabsize]) -> unicode\n\
6420 \n\
6421 Return a copy of S where all tab characters are expanded using spaces.\n\
6422 If tabsize is not given, a tab size of 8 characters is assumed.");
6423
6424 static PyObject*
6425 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6426 {
6427     Py_UNICODE *e;
6428     Py_UNICODE *p;
6429     Py_UNICODE *q;
6430     Py_UNICODE *qe;
6431     Py_ssize_t i, j, incr;
6432     PyUnicodeObject *u;
6433     int tabsize = 8;
6434
6435     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6436         return NULL;
6437
6438     /* First pass: determine size of output string */
6439     i = 0; /* chars up to and including most recent \n or \r */
6440     j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6441     e = self->str + self->length; /* end of input */
6442     for (p = self->str; p < e; p++)
6443         if (*p == '\t') {
6444             if (tabsize > 0) {
6445                 incr = tabsize - (j % tabsize); /* cannot overflow */
6446                 if (j > PY_SSIZE_T_MAX - incr)
6447                     goto overflow1;
6448                 j += incr;
6449             }
6450         }
6451         else {
6452             if (j > PY_SSIZE_T_MAX - 1)
6453                 goto overflow1;
6454             j++;
6455             if (*p == '\n' || *p == '\r') {
6456                 if (i > PY_SSIZE_T_MAX - j)
6457                     goto overflow1;
6458                 i += j;
6459                 j = 0;
6460             }
6461         }
6462
6463     if (i > PY_SSIZE_T_MAX - j)
6464         goto overflow1;
6465
6466     /* Second pass: create output string and fill it */
6467     u = _PyUnicode_New(i + j);
6468     if (!u)
6469         return NULL;
6470
6471     j = 0; /* same as in first pass */
6472     q = u->str; /* next output char */
6473     qe = u->str + u->length; /* end of output */
6474
6475     for (p = self->str; p < e; p++)
6476         if (*p == '\t') {
6477             if (tabsize > 0) {
6478                 i = tabsize - (j % tabsize);
6479                 j += i;
6480                 while (i--) {
6481                     if (q >= qe)
6482                         goto overflow2;
6483                     *q++ = ' ';
6484                 }
6485             }
6486         }
6487         else {
6488             if (q >= qe)
6489                 goto overflow2;
6490             *q++ = *p;
6491             j++;
6492             if (*p == '\n' || *p == '\r')
6493                 j = 0;
6494         }
6495
6496     return (PyObject*) u;
6497
6498   overflow2:
6499     Py_DECREF(u);
6500   overflow1:
6501     PyErr_SetString(PyExc_OverflowError, "new string is too long");
6502     return NULL;
6503 }
6504
6505 PyDoc_STRVAR(find__doc__,
6506              "S.find(sub [,start [,end]]) -> int\n\
6507 \n\
6508 Return the lowest index in S where substring sub is found,\n\
6509 such that sub is contained within s[start:end].  Optional\n\
6510 arguments start and end are interpreted as in slice notation.\n\
6511 \n\
6512 Return -1 on failure.");
6513
6514 static PyObject *
6515 unicode_find(PyUnicodeObject *self, PyObject *args)
6516 {
6517     PyObject *substring;
6518     Py_ssize_t start;
6519     Py_ssize_t end;
6520     Py_ssize_t result;
6521
6522     if (!_ParseTupleFinds(args, &substring, &start, &end))
6523         return NULL;
6524
6525     result = stringlib_find_slice(
6526         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6527         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6528         start, end
6529         );
6530
6531     Py_DECREF(substring);
6532
6533     return PyInt_FromSsize_t(result);
6534 }
6535
6536 static PyObject *
6537 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6538 {
6539     if (index < 0 || index >= self->length) {
6540         PyErr_SetString(PyExc_IndexError, "string index out of range");
6541         return NULL;
6542     }
6543
6544     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6545 }
6546
6547 static long
6548 unicode_hash(PyUnicodeObject *self)
6549 {
6550     /* Since Unicode objects compare equal to their ASCII string
6551        counterparts, they should use the individual character values
6552        as basis for their hash value.  This is needed to assure that
6553        strings and Unicode objects behave in the same way as
6554        dictionary keys. */
6555
6556     register Py_ssize_t len;
6557     register Py_UNICODE *p;
6558     register long x;
6559
6560     if (self->hash != -1)
6561         return self->hash;
6562     len = PyUnicode_GET_SIZE(self);
6563     p = PyUnicode_AS_UNICODE(self);
6564     x = *p << 7;
6565     while (--len >= 0)
6566         x = (1000003*x) ^ *p++;
6567     x ^= PyUnicode_GET_SIZE(self);
6568     if (x == -1)
6569         x = -2;
6570     self->hash = x;
6571     return x;
6572 }
6573
6574 PyDoc_STRVAR(index__doc__,
6575              "S.index(sub [,start [,end]]) -> int\n\
6576 \n\
6577 Like S.find() but raise ValueError when the substring is not found.");
6578
6579 static PyObject *
6580 unicode_index(PyUnicodeObject *self, PyObject *args)
6581 {
6582     Py_ssize_t result;
6583     PyObject *substring;
6584     Py_ssize_t start;
6585     Py_ssize_t end;
6586
6587     if (!_ParseTupleFinds(args, &substring, &start, &end))
6588         return NULL;
6589
6590     result = stringlib_find_slice(
6591         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6592         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6593         start, end
6594         );
6595
6596     Py_DECREF(substring);
6597
6598     if (result < 0) {
6599         PyErr_SetString(PyExc_ValueError, "substring not found");
6600         return NULL;
6601     }
6602
6603     return PyInt_FromSsize_t(result);
6604 }
6605
6606 PyDoc_STRVAR(islower__doc__,
6607              "S.islower() -> bool\n\
6608 \n\
6609 Return True if all cased characters in S are lowercase and there is\n\
6610 at least one cased character in S, False otherwise.");
6611
6612 static PyObject*
6613 unicode_islower(PyUnicodeObject *self)
6614 {
6615     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6616     register const Py_UNICODE *e;
6617     int cased;
6618
6619     /* Shortcut for single character strings */
6620     if (PyUnicode_GET_SIZE(self) == 1)
6621         return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6622
6623     /* Special case for empty strings */
6624     if (PyUnicode_GET_SIZE(self) == 0)
6625         return PyBool_FromLong(0);
6626
6627     e = p + PyUnicode_GET_SIZE(self);
6628     cased = 0;
6629     for (; p < e; p++) {
6630         register const Py_UNICODE ch = *p;
6631
6632         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6633             return PyBool_FromLong(0);
6634         else if (!cased && Py_UNICODE_ISLOWER(ch))
6635             cased = 1;
6636     }
6637     return PyBool_FromLong(cased);
6638 }
6639
6640 PyDoc_STRVAR(isupper__doc__,
6641              "S.isupper() -> bool\n\
6642 \n\
6643 Return True if all cased characters in S are uppercase and there is\n\
6644 at least one cased character in S, False otherwise.");
6645
6646 static PyObject*
6647 unicode_isupper(PyUnicodeObject *self)
6648 {
6649     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6650     register const Py_UNICODE *e;
6651     int cased;
6652
6653     /* Shortcut for single character strings */
6654     if (PyUnicode_GET_SIZE(self) == 1)
6655         return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6656
6657     /* Special case for empty strings */
6658     if (PyUnicode_GET_SIZE(self) == 0)
6659         return PyBool_FromLong(0);
6660
6661     e = p + PyUnicode_GET_SIZE(self);
6662     cased = 0;
6663     for (; p < e; p++) {
6664         register const Py_UNICODE ch = *p;
6665
6666         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6667             return PyBool_FromLong(0);
6668         else if (!cased && Py_UNICODE_ISUPPER(ch))
6669             cased = 1;
6670     }
6671     return PyBool_FromLong(cased);
6672 }
6673
6674 PyDoc_STRVAR(istitle__doc__,
6675              "S.istitle() -> bool\n\
6676 \n\
6677 Return True if S is a titlecased string and there is at least one\n\
6678 character in S, i.e. upper- and titlecase characters may only\n\
6679 follow uncased characters and lowercase characters only cased ones.\n\
6680 Return False otherwise.");
6681
6682 static PyObject*
6683 unicode_istitle(PyUnicodeObject *self)
6684 {
6685     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6686     register const Py_UNICODE *e;
6687     int cased, previous_is_cased;
6688
6689     /* Shortcut for single character strings */
6690     if (PyUnicode_GET_SIZE(self) == 1)
6691         return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6692                                (Py_UNICODE_ISUPPER(*p) != 0));
6693
6694     /* Special case for empty strings */
6695     if (PyUnicode_GET_SIZE(self) == 0)
6696         return PyBool_FromLong(0);
6697
6698     e = p + PyUnicode_GET_SIZE(self);
6699     cased = 0;
6700     previous_is_cased = 0;
6701     for (; p < e; p++) {
6702         register const Py_UNICODE ch = *p;
6703
6704         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6705             if (previous_is_cased)
6706                 return PyBool_FromLong(0);
6707             previous_is_cased = 1;
6708             cased = 1;
6709         }
6710         else if (Py_UNICODE_ISLOWER(ch)) {
6711             if (!previous_is_cased)
6712                 return PyBool_FromLong(0);
6713             previous_is_cased = 1;
6714             cased = 1;
6715         }
6716         else
6717             previous_is_cased = 0;
6718     }
6719     return PyBool_FromLong(cased);
6720 }
6721
6722 PyDoc_STRVAR(isspace__doc__,
6723              "S.isspace() -> bool\n\
6724 \n\
6725 Return True if all characters in S are whitespace\n\
6726 and there is at least one character in S, False otherwise.");
6727
6728 static PyObject*
6729 unicode_isspace(PyUnicodeObject *self)
6730 {
6731     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6732     register const Py_UNICODE *e;
6733
6734     /* Shortcut for single character strings */
6735     if (PyUnicode_GET_SIZE(self) == 1 &&
6736         Py_UNICODE_ISSPACE(*p))
6737         return PyBool_FromLong(1);
6738
6739     /* Special case for empty strings */
6740     if (PyUnicode_GET_SIZE(self) == 0)
6741         return PyBool_FromLong(0);
6742
6743     e = p + PyUnicode_GET_SIZE(self);
6744     for (; p < e; p++) {
6745         if (!Py_UNICODE_ISSPACE(*p))
6746             return PyBool_FromLong(0);
6747     }
6748     return PyBool_FromLong(1);
6749 }
6750
6751 PyDoc_STRVAR(isalpha__doc__,
6752              "S.isalpha() -> bool\n\
6753 \n\
6754 Return True if all characters in S are alphabetic\n\
6755 and there is at least one character in S, False otherwise.");
6756
6757 static PyObject*
6758 unicode_isalpha(PyUnicodeObject *self)
6759 {
6760     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6761     register const Py_UNICODE *e;
6762
6763     /* Shortcut for single character strings */
6764     if (PyUnicode_GET_SIZE(self) == 1 &&
6765         Py_UNICODE_ISALPHA(*p))
6766         return PyBool_FromLong(1);
6767
6768     /* Special case for empty strings */
6769     if (PyUnicode_GET_SIZE(self) == 0)
6770         return PyBool_FromLong(0);
6771
6772     e = p + PyUnicode_GET_SIZE(self);
6773     for (; p < e; p++) {
6774         if (!Py_UNICODE_ISALPHA(*p))
6775             return PyBool_FromLong(0);
6776     }
6777     return PyBool_FromLong(1);
6778 }
6779
6780 PyDoc_STRVAR(isalnum__doc__,
6781              "S.isalnum() -> bool\n\
6782 \n\
6783 Return True if all characters in S are alphanumeric\n\
6784 and there is at least one character in S, False otherwise.");
6785
6786 static PyObject*
6787 unicode_isalnum(PyUnicodeObject *self)
6788 {
6789     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6790     register const Py_UNICODE *e;
6791
6792     /* Shortcut for single character strings */
6793     if (PyUnicode_GET_SIZE(self) == 1 &&
6794         Py_UNICODE_ISALNUM(*p))
6795         return PyBool_FromLong(1);
6796
6797     /* Special case for empty strings */
6798     if (PyUnicode_GET_SIZE(self) == 0)
6799         return PyBool_FromLong(0);
6800
6801     e = p + PyUnicode_GET_SIZE(self);
6802     for (; p < e; p++) {
6803         if (!Py_UNICODE_ISALNUM(*p))
6804             return PyBool_FromLong(0);
6805     }
6806     return PyBool_FromLong(1);
6807 }
6808
6809 PyDoc_STRVAR(isdecimal__doc__,
6810              "S.isdecimal() -> bool\n\
6811 \n\
6812 Return True if there are only decimal characters in S,\n\
6813 False otherwise.");
6814
6815 static PyObject*
6816 unicode_isdecimal(PyUnicodeObject *self)
6817 {
6818     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6819     register const Py_UNICODE *e;
6820
6821     /* Shortcut for single character strings */
6822     if (PyUnicode_GET_SIZE(self) == 1 &&
6823         Py_UNICODE_ISDECIMAL(*p))
6824         return PyBool_FromLong(1);
6825
6826     /* Special case for empty strings */
6827     if (PyUnicode_GET_SIZE(self) == 0)
6828         return PyBool_FromLong(0);
6829
6830     e = p + PyUnicode_GET_SIZE(self);
6831     for (; p < e; p++) {
6832         if (!Py_UNICODE_ISDECIMAL(*p))
6833             return PyBool_FromLong(0);
6834     }
6835     return PyBool_FromLong(1);
6836 }
6837
6838 PyDoc_STRVAR(isdigit__doc__,
6839              "S.isdigit() -> bool\n\
6840 \n\
6841 Return True if all characters in S are digits\n\
6842 and there is at least one character in S, False otherwise.");
6843
6844 static PyObject*
6845 unicode_isdigit(PyUnicodeObject *self)
6846 {
6847     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6848     register const Py_UNICODE *e;
6849
6850     /* Shortcut for single character strings */
6851     if (PyUnicode_GET_SIZE(self) == 1 &&
6852         Py_UNICODE_ISDIGIT(*p))
6853         return PyBool_FromLong(1);
6854
6855     /* Special case for empty strings */
6856     if (PyUnicode_GET_SIZE(self) == 0)
6857         return PyBool_FromLong(0);
6858
6859     e = p + PyUnicode_GET_SIZE(self);
6860     for (; p < e; p++) {
6861         if (!Py_UNICODE_ISDIGIT(*p))
6862             return PyBool_FromLong(0);
6863     }
6864     return PyBool_FromLong(1);
6865 }
6866
6867 PyDoc_STRVAR(isnumeric__doc__,
6868              "S.isnumeric() -> bool\n\
6869 \n\
6870 Return True if there are only numeric characters in S,\n\
6871 False otherwise.");
6872
6873 static PyObject*
6874 unicode_isnumeric(PyUnicodeObject *self)
6875 {
6876     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6877     register const Py_UNICODE *e;
6878
6879     /* Shortcut for single character strings */
6880     if (PyUnicode_GET_SIZE(self) == 1 &&
6881         Py_UNICODE_ISNUMERIC(*p))
6882         return PyBool_FromLong(1);
6883
6884     /* Special case for empty strings */
6885     if (PyUnicode_GET_SIZE(self) == 0)
6886         return PyBool_FromLong(0);
6887
6888     e = p + PyUnicode_GET_SIZE(self);
6889     for (; p < e; p++) {
6890         if (!Py_UNICODE_ISNUMERIC(*p))
6891             return PyBool_FromLong(0);
6892     }
6893     return PyBool_FromLong(1);
6894 }
6895
6896 PyDoc_STRVAR(join__doc__,
6897              "S.join(iterable) -> unicode\n\
6898 \n\
6899 Return a string which is the concatenation of the strings in the\n\
6900 iterable.  The separator between elements is S.");
6901
6902 static PyObject*
6903 unicode_join(PyObject *self, PyObject *data)
6904 {
6905     return PyUnicode_Join(self, data);
6906 }
6907
6908 static Py_ssize_t
6909 unicode_length(PyUnicodeObject *self)
6910 {
6911     return self->length;
6912 }
6913
6914 PyDoc_STRVAR(ljust__doc__,
6915              "S.ljust(width[, fillchar]) -> int\n\
6916 \n\
6917 Return S left-justified in a Unicode string of length width. Padding is\n\
6918 done using the specified fill character (default is a space).");
6919
6920 static PyObject *
6921 unicode_ljust(PyUnicodeObject *self, PyObject *args)
6922 {
6923     Py_ssize_t width;
6924     Py_UNICODE fillchar = ' ';
6925
6926     if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
6927         return NULL;
6928
6929     if (self->length >= width && PyUnicode_CheckExact(self)) {
6930         Py_INCREF(self);
6931         return (PyObject*) self;
6932     }
6933
6934     return (PyObject*) pad(self, 0, width - self->length, fillchar);
6935 }
6936
6937 PyDoc_STRVAR(lower__doc__,
6938              "S.lower() -> unicode\n\
6939 \n\
6940 Return a copy of the string S converted to lowercase.");
6941
6942 static PyObject*
6943 unicode_lower(PyUnicodeObject *self)
6944 {
6945     return fixup(self, fixlower);
6946 }
6947
6948 #define LEFTSTRIP 0
6949 #define RIGHTSTRIP 1
6950 #define BOTHSTRIP 2
6951
6952 /* Arrays indexed by above */
6953 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6954
6955 #define STRIPNAME(i) (stripformat[i]+3)
6956
6957 /* externally visible for str.strip(unicode) */
6958 PyObject *
6959 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6960 {
6961     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6962     Py_ssize_t len = PyUnicode_GET_SIZE(self);
6963     Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6964     Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6965     Py_ssize_t i, j;
6966
6967     BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6968
6969     i = 0;
6970     if (striptype != RIGHTSTRIP) {
6971         while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6972             i++;
6973         }
6974     }
6975
6976     j = len;
6977     if (striptype != LEFTSTRIP) {
6978         do {
6979             j--;
6980         } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6981         j++;
6982     }
6983
6984     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6985         Py_INCREF(self);
6986         return (PyObject*)self;
6987     }
6988     else
6989         return PyUnicode_FromUnicode(s+i, j-i);
6990 }
6991
6992
6993 static PyObject *
6994 do_strip(PyUnicodeObject *self, int striptype)
6995 {
6996     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6997     Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
6998
6999     i = 0;
7000     if (striptype != RIGHTSTRIP) {
7001         while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7002             i++;
7003         }
7004     }
7005
7006     j = len;
7007     if (striptype != LEFTSTRIP) {
7008         do {
7009             j--;
7010         } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7011         j++;
7012     }
7013
7014     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7015         Py_INCREF(self);
7016         return (PyObject*)self;
7017     }
7018     else
7019         return PyUnicode_FromUnicode(s+i, j-i);
7020 }
7021
7022
7023 static PyObject *
7024 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7025 {
7026     PyObject *sep = NULL;
7027
7028     if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7029         return NULL;
7030
7031     if (sep != NULL && sep != Py_None) {
7032         if (PyUnicode_Check(sep))
7033             return _PyUnicode_XStrip(self, striptype, sep);
7034         else if (PyString_Check(sep)) {
7035             PyObject *res;
7036             sep = PyUnicode_FromObject(sep);
7037             if (sep==NULL)
7038                 return NULL;
7039             res = _PyUnicode_XStrip(self, striptype, sep);
7040             Py_DECREF(sep);
7041             return res;
7042         }
7043         else {
7044             PyErr_Format(PyExc_TypeError,
7045                          "%s arg must be None, unicode or str",
7046                          STRIPNAME(striptype));
7047             return NULL;
7048         }
7049     }
7050
7051     return do_strip(self, striptype);
7052 }
7053
7054
7055 PyDoc_STRVAR(strip__doc__,
7056              "S.strip([chars]) -> unicode\n\
7057 \n\
7058 Return a copy of the string S with leading and trailing\n\
7059 whitespace removed.\n\
7060 If chars is given and not None, remove characters in chars instead.\n\
7061 If chars is a str, it will be converted to unicode before stripping");
7062
7063 static PyObject *
7064 unicode_strip(PyUnicodeObject *self, PyObject *args)
7065 {
7066     if (PyTuple_GET_SIZE(args) == 0)
7067         return do_strip(self, BOTHSTRIP); /* Common case */
7068     else
7069         return do_argstrip(self, BOTHSTRIP, args);
7070 }
7071
7072
7073 PyDoc_STRVAR(lstrip__doc__,
7074              "S.lstrip([chars]) -> unicode\n\
7075 \n\
7076 Return a copy of the string S with leading whitespace removed.\n\
7077 If chars is given and not None, remove characters in chars instead.\n\
7078 If chars is a str, it will be converted to unicode before stripping");
7079
7080 static PyObject *
7081 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7082 {
7083     if (PyTuple_GET_SIZE(args) == 0)
7084         return do_strip(self, LEFTSTRIP); /* Common case */
7085     else
7086         return do_argstrip(self, LEFTSTRIP, args);
7087 }
7088
7089
7090 PyDoc_STRVAR(rstrip__doc__,
7091              "S.rstrip([chars]) -> unicode\n\
7092 \n\
7093 Return a copy of the string S with trailing whitespace removed.\n\
7094 If chars is given and not None, remove characters in chars instead.\n\
7095 If chars is a str, it will be converted to unicode before stripping");
7096
7097 static PyObject *
7098 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7099 {
7100     if (PyTuple_GET_SIZE(args) == 0)
7101         return do_strip(self, RIGHTSTRIP); /* Common case */
7102     else
7103         return do_argstrip(self, RIGHTSTRIP, args);
7104 }
7105
7106
7107 static PyObject*
7108 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7109 {
7110     PyUnicodeObject *u;
7111     Py_UNICODE *p;
7112     Py_ssize_t nchars;
7113     size_t nbytes;
7114
7115     if (len < 0)
7116         len = 0;
7117
7118     if (len == 1 && PyUnicode_CheckExact(str)) {
7119         /* no repeat, return original string */
7120         Py_INCREF(str);
7121         return (PyObject*) str;
7122     }
7123
7124     /* ensure # of chars needed doesn't overflow int and # of bytes
7125      * needed doesn't overflow size_t
7126      */
7127     nchars = len * str->length;
7128     if (len && nchars / len != str->length) {
7129         PyErr_SetString(PyExc_OverflowError,
7130                         "repeated string is too long");
7131         return NULL;
7132     }
7133     nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7134     if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7135         PyErr_SetString(PyExc_OverflowError,
7136                         "repeated string is too long");
7137         return NULL;
7138     }
7139     u = _PyUnicode_New(nchars);
7140     if (!u)
7141         return NULL;
7142
7143     p = u->str;
7144
7145     if (str->length == 1 && len > 0) {
7146         Py_UNICODE_FILL(p, str->str[0], len);
7147     } else {
7148         Py_ssize_t done = 0; /* number of characters copied this far */
7149         if (done < nchars) {
7150             Py_UNICODE_COPY(p, str->str, str->length);
7151             done = str->length;
7152         }
7153         while (done < nchars) {
7154             Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7155             Py_UNICODE_COPY(p+done, p, n);
7156             done += n;
7157         }
7158     }
7159
7160     return (PyObject*) u;
7161 }
7162
7163 PyObject *PyUnicode_Replace(PyObject *obj,
7164                             PyObject *subobj,
7165                             PyObject *replobj,
7166                             Py_ssize_t maxcount)
7167 {
7168     PyObject *self;
7169     PyObject *str1;
7170     PyObject *str2;
7171     PyObject *result;
7172
7173     self = PyUnicode_FromObject(obj);
7174     if (self == NULL)
7175         return NULL;
7176     str1 = PyUnicode_FromObject(subobj);
7177     if (str1 == NULL) {
7178         Py_DECREF(self);
7179         return NULL;
7180     }
7181     str2 = PyUnicode_FromObject(replobj);
7182     if (str2 == NULL) {
7183         Py_DECREF(self);
7184         Py_DECREF(str1);
7185         return NULL;
7186     }
7187     result = replace((PyUnicodeObject *)self,
7188                      (PyUnicodeObject *)str1,
7189                      (PyUnicodeObject *)str2,
7190                      maxcount);
7191     Py_DECREF(self);
7192     Py_DECREF(str1);
7193     Py_DECREF(str2);
7194     return result;
7195 }
7196
7197 PyDoc_STRVAR(replace__doc__,
7198              "S.replace (old, new[, count]) -> unicode\n\
7199 \n\
7200 Return a copy of S with all occurrences of substring\n\
7201 old replaced by new.  If the optional argument count is\n\
7202 given, only the first count occurrences are replaced.");
7203
7204 static PyObject*
7205 unicode_replace(PyUnicodeObject *self, PyObject *args)
7206 {
7207     PyUnicodeObject *str1;
7208     PyUnicodeObject *str2;
7209     Py_ssize_t maxcount = -1;
7210     PyObject *result;
7211
7212     if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7213         return NULL;
7214     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7215     if (str1 == NULL)
7216         return NULL;
7217     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7218     if (str2 == NULL) {
7219         Py_DECREF(str1);
7220         return NULL;
7221     }
7222
7223     result = replace(self, str1, str2, maxcount);
7224
7225     Py_DECREF(str1);
7226     Py_DECREF(str2);
7227     return result;
7228 }
7229
7230 static
7231 PyObject *unicode_repr(PyObject *unicode)
7232 {
7233     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7234                                 PyUnicode_GET_SIZE(unicode),
7235                                 1);
7236 }
7237
7238 PyDoc_STRVAR(rfind__doc__,
7239              "S.rfind(sub [,start [,end]]) -> int\n\
7240 \n\
7241 Return the highest index in S where substring sub is found,\n\
7242 such that sub is contained within s[start:end].  Optional\n\
7243 arguments start and end are interpreted as in slice notation.\n\
7244 \n\
7245 Return -1 on failure.");
7246
7247 static PyObject *
7248 unicode_rfind(PyUnicodeObject *self, PyObject *args)
7249 {
7250     PyObject *substring;
7251     Py_ssize_t start;
7252     Py_ssize_t end;
7253     Py_ssize_t result;
7254
7255     if (!_ParseTupleFinds(args, &substring, &start, &end))
7256         return NULL;
7257
7258     result = stringlib_rfind_slice(
7259         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7260         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7261         start, end
7262         );
7263
7264     Py_DECREF(substring);
7265
7266     return PyInt_FromSsize_t(result);
7267 }
7268
7269 PyDoc_STRVAR(rindex__doc__,
7270              "S.rindex(sub [,start [,end]]) -> int\n\
7271 \n\
7272 Like S.rfind() but raise ValueError when the substring is not found.");
7273
7274 static PyObject *
7275 unicode_rindex(PyUnicodeObject *self, PyObject *args)
7276 {
7277     PyObject *substring;
7278     Py_ssize_t start;
7279     Py_ssize_t end;
7280     Py_ssize_t result;
7281
7282     if (!_ParseTupleFinds(args, &substring, &start, &end))
7283         return NULL;
7284
7285     result = stringlib_rfind_slice(
7286         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7287         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7288         start, end
7289         );
7290
7291     Py_DECREF(substring);
7292
7293     if (result < 0) {
7294         PyErr_SetString(PyExc_ValueError, "substring not found");
7295         return NULL;
7296     }
7297     return PyInt_FromSsize_t(result);
7298 }
7299
7300 PyDoc_STRVAR(rjust__doc__,
7301              "S.rjust(width[, fillchar]) -> unicode\n\
7302 \n\
7303 Return S right-justified in a Unicode string of length width. Padding is\n\
7304 done using the specified fill character (default is a space).");
7305
7306 static PyObject *
7307 unicode_rjust(PyUnicodeObject *self, PyObject *args)
7308 {
7309     Py_ssize_t width;
7310     Py_UNICODE fillchar = ' ';
7311
7312     if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7313         return NULL;
7314
7315     if (self->length >= width && PyUnicode_CheckExact(self)) {
7316         Py_INCREF(self);
7317         return (PyObject*) self;
7318     }
7319
7320     return (PyObject*) pad(self, width - self->length, 0, fillchar);
7321 }
7322
7323 static PyObject*
7324 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
7325 {
7326     /* standard clamping */
7327     if (start < 0)
7328         start = 0;
7329     if (end < 0)
7330         end = 0;
7331     if (end > self->length)
7332         end = self->length;
7333     if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
7334         /* full slice, return original string */
7335         Py_INCREF(self);
7336         return (PyObject*) self;
7337     }
7338     if (start > end)
7339         start = end;
7340     /* copy slice */
7341     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7342                                              end - start);
7343 }
7344
7345 PyObject *PyUnicode_Split(PyObject *s,
7346                           PyObject *sep,
7347                           Py_ssize_t maxsplit)
7348 {
7349     PyObject *result;
7350
7351     s = PyUnicode_FromObject(s);
7352     if (s == NULL)
7353         return NULL;
7354     if (sep != NULL) {
7355         sep = PyUnicode_FromObject(sep);
7356         if (sep == NULL) {
7357             Py_DECREF(s);
7358             return NULL;
7359         }
7360     }
7361
7362     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7363
7364     Py_DECREF(s);
7365     Py_XDECREF(sep);
7366     return result;
7367 }
7368
7369 PyDoc_STRVAR(split__doc__,
7370              "S.split([sep [,maxsplit]]) -> list of strings\n\
7371 \n\
7372 Return a list of the words in S, using sep as the\n\
7373 delimiter string.  If maxsplit is given, at most maxsplit\n\
7374 splits are done. If sep is not specified or is None, any\n\
7375 whitespace string is a separator and empty strings are\n\
7376 removed from the result.");
7377
7378 static PyObject*
7379 unicode_split(PyUnicodeObject *self, PyObject *args)
7380 {
7381     PyObject *substring = Py_None;
7382     Py_ssize_t maxcount = -1;
7383
7384     if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7385         return NULL;
7386
7387     if (substring == Py_None)
7388         return split(self, NULL, maxcount);
7389     else if (PyUnicode_Check(substring))
7390         return split(self, (PyUnicodeObject *)substring, maxcount);
7391     else
7392         return PyUnicode_Split((PyObject *)self, substring, maxcount);
7393 }
7394
7395 PyObject *
7396 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7397 {
7398     PyObject* str_obj;
7399     PyObject* sep_obj;
7400     PyObject* out;
7401
7402     str_obj = PyUnicode_FromObject(str_in);
7403     if (!str_obj)
7404         return NULL;
7405     sep_obj = PyUnicode_FromObject(sep_in);
7406     if (!sep_obj) {
7407         Py_DECREF(str_obj);
7408         return NULL;
7409     }
7410
7411     out = stringlib_partition(
7412         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7413         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7414         );
7415
7416     Py_DECREF(sep_obj);
7417     Py_DECREF(str_obj);
7418
7419     return out;
7420 }
7421
7422
7423 PyObject *
7424 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7425 {
7426     PyObject* str_obj;
7427     PyObject* sep_obj;
7428     PyObject* out;
7429
7430     str_obj = PyUnicode_FromObject(str_in);
7431     if (!str_obj)
7432         return NULL;
7433     sep_obj = PyUnicode_FromObject(sep_in);
7434     if (!sep_obj) {
7435         Py_DECREF(str_obj);
7436         return NULL;
7437     }
7438
7439     out = stringlib_rpartition(
7440         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7441         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7442         );
7443
7444     Py_DECREF(sep_obj);
7445     Py_DECREF(str_obj);
7446
7447     return out;
7448 }
7449
7450 PyDoc_STRVAR(partition__doc__,
7451              "S.partition(sep) -> (head, sep, tail)\n\
7452 \n\
7453 Search for the separator sep in S, and return the part before it,\n\
7454 the separator itself, and the part after it.  If the separator is not\n\
7455 found, return S and two empty strings.");
7456
7457 static PyObject*
7458 unicode_partition(PyUnicodeObject *self, PyObject *separator)
7459 {
7460     return PyUnicode_Partition((PyObject *)self, separator);
7461 }
7462
7463 PyDoc_STRVAR(rpartition__doc__,
7464              "S.rpartition(sep) -> (tail, sep, head)\n\
7465 \n\
7466 Search for the separator sep in S, starting at the end of S, and return\n\
7467 the part before it, the separator itself, and the part after it.  If the\n\
7468 separator is not found, return two empty strings and S.");
7469
7470 static PyObject*
7471 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7472 {
7473     return PyUnicode_RPartition((PyObject *)self, separator);
7474 }
7475
7476 PyObject *PyUnicode_RSplit(PyObject *s,
7477                            PyObject *sep,
7478                            Py_ssize_t maxsplit)
7479 {
7480     PyObject *result;
7481
7482     s = PyUnicode_FromObject(s);
7483     if (s == NULL)
7484         return NULL;
7485     if (sep != NULL) {
7486         sep = PyUnicode_FromObject(sep);
7487         if (sep == NULL) {
7488             Py_DECREF(s);
7489             return NULL;
7490         }
7491     }
7492
7493     result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7494
7495     Py_DECREF(s);
7496     Py_XDECREF(sep);
7497     return result;
7498 }
7499
7500 PyDoc_STRVAR(rsplit__doc__,
7501              "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7502 \n\
7503 Return a list of the words in S, using sep as the\n\
7504 delimiter string, starting at the end of the string and\n\
7505 working to the front.  If maxsplit is given, at most maxsplit\n\
7506 splits are done. If sep is not specified, any whitespace string\n\
7507 is a separator.");
7508
7509 static PyObject*
7510 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7511 {
7512     PyObject *substring = Py_None;
7513     Py_ssize_t maxcount = -1;
7514
7515     if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7516         return NULL;
7517
7518     if (substring == Py_None)
7519         return rsplit(self, NULL, maxcount);
7520     else if (PyUnicode_Check(substring))
7521         return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7522     else
7523         return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7524 }
7525
7526 PyDoc_STRVAR(splitlines__doc__,
7527              "S.splitlines([keepends]) -> list of strings\n\
7528 \n\
7529 Return a list of the lines in S, breaking at line boundaries.\n\
7530 Line breaks are not included in the resulting list unless keepends\n\
7531 is given and true.");
7532
7533 static PyObject*
7534 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7535 {
7536     int keepends = 0;
7537
7538     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7539         return NULL;
7540
7541     return PyUnicode_Splitlines((PyObject *)self, keepends);
7542 }
7543
7544 static
7545 PyObject *unicode_str(PyUnicodeObject *self)
7546 {
7547     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
7548 }
7549
7550 PyDoc_STRVAR(swapcase__doc__,
7551              "S.swapcase() -> unicode\n\
7552 \n\
7553 Return a copy of S with uppercase characters converted to lowercase\n\
7554 and vice versa.");
7555
7556 static PyObject*
7557 unicode_swapcase(PyUnicodeObject *self)
7558 {
7559     return fixup(self, fixswapcase);
7560 }
7561
7562 PyDoc_STRVAR(translate__doc__,
7563              "S.translate(table) -> unicode\n\
7564 \n\
7565 Return a copy of the string S, where all characters have been mapped\n\
7566 through the given translation table, which must be a mapping of\n\
7567 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7568 Unmapped characters are left untouched. Characters mapped to None\n\
7569 are deleted.");
7570
7571 static PyObject*
7572 unicode_translate(PyUnicodeObject *self, PyObject *table)
7573 {
7574     return PyUnicode_TranslateCharmap(self->str,
7575                                       self->length,
7576                                       table,
7577                                       "ignore");
7578 }
7579
7580 PyDoc_STRVAR(upper__doc__,
7581              "S.upper() -> unicode\n\
7582 \n\
7583 Return a copy of S converted to uppercase.");
7584
7585 static PyObject*
7586 unicode_upper(PyUnicodeObject *self)
7587 {
7588     return fixup(self, fixupper);
7589 }
7590
7591 PyDoc_STRVAR(zfill__doc__,
7592              "S.zfill(width) -> unicode\n\
7593 \n\
7594 Pad a numeric string S with zeros on the left, to fill a field\n\
7595 of the specified width. The string S is never truncated.");
7596
7597 static PyObject *
7598 unicode_zfill(PyUnicodeObject *self, PyObject *args)
7599 {
7600     Py_ssize_t fill;
7601     PyUnicodeObject *u;
7602
7603     Py_ssize_t width;
7604     if (!PyArg_ParseTuple(args, "n:zfill", &width))
7605         return NULL;
7606
7607     if (self->length >= width) {
7608         if (PyUnicode_CheckExact(self)) {
7609             Py_INCREF(self);
7610             return (PyObject*) self;
7611         }
7612         else
7613             return PyUnicode_FromUnicode(
7614                 PyUnicode_AS_UNICODE(self),
7615                 PyUnicode_GET_SIZE(self)
7616                 );
7617     }
7618
7619     fill = width - self->length;
7620
7621     u = pad(self, fill, 0, '0');
7622
7623     if (u == NULL)
7624         return NULL;
7625
7626     if (u->str[fill] == '+' || u->str[fill] == '-') {
7627         /* move sign to beginning of string */
7628         u->str[0] = u->str[fill];
7629         u->str[fill] = '0';
7630     }
7631
7632     return (PyObject*) u;
7633 }
7634
7635 #if 0
7636 static PyObject*
7637 free_listsize(PyUnicodeObject *self)
7638 {
7639     return PyInt_FromLong(numfree);
7640 }
7641 #endif
7642
7643 PyDoc_STRVAR(startswith__doc__,
7644              "S.startswith(prefix[, start[, end]]) -> bool\n\
7645 \n\
7646 Return True if S starts with the specified prefix, False otherwise.\n\
7647 With optional start, test S beginning at that position.\n\
7648 With optional end, stop comparing S at that position.\n\
7649 prefix can also be a tuple of strings to try.");
7650
7651 static PyObject *
7652 unicode_startswith(PyUnicodeObject *self,
7653                    PyObject *args)
7654 {
7655     PyObject *subobj;
7656     PyUnicodeObject *substring;
7657     Py_ssize_t start = 0;
7658     Py_ssize_t end = PY_SSIZE_T_MAX;
7659     int result;
7660
7661     if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
7662                           _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7663         return NULL;
7664     if (PyTuple_Check(subobj)) {
7665         Py_ssize_t i;
7666         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7667             substring = (PyUnicodeObject *)PyUnicode_FromObject(
7668                 PyTuple_GET_ITEM(subobj, i));
7669             if (substring == NULL)
7670                 return NULL;
7671             result = tailmatch(self, substring, start, end, -1);
7672             Py_DECREF(substring);
7673             if (result) {
7674                 Py_RETURN_TRUE;
7675             }
7676         }
7677         /* nothing matched */
7678         Py_RETURN_FALSE;
7679     }
7680     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7681     if (substring == NULL)
7682         return NULL;
7683     result = tailmatch(self, substring, start, end, -1);
7684     Py_DECREF(substring);
7685     return PyBool_FromLong(result);
7686 }
7687
7688
7689 PyDoc_STRVAR(endswith__doc__,
7690              "S.endswith(suffix[, start[, end]]) -> bool\n\
7691 \n\
7692 Return True if S ends with the specified suffix, False otherwise.\n\
7693 With optional start, test S beginning at that position.\n\
7694 With optional end, stop comparing S at that position.\n\
7695 suffix can also be a tuple of strings to try.");
7696
7697 static PyObject *
7698 unicode_endswith(PyUnicodeObject *self,
7699                  PyObject *args)
7700 {
7701     PyObject *subobj;
7702     PyUnicodeObject *substring;
7703     Py_ssize_t start = 0;
7704     Py_ssize_t end = PY_SSIZE_T_MAX;
7705     int result;
7706
7707     if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7708                           _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7709         return NULL;
7710     if (PyTuple_Check(subobj)) {
7711         Py_ssize_t i;
7712         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7713             substring = (PyUnicodeObject *)PyUnicode_FromObject(
7714                 PyTuple_GET_ITEM(subobj, i));
7715             if (substring == NULL)
7716                 return NULL;
7717             result = tailmatch(self, substring, start, end, +1);
7718             Py_DECREF(substring);
7719             if (result) {
7720                 Py_RETURN_TRUE;
7721             }
7722         }
7723         Py_RETURN_FALSE;
7724     }
7725     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7726     if (substring == NULL)
7727         return NULL;
7728
7729     result = tailmatch(self, substring, start, end, +1);
7730     Py_DECREF(substring);
7731     return PyBool_FromLong(result);
7732 }
7733
7734
7735 /* Implements do_string_format, which is unicode because of stringlib */
7736 #include "stringlib/string_format.h"
7737
7738 PyDoc_STRVAR(format__doc__,
7739              "S.format(*args, **kwargs) -> unicode\n\
7740 \n\
7741 ");
7742
7743 static PyObject *
7744 unicode__format__(PyObject *self, PyObject *args)
7745 {
7746     PyObject *format_spec;
7747     PyObject *result = NULL;
7748     PyObject *tmp = NULL;
7749
7750     /* If 2.x, convert format_spec to the same type as value */
7751     /* This is to allow things like u''.format('') */
7752     if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7753         goto done;
7754     if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7755         PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
7756                      "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
7757         goto done;
7758     }
7759     tmp = PyObject_Unicode(format_spec);
7760     if (tmp == NULL)
7761         goto done;
7762     format_spec = tmp;
7763
7764     result = _PyUnicode_FormatAdvanced(self,
7765                                        PyUnicode_AS_UNICODE(format_spec),
7766                                        PyUnicode_GET_SIZE(format_spec));
7767   done:
7768     Py_XDECREF(tmp);
7769     return result;
7770 }
7771
7772 PyDoc_STRVAR(p_format__doc__,
7773              "S.__format__(format_spec) -> unicode\n\
7774 \n\
7775 ");
7776
7777 static PyObject *
7778 unicode__sizeof__(PyUnicodeObject *v)
7779 {
7780     return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7781                              sizeof(Py_UNICODE) * (v->length + 1));
7782 }
7783
7784 PyDoc_STRVAR(sizeof__doc__,
7785              "S.__sizeof__() -> size of S in memory, in bytes\n\
7786 \n\
7787 ");
7788
7789 static PyObject *
7790 unicode_getnewargs(PyUnicodeObject *v)
7791 {
7792     return Py_BuildValue("(u#)", v->str, v->length);
7793 }
7794
7795
7796 static PyMethodDef unicode_methods[] = {
7797
7798     /* Order is according to common usage: often used methods should
7799        appear first, since lookup is done sequentially. */
7800
7801     {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
7802     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7803     {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
7804     {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
7805     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7806     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7807     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7808     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7809     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7810     {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7811     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
7812     {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
7813     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7814     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7815     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
7816     {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
7817     {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
7818 /*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7819     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7820     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7821     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
7822     {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
7823     {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
7824     {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
7825     {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
7826     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7827     {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7828     {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7829     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7830     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7831     {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7832     {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7833     {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7834     {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7835     {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7836     {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7837     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7838     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7839     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
7840     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
7841     {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7842     {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7843     {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7844     {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
7845     {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
7846 #if 0
7847     {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
7848 #endif
7849
7850 #if 0
7851     /* This one is just used for debugging the implementation. */
7852     {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
7853 #endif
7854
7855     {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
7856     {NULL, NULL}
7857 };
7858
7859 static PyObject *
7860 unicode_mod(PyObject *v, PyObject *w)
7861 {
7862     if (!PyUnicode_Check(v)) {
7863         Py_INCREF(Py_NotImplemented);
7864         return Py_NotImplemented;
7865     }
7866     return PyUnicode_Format(v, w);
7867 }
7868
7869 static PyNumberMethods unicode_as_number = {
7870     0,              /*nb_add*/
7871     0,              /*nb_subtract*/
7872     0,              /*nb_multiply*/
7873     0,              /*nb_divide*/
7874     unicode_mod,            /*nb_remainder*/
7875 };
7876
7877 static PySequenceMethods unicode_as_sequence = {
7878     (lenfunc) unicode_length,       /* sq_length */
7879     PyUnicode_Concat,           /* sq_concat */
7880     (ssizeargfunc) unicode_repeat,  /* sq_repeat */
7881     (ssizeargfunc) unicode_getitem,     /* sq_item */
7882     (ssizessizeargfunc) unicode_slice,  /* sq_slice */
7883     0,                  /* sq_ass_item */
7884     0,                  /* sq_ass_slice */
7885     PyUnicode_Contains,         /* sq_contains */
7886 };
7887
7888 static PyObject*
7889 unicode_subscript(PyUnicodeObject* self, PyObject* item)
7890 {
7891     if (PyIndex_Check(item)) {
7892         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
7893         if (i == -1 && PyErr_Occurred())
7894             return NULL;
7895         if (i < 0)
7896             i += PyUnicode_GET_SIZE(self);
7897         return unicode_getitem(self, i);
7898     } else if (PySlice_Check(item)) {
7899         Py_ssize_t start, stop, step, slicelength, cur, i;
7900         Py_UNICODE* source_buf;
7901         Py_UNICODE* result_buf;
7902         PyObject* result;
7903
7904         if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
7905                                  &start, &stop, &step, &slicelength) < 0) {
7906             return NULL;
7907         }
7908
7909         if (slicelength <= 0) {
7910             return PyUnicode_FromUnicode(NULL, 0);
7911         } else if (start == 0 && step == 1 && slicelength == self->length &&
7912                    PyUnicode_CheckExact(self)) {
7913             Py_INCREF(self);
7914             return (PyObject *)self;
7915         } else if (step == 1) {
7916             return PyUnicode_FromUnicode(self->str + start, slicelength);
7917         } else {
7918             source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
7919             result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7920                                                        sizeof(Py_UNICODE));
7921
7922             if (result_buf == NULL)
7923                 return PyErr_NoMemory();
7924
7925             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7926                 result_buf[i] = source_buf[cur];
7927             }
7928
7929             result = PyUnicode_FromUnicode(result_buf, slicelength);
7930             PyObject_FREE(result_buf);
7931             return result;
7932         }
7933     } else {
7934         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7935         return NULL;
7936     }
7937 }
7938
7939 static PyMappingMethods unicode_as_mapping = {
7940     (lenfunc)unicode_length,        /* mp_length */
7941     (binaryfunc)unicode_subscript,  /* mp_subscript */
7942     (objobjargproc)0,           /* mp_ass_subscript */
7943 };
7944
7945 static Py_ssize_t
7946 unicode_buffer_getreadbuf(PyUnicodeObject *self,
7947                           Py_ssize_t index,
7948                           const void **ptr)
7949 {
7950     if (index != 0) {
7951         PyErr_SetString(PyExc_SystemError,
7952                         "accessing non-existent unicode segment");
7953         return -1;
7954     }
7955     *ptr = (void *) self->str;
7956     return PyUnicode_GET_DATA_SIZE(self);
7957 }
7958
7959 static Py_ssize_t
7960 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
7961                            const void **ptr)
7962 {
7963     PyErr_SetString(PyExc_TypeError,
7964                     "cannot use unicode as modifiable buffer");
7965     return -1;
7966 }
7967
7968 static int
7969 unicode_buffer_getsegcount(PyUnicodeObject *self,
7970                            Py_ssize_t *lenp)
7971 {
7972     if (lenp)
7973         *lenp = PyUnicode_GET_DATA_SIZE(self);
7974     return 1;
7975 }
7976
7977 static Py_ssize_t
7978 unicode_buffer_getcharbuf(PyUnicodeObject *self,
7979                           Py_ssize_t index,
7980                           const void **ptr)
7981 {
7982     PyObject *str;
7983
7984     if (index != 0) {
7985         PyErr_SetString(PyExc_SystemError,
7986                         "accessing non-existent unicode segment");
7987         return -1;
7988     }
7989     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
7990     if (str == NULL)
7991         return -1;
7992     *ptr = (void *) PyString_AS_STRING(str);
7993     return PyString_GET_SIZE(str);
7994 }
7995
7996 /* Helpers for PyUnicode_Format() */
7997
7998 static PyObject *
7999 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8000 {
8001     Py_ssize_t argidx = *p_argidx;
8002     if (argidx < arglen) {
8003         (*p_argidx)++;
8004         if (arglen < 0)
8005             return args;
8006         else
8007             return PyTuple_GetItem(args, argidx);
8008     }
8009     PyErr_SetString(PyExc_TypeError,
8010                     "not enough arguments for format string");
8011     return NULL;
8012 }
8013
8014 #define F_LJUST (1<<0)
8015 #define F_SIGN  (1<<1)
8016 #define F_BLANK (1<<2)
8017 #define F_ALT   (1<<3)
8018 #define F_ZERO  (1<<4)
8019
8020 static Py_ssize_t
8021 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8022 {
8023     register Py_ssize_t i;
8024     Py_ssize_t len = strlen(charbuffer);
8025     for (i = len - 1; i >= 0; i--)
8026         buffer[i] = (Py_UNICODE) charbuffer[i];
8027
8028     return len;
8029 }
8030
8031 static int
8032 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8033 {
8034     Py_ssize_t result;
8035
8036     PyOS_snprintf((char *)buffer, len, format, x);
8037     result = strtounicode(buffer, (char *)buffer);
8038     return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8039 }
8040
8041 /* XXX To save some code duplication, formatfloat/long/int could have been
8042    shared with stringobject.c, converting from 8-bit to Unicode after the
8043    formatting is done. */
8044
8045 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
8046
8047 static PyObject *
8048 formatfloat(PyObject *v, int flags, int prec, int type)
8049 {
8050     char *p;
8051     PyObject *result;
8052     double x;
8053
8054     x = PyFloat_AsDouble(v);
8055     if (x == -1.0 && PyErr_Occurred())
8056         return NULL;
8057
8058     if (prec < 0)
8059         prec = 6;
8060
8061     p = PyOS_double_to_string(x, type, prec,
8062                               (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8063     if (p == NULL)
8064         return NULL;
8065     result = PyUnicode_FromStringAndSize(p, strlen(p));
8066     PyMem_Free(p);
8067     return result;
8068 }
8069
8070 static PyObject*
8071 formatlong(PyObject *val, int flags, int prec, int type)
8072 {
8073     char *buf;
8074     int i, len;
8075     PyObject *str; /* temporary string object. */
8076     PyUnicodeObject *result;
8077
8078     str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8079     if (!str)
8080         return NULL;
8081     result = _PyUnicode_New(len);
8082     if (!result) {
8083         Py_DECREF(str);
8084         return NULL;
8085     }
8086     for (i = 0; i < len; i++)
8087         result->str[i] = buf[i];
8088     result->str[len] = 0;
8089     Py_DECREF(str);
8090     return (PyObject*)result;
8091 }
8092
8093 static int
8094 formatint(Py_UNICODE *buf,
8095           size_t buflen,
8096           int flags,
8097           int prec,
8098           int type,
8099           PyObject *v)
8100 {
8101     /* fmt = '%#.' + `prec` + 'l' + `type`
8102      * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8103      *                     + 1 + 1
8104      *                   = 24
8105      */
8106     char fmt[64]; /* plenty big enough! */
8107     char *sign;
8108     long x;
8109
8110     x = PyInt_AsLong(v);
8111     if (x == -1 && PyErr_Occurred())
8112         return -1;
8113     if (x < 0 && type == 'u') {
8114         type = 'd';
8115     }
8116     if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8117         sign = "-";
8118     else
8119         sign = "";
8120     if (prec < 0)
8121         prec = 1;
8122
8123     /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8124      * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8125      */
8126     if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8127         PyErr_SetString(PyExc_OverflowError,
8128                         "formatted integer is too long (precision too large?)");
8129         return -1;
8130     }
8131
8132     if ((flags & F_ALT) &&
8133         (type == 'x' || type == 'X')) {
8134         /* When converting under %#x or %#X, there are a number
8135          * of issues that cause pain:
8136          * - when 0 is being converted, the C standard leaves off
8137          *   the '0x' or '0X', which is inconsistent with other
8138          *   %#x/%#X conversions and inconsistent with Python's
8139          *   hex() function
8140          * - there are platforms that violate the standard and
8141          *   convert 0 with the '0x' or '0X'
8142          *   (Metrowerks, Compaq Tru64)
8143          * - there are platforms that give '0x' when converting
8144          *   under %#X, but convert 0 in accordance with the
8145          *   standard (OS/2 EMX)
8146          *
8147          * We can achieve the desired consistency by inserting our
8148          * own '0x' or '0X' prefix, and substituting %x/%X in place
8149          * of %#x/%#X.
8150          *
8151          * Note that this is the same approach as used in
8152          * formatint() in stringobject.c
8153          */
8154         PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8155                       sign, type, prec, type);
8156     }
8157     else {
8158         PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8159                       sign, (flags&F_ALT) ? "#" : "",
8160                       prec, type);
8161     }
8162     if (sign[0])
8163         return longtounicode(buf, buflen, fmt, -x);
8164     else
8165         return longtounicode(buf, buflen, fmt, x);
8166 }
8167
8168 static int
8169 formatchar(Py_UNICODE *buf,
8170            size_t buflen,
8171            PyObject *v)
8172 {
8173     /* presume that the buffer is at least 2 characters long */
8174     if (PyUnicode_Check(v)) {
8175         if (PyUnicode_GET_SIZE(v) != 1)
8176             goto onError;
8177         buf[0] = PyUnicode_AS_UNICODE(v)[0];
8178     }
8179
8180     else if (PyString_Check(v)) {
8181         if (PyString_GET_SIZE(v) != 1)
8182             goto onError;
8183         buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8184     }
8185
8186     else {
8187         /* Integer input truncated to a character */
8188         long x;
8189         x = PyInt_AsLong(v);
8190         if (x == -1 && PyErr_Occurred())
8191             goto onError;
8192 #ifdef Py_UNICODE_WIDE
8193         if (x < 0 || x > 0x10ffff) {
8194             PyErr_SetString(PyExc_OverflowError,
8195                             "%c arg not in range(0x110000) "
8196                             "(wide Python build)");
8197             return -1;
8198         }
8199 #else
8200         if (x < 0 || x > 0xffff) {
8201             PyErr_SetString(PyExc_OverflowError,
8202                             "%c arg not in range(0x10000) "
8203                             "(narrow Python build)");
8204             return -1;
8205         }
8206 #endif
8207         buf[0] = (Py_UNICODE) x;
8208     }
8209     buf[1] = '\0';
8210     return 1;
8211
8212   onError:
8213     PyErr_SetString(PyExc_TypeError,
8214                     "%c requires int or char");
8215     return -1;
8216 }
8217
8218 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8219
8220    FORMATBUFLEN is the length of the buffer in which the ints &
8221    chars are formatted. XXX This is a magic number. Each formatting
8222    routine does bounds checking to ensure no overflow, but a better
8223    solution may be to malloc a buffer of appropriate size for each
8224    format. For now, the current solution is sufficient.
8225 */
8226 #define FORMATBUFLEN (size_t)120
8227
8228 PyObject *PyUnicode_Format(PyObject *format,
8229                            PyObject *args)
8230 {
8231     Py_UNICODE *fmt, *res;
8232     Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8233     int args_owned = 0;
8234     PyUnicodeObject *result = NULL;
8235     PyObject *dict = NULL;
8236     PyObject *uformat;
8237
8238     if (format == NULL || args == NULL) {
8239         PyErr_BadInternalCall();
8240         return NULL;
8241     }
8242     uformat = PyUnicode_FromObject(format);
8243     if (uformat == NULL)
8244         return NULL;
8245     fmt = PyUnicode_AS_UNICODE(uformat);
8246     fmtcnt = PyUnicode_GET_SIZE(uformat);
8247
8248     reslen = rescnt = fmtcnt + 100;
8249     result = _PyUnicode_New(reslen);
8250     if (result == NULL)
8251         goto onError;
8252     res = PyUnicode_AS_UNICODE(result);
8253
8254     if (PyTuple_Check(args)) {
8255         arglen = PyTuple_Size(args);
8256         argidx = 0;
8257     }
8258     else {
8259         arglen = -1;
8260         argidx = -2;
8261     }
8262     if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
8263         !PyObject_TypeCheck(args, &PyBaseString_Type))
8264         dict = args;
8265
8266     while (--fmtcnt >= 0) {
8267         if (*fmt != '%') {
8268             if (--rescnt < 0) {
8269                 rescnt = fmtcnt + 100;
8270                 reslen += rescnt;
8271                 if (_PyUnicode_Resize(&result, reslen) < 0)
8272                     goto onError;
8273                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8274                 --rescnt;
8275             }
8276             *res++ = *fmt++;
8277         }
8278         else {
8279             /* Got a format specifier */
8280             int flags = 0;
8281             Py_ssize_t width = -1;
8282             int prec = -1;
8283             Py_UNICODE c = '\0';
8284             Py_UNICODE fill;
8285             int isnumok;
8286             PyObject *v = NULL;
8287             PyObject *temp = NULL;
8288             Py_UNICODE *pbuf;
8289             Py_UNICODE sign;
8290             Py_ssize_t len;
8291             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
8292
8293             fmt++;
8294             if (*fmt == '(') {
8295                 Py_UNICODE *keystart;
8296                 Py_ssize_t keylen;
8297                 PyObject *key;
8298                 int pcount = 1;
8299
8300                 if (dict == NULL) {
8301                     PyErr_SetString(PyExc_TypeError,
8302                                     "format requires a mapping");
8303                     goto onError;
8304                 }
8305                 ++fmt;
8306                 --fmtcnt;
8307                 keystart = fmt;
8308                 /* Skip over balanced parentheses */
8309                 while (pcount > 0 && --fmtcnt >= 0) {
8310                     if (*fmt == ')')
8311                         --pcount;
8312                     else if (*fmt == '(')
8313                         ++pcount;
8314                     fmt++;
8315                 }
8316                 keylen = fmt - keystart - 1;
8317                 if (fmtcnt < 0 || pcount > 0) {
8318                     PyErr_SetString(PyExc_ValueError,
8319                                     "incomplete format key");
8320                     goto onError;
8321                 }
8322 #if 0
8323                 /* keys are converted to strings using UTF-8 and
8324                    then looked up since Python uses strings to hold
8325                    variables names etc. in its namespaces and we
8326                    wouldn't want to break common idioms. */
8327                 key = PyUnicode_EncodeUTF8(keystart,
8328                                            keylen,
8329                                            NULL);
8330 #else
8331                 key = PyUnicode_FromUnicode(keystart, keylen);
8332 #endif
8333                 if (key == NULL)
8334                     goto onError;
8335                 if (args_owned) {
8336                     Py_DECREF(args);
8337                     args_owned = 0;
8338                 }
8339                 args = PyObject_GetItem(dict, key);
8340                 Py_DECREF(key);
8341                 if (args == NULL) {
8342                     goto onError;
8343                 }
8344                 args_owned = 1;
8345                 arglen = -1;
8346                 argidx = -2;
8347             }
8348             while (--fmtcnt >= 0) {
8349                 switch (c = *fmt++) {
8350                 case '-': flags |= F_LJUST; continue;
8351                 case '+': flags |= F_SIGN; continue;
8352                 case ' ': flags |= F_BLANK; continue;
8353                 case '#': flags |= F_ALT; continue;
8354                 case '0': flags |= F_ZERO; continue;
8355                 }
8356                 break;
8357             }
8358             if (c == '*') {
8359                 v = getnextarg(args, arglen, &argidx);
8360                 if (v == NULL)
8361                     goto onError;
8362                 if (!PyInt_Check(v)) {
8363                     PyErr_SetString(PyExc_TypeError,
8364                                     "* wants int");
8365                     goto onError;
8366                 }
8367                 width = PyInt_AsLong(v);
8368                 if (width < 0) {
8369                     flags |= F_LJUST;
8370                     width = -width;
8371                 }
8372                 if (--fmtcnt >= 0)
8373                     c = *fmt++;
8374             }
8375             else if (c >= '0' && c <= '9') {
8376                 width = c - '0';
8377                 while (--fmtcnt >= 0) {
8378                     c = *fmt++;
8379                     if (c < '0' || c > '9')
8380                         break;
8381                     if ((width*10) / 10 != width) {
8382                         PyErr_SetString(PyExc_ValueError,
8383                                         "width too big");
8384                         goto onError;
8385                     }
8386                     width = width*10 + (c - '0');
8387                 }
8388             }
8389             if (c == '.') {
8390                 prec = 0;
8391                 if (--fmtcnt >= 0)
8392                     c = *fmt++;
8393                 if (c == '*') {
8394                     v = getnextarg(args, arglen, &argidx);
8395                     if (v == NULL)
8396                         goto onError;
8397                     if (!PyInt_Check(v)) {
8398                         PyErr_SetString(PyExc_TypeError,
8399                                         "* wants int");
8400                         goto onError;
8401                     }
8402                     prec = PyInt_AsLong(v);
8403                     if (prec < 0)
8404                         prec = 0;
8405                     if (--fmtcnt >= 0)
8406                         c = *fmt++;
8407                 }
8408                 else if (c >= '0' && c <= '9') {
8409                     prec = c - '0';
8410                     while (--fmtcnt >= 0) {
8411                         c = Py_CHARMASK(*fmt++);
8412                         if (c < '0' || c > '9')
8413                             break;
8414                         if ((prec*10) / 10 != prec) {
8415                             PyErr_SetString(PyExc_ValueError,
8416                                             "prec too big");
8417                             goto onError;
8418                         }
8419                         prec = prec*10 + (c - '0');
8420                     }
8421                 }
8422             } /* prec */
8423             if (fmtcnt >= 0) {
8424                 if (c == 'h' || c == 'l' || c == 'L') {
8425                     if (--fmtcnt >= 0)
8426                         c = *fmt++;
8427                 }
8428             }
8429             if (fmtcnt < 0) {
8430                 PyErr_SetString(PyExc_ValueError,
8431                                 "incomplete format");
8432                 goto onError;
8433             }
8434             if (c != '%') {
8435                 v = getnextarg(args, arglen, &argidx);
8436                 if (v == NULL)
8437                     goto onError;
8438             }
8439             sign = 0;
8440             fill = ' ';
8441             switch (c) {
8442
8443             case '%':
8444                 pbuf = formatbuf;
8445                 /* presume that buffer length is at least 1 */
8446                 pbuf[0] = '%';
8447                 len = 1;
8448                 break;
8449
8450             case 's':
8451             case 'r':
8452                 if (PyUnicode_Check(v) && c == 's') {
8453                     temp = v;
8454                     Py_INCREF(temp);
8455                 }
8456                 else {
8457                     PyObject *unicode;
8458                     if (c == 's')
8459                         temp = PyObject_Unicode(v);
8460                     else
8461                         temp = PyObject_Repr(v);
8462                     if (temp == NULL)
8463                         goto onError;
8464                     if (PyUnicode_Check(temp))
8465                         /* nothing to do */;
8466                     else if (PyString_Check(temp)) {
8467                         /* convert to string to Unicode */
8468                         unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8469                                                    PyString_GET_SIZE(temp),
8470                                                    NULL,
8471                                                    "strict");
8472                         Py_DECREF(temp);
8473                         temp = unicode;
8474                         if (temp == NULL)
8475                             goto onError;
8476                     }
8477                     else {
8478                         Py_DECREF(temp);
8479                         PyErr_SetString(PyExc_TypeError,
8480                                         "%s argument has non-string str()");
8481                         goto onError;
8482                     }
8483                 }
8484                 pbuf = PyUnicode_AS_UNICODE(temp);
8485                 len = PyUnicode_GET_SIZE(temp);
8486                 if (prec >= 0 && len > prec)
8487                     len = prec;
8488                 break;
8489
8490             case 'i':
8491             case 'd':
8492             case 'u':
8493             case 'o':
8494             case 'x':
8495             case 'X':
8496                 if (c == 'i')
8497                     c = 'd';
8498                 isnumok = 0;
8499                 if (PyNumber_Check(v)) {
8500                     PyObject *iobj=NULL;
8501
8502                     if (PyInt_Check(v) || (PyLong_Check(v))) {
8503                         iobj = v;
8504                         Py_INCREF(iobj);
8505                     }
8506                     else {
8507                         iobj = PyNumber_Int(v);
8508                         if (iobj==NULL) iobj = PyNumber_Long(v);
8509                     }
8510                     if (iobj!=NULL) {
8511                         if (PyInt_Check(iobj)) {
8512                             isnumok = 1;
8513                             pbuf = formatbuf;
8514                             len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8515                                             flags, prec, c, iobj);
8516                             Py_DECREF(iobj);
8517                             if (len < 0)
8518                                 goto onError;
8519                             sign = 1;
8520                         }
8521                         else if (PyLong_Check(iobj)) {
8522                             isnumok = 1;
8523                             temp = formatlong(iobj, flags, prec, c);
8524                             Py_DECREF(iobj);
8525                             if (!temp)
8526                                 goto onError;
8527                             pbuf = PyUnicode_AS_UNICODE(temp);
8528                             len = PyUnicode_GET_SIZE(temp);
8529                             sign = 1;
8530                         }
8531                         else {
8532                             Py_DECREF(iobj);
8533                         }
8534                     }
8535                 }
8536                 if (!isnumok) {
8537                     PyErr_Format(PyExc_TypeError,
8538                                  "%%%c format: a number is required, "
8539                                  "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8540                     goto onError;
8541                 }
8542                 if (flags & F_ZERO)
8543                     fill = '0';
8544                 break;
8545
8546             case 'e':
8547             case 'E':
8548             case 'f':
8549             case 'F':
8550             case 'g':
8551             case 'G':
8552                 temp = formatfloat(v, flags, prec, c);
8553                 if (temp == NULL)
8554                     goto onError;
8555                 pbuf = PyUnicode_AS_UNICODE(temp);
8556                 len = PyUnicode_GET_SIZE(temp);
8557                 sign = 1;
8558                 if (flags & F_ZERO)
8559                     fill = '0';
8560                 break;
8561
8562             case 'c':
8563                 pbuf = formatbuf;
8564                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8565                 if (len < 0)
8566                     goto onError;
8567                 break;
8568
8569             default:
8570                 PyErr_Format(PyExc_ValueError,
8571                              "unsupported format character '%c' (0x%x) "
8572                              "at index %zd",
8573                              (31<=c && c<=126) ? (char)c : '?',
8574                              (int)c,
8575                              (Py_ssize_t)(fmt - 1 -
8576                                           PyUnicode_AS_UNICODE(uformat)));
8577                 goto onError;
8578             }
8579             if (sign) {
8580                 if (*pbuf == '-' || *pbuf == '+') {
8581                     sign = *pbuf++;
8582                     len--;
8583                 }
8584                 else if (flags & F_SIGN)
8585                     sign = '+';
8586                 else if (flags & F_BLANK)
8587                     sign = ' ';
8588                 else
8589                     sign = 0;
8590             }
8591             if (width < len)
8592                 width = len;
8593             if (rescnt - (sign != 0) < width) {
8594                 reslen -= rescnt;
8595                 rescnt = width + fmtcnt + 100;
8596                 reslen += rescnt;
8597                 if (reslen < 0) {
8598                     Py_XDECREF(temp);
8599                     PyErr_NoMemory();
8600                     goto onError;
8601                 }
8602                 if (_PyUnicode_Resize(&result, reslen) < 0) {
8603                     Py_XDECREF(temp);
8604                     goto onError;
8605                 }
8606                 res = PyUnicode_AS_UNICODE(result)
8607                     + reslen - rescnt;
8608             }
8609             if (sign) {
8610                 if (fill != ' ')
8611                     *res++ = sign;
8612                 rescnt--;
8613                 if (width > len)
8614                     width--;
8615             }
8616             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8617                 assert(pbuf[0] == '0');
8618                 assert(pbuf[1] == c);
8619                 if (fill != ' ') {
8620                     *res++ = *pbuf++;
8621                     *res++ = *pbuf++;
8622                 }
8623                 rescnt -= 2;
8624                 width -= 2;
8625                 if (width < 0)
8626                     width = 0;
8627                 len -= 2;
8628             }
8629             if (width > len && !(flags & F_LJUST)) {
8630                 do {
8631                     --rescnt;
8632                     *res++ = fill;
8633                 } while (--width > len);
8634             }
8635             if (fill == ' ') {
8636                 if (sign)
8637                     *res++ = sign;
8638                 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8639                     assert(pbuf[0] == '0');
8640                     assert(pbuf[1] == c);
8641                     *res++ = *pbuf++;
8642                     *res++ = *pbuf++;
8643                 }
8644             }
8645             Py_UNICODE_COPY(res, pbuf, len);
8646             res += len;
8647             rescnt -= len;
8648             while (--width >= len) {
8649                 --rescnt;
8650                 *res++ = ' ';
8651             }
8652             if (dict && (argidx < arglen) && c != '%') {
8653                 PyErr_SetString(PyExc_TypeError,
8654                                 "not all arguments converted during string formatting");
8655                 Py_XDECREF(temp);
8656                 goto onError;
8657             }
8658             Py_XDECREF(temp);
8659         } /* '%' */
8660     } /* until end */
8661     if (argidx < arglen && !dict) {
8662         PyErr_SetString(PyExc_TypeError,
8663                         "not all arguments converted during string formatting");
8664         goto onError;
8665     }
8666
8667     if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8668         goto onError;
8669     if (args_owned) {
8670         Py_DECREF(args);
8671     }
8672     Py_DECREF(uformat);
8673     return (PyObject *)result;
8674
8675   onError:
8676     Py_XDECREF(result);
8677     Py_DECREF(uformat);
8678     if (args_owned) {
8679         Py_DECREF(args);
8680     }
8681     return NULL;
8682 }
8683
8684 static PyBufferProcs unicode_as_buffer = {
8685     (readbufferproc) unicode_buffer_getreadbuf,
8686     (writebufferproc) unicode_buffer_getwritebuf,
8687     (segcountproc) unicode_buffer_getsegcount,
8688     (charbufferproc) unicode_buffer_getcharbuf,
8689 };
8690
8691 static PyObject *
8692 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8693
8694 static PyObject *
8695 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8696 {
8697     PyObject *x = NULL;
8698     static char *kwlist[] = {"string", "encoding", "errors", 0};
8699     char *encoding = NULL;
8700     char *errors = NULL;
8701
8702     if (type != &PyUnicode_Type)
8703         return unicode_subtype_new(type, args, kwds);
8704     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8705                                      kwlist, &x, &encoding, &errors))
8706         return NULL;
8707     if (x == NULL)
8708         return (PyObject *)_PyUnicode_New(0);
8709     if (encoding == NULL && errors == NULL)
8710         return PyObject_Unicode(x);
8711     else
8712         return PyUnicode_FromEncodedObject(x, encoding, errors);
8713 }
8714
8715 static PyObject *
8716 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8717 {
8718     PyUnicodeObject *tmp, *pnew;
8719     Py_ssize_t n;
8720
8721     assert(PyType_IsSubtype(type, &PyUnicode_Type));
8722     tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8723     if (tmp == NULL)
8724         return NULL;
8725     assert(PyUnicode_Check(tmp));
8726     pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8727     if (pnew == NULL) {
8728         Py_DECREF(tmp);
8729         return NULL;
8730     }
8731     pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8732     if (pnew->str == NULL) {
8733         _Py_ForgetReference((PyObject *)pnew);
8734         PyObject_Del(pnew);
8735         Py_DECREF(tmp);
8736         return PyErr_NoMemory();
8737     }
8738     Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8739     pnew->length = n;
8740     pnew->hash = tmp->hash;
8741     Py_DECREF(tmp);
8742     return (PyObject *)pnew;
8743 }
8744
8745 PyDoc_STRVAR(unicode_doc,
8746              "unicode(string [, encoding[, errors]]) -> object\n\
8747 \n\
8748 Create a new Unicode object from the given encoded string.\n\
8749 encoding defaults to the current default string encoding.\n\
8750 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8751
8752 PyTypeObject PyUnicode_Type = {
8753     PyVarObject_HEAD_INIT(&PyType_Type, 0)
8754     "unicode",              /* tp_name */
8755     sizeof(PyUnicodeObject),        /* tp_size */
8756     0,                  /* tp_itemsize */
8757     /* Slots */
8758     (destructor)unicode_dealloc,    /* tp_dealloc */
8759     0,                  /* tp_print */
8760     0,                  /* tp_getattr */
8761     0,                  /* tp_setattr */
8762     0,                  /* tp_compare */
8763     unicode_repr,           /* tp_repr */
8764     &unicode_as_number,         /* tp_as_number */
8765     &unicode_as_sequence,       /* tp_as_sequence */
8766     &unicode_as_mapping,        /* tp_as_mapping */
8767     (hashfunc) unicode_hash,        /* tp_hash*/
8768     0,                  /* tp_call*/
8769     (reprfunc) unicode_str,     /* tp_str */
8770     PyObject_GenericGetAttr,        /* tp_getattro */
8771     0,                  /* tp_setattro */
8772     &unicode_as_buffer,         /* tp_as_buffer */
8773     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
8774     Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS,  /* tp_flags */
8775     unicode_doc,            /* tp_doc */
8776     0,                  /* tp_traverse */
8777     0,                  /* tp_clear */
8778     PyUnicode_RichCompare,      /* tp_richcompare */
8779     0,                  /* tp_weaklistoffset */
8780     0,                  /* tp_iter */
8781     0,                  /* tp_iternext */
8782     unicode_methods,            /* tp_methods */
8783     0,                  /* tp_members */
8784     0,                  /* tp_getset */
8785     &PyBaseString_Type,         /* tp_base */
8786     0,                  /* tp_dict */
8787     0,                  /* tp_descr_get */
8788     0,                  /* tp_descr_set */
8789     0,                  /* tp_dictoffset */
8790     0,                  /* tp_init */
8791     0,                  /* tp_alloc */
8792     unicode_new,            /* tp_new */
8793     PyObject_Del,           /* tp_free */
8794 };
8795
8796 /* Initialize the Unicode implementation */
8797
8798 void _PyUnicode_Init(void)
8799 {
8800     int i;
8801
8802     /* XXX - move this array to unicodectype.c ? */
8803     Py_UNICODE linebreak[] = {
8804         0x000A, /* LINE FEED */
8805         0x000D, /* CARRIAGE RETURN */
8806         0x001C, /* FILE SEPARATOR */
8807         0x001D, /* GROUP SEPARATOR */
8808         0x001E, /* RECORD SEPARATOR */
8809         0x0085, /* NEXT LINE */
8810         0x2028, /* LINE SEPARATOR */
8811         0x2029, /* PARAGRAPH SEPARATOR */
8812     };
8813
8814     /* Init the implementation */
8815     free_list = NULL;
8816     numfree = 0;
8817     unicode_empty = _PyUnicode_New(0);
8818     if (!unicode_empty)
8819         return;
8820
8821     strcpy(unicode_default_encoding, "ascii");
8822     for (i = 0; i < 256; i++)
8823         unicode_latin1[i] = NULL;
8824     if (PyType_Ready(&PyUnicode_Type) < 0)
8825         Py_FatalError("Can't initialize 'unicode'");
8826
8827     /* initialize the linebreak bloom filter */
8828     bloom_linebreak = make_bloom_mask(
8829         linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8830         );
8831
8832     PyType_Ready(&EncodingMapType);
8833 }
8834
8835 /* Finalize the Unicode implementation */
8836
8837 int
8838 PyUnicode_ClearFreeList(void)
8839 {
8840     int freelist_size = numfree;
8841     PyUnicodeObject *u;
8842
8843     for (u = free_list; u != NULL;) {
8844         PyUnicodeObject *v = u;
8845         u = *(PyUnicodeObject **)u;
8846         if (v->str)
8847             PyObject_DEL(v->str);
8848         Py_XDECREF(v->defenc);
8849         PyObject_Del(v);
8850         numfree--;
8851     }
8852     free_list = NULL;
8853     assert(numfree == 0);
8854     return freelist_size;
8855 }
8856
8857 void
8858 _PyUnicode_Fini(void)
8859 {
8860     int i;
8861
8862     Py_XDECREF(unicode_empty);
8863     unicode_empty = NULL;
8864
8865     for (i = 0; i < 256; i++) {
8866         if (unicode_latin1[i]) {
8867             Py_DECREF(unicode_latin1[i]);
8868             unicode_latin1[i] = NULL;
8869         }
8870     }
8871     (void)PyUnicode_ClearFreeList();
8872 }
8873
8874 #ifdef __cplusplus
8875 }
8876 #endif