Objects/unicodeobject.c

   1 /*
   2
   3 Unicode implementation based on original code by Fredrik Lundh,
   4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
   5 Unicode Integration Proposal (see file Misc/unicode.txt).
   6
   7 Major speed upgrades to the method implementations at the Reykjavik
   8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
   9
  10 Copyright (c) Corporation for National Research Initiatives.
  11
  12 --------------------------------------------------------------------
  13 The original string type implementation is:
  14
  15     Copyright (c) 1999 by Secret Labs AB
  16     Copyright (c) 1999 by Fredrik Lundh
  17
  18 By obtaining, using, and/or copying this software and/or its
  19 associated documentation, you agree that you have read, understood,
  20 and will comply with the following terms and conditions:
  21
  22 Permission to use, copy, modify, and distribute this software and its
  23 associated documentation for any purpose and without fee is hereby
  24 granted, provided that the above copyright notice appears in all
  25 copies, and that both that copyright notice and this permission notice
  26 appear in supporting documentation, and that the name of Secret Labs
  27 AB or the author not be used in advertising or publicity pertaining to
  28 distribution of the software without specific, written prior
  29 permission.
  30
  31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  33 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  38 --------------------------------------------------------------------
  39
  40 */
  41
  42 #define PY_SSIZE_T_CLEAN
  43 #include "Python.h"
  44
  45 #include "unicodeobject.h"
  46 #include "ucnhash.h"
  47
  48 #ifdef MS_WINDOWS
  49 #include <windows.h>
  50 #endif
  51
  52 /* Limit for the Unicode object free list */
  53
  54 #define MAX_UNICODE_FREELIST_SIZE       1024
  55
  56 /* Limit for the Unicode object free list stay alive optimization.
  57
  58    The implementation will keep allocated Unicode memory intact for
  59    all objects on the free list having a size less than this
  60    limit. This reduces malloc() overhead for small Unicode objects.
  61
  62    At worst this will result in MAX_UNICODE_FREELIST_SIZE *
  63    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
  64    malloc()-overhead) bytes of unused garbage.
  65
  66    Setting the limit to 0 effectively turns the feature off.
  67
  68    Note: This is an experimental feature ! If you get core dumps when
  69    using Unicode objects, turn this feature off.
  70
  71 */
  72
  73 #define KEEPALIVE_SIZE_LIMIT       9
  74
  75 /* Endianness switches; defaults to little endian */
  76
  77 #ifdef WORDS_BIGENDIAN
  78 # define BYTEORDER_IS_BIG_ENDIAN
  79 #else
  80 # define BYTEORDER_IS_LITTLE_ENDIAN
  81 #endif
  82
  83 /* --- Globals ------------------------------------------------------------
  84
  85    The globals are initialized by the _PyUnicode_Init() API and should
  86    not be used before calling that API.
  87
  88 */
  89
  90
  91 #ifdef __cplusplus
  92 extern "C" {
  93 #endif
  94
  95 /* Free list for Unicode objects */
  96 static PyUnicodeObject *unicode_freelist;
  97 static int unicode_freelist_size;
  98
  99 /* The empty Unicode object is shared to improve performance. */
 100 static PyUnicodeObject *unicode_empty;
 101
 102 /* Single character Unicode strings in the Latin-1 range are being
 103    shared as well. */
 104 static PyUnicodeObject *unicode_latin1[256];
 105
 106 /* Default encoding to use and assume when NULL is passed as encoding
 107    parameter; it is initialized by _PyUnicode_Init().
 108
 109    Always use the PyUnicode_SetDefaultEncoding() and
 110    PyUnicode_GetDefaultEncoding() APIs to access this global.
 111
 112 */
 113 static char unicode_default_encoding[100];
 114
 115 Py_UNICODE
 116 PyUnicode_GetMax(void)
 117 {
 118 #ifdef Py_UNICODE_WIDE
 119         return 0x10FFFF;
 120 #else
 121         /* This is actually an illegal character, so it should
 122            not be passed to unichr. */
 123         return 0xFFFF;
 124 #endif
 125 }
 126
 127 /* --- Bloom Filters ----------------------------------------------------- */
 128
 129 /* stuff to implement simple "bloom filters" for Unicode characters.
 130    to keep things simple, we use a single bitmask, using the least 5
 131    bits from each unicode characters as the bit index. */
 132
 133 /* the linebreak mask is set up by Unicode_Init below */
 134
 135 #define BLOOM_MASK unsigned long
 136
 137 static BLOOM_MASK bloom_linebreak;
 138
 139 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
 140
 141 #define BLOOM_LINEBREAK(ch)\
 142     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
 143
 144 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
 145 {
 146     /* calculate simple bloom-style bitmask for a given unicode string */
 147
 148     long mask;
 149     Py_ssize_t i;
 150
 151     mask = 0;
 152     for (i = 0; i < len; i++)
 153         mask |= (1 << (ptr[i] & 0x1F));
 154
 155     return mask;
 156 }
 157
 158 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
 159 {
 160     Py_ssize_t i;
 161
 162     for (i = 0; i < setlen; i++)
 163         if (set[i] == chr)
 164             return 1;
 165
 166     return 0;
 167 }
 168
 169 #define BLOOM_MEMBER(mask, chr, set, setlen)\
 170     BLOOM(mask, chr) && unicode_member(chr, set, setlen)
 171
 172 /* --- Unicode Object ----------------------------------------------------- */
 173
 174 static
 175 int unicode_resize(register PyUnicodeObject *unicode,
 176                       Py_ssize_t length)
 177 {
 178     void *oldstr;
 179
 180     /* Shortcut if there's nothing much to do. */
 181     if (unicode->length == length)
 182         goto reset;
 183
 184     /* Resizing shared object (unicode_empty or single character
 185        objects) in-place is not allowed. Use PyUnicode_Resize()
 186        instead ! */
 187
 188     if (unicode == unicode_empty ||
 189         (unicode->length == 1 &&
 190          unicode->str[0] < 256U &&
 191          unicode_latin1[unicode->str[0]] == unicode)) {
 192         PyErr_SetString(PyExc_SystemError,
 193                         "can't resize shared unicode objects");
 194         return -1;
 195     }
 196
 197     /* We allocate one more byte to make sure the string is Ux0000 terminated.
 198        The overallocation is also used by fastsearch, which assumes that it's
 199        safe to look at str[length] (without making any assumptions about what
 200        it contains). */
 201
 202     oldstr = unicode->str;
 203     PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
 204     if (!unicode->str) {
 205         unicode->str = (Py_UNICODE *)oldstr;
 206         PyErr_NoMemory();
 207         return -1;
 208     }
 209     unicode->str[length] = 0;
 210     unicode->length = length;
 211
 212  reset:
 213     /* Reset the object caches */
 214     if (unicode->defenc) {
 215         Py_DECREF(unicode->defenc);
 216         unicode->defenc = NULL;
 217     }
 218     unicode->hash = -1;
 219
 220     return 0;
 221 }
 222
 223 /* We allocate one more byte to make sure the string is
 224    Ux0000 terminated -- XXX is this needed ?
 225
 226    XXX This allocator could further be enhanced by assuring that the
 227        free list never reduces its size below 1.
 228
 229 */
 230
 231 static
 232 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
 233 {
 234     register PyUnicodeObject *unicode;
 235
 236     /* Optimization for empty strings */
 237     if (length == 0 && unicode_empty != NULL) {
 238         Py_INCREF(unicode_empty);
 239         return unicode_empty;
 240     }
 241
 242     /* Unicode freelist & memory allocation */
 243     if (unicode_freelist) {
 244         unicode = unicode_freelist;
 245         unicode_freelist = *(PyUnicodeObject **)unicode;
 246         unicode_freelist_size--;
 247         if (unicode->str) {
 248             /* Keep-Alive optimization: we only upsize the buffer,
 249                never downsize it. */
 250             if ((unicode->length < length) &&
 251                 unicode_resize(unicode, length) < 0) {
 252                 PyMem_DEL(unicode->str);
 253                 goto onError;
 254             }
 255         }
 256         else {
 257             unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
 258         }
 259         PyObject_INIT(unicode, &PyUnicode_Type);
 260     }
 261     else {
 262         unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
 263         if (unicode == NULL)
 264             return NULL;
 265         unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
 266     }
 267
 268     if (!unicode->str) {
 269         PyErr_NoMemory();
 270         goto onError;
 271     }
 272     /* Initialize the first element to guard against cases where
 273      * the caller fails before initializing str -- unicode_resize()
 274      * reads str[0], and the Keep-Alive optimization can keep memory
 275      * allocated for str alive across a call to unicode_dealloc(unicode).
 276      * We don't want unicode_resize to read uninitialized memory in
 277      * that case.
 278      */
 279     unicode->str[0] = 0;
 280     unicode->str[length] = 0;
 281     unicode->length = length;
 282     unicode->hash = -1;
 283     unicode->defenc = NULL;
 284     return unicode;
 285
 286  onError:
 287     _Py_ForgetReference((PyObject *)unicode);
 288     PyObject_Del(unicode);
 289     return NULL;
 290 }
 291
 292 static
 293 void unicode_dealloc(register PyUnicodeObject *unicode)
 294 {
 295     if (PyUnicode_CheckExact(unicode) &&
 296         unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
 297         /* Keep-Alive optimization */
 298         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
 299             PyMem_DEL(unicode->str);
 300             unicode->str = NULL;
 301             unicode->length = 0;
 302         }
 303         if (unicode->defenc) {
 304             Py_DECREF(unicode->defenc);
 305             unicode->defenc = NULL;
 306         }
 307         /* Add to free list */
 308         *(PyUnicodeObject **)unicode = unicode_freelist;
 309         unicode_freelist = unicode;
 310         unicode_freelist_size++;
 311     }
 312     else {
 313         PyMem_DEL(unicode->str);
 314         Py_XDECREF(unicode->defenc);
 315         unicode->ob_type->tp_free((PyObject *)unicode);
 316     }
 317 }
 318
 319 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
 320 {
 321     register PyUnicodeObject *v;
 322
 323     /* Argument checks */
 324     if (unicode == NULL) {
 325         PyErr_BadInternalCall();
 326         return -1;
 327     }
 328     v = (PyUnicodeObject *)*unicode;
 329     if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
 330         PyErr_BadInternalCall();
 331         return -1;
 332     }
 333
 334     /* Resizing unicode_empty and single character objects is not
 335        possible since these are being shared. We simply return a fresh
 336        copy with the same Unicode content. */
 337     if (v->length != length &&
 338         (v == unicode_empty || v->length == 1)) {
 339         PyUnicodeObject *w = _PyUnicode_New(length);
 340         if (w == NULL)
 341             return -1;
 342         Py_UNICODE_COPY(w->str, v->str,
 343                         length < v->length ? length : v->length);
 344         Py_DECREF(*unicode);
 345         *unicode = (PyObject *)w;
 346         return 0;
 347     }
 348
 349     /* Note that we don't have to modify *unicode for unshared Unicode
 350        objects, since we can modify them in-place. */
 351     return unicode_resize(v, length);
 352 }
 353
 354 /* Internal API for use in unicodeobject.c only ! */
 355 #define _PyUnicode_Resize(unicodevar, length) \
 356         PyUnicode_Resize(((PyObject **)(unicodevar)), length)
 357
 358 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
 359                                 Py_ssize_t size)
 360 {
 361     PyUnicodeObject *unicode;
 362
 363     /* If the Unicode data is known at construction time, we can apply
 364        some optimizations which share commonly used objects. */
 365     if (u != NULL) {
 366
 367         /* Optimization for empty strings */
 368         if (size == 0 && unicode_empty != NULL) {
 369             Py_INCREF(unicode_empty);
 370             return (PyObject *)unicode_empty;
 371         }
 372
 373         /* Single character Unicode objects in the Latin-1 range are
 374            shared when using this constructor */
 375         if (size == 1 && *u < 256) {
 376             unicode = unicode_latin1[*u];
 377             if (!unicode) {
 378                 unicode = _PyUnicode_New(1);
 379                 if (!unicode)
 380                     return NULL;
 381                 unicode->str[0] = *u;
 382                 unicode_latin1[*u] = unicode;
 383             }
 384             Py_INCREF(unicode);
 385             return (PyObject *)unicode;
 386         }
 387     }
 388
 389     unicode = _PyUnicode_New(size);
 390     if (!unicode)
 391         return NULL;
 392
 393     /* Copy the Unicode data into the new object */
 394     if (u != NULL)
 395         Py_UNICODE_COPY(unicode->str, u, size);
 396
 397     return (PyObject *)unicode;
 398 }
 399
 400 #ifdef HAVE_WCHAR_H
 401
 402 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 403                                  Py_ssize_t size)
 404 {
 405     PyUnicodeObject *unicode;
 406
 407     if (w == NULL) {
 408         PyErr_BadInternalCall();
 409         return NULL;
 410     }
 411
 412     unicode = _PyUnicode_New(size);
 413     if (!unicode)
 414         return NULL;
 415
 416     /* Copy the wchar_t data into the new object */
 417 #ifdef HAVE_USABLE_WCHAR_T
 418     memcpy(unicode->str, w, size * sizeof(wchar_t));
 419 #else
 420     {
 421         register Py_UNICODE *u;
 422         register Py_ssize_t i;
 423         u = PyUnicode_AS_UNICODE(unicode);
 424         for (i = size; i > 0; i--)
 425             *u++ = *w++;
 426     }
 427 #endif
 428
 429     return (PyObject *)unicode;
 430 }
 431
 432 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
 433                                 wchar_t *w,
 434                                 Py_ssize_t size)
 435 {
 436     if (unicode == NULL) {
 437         PyErr_BadInternalCall();
 438         return -1;
 439     }
 440
 441     /* If possible, try to copy the 0-termination as well */
 442     if (size > PyUnicode_GET_SIZE(unicode))
 443         size = PyUnicode_GET_SIZE(unicode) + 1;
 444
 445 #ifdef HAVE_USABLE_WCHAR_T
 446     memcpy(w, unicode->str, size * sizeof(wchar_t));
 447 #else
 448     {
 449         register Py_UNICODE *u;
 450         register Py_ssize_t i;
 451         u = PyUnicode_AS_UNICODE(unicode);
 452         for (i = size; i > 0; i--)
 453             *w++ = *u++;
 454     }
 455 #endif
 456
 457     if (size > PyUnicode_GET_SIZE(unicode))
 458         return PyUnicode_GET_SIZE(unicode);
 459     else
 460     return size;
 461 }
 462
 463 #endif
 464
 465 PyObject *PyUnicode_FromOrdinal(int ordinal)
 466 {
 467     Py_UNICODE s[1];
 468
 469 #ifdef Py_UNICODE_WIDE
 470     if (ordinal < 0 || ordinal > 0x10ffff) {
 471         PyErr_SetString(PyExc_ValueError,
 472                         "unichr() arg not in range(0x110000) "
 473                         "(wide Python build)");
 474         return NULL;
 475     }
 476 #else
 477     if (ordinal < 0 || ordinal > 0xffff) {
 478         PyErr_SetString(PyExc_ValueError,
 479                         "unichr() arg not in range(0x10000) "
 480                         "(narrow Python build)");
 481         return NULL;
 482     }
 483 #endif
 484
 485     s[0] = (Py_UNICODE)ordinal;
 486     return PyUnicode_FromUnicode(s, 1);
 487 }
 488
 489 PyObject *PyUnicode_FromObject(register PyObject *obj)
 490 {
 491     /* XXX Perhaps we should make this API an alias of
 492            PyObject_Unicode() instead ?! */
 493     if (PyUnicode_CheckExact(obj)) {
 494         Py_INCREF(obj);
 495         return obj;
 496     }
 497     if (PyUnicode_Check(obj)) {
 498         /* For a Unicode subtype that's not a Unicode object,
 499            return a true Unicode object with the same data. */
 500         return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
 501                                      PyUnicode_GET_SIZE(obj));
 502     }
 503     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
 504 }
 505
 506 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
 507                                       const char *encoding,
 508                                       const char *errors)
 509 {
 510     const char *s = NULL;
 511     Py_ssize_t len;
 512     PyObject *v;
 513
 514     if (obj == NULL) {
 515         PyErr_BadInternalCall();
 516         return NULL;
 517     }
 518
 519 #if 0
 520     /* For b/w compatibility we also accept Unicode objects provided
 521        that no encodings is given and then redirect to
 522        PyObject_Unicode() which then applies the additional logic for
 523        Unicode subclasses.
 524
 525        NOTE: This API should really only be used for object which
 526              represent *encoded* Unicode !
 527
 528     */
 529         if (PyUnicode_Check(obj)) {
 530             if (encoding) {
 531                 PyErr_SetString(PyExc_TypeError,
 532                                 "decoding Unicode is not supported");
 533             return NULL;
 534             }
 535         return PyObject_Unicode(obj);
 536             }
 537 #else
 538     if (PyUnicode_Check(obj)) {
 539         PyErr_SetString(PyExc_TypeError,
 540                         "decoding Unicode is not supported");
 541         return NULL;
 542         }
 543 #endif
 544
 545     /* Coerce object */
 546     if (PyString_Check(obj)) {
 547             s = PyString_AS_STRING(obj);
 548             len = PyString_GET_SIZE(obj);
 549             }
 550     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
 551         /* Overwrite the error message with something more useful in
 552            case of a TypeError. */
 553         if (PyErr_ExceptionMatches(PyExc_TypeError))
 554         PyErr_Format(PyExc_TypeError,
 555                          "coercing to Unicode: need string or buffer, "
 556                          "%.80s found",
 557                      obj->ob_type->tp_name);
 558         goto onError;
 559     }
 560
 561     /* Convert to Unicode */
 562     if (len == 0) {
 563         Py_INCREF(unicode_empty);
 564         v = (PyObject *)unicode_empty;
 565     }
 566     else
 567         v = PyUnicode_Decode(s, len, encoding, errors);
 568
 569     return v;
 570
 571  onError:
 572     return NULL;
 573 }
 574
 575 PyObject *PyUnicode_Decode(const char *s,
 576                            Py_ssize_t size,
 577                            const char *encoding,
 578                            const char *errors)
 579 {
 580     PyObject *buffer = NULL, *unicode;
 581
 582     if (encoding == NULL)
 583         encoding = PyUnicode_GetDefaultEncoding();
 584
 585     /* Shortcuts for common default encodings */
 586     if (strcmp(encoding, "utf-8") == 0)
 587         return PyUnicode_DecodeUTF8(s, size, errors);
 588     else if (strcmp(encoding, "latin-1") == 0)
 589         return PyUnicode_DecodeLatin1(s, size, errors);
 590 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
 591     else if (strcmp(encoding, "mbcs") == 0)
 592         return PyUnicode_DecodeMBCS(s, size, errors);
 593 #endif
 594     else if (strcmp(encoding, "ascii") == 0)
 595         return PyUnicode_DecodeASCII(s, size, errors);
 596
 597     /* Decode via the codec registry */
 598     buffer = PyBuffer_FromMemory((void *)s, size);
 599     if (buffer == NULL)
 600         goto onError;
 601     unicode = PyCodec_Decode(buffer, encoding, errors);
 602     if (unicode == NULL)
 603         goto onError;
 604     if (!PyUnicode_Check(unicode)) {
 605         PyErr_Format(PyExc_TypeError,
 606                      "decoder did not return an unicode object (type=%.400s)",
 607                      unicode->ob_type->tp_name);
 608         Py_DECREF(unicode);
 609         goto onError;
 610     }
 611     Py_DECREF(buffer);
 612     return unicode;
 613
 614  onError:
 615     Py_XDECREF(buffer);
 616     return NULL;
 617 }
 618
 619 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
 620                                     const char *encoding,
 621                                     const char *errors)
 622 {
 623     PyObject *v;
 624
 625     if (!PyUnicode_Check(unicode)) {
 626         PyErr_BadArgument();
 627         goto onError;
 628     }
 629
 630     if (encoding == NULL)
 631         encoding = PyUnicode_GetDefaultEncoding();
 632
 633     /* Decode via the codec registry */
 634     v = PyCodec_Decode(unicode, encoding, errors);
 635     if (v == NULL)
 636         goto onError;
 637     return v;
 638
 639  onError:
 640     return NULL;
 641 }
 642
 643 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
 644                            Py_ssize_t size,
 645                            const char *encoding,
 646                            const char *errors)
 647 {
 648     PyObject *v, *unicode;
 649
 650     unicode = PyUnicode_FromUnicode(s, size);
 651     if (unicode == NULL)
 652         return NULL;
 653     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
 654     Py_DECREF(unicode);
 655     return v;
 656 }
 657
 658 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
 659                                     const char *encoding,
 660                                     const char *errors)
 661 {
 662     PyObject *v;
 663
 664     if (!PyUnicode_Check(unicode)) {
 665         PyErr_BadArgument();
 666         goto onError;
 667     }
 668
 669     if (encoding == NULL)
 670         encoding = PyUnicode_GetDefaultEncoding();
 671
 672     /* Encode via the codec registry */
 673     v = PyCodec_Encode(unicode, encoding, errors);
 674     if (v == NULL)
 675         goto onError;
 676     return v;
 677
 678  onError:
 679     return NULL;
 680 }
 681
 682 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
 683                                     const char *encoding,
 684                                     const char *errors)
 685 {
 686     PyObject *v;
 687
 688     if (!PyUnicode_Check(unicode)) {
 689         PyErr_BadArgument();
 690         goto onError;
 691     }
 692
 693     if (encoding == NULL)
 694         encoding = PyUnicode_GetDefaultEncoding();
 695
 696     /* Shortcuts for common default encodings */
 697     if (errors == NULL) {
 698         if (strcmp(encoding, "utf-8") == 0)
 699             return PyUnicode_AsUTF8String(unicode);
 700         else if (strcmp(encoding, "latin-1") == 0)
 701             return PyUnicode_AsLatin1String(unicode);
 702 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
 703         else if (strcmp(encoding, "mbcs") == 0)
 704             return PyUnicode_AsMBCSString(unicode);
 705 #endif
 706         else if (strcmp(encoding, "ascii") == 0)
 707             return PyUnicode_AsASCIIString(unicode);
 708     }
 709
 710     /* Encode via the codec registry */
 711     v = PyCodec_Encode(unicode, encoding, errors);
 712     if (v == NULL)
 713         goto onError;
 714     if (!PyString_Check(v)) {
 715         PyErr_Format(PyExc_TypeError,
 716                      "encoder did not return a string object (type=%.400s)",
 717                      v->ob_type->tp_name);
 718         Py_DECREF(v);
 719         goto onError;
 720     }
 721     return v;
 722
 723  onError:
 724     return NULL;
 725 }
 726
 727 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
 728                                             const char *errors)
 729 {
 730     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
 731
 732     if (v)
 733         return v;
 734     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
 735     if (v && errors == NULL)
 736         ((PyUnicodeObject *)unicode)->defenc = v;
 737     return v;
 738 }
 739
 740 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
 741 {
 742     if (!PyUnicode_Check(unicode)) {
 743         PyErr_BadArgument();
 744         goto onError;
 745     }
 746     return PyUnicode_AS_UNICODE(unicode);
 747
 748  onError:
 749     return NULL;
 750 }
 751
 752 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
 753 {
 754     if (!PyUnicode_Check(unicode)) {
 755         PyErr_BadArgument();
 756         goto onError;
 757     }
 758     return PyUnicode_GET_SIZE(unicode);
 759
 760  onError:
 761     return -1;
 762 }
 763
 764 const char *PyUnicode_GetDefaultEncoding(void)
 765 {
 766     return unicode_default_encoding;
 767 }
 768
 769 int PyUnicode_SetDefaultEncoding(const char *encoding)
 770 {
 771     PyObject *v;
 772
 773     /* Make sure the encoding is valid. As side effect, this also
 774        loads the encoding into the codec registry cache. */
 775     v = _PyCodec_Lookup(encoding);
 776     if (v == NULL)
 777         goto onError;
 778     Py_DECREF(v);
 779     strncpy(unicode_default_encoding,
 780             encoding,
 781             sizeof(unicode_default_encoding));
 782     return 0;
 783
 784  onError:
 785     return -1;
 786 }
 787
 788 /* error handling callback helper:
 789    build arguments, call the callback and check the arguments,
 790    if no exception occurred, copy the replacement to the output
 791    and adjust various state variables.
 792    return 0 on success, -1 on error
 793 */
 794
 795 static
 796 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
 797                  const char *encoding, const char *reason,
 798                  const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
 799                  PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
 800 {
 801     static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
 802
 803     PyObject *restuple = NULL;
 804     PyObject *repunicode = NULL;
 805     Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
 806     Py_ssize_t requiredsize;
 807     Py_ssize_t newpos;
 808     Py_UNICODE *repptr;
 809     Py_ssize_t repsize;
 810     int res = -1;
 811
 812     if (*errorHandler == NULL) {
 813         *errorHandler = PyCodec_LookupError(errors);
 814         if (*errorHandler == NULL)
 815            goto onError;
 816     }
 817
 818     if (*exceptionObject == NULL) {
 819         *exceptionObject = PyUnicodeDecodeError_Create(
 820             encoding, input, insize, *startinpos, *endinpos, reason);
 821         if (*exceptionObject == NULL)
 822            goto onError;
 823     }
 824     else {
 825         if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
 826             goto onError;
 827         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
 828             goto onError;
 829         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
 830             goto onError;
 831     }
 832
 833     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
 834     if (restuple == NULL)
 835         goto onError;
 836     if (!PyTuple_Check(restuple)) {
 837         PyErr_Format(PyExc_TypeError, &argparse[4]);
 838         goto onError;
 839     }
 840     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
 841         goto onError;
 842     if (newpos<0)
 843         newpos = insize+newpos;
 844     if (newpos<0 || newpos>insize) {
 845         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
 846         goto onError;
 847     }
 848
 849     /* need more space? (at least enough for what we
 850        have+the replacement+the rest of the string (starting
 851        at the new input position), so we won't have to check space
 852        when there are no errors in the rest of the string) */
 853     repptr = PyUnicode_AS_UNICODE(repunicode);
 854     repsize = PyUnicode_GET_SIZE(repunicode);
 855     requiredsize = *outpos + repsize + insize-newpos;
 856     if (requiredsize > outsize) {
 857         if (requiredsize<2*outsize)
 858             requiredsize = 2*outsize;
 859         if (PyUnicode_Resize(output, requiredsize) < 0)
 860             goto onError;
 861         *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
 862     }
 863     *endinpos = newpos;
 864     *inptr = input + newpos;
 865     Py_UNICODE_COPY(*outptr, repptr, repsize);
 866     *outptr += repsize;
 867     *outpos += repsize;
 868     /* we made it! */
 869     res = 0;
 870
 871     onError:
 872     Py_XDECREF(restuple);
 873     return res;
 874 }
 875
 876 /* --- UTF-7 Codec -------------------------------------------------------- */
 877
 878 /* see RFC2152 for details */
 879
 880 static
 881 char utf7_special[128] = {
 882     /* indicate whether a UTF-7 character is special i.e. cannot be directly
 883        encoded:
 884            0 - not special
 885            1 - special
 886            2 - whitespace (optional)
 887            3 - RFC2152 Set O (optional) */
 888     1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
 889     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 890     2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
 891     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
 892     3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 893     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
 894     3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 895     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
 896
 897 };
 898
 899 /* Note: The comparison (c) <= 0 is a trick to work-around gcc
 900    warnings about the comparison always being false; since
 901    utf7_special[0] is 1, we can safely make that one comparison
 902    true  */
 903
 904 #define SPECIAL(c, encodeO, encodeWS) \
 905     ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
 906      (encodeWS && (utf7_special[(c)] == 2)) || \
 907      (encodeO && (utf7_special[(c)] == 3)))
 908
 909 #define B64(n)  \
 910     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
 911 #define B64CHAR(c) \
 912     (isalnum(c) || (c) == '+' || (c) == '/')
 913 #define UB64(c) \
 914     ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ?                   \
 915      (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
 916
 917 #define ENCODE(out, ch, bits)                   \
 918     while (bits >= 6) {                         \
 919         *out++ = B64(ch >> (bits-6));           \
 920         bits -= 6;                              \
 921     }
 922
 923 #define DECODE(out, ch, bits, surrogate)                                \
 924     while (bits >= 16) {                                                \
 925         Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff);   \
 926         bits -= 16;                                                     \
 927         if (surrogate) {                                                \
 928             /* We have already generated an error for the high surrogate \
 929                so let's not bother seeing if the low surrogate is correct or not */ \
 930             surrogate = 0;                                              \
 931         } else if (0xDC00 <= outCh && outCh <= 0xDFFF) {                \
 932             /* This is a surrogate pair. Unfortunately we can't represent \
 933                it in a 16-bit character */                              \
 934             surrogate = 1;                                              \
 935             errmsg = "code pairs are not supported";                    \
 936             goto utf7Error;                                             \
 937         } else {                                                        \
 938             *out++ = outCh;                                             \
 939         }                                                               \
 940     }
 941
 942 PyObject *PyUnicode_DecodeUTF7(const char *s,
 943                                Py_ssize_t size,
 944                                const char *errors)
 945 {
 946     const char *starts = s;
 947     Py_ssize_t startinpos;
 948     Py_ssize_t endinpos;
 949     Py_ssize_t outpos;
 950     const char *e;
 951     PyUnicodeObject *unicode;
 952     Py_UNICODE *p;
 953     const char *errmsg = "";
 954     int inShift = 0;
 955     unsigned int bitsleft = 0;
 956     unsigned long charsleft = 0;
 957     int surrogate = 0;
 958     PyObject *errorHandler = NULL;
 959     PyObject *exc = NULL;
 960
 961     unicode = _PyUnicode_New(size);
 962     if (!unicode)
 963         return NULL;
 964     if (size == 0)
 965         return (PyObject *)unicode;
 966
 967     p = unicode->str;
 968     e = s + size;
 969
 970     while (s < e) {
 971         Py_UNICODE ch;
 972         restart:
 973         ch = *s;
 974
 975         if (inShift) {
 976             if ((ch == '-') || !B64CHAR(ch)) {
 977                 inShift = 0;
 978                 s++;
 979
 980                 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
 981                 if (bitsleft >= 6) {
 982                     /* The shift sequence has a partial character in it. If
 983                        bitsleft < 6 then we could just classify it as padding
 984                        but that is not the case here */
 985
 986                     errmsg = "partial character in shift sequence";
 987                     goto utf7Error;
 988                 }
 989                 /* According to RFC2152 the remaining bits should be zero. We
 990                    choose to signal an error/insert a replacement character
 991                    here so indicate the potential of a misencoded character. */
 992
 993                 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
 994                 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
 995                     errmsg = "non-zero padding bits in shift sequence";
 996                     goto utf7Error;
 997                 }
 998
 999                 if (ch == '-') {
1000                     if ((s < e) && (*(s) == '-')) {
1001                         *p++ = '-';
1002                         inShift = 1;
1003                     }
1004                 } else if (SPECIAL(ch,0,0)) {
1005                     errmsg = "unexpected special character";
1006                         goto utf7Error;
1007                 } else  {
1008                     *p++ = ch;
1009                 }
1010             } else {
1011                 charsleft = (charsleft << 6) | UB64(ch);
1012                 bitsleft += 6;
1013                 s++;
1014                 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1015             }
1016         }
1017         else if ( ch == '+' ) {
1018             startinpos = s-starts;
1019             s++;
1020             if (s < e && *s == '-') {
1021                 s++;
1022                 *p++ = '+';
1023             } else
1024             {
1025                 inShift = 1;
1026                 bitsleft = 0;
1027             }
1028         }
1029         else if (SPECIAL(ch,0,0)) {
1030             errmsg = "unexpected special character";
1031             s++;
1032                 goto utf7Error;
1033         }
1034         else {
1035             *p++ = ch;
1036             s++;
1037         }
1038         continue;
1039     utf7Error:
1040         outpos = p-PyUnicode_AS_UNICODE(unicode);
1041         endinpos = s-starts;
1042         if (unicode_decode_call_errorhandler(
1043              errors, &errorHandler,
1044              "utf7", errmsg,
1045              starts, size, &startinpos, &endinpos, &exc, &s,
1046              (PyObject **)&unicode, &outpos, &p))
1047         goto onError;
1048     }
1049
1050     if (inShift) {
1051         outpos = p-PyUnicode_AS_UNICODE(unicode);
1052         endinpos = size;
1053         if (unicode_decode_call_errorhandler(
1054              errors, &errorHandler,
1055              "utf7", "unterminated shift sequence",
1056              starts, size, &startinpos, &endinpos, &exc, &s,
1057              (PyObject **)&unicode, &outpos, &p))
1058             goto onError;
1059         if (s < e)
1060            goto restart;
1061     }
1062
1063     if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1064         goto onError;
1065
1066     Py_XDECREF(errorHandler);
1067     Py_XDECREF(exc);
1068     return (PyObject *)unicode;
1069
1070 onError:
1071     Py_XDECREF(errorHandler);
1072     Py_XDECREF(exc);
1073     Py_DECREF(unicode);
1074     return NULL;
1075 }
1076
1077
1078 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1079                    Py_ssize_t size,
1080                    int encodeSetO,
1081                    int encodeWhiteSpace,
1082                    const char *errors)
1083 {
1084     PyObject *v;
1085     /* It might be possible to tighten this worst case */
1086     Py_ssize_t cbAllocated = 5 * size;
1087     int inShift = 0;
1088     Py_ssize_t i = 0;
1089     unsigned int bitsleft = 0;
1090     unsigned long charsleft = 0;
1091     char * out;
1092     char * start;
1093
1094     if (size == 0)
1095                 return PyString_FromStringAndSize(NULL, 0);
1096
1097     v = PyString_FromStringAndSize(NULL, cbAllocated);
1098     if (v == NULL)
1099         return NULL;
1100
1101     start = out = PyString_AS_STRING(v);
1102     for (;i < size; ++i) {
1103         Py_UNICODE ch = s[i];
1104
1105         if (!inShift) {
1106             if (ch == '+') {
1107                 *out++ = '+';
1108                 *out++ = '-';
1109             } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1110                 charsleft = ch;
1111                 bitsleft = 16;
1112                 *out++ = '+';
1113                 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1114                 inShift = bitsleft > 0;
1115             } else {
1116                 *out++ = (char) ch;
1117             }
1118         } else {
1119             if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1120                 *out++ = B64(charsleft << (6-bitsleft));
1121                 charsleft = 0;
1122                 bitsleft = 0;
1123                 /* Characters not in the BASE64 set implicitly unshift the sequence
1124                    so no '-' is required, except if the character is itself a '-' */
1125                 if (B64CHAR(ch) || ch == '-') {
1126                     *out++ = '-';
1127                 }
1128                 inShift = 0;
1129                 *out++ = (char) ch;
1130             } else {
1131                 bitsleft += 16;
1132                 charsleft = (charsleft << 16) | ch;
1133                 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1134
1135                 /* If the next character is special then we dont' need to terminate
1136                    the shift sequence. If the next character is not a BASE64 character
1137                    or '-' then the shift sequence will be terminated implicitly and we
1138                    don't have to insert a '-'. */
1139
1140                 if (bitsleft == 0) {
1141                     if (i + 1 < size) {
1142                         Py_UNICODE ch2 = s[i+1];
1143
1144                         if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1145
1146                         } else if (B64CHAR(ch2) || ch2 == '-') {
1147                             *out++ = '-';
1148                             inShift = 0;
1149                         } else {
1150                             inShift = 0;
1151                         }
1152
1153                     }
1154                     else {
1155                         *out++ = '-';
1156                         inShift = 0;
1157                     }
1158                 }
1159             }
1160         }
1161     }
1162     if (bitsleft) {
1163         *out++= B64(charsleft << (6-bitsleft) );
1164         *out++ = '-';
1165     }
1166
1167     _PyString_Resize(&v, out - start);
1168     return v;
1169 }
1170
1171 #undef SPECIAL
1172 #undef B64
1173 #undef B64CHAR
1174 #undef UB64
1175 #undef ENCODE
1176 #undef DECODE
1177
1178 /* --- UTF-8 Codec -------------------------------------------------------- */
1179
1180 static
1181 char utf8_code_length[256] = {
1182     /* Map UTF-8 encoded prefix byte to sequence length.  zero means
1183        illegal prefix.  see RFC 2279 for details */
1184     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1185     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1186     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1187     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1188     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1189     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1193     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1194     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1195     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1196     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1197     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1198     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1199     4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1200 };
1201
1202 PyObject *PyUnicode_DecodeUTF8(const char *s,
1203                                Py_ssize_t size,
1204                                const char *errors)
1205 {
1206     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1207 }
1208
1209 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1210                                         Py_ssize_t size,
1211                                         const char *errors,
1212                                         Py_ssize_t *consumed)
1213 {
1214     const char *starts = s;
1215     int n;
1216     Py_ssize_t startinpos;
1217     Py_ssize_t endinpos;
1218     Py_ssize_t outpos;
1219     const char *e;
1220     PyUnicodeObject *unicode;
1221     Py_UNICODE *p;
1222     const char *errmsg = "";
1223     PyObject *errorHandler = NULL;
1224     PyObject *exc = NULL;
1225
1226     /* Note: size will always be longer than the resulting Unicode
1227        character count */
1228     unicode = _PyUnicode_New(size);
1229     if (!unicode)
1230         return NULL;
1231     if (size == 0) {
1232         if (consumed)
1233             *consumed = 0;
1234         return (PyObject *)unicode;
1235     }
1236
1237     /* Unpack UTF-8 encoded data */
1238     p = unicode->str;
1239     e = s + size;
1240
1241     while (s < e) {
1242         Py_UCS4 ch = (unsigned char)*s;
1243
1244         if (ch < 0x80) {
1245             *p++ = (Py_UNICODE)ch;
1246             s++;
1247             continue;
1248         }
1249
1250         n = utf8_code_length[ch];
1251
1252         if (s + n > e) {
1253             if (consumed)
1254                 break;
1255             else {
1256                 errmsg = "unexpected end of data";
1257                 startinpos = s-starts;
1258                 endinpos = size;
1259                 goto utf8Error;
1260             }
1261         }
1262
1263         switch (n) {
1264
1265         case 0:
1266             errmsg = "unexpected code byte";
1267             startinpos = s-starts;
1268             endinpos = startinpos+1;
1269             goto utf8Error;
1270
1271         case 1:
1272             errmsg = "internal error";
1273             startinpos = s-starts;
1274             endinpos = startinpos+1;
1275             goto utf8Error;
1276
1277         case 2:
1278             if ((s[1] & 0xc0) != 0x80) {
1279                 errmsg = "invalid data";
1280                 startinpos = s-starts;
1281                 endinpos = startinpos+2;
1282                 goto utf8Error;
1283             }
1284             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1285             if (ch < 0x80) {
1286                 startinpos = s-starts;
1287                 endinpos = startinpos+2;
1288                 errmsg = "illegal encoding";
1289                 goto utf8Error;
1290             }
1291             else
1292                 *p++ = (Py_UNICODE)ch;
1293             break;
1294
1295         case 3:
1296             if ((s[1] & 0xc0) != 0x80 ||
1297                 (s[2] & 0xc0) != 0x80) {
1298                 errmsg = "invalid data";
1299                 startinpos = s-starts;
1300                 endinpos = startinpos+3;
1301                 goto utf8Error;
1302             }
1303             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1304             if (ch < 0x0800) {
1305                 /* Note: UTF-8 encodings of surrogates are considered
1306                    legal UTF-8 sequences;
1307
1308                    XXX For wide builds (UCS-4) we should probably try
1309                        to recombine the surrogates into a single code
1310                        unit.
1311                 */
1312                 errmsg = "illegal encoding";
1313                 startinpos = s-starts;
1314                 endinpos = startinpos+3;
1315                 goto utf8Error;
1316             }
1317             else
1318                 *p++ = (Py_UNICODE)ch;
1319             break;
1320
1321         case 4:
1322             if ((s[1] & 0xc0) != 0x80 ||
1323                 (s[2] & 0xc0) != 0x80 ||
1324                 (s[3] & 0xc0) != 0x80) {
1325                 errmsg = "invalid data";
1326                 startinpos = s-starts;
1327                 endinpos = startinpos+4;
1328                 goto utf8Error;
1329             }
1330             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1331                  ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1332             /* validate and convert to UTF-16 */
1333             if ((ch < 0x10000)        /* minimum value allowed for 4
1334                                          byte encoding */
1335                 || (ch > 0x10ffff))   /* maximum value allowed for
1336                                          UTF-16 */
1337             {
1338                 errmsg = "illegal encoding";
1339                 startinpos = s-starts;
1340                 endinpos = startinpos+4;
1341                 goto utf8Error;
1342             }
1343 #ifdef Py_UNICODE_WIDE
1344             *p++ = (Py_UNICODE)ch;
1345 #else
1346             /*  compute and append the two surrogates: */
1347
1348             /*  translate from 10000..10FFFF to 0..FFFF */
1349             ch -= 0x10000;
1350
1351             /*  high surrogate = top 10 bits added to D800 */
1352             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1353
1354             /*  low surrogate = bottom 10 bits added to DC00 */
1355             *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1356 #endif
1357             break;
1358
1359         default:
1360             /* Other sizes are only needed for UCS-4 */
1361             errmsg = "unsupported Unicode code range";
1362             startinpos = s-starts;
1363             endinpos = startinpos+n;
1364             goto utf8Error;
1365         }
1366         s += n;
1367         continue;
1368
1369     utf8Error:
1370     outpos = p-PyUnicode_AS_UNICODE(unicode);
1371     if (unicode_decode_call_errorhandler(
1372              errors, &errorHandler,
1373              "utf8", errmsg,
1374              starts, size, &startinpos, &endinpos, &exc, &s,
1375              (PyObject **)&unicode, &outpos, &p))
1376         goto onError;
1377     }
1378     if (consumed)
1379         *consumed = s-starts;
1380
1381     /* Adjust length */
1382     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1383         goto onError;
1384
1385     Py_XDECREF(errorHandler);
1386     Py_XDECREF(exc);
1387     return (PyObject *)unicode;
1388
1389 onError:
1390     Py_XDECREF(errorHandler);
1391     Py_XDECREF(exc);
1392     Py_DECREF(unicode);
1393     return NULL;
1394 }
1395
1396 /* Allocation strategy:  if the string is short, convert into a stack buffer
1397    and allocate exactly as much space needed at the end.  Else allocate the
1398    maximum possible needed (4 result bytes per Unicode character), and return
1399    the excess memory at the end.
1400 */
1401 PyObject *
1402 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1403                      Py_ssize_t size,
1404                      const char *errors)
1405 {
1406 #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
1407
1408     Py_ssize_t i;           /* index into s of next input byte */
1409     PyObject *v;        /* result string object */
1410     char *p;            /* next free byte in output buffer */
1411     Py_ssize_t nallocated;  /* number of result bytes allocated */
1412     Py_ssize_t nneeded;        /* number of result bytes needed */
1413     char stackbuf[MAX_SHORT_UNICHARS * 4];
1414
1415     assert(s != NULL);
1416     assert(size >= 0);
1417
1418     if (size <= MAX_SHORT_UNICHARS) {
1419         /* Write into the stack buffer; nallocated can't overflow.
1420          * At the end, we'll allocate exactly as much heap space as it
1421          * turns out we need.
1422          */
1423         nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1424         v = NULL;   /* will allocate after we're done */
1425         p = stackbuf;
1426     }
1427     else {
1428         /* Overallocate on the heap, and give the excess back at the end. */
1429         nallocated = size * 4;
1430         if (nallocated / 4 != size)  /* overflow! */
1431             return PyErr_NoMemory();
1432         v = PyString_FromStringAndSize(NULL, nallocated);
1433         if (v == NULL)
1434             return NULL;
1435         p = PyString_AS_STRING(v);
1436     }
1437
1438     for (i = 0; i < size;) {
1439         Py_UCS4 ch = s[i++];
1440
1441         if (ch < 0x80)
1442             /* Encode ASCII */
1443             *p++ = (char) ch;
1444
1445         else if (ch < 0x0800) {
1446             /* Encode Latin-1 */
1447             *p++ = (char)(0xc0 | (ch >> 6));
1448             *p++ = (char)(0x80 | (ch & 0x3f));
1449         }
1450         else {
1451             /* Encode UCS2 Unicode ordinals */
1452             if (ch < 0x10000) {
1453                 /* Special case: check for high surrogate */
1454                 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1455                     Py_UCS4 ch2 = s[i];
1456                     /* Check for low surrogate and combine the two to
1457                        form a UCS4 value */
1458                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1459                         ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
1460                         i++;
1461                         goto encodeUCS4;
1462                     }
1463                     /* Fall through: handles isolated high surrogates */
1464                 }
1465                 *p++ = (char)(0xe0 | (ch >> 12));
1466                 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1467                 *p++ = (char)(0x80 | (ch & 0x3f));
1468                 continue;
1469             }
1470 encodeUCS4:
1471             /* Encode UCS4 Unicode ordinals */
1472             *p++ = (char)(0xf0 | (ch >> 18));
1473             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1474             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1475             *p++ = (char)(0x80 | (ch & 0x3f));
1476         }
1477     }
1478
1479     if (v == NULL) {
1480         /* This was stack allocated. */
1481         nneeded = p - stackbuf;
1482         assert(nneeded <= nallocated);
1483         v = PyString_FromStringAndSize(stackbuf, nneeded);
1484     }
1485     else {
1486         /* Cut back to size actually needed. */
1487         nneeded = p - PyString_AS_STRING(v);
1488         assert(nneeded <= nallocated);
1489         _PyString_Resize(&v, nneeded);
1490     }
1491     return v;
1492
1493 #undef MAX_SHORT_UNICHARS
1494 }
1495
1496 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1497 {
1498     if (!PyUnicode_Check(unicode)) {
1499         PyErr_BadArgument();
1500         return NULL;
1501     }
1502     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1503                                 PyUnicode_GET_SIZE(unicode),
1504                                 NULL);
1505 }
1506
1507 /* --- UTF-16 Codec ------------------------------------------------------- */
1508
1509 PyObject *
1510 PyUnicode_DecodeUTF16(const char *s,
1511                       Py_ssize_t size,
1512                       const char *errors,
1513                       int *byteorder)
1514 {
1515     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1516 }
1517
1518 PyObject *
1519 PyUnicode_DecodeUTF16Stateful(const char *s,
1520                               Py_ssize_t size,
1521                               const char *errors,
1522                               int *byteorder,
1523                               Py_ssize_t *consumed)
1524 {
1525     const char *starts = s;
1526     Py_ssize_t startinpos;
1527     Py_ssize_t endinpos;
1528     Py_ssize_t outpos;
1529     PyUnicodeObject *unicode;
1530     Py_UNICODE *p;
1531     const unsigned char *q, *e;
1532     int bo = 0;       /* assume native ordering by default */
1533     const char *errmsg = "";
1534     /* Offsets from q for retrieving byte pairs in the right order. */
1535 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1536     int ihi = 1, ilo = 0;
1537 #else
1538     int ihi = 0, ilo = 1;
1539 #endif
1540     PyObject *errorHandler = NULL;
1541     PyObject *exc = NULL;
1542
1543     /* Note: size will always be longer than the resulting Unicode
1544        character count */
1545     unicode = _PyUnicode_New(size);
1546     if (!unicode)
1547         return NULL;
1548     if (size == 0)
1549         return (PyObject *)unicode;
1550
1551     /* Unpack UTF-16 encoded data */
1552     p = unicode->str;
1553     q = (unsigned char *)s;
1554     e = q + size;
1555
1556     if (byteorder)
1557         bo = *byteorder;
1558
1559     /* Check for BOM marks (U+FEFF) in the input and adjust current
1560        byte order setting accordingly. In native mode, the leading BOM
1561        mark is skipped, in all other modes, it is copied to the output
1562        stream as-is (giving a ZWNBSP character). */
1563     if (bo == 0) {
1564         if (size >= 2) {
1565             const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
1566 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1567             if (bom == 0xFEFF) {
1568                 q += 2;
1569                 bo = -1;
1570             }
1571             else if (bom == 0xFFFE) {
1572                 q += 2;
1573                 bo = 1;
1574             }
1575 #else
1576             if (bom == 0xFEFF) {
1577                 q += 2;
1578                 bo = 1;
1579             }
1580             else if (bom == 0xFFFE) {
1581                 q += 2;
1582                 bo = -1;
1583             }
1584 #endif
1585         }
1586     }
1587
1588     if (bo == -1) {
1589         /* force LE */
1590         ihi = 1;
1591         ilo = 0;
1592     }
1593     else if (bo == 1) {
1594         /* force BE */
1595         ihi = 0;
1596         ilo = 1;
1597     }
1598
1599     while (q < e) {
1600         Py_UNICODE ch;
1601         /* remaining bytes at the end? (size should be even) */
1602         if (e-q<2) {
1603             if (consumed)
1604                 break;
1605             errmsg = "truncated data";
1606             startinpos = ((const char *)q)-starts;
1607             endinpos = ((const char *)e)-starts;
1608             goto utf16Error;
1609             /* The remaining input chars are ignored if the callback
1610                chooses to skip the input */
1611         }
1612         ch = (q[ihi] << 8) | q[ilo];
1613
1614         q += 2;
1615
1616         if (ch < 0xD800 || ch > 0xDFFF) {
1617             *p++ = ch;
1618             continue;
1619         }
1620
1621         /* UTF-16 code pair: */
1622         if (q >= e) {
1623             errmsg = "unexpected end of data";
1624             startinpos = (((const char *)q)-2)-starts;
1625             endinpos = ((const char *)e)-starts;
1626             goto utf16Error;
1627         }
1628         if (0xD800 <= ch && ch <= 0xDBFF) {
1629             Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1630             q += 2;
1631             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1632 #ifndef Py_UNICODE_WIDE
1633                 *p++ = ch;
1634                 *p++ = ch2;
1635 #else
1636                 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1637 #endif
1638                 continue;
1639             }
1640             else {
1641                 errmsg = "illegal UTF-16 surrogate";
1642                 startinpos = (((const char *)q)-4)-starts;
1643                 endinpos = startinpos+2;
1644                 goto utf16Error;
1645             }
1646
1647         }
1648         errmsg = "illegal encoding";
1649         startinpos = (((const char *)q)-2)-starts;
1650         endinpos = startinpos+2;
1651         /* Fall through to report the error */
1652
1653     utf16Error:
1654         outpos = p-PyUnicode_AS_UNICODE(unicode);
1655         if (unicode_decode_call_errorhandler(
1656                  errors, &errorHandler,
1657                  "utf16", errmsg,
1658                  starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1659                  (PyObject **)&unicode, &outpos, &p))
1660             goto onError;
1661     }
1662
1663     if (byteorder)
1664         *byteorder = bo;
1665
1666     if (consumed)
1667         *consumed = (const char *)q-starts;
1668
1669     /* Adjust length */
1670     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1671         goto onError;
1672
1673     Py_XDECREF(errorHandler);
1674     Py_XDECREF(exc);
1675     return (PyObject *)unicode;
1676
1677 onError:
1678     Py_DECREF(unicode);
1679     Py_XDECREF(errorHandler);
1680     Py_XDECREF(exc);
1681     return NULL;
1682 }
1683
1684 PyObject *
1685 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1686                       Py_ssize_t size,
1687                       const char *errors,
1688                       int byteorder)
1689 {
1690     PyObject *v;
1691     unsigned char *p;
1692 #ifdef Py_UNICODE_WIDE
1693     int i, pairs;
1694 #else
1695     const int pairs = 0;
1696 #endif
1697     /* Offsets from p for storing byte pairs in the right order. */
1698 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1699     int ihi = 1, ilo = 0;
1700 #else
1701     int ihi = 0, ilo = 1;
1702 #endif
1703
1704 #define STORECHAR(CH)                   \
1705     do {                                \
1706         p[ihi] = ((CH) >> 8) & 0xff;    \
1707         p[ilo] = (CH) & 0xff;           \
1708         p += 2;                         \
1709     } while(0)
1710
1711 #ifdef Py_UNICODE_WIDE
1712     for (i = pairs = 0; i < size; i++)
1713         if (s[i] >= 0x10000)
1714             pairs++;
1715 #endif
1716     v = PyString_FromStringAndSize(NULL,
1717                   2 * (size + pairs + (byteorder == 0)));
1718     if (v == NULL)
1719         return NULL;
1720
1721     p = (unsigned char *)PyString_AS_STRING(v);
1722     if (byteorder == 0)
1723         STORECHAR(0xFEFF);
1724     if (size == 0)
1725         return v;
1726
1727     if (byteorder == -1) {
1728         /* force LE */
1729         ihi = 1;
1730         ilo = 0;
1731     }
1732     else if (byteorder == 1) {
1733         /* force BE */
1734         ihi = 0;
1735         ilo = 1;
1736     }
1737
1738     while (size-- > 0) {
1739         Py_UNICODE ch = *s++;
1740         Py_UNICODE ch2 = 0;
1741 #ifdef Py_UNICODE_WIDE
1742         if (ch >= 0x10000) {
1743             ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1744             ch  = 0xD800 | ((ch-0x10000) >> 10);
1745         }
1746 #endif
1747         STORECHAR(ch);
1748         if (ch2)
1749             STORECHAR(ch2);
1750     }
1751     return v;
1752 #undef STORECHAR
1753 }
1754
1755 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1756 {
1757     if (!PyUnicode_Check(unicode)) {
1758         PyErr_BadArgument();
1759         return NULL;
1760     }
1761     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1762                                  PyUnicode_GET_SIZE(unicode),
1763                                  NULL,
1764                                  0);
1765 }
1766
1767 /* --- Unicode Escape Codec ----------------------------------------------- */
1768
1769 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1770
1771 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1772                                         Py_ssize_t size,
1773                                         const char *errors)
1774 {
1775     const char *starts = s;
1776     Py_ssize_t startinpos;
1777     Py_ssize_t endinpos;
1778     Py_ssize_t outpos;
1779     int i;
1780     PyUnicodeObject *v;
1781     Py_UNICODE *p;
1782     const char *end;
1783     char* message;
1784     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1785     PyObject *errorHandler = NULL;
1786     PyObject *exc = NULL;
1787
1788     /* Escaped strings will always be longer than the resulting
1789        Unicode string, so we start with size here and then reduce the
1790        length after conversion to the true value.
1791        (but if the error callback returns a long replacement string
1792        we'll have to allocate more space) */
1793     v = _PyUnicode_New(size);
1794     if (v == NULL)
1795         goto onError;
1796     if (size == 0)
1797         return (PyObject *)v;
1798
1799     p = PyUnicode_AS_UNICODE(v);
1800     end = s + size;
1801
1802     while (s < end) {
1803         unsigned char c;
1804         Py_UNICODE x;
1805         int digits;
1806
1807         /* Non-escape characters are interpreted as Unicode ordinals */
1808         if (*s != '\\') {
1809             *p++ = (unsigned char) *s++;
1810             continue;
1811         }
1812
1813         startinpos = s-starts;
1814         /* \ - Escapes */
1815         s++;
1816         switch (*s++) {
1817
1818         /* \x escapes */
1819         case '\n': break;
1820         case '\\': *p++ = '\\'; break;
1821         case '\'': *p++ = '\''; break;
1822         case '\"': *p++ = '\"'; break;
1823         case 'b': *p++ = '\b'; break;
1824         case 'f': *p++ = '\014'; break; /* FF */
1825         case 't': *p++ = '\t'; break;
1826         case 'n': *p++ = '\n'; break;
1827         case 'r': *p++ = '\r'; break;
1828         case 'v': *p++ = '\013'; break; /* VT */
1829         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1830
1831         /* \OOO (octal) escapes */
1832         case '0': case '1': case '2': case '3':
1833         case '4': case '5': case '6': case '7':
1834             x = s[-1] - '0';
1835             if ('0' <= *s && *s <= '7') {
1836                 x = (x<<3) + *s++ - '0';
1837                 if ('0' <= *s && *s <= '7')
1838                     x = (x<<3) + *s++ - '0';
1839             }
1840             *p++ = x;
1841             break;
1842
1843         /* hex escapes */
1844         /* \xXX */
1845         case 'x':
1846             digits = 2;
1847             message = "truncated \\xXX escape";
1848             goto hexescape;
1849
1850         /* \uXXXX */
1851         case 'u':
1852             digits = 4;
1853             message = "truncated \\uXXXX escape";
1854             goto hexescape;
1855
1856         /* \UXXXXXXXX */
1857         case 'U':
1858             digits = 8;
1859             message = "truncated \\UXXXXXXXX escape";
1860         hexescape:
1861             chr = 0;
1862             outpos = p-PyUnicode_AS_UNICODE(v);
1863             if (s+digits>end) {
1864                 endinpos = size;
1865                 if (unicode_decode_call_errorhandler(
1866                     errors, &errorHandler,
1867                     "unicodeescape", "end of string in escape sequence",
1868                     starts, size, &startinpos, &endinpos, &exc, &s,
1869                     (PyObject **)&v, &outpos, &p))
1870                     goto onError;
1871                 goto nextByte;
1872             }
1873             for (i = 0; i < digits; ++i) {
1874                 c = (unsigned char) s[i];
1875                 if (!isxdigit(c)) {
1876                     endinpos = (s+i+1)-starts;
1877                     if (unicode_decode_call_errorhandler(
1878                         errors, &errorHandler,
1879                         "unicodeescape", message,
1880                         starts, size, &startinpos, &endinpos, &exc, &s,
1881                         (PyObject **)&v, &outpos, &p))
1882                         goto onError;
1883                     goto nextByte;
1884                 }
1885                 chr = (chr<<4) & ~0xF;
1886                 if (c >= '0' && c <= '9')
1887                     chr += c - '0';
1888                 else if (c >= 'a' && c <= 'f')
1889                     chr += 10 + c - 'a';
1890                 else
1891                     chr += 10 + c - 'A';
1892             }
1893             s += i;
1894             if (chr == 0xffffffff && PyErr_Occurred())
1895                 /* _decoding_error will have already written into the
1896                    target buffer. */
1897                 break;
1898         store:
1899             /* when we get here, chr is a 32-bit unicode character */
1900             if (chr <= 0xffff)
1901                 /* UCS-2 character */
1902                 *p++ = (Py_UNICODE) chr;
1903             else if (chr <= 0x10ffff) {
1904                 /* UCS-4 character. Either store directly, or as
1905                    surrogate pair. */
1906 #ifdef Py_UNICODE_WIDE
1907                 *p++ = chr;
1908 #else
1909                 chr -= 0x10000L;
1910                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1911                 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
1912 #endif
1913             } else {
1914                 endinpos = s-starts;
1915                 outpos = p-PyUnicode_AS_UNICODE(v);
1916                 if (unicode_decode_call_errorhandler(
1917                     errors, &errorHandler,
1918                     "unicodeescape", "illegal Unicode character",
1919                     starts, size, &startinpos, &endinpos, &exc, &s,
1920                     (PyObject **)&v, &outpos, &p))
1921                     goto onError;
1922             }
1923             break;
1924
1925         /* \N{name} */
1926         case 'N':
1927             message = "malformed \\N character escape";
1928             if (ucnhash_CAPI == NULL) {
1929                 /* load the unicode data module */
1930                 PyObject *m, *api;
1931                 m = PyImport_ImportModule("unicodedata");
1932                 if (m == NULL)
1933                     goto ucnhashError;
1934                 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
1935                 Py_DECREF(m);
1936                 if (api == NULL)
1937                     goto ucnhashError;
1938                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
1939                 Py_DECREF(api);
1940                 if (ucnhash_CAPI == NULL)
1941                     goto ucnhashError;
1942             }
1943             if (*s == '{') {
1944                 const char *start = s+1;
1945                 /* look for the closing brace */
1946                 while (*s != '}' && s < end)
1947                     s++;
1948                 if (s > start && s < end && *s == '}') {
1949                     /* found a name.  look it up in the unicode database */
1950                     message = "unknown Unicode character name";
1951                     s++;
1952                     if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
1953                         goto store;
1954                 }
1955             }
1956             endinpos = s-starts;
1957             outpos = p-PyUnicode_AS_UNICODE(v);
1958             if (unicode_decode_call_errorhandler(
1959                 errors, &errorHandler,
1960                 "unicodeescape", message,
1961                 starts, size, &startinpos, &endinpos, &exc, &s,
1962                 (PyObject **)&v, &outpos, &p))
1963                 goto onError;
1964             break;
1965
1966         default:
1967             if (s > end) {
1968                 message = "\\ at end of string";
1969                 s--;
1970                 endinpos = s-starts;
1971                 outpos = p-PyUnicode_AS_UNICODE(v);
1972                 if (unicode_decode_call_errorhandler(
1973                     errors, &errorHandler,
1974                     "unicodeescape", message,
1975                     starts, size, &startinpos, &endinpos, &exc, &s,
1976                     (PyObject **)&v, &outpos, &p))
1977                     goto onError;
1978             }
1979             else {
1980                 *p++ = '\\';
1981                 *p++ = (unsigned char)s[-1];
1982             }
1983             break;
1984         }
1985         nextByte:
1986         ;
1987     }
1988     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
1989         goto onError;
1990     Py_XDECREF(errorHandler);
1991     Py_XDECREF(exc);
1992     return (PyObject *)v;
1993
1994 ucnhashError:
1995     PyErr_SetString(
1996         PyExc_UnicodeError,
1997         "\\N escapes not supported (can't load unicodedata module)"
1998         );
1999     Py_XDECREF(v);
2000     Py_XDECREF(errorHandler);
2001     Py_XDECREF(exc);
2002     return NULL;
2003
2004 onError:
2005     Py_XDECREF(v);
2006     Py_XDECREF(errorHandler);
2007     Py_XDECREF(exc);
2008     return NULL;
2009 }
2010
2011 /* Return a Unicode-Escape string version of the Unicode object.
2012
2013    If quotes is true, the string is enclosed in u"" or u'' quotes as
2014    appropriate.
2015
2016 */
2017
2018 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2019                                       Py_ssize_t size,
2020                                       Py_UNICODE ch)
2021 {
2022     /* like wcschr, but doesn't stop at NULL characters */
2023
2024     while (size-- > 0) {
2025         if (*s == ch)
2026             return s;
2027         s++;
2028     }
2029
2030     return NULL;
2031 }
2032
2033 static
2034 PyObject *unicodeescape_string(const Py_UNICODE *s,
2035                                Py_ssize_t size,
2036                                int quotes)
2037 {
2038     PyObject *repr;
2039     char *p;
2040
2041     static const char *hexdigit = "0123456789abcdef";
2042
2043     repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
2044     if (repr == NULL)
2045         return NULL;
2046
2047     p = PyString_AS_STRING(repr);
2048
2049     if (quotes) {
2050         *p++ = 'u';
2051         *p++ = (findchar(s, size, '\'') &&
2052                 !findchar(s, size, '"')) ? '"' : '\'';
2053     }
2054     while (size-- > 0) {
2055         Py_UNICODE ch = *s++;
2056
2057         /* Escape quotes and backslashes */
2058         if ((quotes &&
2059              ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
2060             *p++ = '\\';
2061             *p++ = (char) ch;
2062             continue;
2063         }
2064
2065 #ifdef Py_UNICODE_WIDE
2066         /* Map 21-bit characters to '\U00xxxxxx' */
2067         else if (ch >= 0x10000) {
2068             Py_ssize_t offset = p - PyString_AS_STRING(repr);
2069
2070             /* Resize the string if necessary */
2071             if (offset + 12 > PyString_GET_SIZE(repr)) {
2072                 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
2073                     return NULL;
2074                 p = PyString_AS_STRING(repr) + offset;
2075             }
2076
2077             *p++ = '\\';
2078             *p++ = 'U';
2079             *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2080             *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2081             *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2082             *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2083             *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2084             *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2085             *p++ = hexdigit[(ch >> 4) & 0x0000000F];
2086             *p++ = hexdigit[ch & 0x0000000F];
2087             continue;
2088         }
2089 #endif
2090         /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2091         else if (ch >= 0xD800 && ch < 0xDC00) {
2092             Py_UNICODE ch2;
2093             Py_UCS4 ucs;
2094
2095             ch2 = *s++;
2096             size--;
2097             if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2098                 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2099                 *p++ = '\\';
2100                 *p++ = 'U';
2101                 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2102                 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2103                 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2104                 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2105                 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2106                 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2107                 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2108                 *p++ = hexdigit[ucs & 0x0000000F];
2109                 continue;
2110             }
2111             /* Fall through: isolated surrogates are copied as-is */
2112             s--;
2113             size++;
2114         }
2115
2116         /* Map 16-bit characters to '\uxxxx' */
2117         if (ch >= 256) {
2118             *p++ = '\\';
2119             *p++ = 'u';
2120             *p++ = hexdigit[(ch >> 12) & 0x000F];
2121             *p++ = hexdigit[(ch >> 8) & 0x000F];
2122             *p++ = hexdigit[(ch >> 4) & 0x000F];
2123             *p++ = hexdigit[ch & 0x000F];
2124         }
2125
2126         /* Map special whitespace to '\t', \n', '\r' */
2127         else if (ch == '\t') {
2128             *p++ = '\\';
2129             *p++ = 't';
2130         }
2131         else if (ch == '\n') {
2132             *p++ = '\\';
2133             *p++ = 'n';
2134         }
2135         else if (ch == '\r') {
2136             *p++ = '\\';
2137             *p++ = 'r';
2138         }
2139
2140         /* Map non-printable US ASCII to '\xhh' */
2141         else if (ch < ' ' || ch >= 0x7F) {
2142             *p++ = '\\';
2143             *p++ = 'x';
2144             *p++ = hexdigit[(ch >> 4) & 0x000F];
2145             *p++ = hexdigit[ch & 0x000F];
2146         }
2147
2148         /* Copy everything else as-is */
2149         else
2150             *p++ = (char) ch;
2151     }
2152     if (quotes)
2153         *p++ = PyString_AS_STRING(repr)[1];
2154
2155     *p = '\0';
2156     _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
2157     return repr;
2158 }
2159
2160 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2161                                         Py_ssize_t size)
2162 {
2163     return unicodeescape_string(s, size, 0);
2164 }
2165
2166 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2167 {
2168     if (!PyUnicode_Check(unicode)) {
2169         PyErr_BadArgument();
2170         return NULL;
2171     }
2172     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2173                                          PyUnicode_GET_SIZE(unicode));
2174 }
2175
2176 /* --- Raw Unicode Escape Codec ------------------------------------------- */
2177
2178 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2179                                            Py_ssize_t size,
2180                                            const char *errors)
2181 {
2182     const char *starts = s;
2183     Py_ssize_t startinpos;
2184     Py_ssize_t endinpos;
2185     Py_ssize_t outpos;
2186     PyUnicodeObject *v;
2187     Py_UNICODE *p;
2188     const char *end;
2189     const char *bs;
2190     PyObject *errorHandler = NULL;
2191     PyObject *exc = NULL;
2192
2193     /* Escaped strings will always be longer than the resulting
2194        Unicode string, so we start with size here and then reduce the
2195        length after conversion to the true value. (But decoding error
2196        handler might have to resize the string) */
2197     v = _PyUnicode_New(size);
2198     if (v == NULL)
2199         goto onError;
2200     if (size == 0)
2201         return (PyObject *)v;
2202     p = PyUnicode_AS_UNICODE(v);
2203     end = s + size;
2204     while (s < end) {
2205         unsigned char c;
2206         Py_UCS4 x;
2207         int i;
2208         int count;
2209
2210         /* Non-escape characters are interpreted as Unicode ordinals */
2211         if (*s != '\\') {
2212             *p++ = (unsigned char)*s++;
2213             continue;
2214         }
2215         startinpos = s-starts;
2216
2217         /* \u-escapes are only interpreted iff the number of leading
2218            backslashes if odd */
2219         bs = s;
2220         for (;s < end;) {
2221             if (*s != '\\')
2222                 break;
2223             *p++ = (unsigned char)*s++;
2224         }
2225         if (((s - bs) & 1) == 0 ||
2226             s >= end ||
2227             (*s != 'u' && *s != 'U')) {
2228             continue;
2229         }
2230         p--;
2231         count = *s=='u' ? 4 : 8;
2232         s++;
2233
2234         /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
2235         outpos = p-PyUnicode_AS_UNICODE(v);
2236         for (x = 0, i = 0; i < count; ++i, ++s) {
2237             c = (unsigned char)*s;
2238             if (!isxdigit(c)) {
2239                 endinpos = s-starts;
2240                 if (unicode_decode_call_errorhandler(
2241                     errors, &errorHandler,
2242                     "rawunicodeescape", "truncated \\uXXXX",
2243                     starts, size, &startinpos, &endinpos, &exc, &s,
2244                     (PyObject **)&v, &outpos, &p))
2245                     goto onError;
2246                 goto nextByte;
2247             }
2248             x = (x<<4) & ~0xF;
2249             if (c >= '0' && c <= '9')
2250                 x += c - '0';
2251             else if (c >= 'a' && c <= 'f')
2252                 x += 10 + c - 'a';
2253             else
2254                 x += 10 + c - 'A';
2255         }
2256 #ifndef Py_UNICODE_WIDE
2257         if (x > 0x10000) {
2258             if (unicode_decode_call_errorhandler(
2259                     errors, &errorHandler,
2260                     "rawunicodeescape", "\\Uxxxxxxxx out of range",
2261                     starts, size, &startinpos, &endinpos, &exc, &s,
2262                     (PyObject **)&v, &outpos, &p))
2263                     goto onError;
2264         }
2265 #endif
2266         *p++ = x;
2267         nextByte:
2268         ;
2269     }
2270     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2271         goto onError;
2272     Py_XDECREF(errorHandler);
2273     Py_XDECREF(exc);
2274     return (PyObject *)v;
2275
2276  onError:
2277     Py_XDECREF(v);
2278     Py_XDECREF(errorHandler);
2279     Py_XDECREF(exc);
2280     return NULL;
2281 }
2282
2283 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2284                                            Py_ssize_t size)
2285 {
2286     PyObject *repr;
2287     char *p;
2288     char *q;
2289
2290     static const char *hexdigit = "0123456789abcdef";
2291
2292 #ifdef Py_UNICODE_WIDE
2293     repr = PyString_FromStringAndSize(NULL, 10 * size);
2294 #else
2295     repr = PyString_FromStringAndSize(NULL, 6 * size);
2296 #endif
2297     if (repr == NULL)
2298         return NULL;
2299     if (size == 0)
2300         return repr;
2301
2302     p = q = PyString_AS_STRING(repr);
2303     while (size-- > 0) {
2304         Py_UNICODE ch = *s++;
2305 #ifdef Py_UNICODE_WIDE
2306         /* Map 32-bit characters to '\Uxxxxxxxx' */
2307         if (ch >= 0x10000) {
2308             *p++ = '\\';
2309             *p++ = 'U';
2310             *p++ = hexdigit[(ch >> 28) & 0xf];
2311             *p++ = hexdigit[(ch >> 24) & 0xf];
2312             *p++ = hexdigit[(ch >> 20) & 0xf];
2313             *p++ = hexdigit[(ch >> 16) & 0xf];
2314             *p++ = hexdigit[(ch >> 12) & 0xf];
2315             *p++ = hexdigit[(ch >> 8) & 0xf];
2316             *p++ = hexdigit[(ch >> 4) & 0xf];
2317             *p++ = hexdigit[ch & 15];
2318         }
2319         else
2320 #endif
2321         /* Map 16-bit characters to '\uxxxx' */
2322         if (ch >= 256) {
2323             *p++ = '\\';
2324             *p++ = 'u';
2325             *p++ = hexdigit[(ch >> 12) & 0xf];
2326             *p++ = hexdigit[(ch >> 8) & 0xf];
2327             *p++ = hexdigit[(ch >> 4) & 0xf];
2328             *p++ = hexdigit[ch & 15];
2329         }
2330         /* Copy everything else as-is */
2331         else
2332             *p++ = (char) ch;
2333     }
2334     *p = '\0';
2335     _PyString_Resize(&repr, p - q);
2336     return repr;
2337 }
2338
2339 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2340 {
2341     if (!PyUnicode_Check(unicode)) {
2342         PyErr_BadArgument();
2343         return NULL;
2344     }
2345     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2346                                             PyUnicode_GET_SIZE(unicode));
2347 }
2348
2349 /* --- Unicode Internal Codec ------------------------------------------- */
2350
2351 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
2352                                            Py_ssize_t size,
2353                                            const char *errors)
2354 {
2355     const char *starts = s;
2356     Py_ssize_t startinpos;
2357     Py_ssize_t endinpos;
2358     Py_ssize_t outpos;
2359     PyUnicodeObject *v;
2360     Py_UNICODE *p;
2361     const char *end;
2362     const char *reason;
2363     PyObject *errorHandler = NULL;
2364     PyObject *exc = NULL;
2365
2366 #ifdef Py_UNICODE_WIDE
2367     Py_UNICODE unimax = PyUnicode_GetMax();
2368 #endif
2369
2370     v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2371     if (v == NULL)
2372         goto onError;
2373     if (PyUnicode_GetSize((PyObject *)v) == 0)
2374         return (PyObject *)v;
2375     p = PyUnicode_AS_UNICODE(v);
2376     end = s + size;
2377
2378     while (s < end) {
2379         memcpy(p, s, sizeof(Py_UNICODE));
2380         /* We have to sanity check the raw data, otherwise doom looms for
2381            some malformed UCS-4 data. */
2382         if (
2383             #ifdef Py_UNICODE_WIDE
2384             *p > unimax || *p < 0 ||
2385             #endif
2386             end-s < Py_UNICODE_SIZE
2387             )
2388             {
2389             startinpos = s - starts;
2390             if (end-s < Py_UNICODE_SIZE) {
2391                 endinpos = end-starts;
2392                 reason = "truncated input";
2393             }
2394             else {
2395                 endinpos = s - starts + Py_UNICODE_SIZE;
2396                 reason = "illegal code point (> 0x10FFFF)";
2397             }
2398             outpos = p - PyUnicode_AS_UNICODE(v);
2399             if (unicode_decode_call_errorhandler(
2400                     errors, &errorHandler,
2401                     "unicode_internal", reason,
2402                     starts, size, &startinpos, &endinpos, &exc, &s,
2403                     (PyObject **)&v, &outpos, &p)) {
2404                 goto onError;
2405             }
2406         }
2407         else {
2408             p++;
2409             s += Py_UNICODE_SIZE;
2410         }
2411     }
2412
2413     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2414         goto onError;
2415     Py_XDECREF(errorHandler);
2416     Py_XDECREF(exc);
2417     return (PyObject *)v;
2418
2419  onError:
2420     Py_XDECREF(v);
2421     Py_XDECREF(errorHandler);
2422     Py_XDECREF(exc);
2423     return NULL;
2424 }
2425
2426 /* --- Latin-1 Codec ------------------------------------------------------ */
2427
2428 PyObject *PyUnicode_DecodeLatin1(const char *s,
2429                                  Py_ssize_t size,
2430                                  const char *errors)
2431 {
2432     PyUnicodeObject *v;
2433     Py_UNICODE *p;
2434
2435     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2436     if (size == 1) {
2437         Py_UNICODE r = *(unsigned char*)s;
2438         return PyUnicode_FromUnicode(&r, 1);
2439     }
2440
2441     v = _PyUnicode_New(size);
2442     if (v == NULL)
2443         goto onError;
2444     if (size == 0)
2445         return (PyObject *)v;
2446     p = PyUnicode_AS_UNICODE(v);
2447     while (size-- > 0)
2448         *p++ = (unsigned char)*s++;
2449     return (PyObject *)v;
2450
2451  onError:
2452     Py_XDECREF(v);
2453     return NULL;
2454 }
2455
2456 /* create or adjust a UnicodeEncodeError */
2457 static void make_encode_exception(PyObject **exceptionObject,
2458     const char *encoding,
2459     const Py_UNICODE *unicode, Py_ssize_t size,
2460     Py_ssize_t startpos, Py_ssize_t endpos,
2461     const char *reason)
2462 {
2463     if (*exceptionObject == NULL) {
2464         *exceptionObject = PyUnicodeEncodeError_Create(
2465             encoding, unicode, size, startpos, endpos, reason);
2466     }
2467     else {
2468         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2469             goto onError;
2470         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2471             goto onError;
2472         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2473             goto onError;
2474         return;
2475         onError:
2476         Py_DECREF(*exceptionObject);
2477         *exceptionObject = NULL;
2478     }
2479 }
2480
2481 /* raises a UnicodeEncodeError */
2482 static void raise_encode_exception(PyObject **exceptionObject,
2483     const char *encoding,
2484     const Py_UNICODE *unicode, Py_ssize_t size,
2485     Py_ssize_t startpos, Py_ssize_t endpos,
2486     const char *reason)
2487 {
2488     make_encode_exception(exceptionObject,
2489         encoding, unicode, size, startpos, endpos, reason);
2490     if (*exceptionObject != NULL)
2491         PyCodec_StrictErrors(*exceptionObject);
2492 }
2493
2494 /* error handling callback helper:
2495    build arguments, call the callback and check the arguments,
2496    put the result into newpos and return the replacement string, which
2497    has to be freed by the caller */
2498 static PyObject *unicode_encode_call_errorhandler(const char *errors,
2499     PyObject **errorHandler,
2500     const char *encoding, const char *reason,
2501     const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2502     Py_ssize_t startpos, Py_ssize_t endpos,
2503     Py_ssize_t *newpos)
2504 {
2505     static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
2506
2507     PyObject *restuple;
2508     PyObject *resunicode;
2509
2510     if (*errorHandler == NULL) {
2511         *errorHandler = PyCodec_LookupError(errors);
2512         if (*errorHandler == NULL)
2513             return NULL;
2514     }
2515
2516     make_encode_exception(exceptionObject,
2517         encoding, unicode, size, startpos, endpos, reason);
2518     if (*exceptionObject == NULL)
2519         return NULL;
2520
2521     restuple = PyObject_CallFunctionObjArgs(
2522         *errorHandler, *exceptionObject, NULL);
2523     if (restuple == NULL)
2524         return NULL;
2525     if (!PyTuple_Check(restuple)) {
2526         PyErr_Format(PyExc_TypeError, &argparse[4]);
2527         Py_DECREF(restuple);
2528         return NULL;
2529     }
2530     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2531         &resunicode, newpos)) {
2532         Py_DECREF(restuple);
2533         return NULL;
2534     }
2535     if (*newpos<0)
2536         *newpos = size+*newpos;
2537     if (*newpos<0 || *newpos>size) {
2538         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
2539         Py_DECREF(restuple);
2540         return NULL;
2541     }
2542     Py_INCREF(resunicode);
2543     Py_DECREF(restuple);
2544     return resunicode;
2545 }
2546
2547 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2548                                  Py_ssize_t size,
2549                                  const char *errors,
2550                                  int limit)
2551 {
2552     /* output object */
2553     PyObject *res;
2554     /* pointers to the beginning and end+1 of input */
2555     const Py_UNICODE *startp = p;
2556     const Py_UNICODE *endp = p + size;
2557     /* pointer to the beginning of the unencodable characters */
2558     /* const Py_UNICODE *badp = NULL; */
2559     /* pointer into the output */
2560     char *str;
2561     /* current output position */
2562     Py_ssize_t respos = 0;
2563     Py_ssize_t ressize;
2564     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2565     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2566     PyObject *errorHandler = NULL;
2567     PyObject *exc = NULL;
2568     /* the following variable is used for caching string comparisons
2569      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2570     int known_errorHandler = -1;
2571
2572     /* allocate enough for a simple encoding without
2573        replacements, if we need more, we'll resize */
2574     res = PyString_FromStringAndSize(NULL, size);
2575     if (res == NULL)
2576         goto onError;
2577     if (size == 0)
2578         return res;
2579     str = PyString_AS_STRING(res);
2580     ressize = size;
2581
2582     while (p<endp) {
2583         Py_UNICODE c = *p;
2584
2585         /* can we encode this? */
2586         if (c<limit) {
2587             /* no overflow check, because we know that the space is enough */
2588             *str++ = (char)c;
2589             ++p;
2590         }
2591         else {
2592             Py_ssize_t unicodepos = p-startp;
2593             Py_ssize_t requiredsize;
2594             PyObject *repunicode;
2595             Py_ssize_t repsize;
2596             Py_ssize_t newpos;
2597             Py_ssize_t respos;
2598             Py_UNICODE *uni2;
2599             /* startpos for collecting unencodable chars */
2600             const Py_UNICODE *collstart = p;
2601             const Py_UNICODE *collend = p;
2602             /* find all unecodable characters */
2603             while ((collend < endp) && ((*collend)>=limit))
2604                 ++collend;
2605             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2606             if (known_errorHandler==-1) {
2607                 if ((errors==NULL) || (!strcmp(errors, "strict")))
2608                     known_errorHandler = 1;
2609                 else if (!strcmp(errors, "replace"))
2610                     known_errorHandler = 2;
2611                 else if (!strcmp(errors, "ignore"))
2612                     known_errorHandler = 3;
2613                 else if (!strcmp(errors, "xmlcharrefreplace"))
2614                     known_errorHandler = 4;
2615                 else
2616                     known_errorHandler = 0;
2617             }
2618             switch (known_errorHandler) {
2619                 case 1: /* strict */
2620                     raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2621                     goto onError;
2622                 case 2: /* replace */
2623                     while (collstart++<collend)
2624                         *str++ = '?'; /* fall through */
2625                 case 3: /* ignore */
2626                     p = collend;
2627                     break;
2628                 case 4: /* xmlcharrefreplace */
2629                     respos = str-PyString_AS_STRING(res);
2630                     /* determine replacement size (temporarily (mis)uses p) */
2631                     for (p = collstart, repsize = 0; p < collend; ++p) {
2632                         if (*p<10)
2633                             repsize += 2+1+1;
2634                         else if (*p<100)
2635                             repsize += 2+2+1;
2636                         else if (*p<1000)
2637                             repsize += 2+3+1;
2638                         else if (*p<10000)
2639                             repsize += 2+4+1;
2640 #ifndef Py_UNICODE_WIDE
2641                         else
2642                             repsize += 2+5+1;
2643 #else
2644                         else if (*p<100000)
2645                             repsize += 2+5+1;
2646                         else if (*p<1000000)
2647                             repsize += 2+6+1;
2648                         else
2649                             repsize += 2+7+1;
2650 #endif
2651                     }
2652                     requiredsize = respos+repsize+(endp-collend);
2653                     if (requiredsize > ressize) {
2654                         if (requiredsize<2*ressize)
2655                             requiredsize = 2*ressize;
2656                         if (_PyString_Resize(&res, requiredsize))
2657                             goto onError;
2658                         str = PyString_AS_STRING(res) + respos;
2659                         ressize = requiredsize;
2660                     }
2661                     /* generate replacement (temporarily (mis)uses p) */
2662                     for (p = collstart; p < collend; ++p) {
2663                         str += sprintf(str, "&#%d;", (int)*p);
2664                     }
2665                     p = collend;
2666                     break;
2667                 default:
2668                     repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2669                         encoding, reason, startp, size, &exc,
2670                         collstart-startp, collend-startp, &newpos);
2671                     if (repunicode == NULL)
2672                         goto onError;
2673                     /* need more space? (at least enough for what we
2674                        have+the replacement+the rest of the string, so
2675                        we won't have to check space for encodable characters) */
2676                     respos = str-PyString_AS_STRING(res);
2677                     repsize = PyUnicode_GET_SIZE(repunicode);
2678                     requiredsize = respos+repsize+(endp-collend);
2679                     if (requiredsize > ressize) {
2680                         if (requiredsize<2*ressize)
2681                             requiredsize = 2*ressize;
2682                         if (_PyString_Resize(&res, requiredsize)) {
2683                             Py_DECREF(repunicode);
2684                             goto onError;
2685                         }
2686                         str = PyString_AS_STRING(res) + respos;
2687                         ressize = requiredsize;
2688                     }
2689                     /* check if there is anything unencodable in the replacement
2690                        and copy it to the output */
2691                     for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2692                         c = *uni2;
2693                         if (c >= limit) {
2694                             raise_encode_exception(&exc, encoding, startp, size,
2695                                 unicodepos, unicodepos+1, reason);
2696                             Py_DECREF(repunicode);
2697                             goto onError;
2698                         }
2699                         *str = (char)c;
2700                     }
2701                     p = startp + newpos;
2702                     Py_DECREF(repunicode);
2703             }
2704         }
2705     }
2706     /* Resize if we allocated to much */
2707     respos = str-PyString_AS_STRING(res);
2708     if (respos<ressize)
2709        /* If this falls res will be NULL */
2710         _PyString_Resize(&res, respos);
2711     Py_XDECREF(errorHandler);
2712     Py_XDECREF(exc);
2713     return res;
2714
2715     onError:
2716     Py_XDECREF(res);
2717     Py_XDECREF(errorHandler);
2718     Py_XDECREF(exc);
2719     return NULL;
2720 }
2721
2722 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2723                                  Py_ssize_t size,
2724                                  const char *errors)
2725 {
2726     return unicode_encode_ucs1(p, size, errors, 256);
2727 }
2728
2729 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2730 {
2731     if (!PyUnicode_Check(unicode)) {
2732         PyErr_BadArgument();
2733         return NULL;
2734     }
2735     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2736                                   PyUnicode_GET_SIZE(unicode),
2737                                   NULL);
2738 }
2739
2740 /* --- 7-bit ASCII Codec -------------------------------------------------- */
2741
2742 PyObject *PyUnicode_DecodeASCII(const char *s,
2743                                 Py_ssize_t size,
2744                                 const char *errors)
2745 {
2746     const char *starts = s;
2747     PyUnicodeObject *v;
2748     Py_UNICODE *p;
2749     Py_ssize_t startinpos;
2750     Py_ssize_t endinpos;
2751     Py_ssize_t outpos;
2752     const char *e;
2753     PyObject *errorHandler = NULL;
2754     PyObject *exc = NULL;
2755
2756     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
2757     if (size == 1 && *(unsigned char*)s < 128) {
2758         Py_UNICODE r = *(unsigned char*)s;
2759         return PyUnicode_FromUnicode(&r, 1);
2760     }
2761
2762     v = _PyUnicode_New(size);
2763     if (v == NULL)
2764         goto onError;
2765     if (size == 0)
2766         return (PyObject *)v;
2767     p = PyUnicode_AS_UNICODE(v);
2768     e = s + size;
2769     while (s < e) {
2770         register unsigned char c = (unsigned char)*s;
2771         if (c < 128) {
2772             *p++ = c;
2773             ++s;
2774         }
2775         else {
2776             startinpos = s-starts;
2777             endinpos = startinpos + 1;
2778             outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
2779             if (unicode_decode_call_errorhandler(
2780                  errors, &errorHandler,
2781                  "ascii", "ordinal not in range(128)",
2782                  starts, size, &startinpos, &endinpos, &exc, &s,
2783                  (PyObject **)&v, &outpos, &p))
2784                 goto onError;
2785         }
2786     }
2787     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
2788         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2789             goto onError;
2790     Py_XDECREF(errorHandler);
2791     Py_XDECREF(exc);
2792     return (PyObject *)v;
2793
2794  onError:
2795     Py_XDECREF(v);
2796     Py_XDECREF(errorHandler);
2797     Py_XDECREF(exc);
2798     return NULL;
2799 }
2800
2801 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2802                                 Py_ssize_t size,
2803                                 const char *errors)
2804 {
2805     return unicode_encode_ucs1(p, size, errors, 128);
2806 }
2807
2808 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2809 {
2810     if (!PyUnicode_Check(unicode)) {
2811         PyErr_BadArgument();
2812         return NULL;
2813     }
2814     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2815                                  PyUnicode_GET_SIZE(unicode),
2816                                  NULL);
2817 }
2818
2819 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
2820
2821 /* --- MBCS codecs for Windows -------------------------------------------- */
2822
2823 #if SIZEOF_INT < SIZEOF_SSIZE_T
2824 #define NEED_RETRY
2825 #endif
2826
2827 /* XXX This code is limited to "true" double-byte encodings, as
2828    a) it assumes an incomplete character consists of a single byte, and
2829    b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2830       encodings, see IsDBCSLeadByteEx documentation. */
2831
2832 static int is_dbcs_lead_byte(const char *s, int offset)
2833 {
2834     const char *curr = s + offset;
2835
2836     if (IsDBCSLeadByte(*curr)) {
2837         const char *prev = CharPrev(s, curr);
2838         return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
2839     }
2840     return 0;
2841 }
2842
2843 /*
2844  * Decode MBCS string into unicode object. If 'final' is set, converts
2845  * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2846  */
2847 static int decode_mbcs(PyUnicodeObject **v,
2848                         const char *s, /* MBCS string */
2849                         int size, /* sizeof MBCS string */
2850                         int final)
2851 {
2852     Py_UNICODE *p;
2853     Py_ssize_t n = 0;
2854     int usize = 0;
2855
2856     assert(size >= 0);
2857
2858     /* Skip trailing lead-byte unless 'final' is set */
2859     if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
2860         --size;
2861
2862     /* First get the size of the result */
2863     if (size > 0) {
2864         usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2865         if (usize == 0) {
2866             PyErr_SetFromWindowsErrWithFilename(0, NULL);
2867             return -1;
2868         }
2869     }
2870
2871     if (*v == NULL) {
2872         /* Create unicode object */
2873         *v = _PyUnicode_New(usize);
2874         if (*v == NULL)
2875             return -1;
2876     }
2877     else {
2878         /* Extend unicode object */
2879         n = PyUnicode_GET_SIZE(*v);
2880         if (_PyUnicode_Resize(v, n + usize) < 0)
2881             return -1;
2882     }
2883
2884     /* Do the conversion */
2885     if (size > 0) {
2886         p = PyUnicode_AS_UNICODE(*v) + n;
2887         if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2888             PyErr_SetFromWindowsErrWithFilename(0, NULL);
2889             return -1;
2890         }
2891     }
2892
2893     return size;
2894 }
2895
2896 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
2897                                         Py_ssize_t size,
2898                                         const char *errors,
2899                                         Py_ssize_t *consumed)
2900 {
2901     PyUnicodeObject *v = NULL;
2902     int done;
2903
2904     if (consumed)
2905         *consumed = 0;
2906
2907 #ifdef NEED_RETRY
2908   retry:
2909     if (size > INT_MAX)
2910         done = decode_mbcs(&v, s, INT_MAX, 0);
2911     else
2912 #endif
2913         done = decode_mbcs(&v, s, (int)size, !consumed);
2914
2915     if (done < 0) {
2916         Py_XDECREF(v);
2917         return NULL;
2918     }
2919
2920     if (consumed)
2921         *consumed += done;
2922
2923 #ifdef NEED_RETRY
2924     if (size > INT_MAX) {
2925         s += done;
2926         size -= done;
2927         goto retry;
2928     }
2929 #endif
2930
2931     return (PyObject *)v;
2932 }
2933
2934 PyObject *PyUnicode_DecodeMBCS(const char *s,
2935                                 Py_ssize_t size,
2936                                 const char *errors)
2937 {
2938     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
2939 }
2940
2941 /*
2942  * Convert unicode into string object (MBCS).
2943  * Returns 0 if succeed, -1 otherwise.
2944  */
2945 static int encode_mbcs(PyObject **repr,
2946                         const Py_UNICODE *p, /* unicode */
2947                         int size) /* size of unicode */
2948 {
2949     int mbcssize = 0;
2950     Py_ssize_t n = 0;
2951
2952     assert(size >= 0);
2953
2954     /* First get the size of the result */
2955     if (size > 0) {
2956         mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2957         if (mbcssize == 0) {
2958             PyErr_SetFromWindowsErrWithFilename(0, NULL);
2959             return -1;
2960         }
2961     }
2962
2963     if (*repr == NULL) {
2964         /* Create string object */
2965         *repr = PyString_FromStringAndSize(NULL, mbcssize);
2966         if (*repr == NULL)
2967             return -1;
2968     }
2969     else {
2970         /* Extend string object */
2971         n = PyString_Size(*repr);
2972         if (_PyString_Resize(repr, n + mbcssize) < 0)
2973             return -1;
2974     }
2975
2976     /* Do the conversion */
2977     if (size > 0) {
2978         char *s = PyString_AS_STRING(*repr) + n;
2979         if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2980             PyErr_SetFromWindowsErrWithFilename(0, NULL);
2981             return -1;
2982         }
2983     }
2984
2985     return 0;
2986 }
2987
2988 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2989                                 Py_ssize_t size,
2990                                 const char *errors)
2991 {
2992     PyObject *repr = NULL;
2993     int ret;
2994
2995 #ifdef NEED_RETRY
2996  retry:
2997     if (size > INT_MAX)
2998         ret = encode_mbcs(&repr, p, INT_MAX);
2999     else
3000 #endif
3001         ret = encode_mbcs(&repr, p, (int)size);
3002
3003     if (ret < 0) {
3004         Py_XDECREF(repr);
3005         return NULL;
3006     }
3007
3008 #ifdef NEED_RETRY
3009     if (size > INT_MAX) {
3010         p += INT_MAX;
3011         size -= INT_MAX;
3012         goto retry;
3013     }
3014 #endif
3015
3016     return repr;
3017 }
3018
3019 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3020 {
3021     if (!PyUnicode_Check(unicode)) {
3022         PyErr_BadArgument();
3023         return NULL;
3024     }
3025     return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3026                                 PyUnicode_GET_SIZE(unicode),
3027                                 NULL);
3028 }
3029
3030 #undef NEED_RETRY
3031
3032 #endif /* MS_WINDOWS */
3033
3034 /* --- Character Mapping Codec -------------------------------------------- */
3035
3036 PyObject *PyUnicode_DecodeCharmap(const char *s,
3037                                   Py_ssize_t size,
3038                                   PyObject *mapping,
3039                                   const char *errors)
3040 {
3041     const char *starts = s;
3042     Py_ssize_t startinpos;
3043     Py_ssize_t endinpos;
3044     Py_ssize_t outpos;
3045     const char *e;
3046     PyUnicodeObject *v;
3047     Py_UNICODE *p;
3048     Py_ssize_t extrachars = 0;
3049     PyObject *errorHandler = NULL;
3050     PyObject *exc = NULL;
3051     Py_UNICODE *mapstring = NULL;
3052     Py_ssize_t maplen = 0;
3053
3054     /* Default to Latin-1 */
3055     if (mapping == NULL)
3056         return PyUnicode_DecodeLatin1(s, size, errors);
3057
3058     v = _PyUnicode_New(size);
3059     if (v == NULL)
3060         goto onError;
3061     if (size == 0)
3062         return (PyObject *)v;
3063     p = PyUnicode_AS_UNICODE(v);
3064     e = s + size;
3065     if (PyUnicode_CheckExact(mapping)) {
3066         mapstring = PyUnicode_AS_UNICODE(mapping);
3067         maplen = PyUnicode_GET_SIZE(mapping);
3068         while (s < e) {
3069             unsigned char ch = *s;
3070             Py_UNICODE x = 0xfffe; /* illegal value */
3071
3072             if (ch < maplen)
3073                 x = mapstring[ch];
3074
3075             if (x == 0xfffe) {
3076                 /* undefined mapping */
3077                 outpos = p-PyUnicode_AS_UNICODE(v);
3078                 startinpos = s-starts;
3079                 endinpos = startinpos+1;
3080                 if (unicode_decode_call_errorhandler(
3081                      errors, &errorHandler,
3082                      "charmap", "character maps to <undefined>",
3083                      starts, size, &startinpos, &endinpos, &exc, &s,
3084                      (PyObject **)&v, &outpos, &p)) {
3085                     goto onError;
3086                 }
3087                 continue;
3088             }
3089             *p++ = x;
3090             ++s;
3091         }
3092     }
3093     else {
3094         while (s < e) {
3095             unsigned char ch = *s;
3096             PyObject *w, *x;
3097
3098             /* Get mapping (char ordinal -> integer, Unicode char or None) */
3099             w = PyInt_FromLong((long)ch);
3100             if (w == NULL)
3101                 goto onError;
3102             x = PyObject_GetItem(mapping, w);
3103             Py_DECREF(w);
3104             if (x == NULL) {
3105                 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3106                     /* No mapping found means: mapping is undefined. */
3107                     PyErr_Clear();
3108                     x = Py_None;
3109                     Py_INCREF(x);
3110                 } else
3111                     goto onError;
3112             }
3113
3114             /* Apply mapping */
3115             if (PyInt_Check(x)) {
3116                 long value = PyInt_AS_LONG(x);
3117                 if (value < 0 || value > 65535) {
3118                     PyErr_SetString(PyExc_TypeError,
3119                                     "character mapping must be in range(65536)");
3120                     Py_DECREF(x);
3121                     goto onError;
3122                 }
3123                 *p++ = (Py_UNICODE)value;
3124             }
3125             else if (x == Py_None) {
3126                 /* undefined mapping */
3127                 outpos = p-PyUnicode_AS_UNICODE(v);
3128                 startinpos = s-starts;
3129                 endinpos = startinpos+1;
3130                 if (unicode_decode_call_errorhandler(
3131                      errors, &errorHandler,
3132                      "charmap", "character maps to <undefined>",
3133                      starts, size, &startinpos, &endinpos, &exc, &s,
3134                      (PyObject **)&v, &outpos, &p)) {
3135                     Py_DECREF(x);
3136                     goto onError;
3137                 }
3138                 Py_DECREF(x);
3139                 continue;
3140             }
3141             else if (PyUnicode_Check(x)) {
3142                 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
3143
3144                 if (targetsize == 1)
3145                     /* 1-1 mapping */
3146                     *p++ = *PyUnicode_AS_UNICODE(x);
3147
3148                 else if (targetsize > 1) {
3149                     /* 1-n mapping */
3150                     if (targetsize > extrachars) {
3151                         /* resize first */
3152                         Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3153                         Py_ssize_t needed = (targetsize - extrachars) + \
3154                                      (targetsize << 2);
3155                         extrachars += needed;
3156                         if (_PyUnicode_Resize(&v,
3157                                              PyUnicode_GET_SIZE(v) + needed) < 0) {
3158                             Py_DECREF(x);
3159                             goto onError;
3160                         }
3161                         p = PyUnicode_AS_UNICODE(v) + oldpos;
3162                     }
3163                     Py_UNICODE_COPY(p,
3164                                     PyUnicode_AS_UNICODE(x),
3165                                     targetsize);
3166                     p += targetsize;
3167                     extrachars -= targetsize;
3168                 }
3169                 /* 1-0 mapping: skip the character */
3170             }
3171             else {
3172                 /* wrong return value */
3173                 PyErr_SetString(PyExc_TypeError,
3174                       "character mapping must return integer, None or unicode");
3175                 Py_DECREF(x);
3176                 goto onError;
3177             }
3178             Py_DECREF(x);
3179             ++s;
3180         }
3181     }
3182     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
3183         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3184             goto onError;
3185     Py_XDECREF(errorHandler);
3186     Py_XDECREF(exc);
3187     return (PyObject *)v;
3188
3189  onError:
3190     Py_XDECREF(errorHandler);
3191     Py_XDECREF(exc);
3192     Py_XDECREF(v);
3193     return NULL;
3194 }
3195
3196 /* Charmap encoding: the lookup table */
3197
3198 struct encoding_map{
3199   PyObject_HEAD
3200   unsigned char level1[32];
3201   int count2, count3;
3202   unsigned char level23[1];
3203 };
3204
3205 static PyObject*
3206 encoding_map_size(PyObject *obj, PyObject* args)
3207 {
3208     struct encoding_map *map = (struct encoding_map*)obj;
3209     return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3210                           128*map->count3);
3211 }
3212
3213 static PyMethodDef encoding_map_methods[] = {
3214         {"size", encoding_map_size, METH_NOARGS,
3215          PyDoc_STR("Return the size (in bytes) of this object") },
3216         { 0 }
3217 };
3218
3219 static void
3220 encoding_map_dealloc(PyObject* o)
3221 {
3222         PyObject_FREE(o);
3223 }
3224
3225 static PyTypeObject EncodingMapType = {
3226         PyObject_HEAD_INIT(NULL)
3227         0,                      /*ob_size*/
3228         "EncodingMap",          /*tp_name*/
3229         sizeof(struct encoding_map),   /*tp_basicsize*/
3230         0,                      /*tp_itemsize*/
3231         /* methods */
3232         encoding_map_dealloc,   /*tp_dealloc*/
3233         0,                      /*tp_print*/
3234         0,                      /*tp_getattr*/
3235         0,                      /*tp_setattr*/
3236         0,                      /*tp_compare*/
3237         0,                      /*tp_repr*/
3238         0,                      /*tp_as_number*/
3239         0,                      /*tp_as_sequence*/
3240         0,                      /*tp_as_mapping*/
3241         0,                      /*tp_hash*/
3242         0,                      /*tp_call*/
3243         0,                      /*tp_str*/
3244         0,                      /*tp_getattro*/
3245         0,                      /*tp_setattro*/
3246         0,                      /*tp_as_buffer*/
3247         Py_TPFLAGS_DEFAULT,     /*tp_flags*/
3248         0,                      /*tp_doc*/
3249         0,                      /*tp_traverse*/
3250         0,                      /*tp_clear*/
3251         0,                      /*tp_richcompare*/
3252         0,                      /*tp_weaklistoffset*/
3253         0,                      /*tp_iter*/
3254         0,                      /*tp_iternext*/
3255         encoding_map_methods,   /*tp_methods*/
3256         0,                      /*tp_members*/
3257         0,                      /*tp_getset*/
3258         0,                      /*tp_base*/
3259         0,                      /*tp_dict*/
3260         0,                      /*tp_descr_get*/
3261         0,                      /*tp_descr_set*/
3262         0,                      /*tp_dictoffset*/
3263         0,                      /*tp_init*/
3264         0,                      /*tp_alloc*/
3265         0,                      /*tp_new*/
3266         0,                      /*tp_free*/
3267         0,                      /*tp_is_gc*/
3268 };
3269
3270 PyObject*
3271 PyUnicode_BuildEncodingMap(PyObject* string)
3272 {
3273     Py_UNICODE *decode;
3274     PyObject *result;
3275     struct encoding_map *mresult;
3276     int i;
3277     int need_dict = 0;
3278     unsigned char level1[32];
3279     unsigned char level2[512];
3280     unsigned char *mlevel1, *mlevel2, *mlevel3;
3281     int count2 = 0, count3 = 0;
3282
3283     if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3284         PyErr_BadArgument();
3285         return NULL;
3286     }
3287     decode = PyUnicode_AS_UNICODE(string);
3288     memset(level1, 0xFF, sizeof level1);
3289     memset(level2, 0xFF, sizeof level2);
3290
3291     /* If there isn't a one-to-one mapping of NULL to \0,
3292        or if there are non-BMP characters, we need to use
3293        a mapping dictionary. */
3294     if (decode[0] != 0)
3295         need_dict = 1;
3296     for (i = 1; i < 256; i++) {
3297         int l1, l2;
3298         if (decode[i] == 0
3299             #ifdef Py_UNICODE_WIDE
3300             || decode[i] > 0xFFFF
3301             #endif
3302         ) {
3303             need_dict = 1;
3304             break;
3305         }
3306         if (decode[i] == 0xFFFE)
3307             /* unmapped character */
3308             continue;
3309         l1 = decode[i] >> 11;
3310         l2 = decode[i] >> 7;
3311         if (level1[l1] == 0xFF)
3312             level1[l1] = count2++;
3313         if (level2[l2] == 0xFF)
3314             level2[l2] = count3++;
3315     }
3316
3317     if (count2 >= 0xFF || count3 >= 0xFF)
3318         need_dict = 1;
3319
3320     if (need_dict) {
3321         PyObject *result = PyDict_New();
3322         PyObject *key, *value;
3323         if (!result)
3324             return NULL;
3325         for (i = 0; i < 256; i++) {
3326             key = value = NULL;
3327             key = PyInt_FromLong(decode[i]);
3328             value = PyInt_FromLong(i);
3329             if (!key || !value)
3330                 goto failed1;
3331             if (PyDict_SetItem(result, key, value) == -1)
3332                 goto failed1;
3333             Py_DECREF(key);
3334             Py_DECREF(value);
3335         }
3336         return result;
3337       failed1:
3338         Py_XDECREF(key);
3339         Py_XDECREF(value);
3340         Py_DECREF(result);
3341         return NULL;
3342     }
3343
3344     /* Create a three-level trie */
3345     result = PyObject_MALLOC(sizeof(struct encoding_map) +
3346                              16*count2 + 128*count3 - 1);
3347     if (!result)
3348         return PyErr_NoMemory();
3349     PyObject_Init(result, &EncodingMapType);
3350     mresult = (struct encoding_map*)result;
3351     mresult->count2 = count2;
3352     mresult->count3 = count3;
3353     mlevel1 = mresult->level1;
3354     mlevel2 = mresult->level23;
3355     mlevel3 = mresult->level23 + 16*count2;
3356     memcpy(mlevel1, level1, 32);
3357     memset(mlevel2, 0xFF, 16*count2);
3358     memset(mlevel3, 0, 128*count3);
3359     count3 = 0;
3360     for (i = 1; i < 256; i++) {
3361         int o1, o2, o3, i2, i3;
3362         if (decode[i] == 0xFFFE)
3363             /* unmapped character */
3364             continue;
3365         o1 = decode[i]>>11;
3366         o2 = (decode[i]>>7) & 0xF;
3367         i2 = 16*mlevel1[o1] + o2;
3368         if (mlevel2[i2] == 0xFF)
3369             mlevel2[i2] = count3++;
3370         o3 = decode[i] & 0x7F;
3371         i3 = 128*mlevel2[i2] + o3;
3372         mlevel3[i3] = i;
3373     }
3374     return result;
3375 }
3376
3377 static int
3378 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3379 {
3380     struct encoding_map *map = (struct encoding_map*)mapping;
3381     int l1 = c>>11;
3382     int l2 = (c>>7) & 0xF;
3383     int l3 = c & 0x7F;
3384     int i;
3385
3386 #ifdef Py_UNICODE_WIDE
3387     if (c > 0xFFFF) {
3388         return -1;
3389     }
3390 #endif
3391     if (c == 0)
3392         return 0;
3393     /* level 1*/
3394     i = map->level1[l1];
3395     if (i == 0xFF) {
3396         return -1;
3397     }
3398     /* level 2*/
3399     i = map->level23[16*i+l2];
3400     if (i == 0xFF) {
3401         return -1;
3402     }
3403     /* level 3 */
3404     i = map->level23[16*map->count2 + 128*i + l3];
3405     if (i == 0) {
3406         return -1;
3407     }
3408     return i;
3409 }
3410
3411 /* Lookup the character ch in the mapping. If the character
3412    can't be found, Py_None is returned (or NULL, if another
3413    error occurred). */
3414 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
3415 {
3416     PyObject *w = PyInt_FromLong((long)c);
3417     PyObject *x;
3418
3419     if (w == NULL)
3420          return NULL;
3421     x = PyObject_GetItem(mapping, w);
3422     Py_DECREF(w);
3423     if (x == NULL) {
3424         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3425             /* No mapping found means: mapping is undefined. */
3426             PyErr_Clear();
3427             x = Py_None;
3428             Py_INCREF(x);
3429             return x;
3430         } else
3431             return NULL;
3432     }
3433     else if (x == Py_None)
3434         return x;
3435     else if (PyInt_Check(x)) {
3436         long value = PyInt_AS_LONG(x);
3437         if (value < 0 || value > 255) {
3438             PyErr_SetString(PyExc_TypeError,
3439                              "character mapping must be in range(256)");
3440             Py_DECREF(x);
3441             return NULL;
3442         }
3443         return x;
3444     }
3445     else if (PyString_Check(x))
3446         return x;
3447     else {
3448         /* wrong return value */
3449         PyErr_SetString(PyExc_TypeError,
3450               "character mapping must return integer, None or str");
3451         Py_DECREF(x);
3452         return NULL;
3453     }
3454 }
3455
3456 static int
3457 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3458 {
3459         Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3460         /* exponentially overallocate to minimize reallocations */
3461         if (requiredsize < 2*outsize)
3462             requiredsize = 2*outsize;
3463         if (_PyString_Resize(outobj, requiredsize)) {
3464             return 0;
3465         }
3466         return 1;
3467 }
3468
3469 typedef enum charmapencode_result {
3470   enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3471 }charmapencode_result;
3472 /* lookup the character, put the result in the output string and adjust
3473    various state variables. Reallocate the output string if not enough
3474    space is available. Return a new reference to the object that
3475    was put in the output buffer, or Py_None, if the mapping was undefined
3476    (in which case no character was written) or NULL, if a
3477    reallocation error occurred. The caller must decref the result */
3478 static
3479 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
3480     PyObject **outobj, Py_ssize_t *outpos)
3481 {
3482     PyObject *rep;
3483     char *outstart;
3484     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3485
3486     if (mapping->ob_type == &EncodingMapType) {
3487         int res = encoding_map_lookup(c, mapping);
3488         Py_ssize_t requiredsize = *outpos+1;
3489         if (res == -1)
3490             return enc_FAILED;
3491         if (outsize<requiredsize)
3492             if (!charmapencode_resize(outobj, outpos, requiredsize))
3493                 return enc_EXCEPTION;
3494         outstart = PyString_AS_STRING(*outobj);
3495         outstart[(*outpos)++] = (char)res;
3496         return enc_SUCCESS;
3497     }
3498
3499     rep = charmapencode_lookup(c, mapping);
3500     if (rep==NULL)
3501         return enc_EXCEPTION;
3502     else if (rep==Py_None) {
3503         Py_DECREF(rep);
3504         return enc_FAILED;
3505     } else {
3506         if (PyInt_Check(rep)) {
3507             Py_ssize_t requiredsize = *outpos+1;
3508             if (outsize<requiredsize)
3509                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
3510                     Py_DECREF(rep);
3511                     return enc_EXCEPTION;
3512                 }
3513             outstart = PyString_AS_STRING(*outobj);
3514             outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3515         }
3516         else {
3517             const char *repchars = PyString_AS_STRING(rep);
3518             Py_ssize_t repsize = PyString_GET_SIZE(rep);
3519             Py_ssize_t requiredsize = *outpos+repsize;
3520             if (outsize<requiredsize)
3521                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
3522                     Py_DECREF(rep);
3523                     return enc_EXCEPTION;
3524                 }
3525             outstart = PyString_AS_STRING(*outobj);
3526             memcpy(outstart + *outpos, repchars, repsize);
3527             *outpos += repsize;
3528         }
3529     }
3530     Py_DECREF(rep);
3531     return enc_SUCCESS;
3532 }
3533
3534 /* handle an error in PyUnicode_EncodeCharmap
3535    Return 0 on success, -1 on error */
3536 static
3537 int charmap_encoding_error(
3538     const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
3539     PyObject **exceptionObject,
3540     int *known_errorHandler, PyObject **errorHandler, const char *errors,
3541     PyObject **res, Py_ssize_t *respos)
3542 {
3543     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3544     Py_ssize_t repsize;
3545     Py_ssize_t newpos;
3546     Py_UNICODE *uni2;
3547     /* startpos for collecting unencodable chars */
3548     Py_ssize_t collstartpos = *inpos;
3549     Py_ssize_t collendpos = *inpos+1;
3550     Py_ssize_t collpos;
3551     char *encoding = "charmap";
3552     char *reason = "character maps to <undefined>";
3553     charmapencode_result x;
3554
3555     /* find all unencodable characters */
3556     while (collendpos < size) {
3557         PyObject *rep;
3558         if (mapping->ob_type == &EncodingMapType) {
3559             int res = encoding_map_lookup(p[collendpos], mapping);
3560             if (res != -1)
3561                 break;
3562             ++collendpos;
3563             continue;
3564         }
3565
3566         rep = charmapencode_lookup(p[collendpos], mapping);
3567         if (rep==NULL)
3568             return -1;
3569         else if (rep!=Py_None) {
3570             Py_DECREF(rep);
3571             break;
3572         }
3573         Py_DECREF(rep);
3574         ++collendpos;
3575     }
3576     /* cache callback name lookup
3577      * (if not done yet, i.e. it's the first error) */
3578     if (*known_errorHandler==-1) {
3579         if ((errors==NULL) || (!strcmp(errors, "strict")))
3580             *known_errorHandler = 1;
3581         else if (!strcmp(errors, "replace"))
3582             *known_errorHandler = 2;
3583         else if (!strcmp(errors, "ignore"))
3584             *known_errorHandler = 3;
3585         else if (!strcmp(errors, "xmlcharrefreplace"))
3586             *known_errorHandler = 4;
3587         else
3588             *known_errorHandler = 0;
3589     }
3590     switch (*known_errorHandler) {
3591         case 1: /* strict */
3592             raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3593             return -1;
3594         case 2: /* replace */
3595             for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3596                 x = charmapencode_output('?', mapping, res, respos);
3597                 if (x==enc_EXCEPTION) {
3598                     return -1;
3599                 }
3600                 else if (x==enc_FAILED) {
3601                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3602                     return -1;
3603                 }
3604             }
3605             /* fall through */
3606         case 3: /* ignore */
3607             *inpos = collendpos;
3608             break;
3609         case 4: /* xmlcharrefreplace */
3610             /* generate replacement (temporarily (mis)uses p) */
3611             for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3612                 char buffer[2+29+1+1];
3613                 char *cp;
3614                 sprintf(buffer, "&#%d;", (int)p[collpos]);
3615                 for (cp = buffer; *cp; ++cp) {
3616                     x = charmapencode_output(*cp, mapping, res, respos);
3617                     if (x==enc_EXCEPTION)
3618                         return -1;
3619                     else if (x==enc_FAILED) {
3620                         raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3621                         return -1;
3622                     }
3623                 }
3624             }
3625             *inpos = collendpos;
3626             break;
3627         default:
3628             repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
3629                 encoding, reason, p, size, exceptionObject,
3630                 collstartpos, collendpos, &newpos);
3631             if (repunicode == NULL)
3632                 return -1;
3633             /* generate replacement  */
3634             repsize = PyUnicode_GET_SIZE(repunicode);
3635             for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3636                 x = charmapencode_output(*uni2, mapping, res, respos);
3637                 if (x==enc_EXCEPTION) {
3638                     return -1;
3639                 }
3640                 else if (x==enc_FAILED) {
3641                     Py_DECREF(repunicode);
3642                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3643                     return -1;
3644                 }
3645             }
3646             *inpos = newpos;
3647             Py_DECREF(repunicode);
3648     }
3649     return 0;
3650 }
3651
3652 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3653                                   Py_ssize_t size,
3654                                   PyObject *mapping,
3655                                   const char *errors)
3656 {
3657     /* output object */
3658     PyObject *res = NULL;
3659     /* current input position */
3660     Py_ssize_t inpos = 0;
3661     /* current output position */
3662     Py_ssize_t respos = 0;
3663     PyObject *errorHandler = NULL;
3664     PyObject *exc = NULL;
3665     /* the following variable is used for caching string comparisons
3666      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3667      * 3=ignore, 4=xmlcharrefreplace */
3668     int known_errorHandler = -1;
3669
3670     /* Default to Latin-1 */
3671     if (mapping == NULL)
3672         return PyUnicode_EncodeLatin1(p, size, errors);
3673
3674     /* allocate enough for a simple encoding without
3675        replacements, if we need more, we'll resize */
3676     res = PyString_FromStringAndSize(NULL, size);
3677     if (res == NULL)
3678         goto onError;
3679     if (size == 0)
3680         return res;
3681
3682     while (inpos<size) {
3683         /* try to encode it */
3684         charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3685         if (x==enc_EXCEPTION) /* error */
3686             goto onError;
3687         if (x==enc_FAILED) { /* unencodable character */
3688             if (charmap_encoding_error(p, size, &inpos, mapping,
3689                 &exc,
3690                 &known_errorHandler, &errorHandler, errors,
3691                 &res, &respos)) {
3692                 goto onError;
3693             }
3694         }
3695         else
3696             /* done with this character => adjust input position */
3697             ++inpos;
3698     }
3699
3700     /* Resize if we allocated to much */
3701     if (respos<PyString_GET_SIZE(res)) {
3702         if (_PyString_Resize(&res, respos))
3703             goto onError;
3704     }
3705     Py_XDECREF(exc);
3706     Py_XDECREF(errorHandler);
3707     return res;
3708
3709     onError:
3710     Py_XDECREF(res);
3711     Py_XDECREF(exc);
3712     Py_XDECREF(errorHandler);
3713     return NULL;
3714 }
3715
3716 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3717                                     PyObject *mapping)
3718 {
3719     if (!PyUnicode_Check(unicode) || mapping == NULL) {
3720         PyErr_BadArgument();
3721         return NULL;
3722     }
3723     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3724                                    PyUnicode_GET_SIZE(unicode),
3725                                    mapping,
3726                                    NULL);
3727 }
3728
3729 /* create or adjust a UnicodeTranslateError */
3730 static void make_translate_exception(PyObject **exceptionObject,
3731     const Py_UNICODE *unicode, Py_ssize_t size,
3732     Py_ssize_t startpos, Py_ssize_t endpos,
3733     const char *reason)
3734 {
3735     if (*exceptionObject == NULL) {
3736         *exceptionObject = PyUnicodeTranslateError_Create(
3737             unicode, size, startpos, endpos, reason);
3738     }
3739     else {
3740         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3741             goto onError;
3742         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3743             goto onError;
3744         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3745             goto onError;
3746         return;
3747         onError:
3748         Py_DECREF(*exceptionObject);
3749         *exceptionObject = NULL;
3750     }
3751 }
3752
3753 /* raises a UnicodeTranslateError */
3754 static void raise_translate_exception(PyObject **exceptionObject,
3755     const Py_UNICODE *unicode, Py_ssize_t size,
3756     Py_ssize_t startpos, Py_ssize_t endpos,
3757     const char *reason)
3758 {
3759     make_translate_exception(exceptionObject,
3760         unicode, size, startpos, endpos, reason);
3761     if (*exceptionObject != NULL)
3762         PyCodec_StrictErrors(*exceptionObject);
3763 }
3764
3765 /* error handling callback helper:
3766    build arguments, call the callback and check the arguments,
3767    put the result into newpos and return the replacement string, which
3768    has to be freed by the caller */
3769 static PyObject *unicode_translate_call_errorhandler(const char *errors,
3770     PyObject **errorHandler,
3771     const char *reason,
3772     const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3773     Py_ssize_t startpos, Py_ssize_t endpos,
3774     Py_ssize_t *newpos)
3775 {
3776     static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
3777
3778     Py_ssize_t i_newpos;
3779     PyObject *restuple;
3780     PyObject *resunicode;
3781
3782     if (*errorHandler == NULL) {
3783         *errorHandler = PyCodec_LookupError(errors);
3784         if (*errorHandler == NULL)
3785             return NULL;
3786     }
3787
3788     make_translate_exception(exceptionObject,
3789         unicode, size, startpos, endpos, reason);
3790     if (*exceptionObject == NULL)
3791         return NULL;
3792
3793     restuple = PyObject_CallFunctionObjArgs(
3794         *errorHandler, *exceptionObject, NULL);
3795     if (restuple == NULL)
3796         return NULL;
3797     if (!PyTuple_Check(restuple)) {
3798         PyErr_Format(PyExc_TypeError, &argparse[4]);
3799         Py_DECREF(restuple);
3800         return NULL;
3801     }
3802     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3803         &resunicode, &i_newpos)) {
3804         Py_DECREF(restuple);
3805         return NULL;
3806     }
3807     if (i_newpos<0)
3808         *newpos = size+i_newpos;
3809     else
3810         *newpos = i_newpos;
3811     if (*newpos<0 || *newpos>size) {
3812         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3813         Py_DECREF(restuple);
3814         return NULL;
3815     }
3816     Py_INCREF(resunicode);
3817     Py_DECREF(restuple);
3818     return resunicode;
3819 }
3820
3821 /* Lookup the character ch in the mapping and put the result in result,
3822    which must be decrefed by the caller.
3823    Return 0 on success, -1 on error */
3824 static
3825 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3826 {
3827     PyObject *w = PyInt_FromLong((long)c);
3828     PyObject *x;
3829
3830     if (w == NULL)
3831          return -1;
3832     x = PyObject_GetItem(mapping, w);
3833     Py_DECREF(w);
3834     if (x == NULL) {
3835         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3836             /* No mapping found means: use 1:1 mapping. */
3837             PyErr_Clear();
3838             *result = NULL;
3839             return 0;
3840         } else
3841             return -1;
3842     }
3843     else if (x == Py_None) {
3844         *result = x;
3845         return 0;
3846     }
3847     else if (PyInt_Check(x)) {
3848         long value = PyInt_AS_LONG(x);
3849         long max = PyUnicode_GetMax();
3850         if (value < 0 || value > max) {
3851             PyErr_Format(PyExc_TypeError,
3852                              "character mapping must be in range(0x%lx)", max+1);
3853             Py_DECREF(x);
3854             return -1;
3855         }
3856         *result = x;
3857         return 0;
3858     }
3859     else if (PyUnicode_Check(x)) {
3860         *result = x;
3861         return 0;
3862     }
3863     else {
3864         /* wrong return value */
3865         PyErr_SetString(PyExc_TypeError,
3866               "character mapping must return integer, None or unicode");
3867         Py_DECREF(x);
3868         return -1;
3869     }
3870 }
3871 /* ensure that *outobj is at least requiredsize characters long,
3872 if not reallocate and adjust various state variables.
3873 Return 0 on success, -1 on error */
3874 static
3875 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
3876     Py_ssize_t requiredsize)
3877 {
3878     Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
3879     if (requiredsize > oldsize) {
3880         /* remember old output position */
3881         Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3882         /* exponentially overallocate to minimize reallocations */
3883         if (requiredsize < 2 * oldsize)
3884             requiredsize = 2 * oldsize;
3885         if (_PyUnicode_Resize(outobj, requiredsize) < 0)
3886             return -1;
3887         *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
3888     }
3889     return 0;
3890 }
3891 /* lookup the character, put the result in the output string and adjust
3892    various state variables. Return a new reference to the object that
3893    was put in the output buffer in *result, or Py_None, if the mapping was
3894    undefined (in which case no character was written).
3895    The called must decref result.
3896    Return 0 on success, -1 on error. */
3897 static
3898 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
3899     Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
3900     PyObject **res)
3901 {
3902     if (charmaptranslate_lookup(*curinp, mapping, res))
3903         return -1;
3904     if (*res==NULL) {
3905         /* not found => default to 1:1 mapping */
3906         *(*outp)++ = *curinp;
3907     }
3908     else if (*res==Py_None)
3909         ;
3910     else if (PyInt_Check(*res)) {
3911         /* no overflow check, because we know that the space is enough */
3912         *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3913     }
3914     else if (PyUnicode_Check(*res)) {
3915         Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
3916         if (repsize==1) {
3917             /* no overflow check, because we know that the space is enough */
3918             *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3919         }
3920         else if (repsize!=0) {
3921             /* more than one character */
3922             Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
3923                 (insize - (curinp-startinp)) +
3924                 repsize - 1;
3925             if (charmaptranslate_makespace(outobj, outp, requiredsize))
3926                 return -1;
3927             memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3928             *outp += repsize;
3929         }
3930     }
3931     else
3932         return -1;
3933     return 0;
3934 }
3935
3936 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
3937                                      Py_ssize_t size,
3938                                      PyObject *mapping,
3939                                      const char *errors)
3940 {
3941     /* output object */
3942     PyObject *res = NULL;
3943     /* pointers to the beginning and end+1 of input */
3944     const Py_UNICODE *startp = p;
3945     const Py_UNICODE *endp = p + size;
3946     /* pointer into the output */
3947     Py_UNICODE *str;
3948     /* current output position */
3949     Py_ssize_t respos = 0;
3950     char *reason = "character maps to <undefined>";
3951     PyObject *errorHandler = NULL;
3952     PyObject *exc = NULL;
3953     /* the following variable is used for caching string comparisons
3954      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3955      * 3=ignore, 4=xmlcharrefreplace */
3956     int known_errorHandler = -1;
3957
3958     if (mapping == NULL) {
3959         PyErr_BadArgument();
3960         return NULL;
3961     }
3962
3963     /* allocate enough for a simple 1:1 translation without
3964        replacements, if we need more, we'll resize */
3965     res = PyUnicode_FromUnicode(NULL, size);
3966     if (res == NULL)
3967         goto onError;
3968     if (size == 0)
3969         return res;
3970     str = PyUnicode_AS_UNICODE(res);
3971
3972     while (p<endp) {
3973         /* try to encode it */
3974         PyObject *x = NULL;
3975         if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
3976             Py_XDECREF(x);
3977             goto onError;
3978         }
3979         Py_XDECREF(x);
3980         if (x!=Py_None) /* it worked => adjust input pointer */
3981             ++p;
3982         else { /* untranslatable character */
3983             PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3984             Py_ssize_t repsize;
3985             Py_ssize_t newpos;
3986             Py_UNICODE *uni2;
3987             /* startpos for collecting untranslatable chars */
3988             const Py_UNICODE *collstart = p;
3989             const Py_UNICODE *collend = p+1;
3990             const Py_UNICODE *coll;
3991
3992             /* find all untranslatable characters */
3993             while (collend < endp) {
3994                 if (charmaptranslate_lookup(*collend, mapping, &x))
3995                     goto onError;
3996                 Py_XDECREF(x);
3997                 if (x!=Py_None)
3998                     break;
3999                 ++collend;
4000             }
4001             /* cache callback name lookup
4002              * (if not done yet, i.e. it's the first error) */
4003             if (known_errorHandler==-1) {
4004                 if ((errors==NULL) || (!strcmp(errors, "strict")))
4005                     known_errorHandler = 1;
4006                 else if (!strcmp(errors, "replace"))
4007                     known_errorHandler = 2;
4008                 else if (!strcmp(errors, "ignore"))
4009                     known_errorHandler = 3;
4010                 else if (!strcmp(errors, "xmlcharrefreplace"))
4011                     known_errorHandler = 4;
4012                 else
4013                     known_errorHandler = 0;
4014             }
4015             switch (known_errorHandler) {
4016                 case 1: /* strict */
4017                     raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4018                     goto onError;
4019                 case 2: /* replace */
4020                     /* No need to check for space, this is a 1:1 replacement */
4021                     for (coll = collstart; coll<collend; ++coll)
4022                         *str++ = '?';
4023                     /* fall through */
4024                 case 3: /* ignore */
4025                     p = collend;
4026                     break;
4027                 case 4: /* xmlcharrefreplace */
4028                     /* generate replacement (temporarily (mis)uses p) */
4029                     for (p = collstart; p < collend; ++p) {
4030                         char buffer[2+29+1+1];
4031                         char *cp;
4032                         sprintf(buffer, "&#%d;", (int)*p);
4033                         if (charmaptranslate_makespace(&res, &str,
4034                             (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4035                             goto onError;
4036                         for (cp = buffer; *cp; ++cp)
4037                             *str++ = *cp;
4038                     }
4039                     p = collend;
4040                     break;
4041                 default:
4042                     repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4043                         reason, startp, size, &exc,
4044                         collstart-startp, collend-startp, &newpos);
4045                     if (repunicode == NULL)
4046                         goto onError;
4047                     /* generate replacement  */
4048                     repsize = PyUnicode_GET_SIZE(repunicode);
4049                     if (charmaptranslate_makespace(&res, &str,
4050                         (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4051                         Py_DECREF(repunicode);
4052                         goto onError;
4053                     }
4054                     for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4055                         *str++ = *uni2;
4056                     p = startp + newpos;
4057                     Py_DECREF(repunicode);
4058             }
4059         }
4060     }
4061     /* Resize if we allocated to much */
4062     respos = str-PyUnicode_AS_UNICODE(res);
4063     if (respos<PyUnicode_GET_SIZE(res)) {
4064         if (_PyUnicode_Resize(&res, respos) < 0)
4065             goto onError;
4066     }
4067     Py_XDECREF(exc);
4068     Py_XDECREF(errorHandler);
4069     return res;
4070
4071     onError:
4072     Py_XDECREF(res);
4073     Py_XDECREF(exc);
4074     Py_XDECREF(errorHandler);
4075     return NULL;
4076 }
4077
4078 PyObject *PyUnicode_Translate(PyObject *str,
4079                               PyObject *mapping,
4080                               const char *errors)
4081 {
4082     PyObject *result;
4083
4084     str = PyUnicode_FromObject(str);
4085     if (str == NULL)
4086         goto onError;
4087     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4088                                         PyUnicode_GET_SIZE(str),
4089                                         mapping,
4090                                         errors);
4091     Py_DECREF(str);
4092     return result;
4093
4094  onError:
4095     Py_XDECREF(str);
4096     return NULL;
4097 }
4098
4099 /* --- Decimal Encoder ---------------------------------------------------- */
4100
4101 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
4102                             Py_ssize_t length,
4103                             char *output,
4104                             const char *errors)
4105 {
4106     Py_UNICODE *p, *end;
4107     PyObject *errorHandler = NULL;
4108     PyObject *exc = NULL;
4109     const char *encoding = "decimal";
4110     const char *reason = "invalid decimal Unicode string";
4111     /* the following variable is used for caching string comparisons
4112      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4113     int known_errorHandler = -1;
4114
4115     if (output == NULL) {
4116         PyErr_BadArgument();
4117         return -1;
4118     }
4119
4120     p = s;
4121     end = s + length;
4122     while (p < end) {
4123         register Py_UNICODE ch = *p;
4124         int decimal;
4125         PyObject *repunicode;
4126         Py_ssize_t repsize;
4127         Py_ssize_t newpos;
4128         Py_UNICODE *uni2;
4129         Py_UNICODE *collstart;
4130         Py_UNICODE *collend;
4131
4132         if (Py_UNICODE_ISSPACE(ch)) {
4133             *output++ = ' ';
4134             ++p;
4135             continue;
4136         }
4137         decimal = Py_UNICODE_TODECIMAL(ch);
4138         if (decimal >= 0) {
4139             *output++ = '0' + decimal;
4140             ++p;
4141             continue;
4142         }
4143         if (0 < ch && ch < 256) {
4144             *output++ = (char)ch;
4145             ++p;
4146             continue;
4147         }
4148         /* All other characters are considered unencodable */
4149         collstart = p;
4150         collend = p+1;
4151         while (collend < end) {
4152             if ((0 < *collend && *collend < 256) ||
4153                 !Py_UNICODE_ISSPACE(*collend) ||
4154                 Py_UNICODE_TODECIMAL(*collend))
4155                 break;
4156         }
4157         /* cache callback name lookup
4158          * (if not done yet, i.e. it's the first error) */
4159         if (known_errorHandler==-1) {
4160             if ((errors==NULL) || (!strcmp(errors, "strict")))
4161                 known_errorHandler = 1;
4162             else if (!strcmp(errors, "replace"))
4163                 known_errorHandler = 2;
4164             else if (!strcmp(errors, "ignore"))
4165                 known_errorHandler = 3;
4166             else if (!strcmp(errors, "xmlcharrefreplace"))
4167                 known_errorHandler = 4;
4168             else
4169                 known_errorHandler = 0;
4170         }
4171         switch (known_errorHandler) {
4172             case 1: /* strict */
4173                 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4174                 goto onError;
4175             case 2: /* replace */
4176                 for (p = collstart; p < collend; ++p)
4177                     *output++ = '?';
4178                 /* fall through */
4179             case 3: /* ignore */
4180                 p = collend;
4181                 break;
4182             case 4: /* xmlcharrefreplace */
4183                 /* generate replacement (temporarily (mis)uses p) */
4184                 for (p = collstart; p < collend; ++p)
4185                     output += sprintf(output, "&#%d;", (int)*p);
4186                 p = collend;
4187                 break;
4188             default:
4189                 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4190                     encoding, reason, s, length, &exc,
4191                     collstart-s, collend-s, &newpos);
4192                 if (repunicode == NULL)
4193                     goto onError;
4194                 /* generate replacement  */
4195                 repsize = PyUnicode_GET_SIZE(repunicode);
4196                 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4197                     Py_UNICODE ch = *uni2;
4198                     if (Py_UNICODE_ISSPACE(ch))
4199                         *output++ = ' ';
4200                     else {
4201                         decimal = Py_UNICODE_TODECIMAL(ch);
4202                         if (decimal >= 0)
4203                             *output++ = '0' + decimal;
4204                         else if (0 < ch && ch < 256)
4205                             *output++ = (char)ch;
4206                         else {
4207                             Py_DECREF(repunicode);
4208                             raise_encode_exception(&exc, encoding,
4209                                 s, length, collstart-s, collend-s, reason);
4210                             goto onError;
4211                         }
4212                     }
4213                 }
4214                 p = s + newpos;
4215                 Py_DECREF(repunicode);
4216         }
4217     }
4218     /* 0-terminate the output string */
4219     *output++ = '\0';
4220     Py_XDECREF(exc);
4221     Py_XDECREF(errorHandler);
4222     return 0;
4223
4224  onError:
4225     Py_XDECREF(exc);
4226     Py_XDECREF(errorHandler);
4227     return -1;
4228 }
4229
4230 /* --- Helpers ------------------------------------------------------------ */
4231
4232 #define STRINGLIB_CHAR Py_UNICODE
4233
4234 #define STRINGLIB_LEN PyUnicode_GET_SIZE
4235 #define STRINGLIB_NEW PyUnicode_FromUnicode
4236 #define STRINGLIB_STR PyUnicode_AS_UNICODE
4237
4238 Py_LOCAL_INLINE(int)
4239 STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
4240 {
4241     if (str[0] != other[0])
4242         return 1;
4243     return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
4244 }
4245
4246 #define STRINGLIB_EMPTY unicode_empty
4247
4248 #include "stringlib/fastsearch.h"
4249
4250 #include "stringlib/count.h"
4251 #include "stringlib/find.h"
4252 #include "stringlib/partition.h"
4253
4254 /* helper macro to fixup start/end slice values */
4255 #define FIX_START_END(obj)                      \
4256     if (start < 0)                              \
4257         start += (obj)->length;                 \
4258     if (start < 0)                              \
4259         start = 0;                              \
4260     if (end > (obj)->length)                    \
4261         end = (obj)->length;                    \
4262     if (end < 0)                                \
4263         end += (obj)->length;                   \
4264     if (end < 0)                                \
4265         end = 0;
4266
4267 Py_ssize_t PyUnicode_Count(PyObject *str,
4268                            PyObject *substr,
4269                            Py_ssize_t start,
4270                            Py_ssize_t end)
4271 {
4272     Py_ssize_t result;
4273     PyUnicodeObject* str_obj;
4274     PyUnicodeObject* sub_obj;
4275
4276     str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4277     if (!str_obj)
4278         return -1;
4279     sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4280     if (!sub_obj) {
4281         Py_DECREF(str_obj);
4282         return -1;
4283     }
4284
4285     FIX_START_END(str_obj);
4286
4287     result = stringlib_count(
4288         str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4289         );
4290
4291     Py_DECREF(sub_obj);
4292     Py_DECREF(str_obj);
4293
4294     return result;
4295 }
4296
4297 Py_ssize_t PyUnicode_Find(PyObject *str,
4298                           PyObject *sub,
4299                           Py_ssize_t start,
4300                           Py_ssize_t end,
4301                           int direction)
4302 {
4303     Py_ssize_t result;
4304
4305     str = PyUnicode_FromObject(str);
4306     if (!str)
4307         return -2;
4308     sub = PyUnicode_FromObject(sub);
4309     if (!sub) {
4310         Py_DECREF(str);
4311         return -2;
4312     }
4313
4314     if (direction > 0)
4315         result = stringlib_find_slice(
4316             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4317             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4318             start, end
4319             );
4320     else
4321         result = stringlib_rfind_slice(
4322             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4323             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4324             start, end
4325             );
4326
4327     Py_DECREF(str);
4328     Py_DECREF(sub);
4329
4330     return result;
4331 }
4332
4333 static
4334 int tailmatch(PyUnicodeObject *self,
4335               PyUnicodeObject *substring,
4336               Py_ssize_t start,
4337               Py_ssize_t end,
4338               int direction)
4339 {
4340     if (substring->length == 0)
4341         return 1;
4342
4343     FIX_START_END(self);
4344
4345     end -= substring->length;
4346     if (end < start)
4347         return 0;
4348
4349     if (direction > 0) {
4350         if (Py_UNICODE_MATCH(self, end, substring))
4351             return 1;
4352     } else {
4353         if (Py_UNICODE_MATCH(self, start, substring))
4354             return 1;
4355     }
4356
4357     return 0;
4358 }
4359
4360 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
4361                         PyObject *substr,
4362                         Py_ssize_t start,
4363                         Py_ssize_t end,
4364                         int direction)
4365 {
4366     Py_ssize_t result;
4367
4368     str = PyUnicode_FromObject(str);
4369     if (str == NULL)
4370         return -1;
4371     substr = PyUnicode_FromObject(substr);
4372     if (substr == NULL) {
4373         Py_DECREF(str);
4374         return -1;
4375     }
4376
4377     result = tailmatch((PyUnicodeObject *)str,
4378                        (PyUnicodeObject *)substr,
4379                        start, end, direction);
4380     Py_DECREF(str);
4381     Py_DECREF(substr);
4382     return result;
4383 }
4384
4385 /* Apply fixfct filter to the Unicode object self and return a
4386    reference to the modified object */
4387
4388 static
4389 PyObject *fixup(PyUnicodeObject *self,
4390                 int (*fixfct)(PyUnicodeObject *s))
4391 {
4392
4393     PyUnicodeObject *u;
4394
4395     u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
4396     if (u == NULL)
4397         return NULL;
4398
4399     Py_UNICODE_COPY(u->str, self->str, self->length);
4400
4401     if (!fixfct(u) && PyUnicode_CheckExact(self)) {
4402         /* fixfct should return TRUE if it modified the buffer. If
4403            FALSE, return a reference to the original buffer instead
4404            (to save space, not time) */
4405         Py_INCREF(self);
4406         Py_DECREF(u);
4407         return (PyObject*) self;
4408     }
4409     return (PyObject*) u;
4410 }
4411
4412 static
4413 int fixupper(PyUnicodeObject *self)
4414 {
4415     Py_ssize_t len = self->length;
4416     Py_UNICODE *s = self->str;
4417     int status = 0;
4418
4419     while (len-- > 0) {
4420         register Py_UNICODE ch;
4421
4422         ch = Py_UNICODE_TOUPPER(*s);
4423         if (ch != *s) {
4424             status = 1;
4425             *s = ch;
4426         }
4427         s++;
4428     }
4429
4430     return status;
4431 }
4432
4433 static
4434 int fixlower(PyUnicodeObject *self)
4435 {
4436     Py_ssize_t len = self->length;
4437     Py_UNICODE *s = self->str;
4438     int status = 0;
4439
4440     while (len-- > 0) {
4441         register Py_UNICODE ch;
4442
4443         ch = Py_UNICODE_TOLOWER(*s);
4444         if (ch != *s) {
4445             status = 1;
4446             *s = ch;
4447         }
4448         s++;
4449     }
4450
4451     return status;
4452 }
4453
4454 static
4455 int fixswapcase(PyUnicodeObject *self)
4456 {
4457     Py_ssize_t len = self->length;
4458     Py_UNICODE *s = self->str;
4459     int status = 0;
4460
4461     while (len-- > 0) {
4462         if (Py_UNICODE_ISUPPER(*s)) {
4463             *s = Py_UNICODE_TOLOWER(*s);
4464             status = 1;
4465         } else if (Py_UNICODE_ISLOWER(*s)) {
4466             *s = Py_UNICODE_TOUPPER(*s);
4467             status = 1;
4468         }
4469         s++;
4470     }
4471
4472     return status;
4473 }
4474
4475 static
4476 int fixcapitalize(PyUnicodeObject *self)
4477 {
4478     Py_ssize_t len = self->length;
4479     Py_UNICODE *s = self->str;
4480     int status = 0;
4481
4482     if (len == 0)
4483         return 0;
4484     if (Py_UNICODE_ISLOWER(*s)) {
4485         *s = Py_UNICODE_TOUPPER(*s);
4486         status = 1;
4487     }
4488     s++;
4489     while (--len > 0) {
4490         if (Py_UNICODE_ISUPPER(*s)) {
4491             *s = Py_UNICODE_TOLOWER(*s);
4492             status = 1;
4493         }
4494         s++;
4495     }
4496     return status;
4497 }
4498
4499 static
4500 int fixtitle(PyUnicodeObject *self)
4501 {
4502     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4503     register Py_UNICODE *e;
4504     int previous_is_cased;
4505
4506     /* Shortcut for single character strings */
4507     if (PyUnicode_GET_SIZE(self) == 1) {
4508         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4509         if (*p != ch) {
4510             *p = ch;
4511             return 1;
4512         }
4513         else
4514             return 0;
4515     }
4516
4517     e = p + PyUnicode_GET_SIZE(self);
4518     previous_is_cased = 0;
4519     for (; p < e; p++) {
4520         register const Py_UNICODE ch = *p;
4521
4522         if (previous_is_cased)
4523             *p = Py_UNICODE_TOLOWER(ch);
4524         else
4525             *p = Py_UNICODE_TOTITLE(ch);
4526
4527         if (Py_UNICODE_ISLOWER(ch) ||
4528             Py_UNICODE_ISUPPER(ch) ||
4529             Py_UNICODE_ISTITLE(ch))
4530             previous_is_cased = 1;
4531         else
4532             previous_is_cased = 0;
4533     }
4534     return 1;
4535 }
4536
4537 PyObject *
4538 PyUnicode_Join(PyObject *separator, PyObject *seq)
4539 {
4540     PyObject *internal_separator = NULL;
4541     const Py_UNICODE blank = ' ';
4542     const Py_UNICODE *sep = &blank;
4543     Py_ssize_t seplen = 1;
4544     PyUnicodeObject *res = NULL; /* the result */
4545     Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
4546     Py_ssize_t res_used;         /* # used bytes */
4547     Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
4548     PyObject *fseq;          /* PySequence_Fast(seq) */
4549     Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
4550     PyObject *item;
4551     Py_ssize_t i;
4552
4553     fseq = PySequence_Fast(seq, "");
4554     if (fseq == NULL) {
4555         return NULL;
4556     }
4557
4558     /* Grrrr.  A codec may be invoked to convert str objects to
4559      * Unicode, and so it's possible to call back into Python code
4560      * during PyUnicode_FromObject(), and so it's possible for a sick
4561      * codec to change the size of fseq (if seq is a list).  Therefore
4562      * we have to keep refetching the size -- can't assume seqlen
4563      * is invariant.
4564      */
4565     seqlen = PySequence_Fast_GET_SIZE(fseq);
4566     /* If empty sequence, return u"". */
4567     if (seqlen == 0) {
4568         res = _PyUnicode_New(0);  /* empty sequence; return u"" */
4569         goto Done;
4570     }
4571     /* If singleton sequence with an exact Unicode, return that. */
4572     if (seqlen == 1) {
4573         item = PySequence_Fast_GET_ITEM(fseq, 0);
4574         if (PyUnicode_CheckExact(item)) {
4575             Py_INCREF(item);
4576             res = (PyUnicodeObject *)item;
4577             goto Done;
4578         }
4579     }
4580
4581     /* At least two items to join, or one that isn't exact Unicode. */
4582     if (seqlen > 1) {
4583         /* Set up sep and seplen -- they're needed. */
4584         if (separator == NULL) {
4585             sep = &blank;
4586             seplen = 1;
4587         }
4588         else {
4589             internal_separator = PyUnicode_FromObject(separator);
4590             if (internal_separator == NULL)
4591                 goto onError;
4592             sep = PyUnicode_AS_UNICODE(internal_separator);
4593             seplen = PyUnicode_GET_SIZE(internal_separator);
4594             /* In case PyUnicode_FromObject() mutated seq. */
4595             seqlen = PySequence_Fast_GET_SIZE(fseq);
4596         }
4597     }
4598
4599     /* Get space. */
4600     res = _PyUnicode_New(res_alloc);
4601     if (res == NULL)
4602         goto onError;
4603     res_p = PyUnicode_AS_UNICODE(res);
4604     res_used = 0;
4605
4606     for (i = 0; i < seqlen; ++i) {
4607         Py_ssize_t itemlen;
4608         Py_ssize_t new_res_used;
4609
4610         item = PySequence_Fast_GET_ITEM(fseq, i);
4611         /* Convert item to Unicode. */
4612         if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4613             PyErr_Format(PyExc_TypeError,
4614                          "sequence item %zd: expected string or Unicode,"
4615                          " %.80s found",
4616                          i, item->ob_type->tp_name);
4617             goto onError;
4618         }
4619         item = PyUnicode_FromObject(item);
4620         if (item == NULL)
4621             goto onError;
4622         /* We own a reference to item from here on. */
4623
4624         /* In case PyUnicode_FromObject() mutated seq. */
4625         seqlen = PySequence_Fast_GET_SIZE(fseq);
4626
4627         /* Make sure we have enough space for the separator and the item. */
4628         itemlen = PyUnicode_GET_SIZE(item);
4629         new_res_used = res_used + itemlen;
4630         if (new_res_used < 0)
4631             goto Overflow;
4632         if (i < seqlen - 1) {
4633             new_res_used += seplen;
4634             if (new_res_used < 0)
4635                 goto Overflow;
4636         }
4637         if (new_res_used > res_alloc) {
4638             /* double allocated size until it's big enough */
4639             do {
4640                 res_alloc += res_alloc;
4641                 if (res_alloc <= 0)
4642                     goto Overflow;
4643             } while (new_res_used > res_alloc);
4644             if (_PyUnicode_Resize(&res, res_alloc) < 0) {
4645                 Py_DECREF(item);
4646                 goto onError;
4647             }
4648             res_p = PyUnicode_AS_UNICODE(res) + res_used;
4649         }
4650
4651         /* Copy item, and maybe the separator. */
4652         Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
4653         res_p += itemlen;
4654         if (i < seqlen - 1) {
4655             Py_UNICODE_COPY(res_p, sep, seplen);
4656             res_p += seplen;
4657         }
4658         Py_DECREF(item);
4659         res_used = new_res_used;
4660     }
4661
4662     /* Shrink res to match the used area; this probably can't fail,
4663      * but it's cheap to check.
4664      */
4665     if (_PyUnicode_Resize(&res, res_used) < 0)
4666         goto onError;
4667
4668  Done:
4669     Py_XDECREF(internal_separator);
4670     Py_DECREF(fseq);
4671     return (PyObject *)res;
4672
4673  Overflow:
4674     PyErr_SetString(PyExc_OverflowError,
4675                     "join() result is too long for a Python string");
4676     Py_DECREF(item);
4677     /* fall through */
4678
4679  onError:
4680     Py_XDECREF(internal_separator);
4681     Py_DECREF(fseq);
4682     Py_XDECREF(res);
4683     return NULL;
4684 }
4685
4686 static
4687 PyUnicodeObject *pad(PyUnicodeObject *self,
4688                      Py_ssize_t left,
4689                      Py_ssize_t right,
4690                      Py_UNICODE fill)
4691 {
4692     PyUnicodeObject *u;
4693
4694     if (left < 0)
4695         left = 0;
4696     if (right < 0)
4697         right = 0;
4698
4699     if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
4700         Py_INCREF(self);
4701         return self;
4702     }
4703
4704     u = _PyUnicode_New(left + self->length + right);
4705     if (u) {
4706         if (left)
4707             Py_UNICODE_FILL(u->str, fill, left);
4708         Py_UNICODE_COPY(u->str + left, self->str, self->length);
4709         if (right)
4710             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4711     }
4712
4713     return u;
4714 }
4715
4716 #define SPLIT_APPEND(data, left, right)                                 \
4717         str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4718         if (!str)                                                       \
4719             goto onError;                                               \
4720         if (PyList_Append(list, str)) {                                 \
4721             Py_DECREF(str);                                             \
4722             goto onError;                                               \
4723         }                                                               \
4724         else                                                            \
4725             Py_DECREF(str);
4726
4727 static
4728 PyObject *split_whitespace(PyUnicodeObject *self,
4729                            PyObject *list,
4730                            Py_ssize_t maxcount)
4731 {
4732     register Py_ssize_t i;
4733     register Py_ssize_t j;
4734     Py_ssize_t len = self->length;
4735     PyObject *str;
4736
4737     for (i = j = 0; i < len; ) {
4738         /* find a token */
4739         while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4740             i++;
4741         j = i;
4742         while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4743             i++;
4744         if (j < i) {
4745             if (maxcount-- <= 0)
4746                 break;
4747             SPLIT_APPEND(self->str, j, i);
4748             while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4749                 i++;
4750             j = i;
4751         }
4752     }
4753     if (j < len) {
4754         SPLIT_APPEND(self->str, j, len);
4755     }
4756     return list;
4757
4758  onError:
4759     Py_DECREF(list);
4760     return NULL;
4761 }
4762
4763 PyObject *PyUnicode_Splitlines(PyObject *string,
4764                                int keepends)
4765 {
4766     register Py_ssize_t i;
4767     register Py_ssize_t j;
4768     Py_ssize_t len;
4769     PyObject *list;
4770     PyObject *str;
4771     Py_UNICODE *data;
4772
4773     string = PyUnicode_FromObject(string);
4774     if (string == NULL)
4775         return NULL;
4776     data = PyUnicode_AS_UNICODE(string);
4777     len = PyUnicode_GET_SIZE(string);
4778
4779     list = PyList_New(0);
4780     if (!list)
4781         goto onError;
4782
4783     for (i = j = 0; i < len; ) {
4784         Py_ssize_t eol;
4785
4786         /* Find a line and append it */
4787         while (i < len && !BLOOM_LINEBREAK(data[i]))
4788             i++;
4789
4790         /* Skip the line break reading CRLF as one line break */
4791         eol = i;
4792         if (i < len) {
4793             if (data[i] == '\r' && i + 1 < len &&
4794                 data[i+1] == '\n')
4795                 i += 2;
4796             else
4797                 i++;
4798             if (keepends)
4799                 eol = i;
4800         }
4801         SPLIT_APPEND(data, j, eol);
4802         j = i;
4803     }
4804     if (j < len) {
4805         SPLIT_APPEND(data, j, len);
4806     }
4807
4808     Py_DECREF(string);
4809     return list;
4810
4811  onError:
4812     Py_XDECREF(list);
4813     Py_DECREF(string);
4814     return NULL;
4815 }
4816
4817 static
4818 PyObject *split_char(PyUnicodeObject *self,
4819                      PyObject *list,
4820                      Py_UNICODE ch,
4821                      Py_ssize_t maxcount)
4822 {
4823     register Py_ssize_t i;
4824     register Py_ssize_t j;
4825     Py_ssize_t len = self->length;
4826     PyObject *str;
4827
4828     for (i = j = 0; i < len; ) {
4829         if (self->str[i] == ch) {
4830             if (maxcount-- <= 0)
4831                 break;
4832             SPLIT_APPEND(self->str, j, i);
4833             i = j = i + 1;
4834         } else
4835             i++;
4836     }
4837     if (j <= len) {
4838         SPLIT_APPEND(self->str, j, len);
4839     }
4840     return list;
4841
4842  onError:
4843     Py_DECREF(list);
4844     return NULL;
4845 }
4846
4847 static
4848 PyObject *split_substring(PyUnicodeObject *self,
4849                           PyObject *list,
4850                           PyUnicodeObject *substring,
4851                           Py_ssize_t maxcount)
4852 {
4853     register Py_ssize_t i;
4854     register Py_ssize_t j;
4855     Py_ssize_t len = self->length;
4856     Py_ssize_t sublen = substring->length;
4857     PyObject *str;
4858
4859     for (i = j = 0; i <= len - sublen; ) {
4860         if (Py_UNICODE_MATCH(self, i, substring)) {
4861             if (maxcount-- <= 0)
4862                 break;
4863             SPLIT_APPEND(self->str, j, i);
4864             i = j = i + sublen;
4865         } else
4866             i++;
4867     }
4868     if (j <= len) {
4869         SPLIT_APPEND(self->str, j, len);
4870     }
4871     return list;
4872
4873  onError:
4874     Py_DECREF(list);
4875     return NULL;
4876 }
4877
4878 static
4879 PyObject *rsplit_whitespace(PyUnicodeObject *self,
4880                             PyObject *list,
4881                             Py_ssize_t maxcount)
4882 {
4883     register Py_ssize_t i;
4884     register Py_ssize_t j;
4885     Py_ssize_t len = self->length;
4886     PyObject *str;
4887
4888     for (i = j = len - 1; i >= 0; ) {
4889         /* find a token */
4890         while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4891             i--;
4892         j = i;
4893         while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4894             i--;
4895         if (j > i) {
4896             if (maxcount-- <= 0)
4897                 break;
4898             SPLIT_APPEND(self->str, i + 1, j + 1);
4899             while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4900                 i--;
4901             j = i;
4902         }
4903     }
4904     if (j >= 0) {
4905         SPLIT_APPEND(self->str, 0, j + 1);
4906     }
4907     if (PyList_Reverse(list) < 0)
4908         goto onError;
4909     return list;
4910
4911  onError:
4912     Py_DECREF(list);
4913     return NULL;
4914 }
4915
4916 static
4917 PyObject *rsplit_char(PyUnicodeObject *self,
4918                       PyObject *list,
4919                       Py_UNICODE ch,
4920                       Py_ssize_t maxcount)
4921 {
4922     register Py_ssize_t i;
4923     register Py_ssize_t j;
4924     Py_ssize_t len = self->length;
4925     PyObject *str;
4926
4927     for (i = j = len - 1; i >= 0; ) {
4928         if (self->str[i] == ch) {
4929             if (maxcount-- <= 0)
4930                 break;
4931             SPLIT_APPEND(self->str, i + 1, j + 1);
4932             j = i = i - 1;
4933         } else
4934             i--;
4935     }
4936     if (j >= -1) {
4937         SPLIT_APPEND(self->str, 0, j + 1);
4938     }
4939     if (PyList_Reverse(list) < 0)
4940         goto onError;
4941     return list;
4942
4943  onError:
4944     Py_DECREF(list);
4945     return NULL;
4946 }
4947
4948 static
4949 PyObject *rsplit_substring(PyUnicodeObject *self,
4950                            PyObject *list,
4951                            PyUnicodeObject *substring,
4952                            Py_ssize_t maxcount)
4953 {
4954     register Py_ssize_t i;
4955     register Py_ssize_t j;
4956     Py_ssize_t len = self->length;
4957     Py_ssize_t sublen = substring->length;
4958     PyObject *str;
4959
4960     for (i = len - sublen, j = len; i >= 0; ) {
4961         if (Py_UNICODE_MATCH(self, i, substring)) {
4962             if (maxcount-- <= 0)
4963                 break;
4964             SPLIT_APPEND(self->str, i + sublen, j);
4965             j = i;
4966             i -= sublen;
4967         } else
4968             i--;
4969     }
4970     if (j >= 0) {
4971         SPLIT_APPEND(self->str, 0, j);
4972     }
4973     if (PyList_Reverse(list) < 0)
4974         goto onError;
4975     return list;
4976
4977  onError:
4978     Py_DECREF(list);
4979     return NULL;
4980 }
4981
4982 #undef SPLIT_APPEND
4983
4984 static
4985 PyObject *split(PyUnicodeObject *self,
4986                 PyUnicodeObject *substring,
4987                 Py_ssize_t maxcount)
4988 {
4989     PyObject *list;
4990
4991     if (maxcount < 0)
4992         maxcount = PY_SSIZE_T_MAX;
4993
4994     list = PyList_New(0);
4995     if (!list)
4996         return NULL;
4997
4998     if (substring == NULL)
4999         return split_whitespace(self,list,maxcount);
5000
5001     else if (substring->length == 1)
5002         return split_char(self,list,substring->str[0],maxcount);
5003
5004     else if (substring->length == 0) {
5005         Py_DECREF(list);
5006         PyErr_SetString(PyExc_ValueError, "empty separator");
5007         return NULL;
5008     }
5009     else
5010         return split_substring(self,list,substring,maxcount);
5011 }
5012
5013 static
5014 PyObject *rsplit(PyUnicodeObject *self,
5015                  PyUnicodeObject *substring,
5016                  Py_ssize_t maxcount)
5017 {
5018     PyObject *list;
5019
5020     if (maxcount < 0)
5021         maxcount = PY_SSIZE_T_MAX;
5022
5023     list = PyList_New(0);
5024     if (!list)
5025         return NULL;
5026
5027     if (substring == NULL)
5028         return rsplit_whitespace(self,list,maxcount);
5029
5030     else if (substring->length == 1)
5031         return rsplit_char(self,list,substring->str[0],maxcount);
5032
5033     else if (substring->length == 0) {
5034         Py_DECREF(list);
5035         PyErr_SetString(PyExc_ValueError, "empty separator");
5036         return NULL;
5037     }
5038     else
5039         return rsplit_substring(self,list,substring,maxcount);
5040 }
5041
5042 static
5043 PyObject *replace(PyUnicodeObject *self,
5044                   PyUnicodeObject *str1,
5045                   PyUnicodeObject *str2,
5046                   Py_ssize_t maxcount)
5047 {
5048     PyUnicodeObject *u;
5049
5050     if (maxcount < 0)
5051         maxcount = PY_SSIZE_T_MAX;
5052
5053     if (str1->length == str2->length) {
5054         /* same length */
5055         Py_ssize_t i;
5056         if (str1->length == 1) {
5057             /* replace characters */
5058             Py_UNICODE u1, u2;
5059             if (!findchar(self->str, self->length, str1->str[0]))
5060                 goto nothing;
5061             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5062             if (!u)
5063                 return NULL;
5064             Py_UNICODE_COPY(u->str, self->str, self->length);
5065             u1 = str1->str[0];
5066             u2 = str2->str[0];
5067             for (i = 0; i < u->length; i++)
5068                 if (u->str[i] == u1) {
5069                     if (--maxcount < 0)
5070                         break;
5071                     u->str[i] = u2;
5072                 }
5073         } else {
5074             i = fastsearch(
5075                 self->str, self->length, str1->str, str1->length, FAST_SEARCH
5076                 );
5077             if (i < 0)
5078                 goto nothing;
5079             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5080             if (!u)
5081                 return NULL;
5082             Py_UNICODE_COPY(u->str, self->str, self->length);
5083             while (i <= self->length - str1->length)
5084                 if (Py_UNICODE_MATCH(self, i, str1)) {
5085                     if (--maxcount < 0)
5086                         break;
5087                     Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5088                     i += str1->length;
5089                 } else
5090                     i++;
5091         }
5092     } else {
5093
5094         Py_ssize_t n, i, j, e;
5095         Py_ssize_t product, new_size, delta;
5096         Py_UNICODE *p;
5097
5098         /* replace strings */
5099         n = stringlib_count(self->str, self->length, str1->str, str1->length);
5100         if (n > maxcount)
5101             n = maxcount;
5102         if (n == 0)
5103             goto nothing;
5104         /* new_size = self->length + n * (str2->length - str1->length)); */
5105         delta = (str2->length - str1->length);
5106         if (delta == 0) {
5107             new_size = self->length;
5108         } else {
5109             product = n * (str2->length - str1->length);
5110             if ((product / (str2->length - str1->length)) != n) {
5111                 PyErr_SetString(PyExc_OverflowError,
5112                                 "replace string is too long");
5113                 return NULL;
5114             }
5115             new_size = self->length + product;
5116             if (new_size < 0) {
5117                 PyErr_SetString(PyExc_OverflowError,
5118                                 "replace string is too long");
5119                 return NULL;
5120             }
5121         }
5122         u = _PyUnicode_New(new_size);
5123         if (!u)
5124             return NULL;
5125         i = 0;
5126         p = u->str;
5127         e = self->length - str1->length;
5128         if (str1->length > 0) {
5129             while (n-- > 0) {
5130                 /* look for next match */
5131                 j = i;
5132                 while (j <= e) {
5133                     if (Py_UNICODE_MATCH(self, j, str1))
5134                         break;
5135                     j++;
5136                 }
5137                 if (j > i) {
5138                     if (j > e)
5139                         break;
5140                     /* copy unchanged part [i:j] */
5141                     Py_UNICODE_COPY(p, self->str+i, j-i);
5142                     p += j - i;
5143                 }
5144                 /* copy substitution string */
5145                 if (str2->length > 0) {
5146                     Py_UNICODE_COPY(p, str2->str, str2->length);
5147                     p += str2->length;
5148                 }
5149                 i = j + str1->length;
5150             }
5151             if (i < self->length)
5152                 /* copy tail [i:] */
5153                 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5154         } else {
5155             /* interleave */
5156             while (n > 0) {
5157                 Py_UNICODE_COPY(p, str2->str, str2->length);
5158                 p += str2->length;
5159                 if (--n <= 0)
5160                     break;
5161                 *p++ = self->str[i++];
5162             }
5163             Py_UNICODE_COPY(p, self->str+i, self->length-i);
5164         }
5165     }
5166     return (PyObject *) u;
5167
5168 nothing:
5169     /* nothing to replace; return original string (when possible) */
5170     if (PyUnicode_CheckExact(self)) {
5171         Py_INCREF(self);
5172         return (PyObject *) self;
5173     }
5174     return PyUnicode_FromUnicode(self->str, self->length);
5175 }
5176
5177 /* --- Unicode Object Methods --------------------------------------------- */
5178
5179 PyDoc_STRVAR(title__doc__,
5180 "S.title() -> unicode\n\
5181 \n\
5182 Return a titlecased version of S, i.e. words start with title case\n\
5183 characters, all remaining cased characters have lower case.");
5184
5185 static PyObject*
5186 unicode_title(PyUnicodeObject *self)
5187 {
5188     return fixup(self, fixtitle);
5189 }
5190
5191 PyDoc_STRVAR(capitalize__doc__,
5192 "S.capitalize() -> unicode\n\
5193 \n\
5194 Return a capitalized version of S, i.e. make the first character\n\
5195 have upper case.");
5196
5197 static PyObject*
5198 unicode_capitalize(PyUnicodeObject *self)
5199 {
5200     return fixup(self, fixcapitalize);
5201 }
5202
5203 #if 0
5204 PyDoc_STRVAR(capwords__doc__,
5205 "S.capwords() -> unicode\n\
5206 \n\
5207 Apply .capitalize() to all words in S and return the result with\n\
5208 normalized whitespace (all whitespace strings are replaced by ' ').");
5209
5210 static PyObject*
5211 unicode_capwords(PyUnicodeObject *self)
5212 {
5213     PyObject *list;
5214     PyObject *item;
5215     Py_ssize_t i;
5216
5217     /* Split into words */
5218     list = split(self, NULL, -1);
5219     if (!list)
5220         return NULL;
5221
5222     /* Capitalize each word */
5223     for (i = 0; i < PyList_GET_SIZE(list); i++) {
5224         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5225                      fixcapitalize);
5226         if (item == NULL)
5227             goto onError;
5228         Py_DECREF(PyList_GET_ITEM(list, i));
5229         PyList_SET_ITEM(list, i, item);
5230     }
5231
5232     /* Join the words to form a new string */
5233     item = PyUnicode_Join(NULL, list);
5234
5235 onError:
5236     Py_DECREF(list);
5237     return (PyObject *)item;
5238 }
5239 #endif
5240
5241 /* Argument converter.  Coerces to a single unicode character */
5242
5243 static int
5244 convert_uc(PyObject *obj, void *addr)
5245 {
5246         Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5247         PyObject *uniobj;
5248         Py_UNICODE *unistr;
5249
5250         uniobj = PyUnicode_FromObject(obj);
5251         if (uniobj == NULL) {
5252                 PyErr_SetString(PyExc_TypeError,
5253                         "The fill character cannot be converted to Unicode");
5254                 return 0;
5255         }
5256         if (PyUnicode_GET_SIZE(uniobj) != 1) {
5257                 PyErr_SetString(PyExc_TypeError,
5258                         "The fill character must be exactly one character long");
5259                 Py_DECREF(uniobj);
5260                 return 0;
5261         }
5262         unistr = PyUnicode_AS_UNICODE(uniobj);
5263         *fillcharloc = unistr[0];
5264         Py_DECREF(uniobj);
5265         return 1;
5266 }
5267
5268 PyDoc_STRVAR(center__doc__,
5269 "S.center(width[, fillchar]) -> unicode\n\
5270 \n\
5271 Return S centered in a Unicode string of length width. Padding is\n\
5272 done using the specified fill character (default is a space)");
5273
5274 static PyObject *
5275 unicode_center(PyUnicodeObject *self, PyObject *args)
5276 {
5277     Py_ssize_t marg, left;
5278     Py_ssize_t width;
5279     Py_UNICODE fillchar = ' ';
5280
5281     if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
5282         return NULL;
5283
5284     if (self->length >= width && PyUnicode_CheckExact(self)) {
5285         Py_INCREF(self);
5286         return (PyObject*) self;
5287     }
5288
5289     marg = width - self->length;
5290     left = marg / 2 + (marg & width & 1);
5291
5292     return (PyObject*) pad(self, left, marg - left, fillchar);
5293 }
5294
5295 #if 0
5296
5297 /* This code should go into some future Unicode collation support
5298    module. The basic comparison should compare ordinals on a naive
5299    basis (this is what Java does and thus JPython too). */
5300
5301 /* speedy UTF-16 code point order comparison */
5302 /* gleaned from: */
5303 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5304
5305 static short utf16Fixup[32] =
5306 {
5307     0, 0, 0, 0, 0, 0, 0, 0,
5308     0, 0, 0, 0, 0, 0, 0, 0,
5309     0, 0, 0, 0, 0, 0, 0, 0,
5310     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
5311 };
5312
5313 static int
5314 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5315 {
5316     Py_ssize_t len1, len2;
5317
5318     Py_UNICODE *s1 = str1->str;
5319     Py_UNICODE *s2 = str2->str;
5320
5321     len1 = str1->length;
5322     len2 = str2->length;
5323
5324     while (len1 > 0 && len2 > 0) {
5325         Py_UNICODE c1, c2;
5326
5327         c1 = *s1++;
5328         c2 = *s2++;
5329
5330         if (c1 > (1<<11) * 26)
5331             c1 += utf16Fixup[c1>>11];
5332         if (c2 > (1<<11) * 26)
5333             c2 += utf16Fixup[c2>>11];
5334         /* now c1 and c2 are in UTF-32-compatible order */
5335
5336         if (c1 != c2)
5337             return (c1 < c2) ? -1 : 1;
5338
5339         len1--; len2--;
5340     }
5341
5342     return (len1 < len2) ? -1 : (len1 != len2);
5343 }
5344
5345 #else
5346
5347 static int
5348 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5349 {
5350     register Py_ssize_t len1, len2;
5351
5352     Py_UNICODE *s1 = str1->str;
5353     Py_UNICODE *s2 = str2->str;
5354
5355     len1 = str1->length;
5356     len2 = str2->length;
5357
5358     while (len1 > 0 && len2 > 0) {
5359         Py_UNICODE c1, c2;
5360
5361         c1 = *s1++;
5362         c2 = *s2++;
5363
5364         if (c1 != c2)
5365             return (c1 < c2) ? -1 : 1;
5366
5367         len1--; len2--;
5368     }
5369
5370     return (len1 < len2) ? -1 : (len1 != len2);
5371 }
5372
5373 #endif
5374
5375 int PyUnicode_Compare(PyObject *left,
5376                       PyObject *right)
5377 {
5378     PyUnicodeObject *u = NULL, *v = NULL;
5379     int result;
5380
5381     /* Coerce the two arguments */
5382     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5383     if (u == NULL)
5384         goto onError;
5385     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5386     if (v == NULL)
5387         goto onError;
5388
5389     /* Shortcut for empty or interned objects */
5390     if (v == u) {
5391         Py_DECREF(u);
5392         Py_DECREF(v);
5393         return 0;
5394     }
5395
5396     result = unicode_compare(u, v);
5397
5398     Py_DECREF(u);
5399     Py_DECREF(v);
5400     return result;
5401
5402 onError:
5403     Py_XDECREF(u);
5404     Py_XDECREF(v);
5405     return -1;
5406 }
5407
5408 int PyUnicode_Contains(PyObject *container,
5409                        PyObject *element)
5410 {
5411     PyObject *str, *sub;
5412     int result;
5413
5414     /* Coerce the two arguments */
5415     sub = PyUnicode_FromObject(element);
5416     if (!sub) {
5417         PyErr_SetString(PyExc_TypeError,
5418             "'in <string>' requires string as left operand");
5419         return -1;
5420     }
5421
5422     str = PyUnicode_FromObject(container);
5423     if (!str) {
5424         Py_DECREF(sub);
5425         return -1;
5426     }
5427
5428     result = stringlib_contains_obj(str, sub);
5429
5430     Py_DECREF(str);
5431     Py_DECREF(sub);
5432
5433     return result;
5434 }
5435
5436 /* Concat to string or Unicode object giving a new Unicode object. */
5437
5438 PyObject *PyUnicode_Concat(PyObject *left,
5439                            PyObject *right)
5440 {
5441     PyUnicodeObject *u = NULL, *v = NULL, *w;
5442
5443     /* Coerce the two arguments */
5444     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5445     if (u == NULL)
5446         goto onError;
5447     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5448     if (v == NULL)
5449         goto onError;
5450
5451     /* Shortcuts */
5452     if (v == unicode_empty) {
5453         Py_DECREF(v);
5454         return (PyObject *)u;
5455     }
5456     if (u == unicode_empty) {
5457         Py_DECREF(u);
5458         return (PyObject *)v;
5459     }
5460
5461     /* Concat the two Unicode strings */
5462     w = _PyUnicode_New(u->length + v->length);
5463     if (w == NULL)
5464         goto onError;
5465     Py_UNICODE_COPY(w->str, u->str, u->length);
5466     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5467
5468     Py_DECREF(u);
5469     Py_DECREF(v);
5470     return (PyObject *)w;
5471
5472 onError:
5473     Py_XDECREF(u);
5474     Py_XDECREF(v);
5475     return NULL;
5476 }
5477
5478 PyDoc_STRVAR(count__doc__,
5479 "S.count(sub[, start[, end]]) -> int\n\
5480 \n\
5481 Return the number of non-overlapping occurrences of substring sub in\n\
5482 Unicode string S[start:end].  Optional arguments start and end are\n\
5483 interpreted as in slice notation.");
5484
5485 static PyObject *
5486 unicode_count(PyUnicodeObject *self, PyObject *args)
5487 {
5488     PyUnicodeObject *substring;
5489     Py_ssize_t start = 0;
5490     Py_ssize_t end = PY_SSIZE_T_MAX;
5491     PyObject *result;
5492
5493     if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5494                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5495         return NULL;
5496
5497     substring = (PyUnicodeObject *)PyUnicode_FromObject(
5498         (PyObject *)substring);
5499     if (substring == NULL)
5500         return NULL;
5501
5502     FIX_START_END(self);
5503
5504     result = PyInt_FromSsize_t(
5505         stringlib_count(self->str + start, end - start,
5506                         substring->str, substring->length)
5507         );
5508
5509     Py_DECREF(substring);
5510
5511     return result;
5512 }
5513
5514 PyDoc_STRVAR(encode__doc__,
5515 "S.encode([encoding[,errors]]) -> string or unicode\n\
5516 \n\
5517 Encodes S using the codec registered for encoding. encoding defaults\n\
5518 to the default encoding. errors may be given to set a different error\n\
5519 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5520 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5521 'xmlcharrefreplace' as well as any other name registered with\n\
5522 codecs.register_error that can handle UnicodeEncodeErrors.");
5523
5524 static PyObject *
5525 unicode_encode(PyUnicodeObject *self, PyObject *args)
5526 {
5527     char *encoding = NULL;
5528     char *errors = NULL;
5529     PyObject *v;
5530
5531     if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5532         return NULL;
5533     v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
5534     if (v == NULL)
5535         goto onError;
5536     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5537         PyErr_Format(PyExc_TypeError,
5538                      "encoder did not return a string/unicode object "
5539                      "(type=%.400s)",
5540                      v->ob_type->tp_name);
5541         Py_DECREF(v);
5542         return NULL;
5543     }
5544     return v;
5545
5546  onError:
5547     return NULL;
5548 }
5549
5550 PyDoc_STRVAR(decode__doc__,
5551 "S.decode([encoding[,errors]]) -> string or unicode\n\
5552 \n\
5553 Decodes S using the codec registered for encoding. encoding defaults\n\
5554 to the default encoding. errors may be given to set a different error\n\
5555 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5556 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5557 as well as any other name registerd with codecs.register_error that is\n\
5558 able to handle UnicodeDecodeErrors.");
5559
5560 static PyObject *
5561 unicode_decode(PyUnicodeObject *self, PyObject *args)
5562 {
5563     char *encoding = NULL;
5564     char *errors = NULL;
5565     PyObject *v;
5566
5567     if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5568         return NULL;
5569     v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
5570     if (v == NULL)
5571         goto onError;
5572     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5573         PyErr_Format(PyExc_TypeError,
5574                      "decoder did not return a string/unicode object "
5575                      "(type=%.400s)",
5576                      v->ob_type->tp_name);
5577         Py_DECREF(v);
5578         return NULL;
5579     }
5580     return v;
5581
5582  onError:
5583     return NULL;
5584 }
5585
5586 PyDoc_STRVAR(expandtabs__doc__,
5587 "S.expandtabs([tabsize]) -> unicode\n\
5588 \n\
5589 Return a copy of S where all tab characters are expanded using spaces.\n\
5590 If tabsize is not given, a tab size of 8 characters is assumed.");
5591
5592 static PyObject*
5593 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5594 {
5595     Py_UNICODE *e;
5596     Py_UNICODE *p;
5597     Py_UNICODE *q;
5598     Py_ssize_t i, j;
5599     PyUnicodeObject *u;
5600     int tabsize = 8;
5601
5602     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5603         return NULL;
5604
5605     /* First pass: determine size of output string */
5606     i = j = 0;
5607     e = self->str + self->length;
5608     for (p = self->str; p < e; p++)
5609         if (*p == '\t') {
5610             if (tabsize > 0)
5611                 j += tabsize - (j % tabsize);
5612         }
5613         else {
5614             j++;
5615             if (*p == '\n' || *p == '\r') {
5616                 i += j;
5617                 j = 0;
5618             }
5619         }
5620
5621     /* Second pass: create output string and fill it */
5622     u = _PyUnicode_New(i + j);
5623     if (!u)
5624         return NULL;
5625
5626     j = 0;
5627     q = u->str;
5628
5629     for (p = self->str; p < e; p++)
5630         if (*p == '\t') {
5631             if (tabsize > 0) {
5632                 i = tabsize - (j % tabsize);
5633                 j += i;
5634                 while (i--)
5635                     *q++ = ' ';
5636             }
5637         }
5638         else {
5639             j++;
5640             *q++ = *p;
5641             if (*p == '\n' || *p == '\r')
5642                 j = 0;
5643         }
5644
5645     return (PyObject*) u;
5646 }
5647
5648 PyDoc_STRVAR(find__doc__,
5649 "S.find(sub [,start [,end]]) -> int\n\
5650 \n\
5651 Return the lowest index in S where substring sub is found,\n\
5652 such that sub is contained within s[start,end].  Optional\n\
5653 arguments start and end are interpreted as in slice notation.\n\
5654 \n\
5655 Return -1 on failure.");
5656
5657 static PyObject *
5658 unicode_find(PyUnicodeObject *self, PyObject *args)
5659 {
5660     PyObject *substring;
5661     Py_ssize_t start = 0;
5662     Py_ssize_t end = PY_SSIZE_T_MAX;
5663     Py_ssize_t result;
5664
5665     if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5666                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5667         return NULL;
5668     substring = PyUnicode_FromObject(substring);
5669     if (!substring)
5670         return NULL;
5671
5672     result = stringlib_find_slice(
5673         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5674         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5675         start, end
5676         );
5677
5678     Py_DECREF(substring);
5679
5680     return PyInt_FromSsize_t(result);
5681 }
5682
5683 static PyObject *
5684 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
5685 {
5686     if (index < 0 || index >= self->length) {
5687         PyErr_SetString(PyExc_IndexError, "string index out of range");
5688         return NULL;
5689     }
5690
5691     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5692 }
5693
5694 static long
5695 unicode_hash(PyUnicodeObject *self)
5696 {
5697     /* Since Unicode objects compare equal to their ASCII string
5698        counterparts, they should use the individual character values
5699        as basis for their hash value.  This is needed to assure that
5700        strings and Unicode objects behave in the same way as
5701        dictionary keys. */
5702
5703     register Py_ssize_t len;
5704     register Py_UNICODE *p;
5705     register long x;
5706
5707     if (self->hash != -1)
5708         return self->hash;
5709     len = PyUnicode_GET_SIZE(self);
5710     p = PyUnicode_AS_UNICODE(self);
5711     x = *p << 7;
5712     while (--len >= 0)
5713         x = (1000003*x) ^ *p++;
5714     x ^= PyUnicode_GET_SIZE(self);
5715     if (x == -1)
5716         x = -2;
5717     self->hash = x;
5718     return x;
5719 }
5720
5721 PyDoc_STRVAR(index__doc__,
5722 "S.index(sub [,start [,end]]) -> int\n\
5723 \n\
5724 Like S.find() but raise ValueError when the substring is not found.");
5725
5726 static PyObject *
5727 unicode_index(PyUnicodeObject *self, PyObject *args)
5728 {
5729     Py_ssize_t result;
5730     PyObject *substring;
5731     Py_ssize_t start = 0;
5732     Py_ssize_t end = PY_SSIZE_T_MAX;
5733
5734     if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5735                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5736         return NULL;
5737     substring = PyUnicode_FromObject(substring);
5738     if (!substring)
5739         return NULL;
5740
5741     result = stringlib_find_slice(
5742         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5743         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5744         start, end
5745         );
5746
5747     Py_DECREF(substring);
5748
5749     if (result < 0) {
5750         PyErr_SetString(PyExc_ValueError, "substring not found");
5751         return NULL;
5752     }
5753
5754     return PyInt_FromSsize_t(result);
5755 }
5756
5757 PyDoc_STRVAR(islower__doc__,
5758 "S.islower() -> bool\n\
5759 \n\
5760 Return True if all cased characters in S are lowercase and there is\n\
5761 at least one cased character in S, False otherwise.");
5762
5763 static PyObject*
5764 unicode_islower(PyUnicodeObject *self)
5765 {
5766     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5767     register const Py_UNICODE *e;
5768     int cased;
5769
5770     /* Shortcut for single character strings */
5771     if (PyUnicode_GET_SIZE(self) == 1)
5772         return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
5773
5774     /* Special case for empty strings */
5775     if (PyUnicode_GET_SIZE(self) == 0)
5776         return PyBool_FromLong(0);
5777
5778     e = p + PyUnicode_GET_SIZE(self);
5779     cased = 0;
5780     for (; p < e; p++) {
5781         register const Py_UNICODE ch = *p;
5782
5783         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
5784             return PyBool_FromLong(0);
5785         else if (!cased && Py_UNICODE_ISLOWER(ch))
5786             cased = 1;
5787     }
5788     return PyBool_FromLong(cased);
5789 }
5790
5791 PyDoc_STRVAR(isupper__doc__,
5792 "S.isupper() -> bool\n\
5793 \n\
5794 Return True if all cased characters in S are uppercase and there is\n\
5795 at least one cased character in S, False otherwise.");
5796
5797 static PyObject*
5798 unicode_isupper(PyUnicodeObject *self)
5799 {
5800     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5801     register const Py_UNICODE *e;
5802     int cased;
5803
5804     /* Shortcut for single character strings */
5805     if (PyUnicode_GET_SIZE(self) == 1)
5806         return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
5807
5808     /* Special case for empty strings */
5809     if (PyUnicode_GET_SIZE(self) == 0)
5810         return PyBool_FromLong(0);
5811
5812     e = p + PyUnicode_GET_SIZE(self);
5813     cased = 0;
5814     for (; p < e; p++) {
5815         register const Py_UNICODE ch = *p;
5816
5817         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
5818             return PyBool_FromLong(0);
5819         else if (!cased && Py_UNICODE_ISUPPER(ch))
5820             cased = 1;
5821     }
5822     return PyBool_FromLong(cased);
5823 }
5824
5825 PyDoc_STRVAR(istitle__doc__,
5826 "S.istitle() -> bool\n\
5827 \n\
5828 Return True if S is a titlecased string and there is at least one\n\
5829 character in S, i.e. upper- and titlecase characters may only\n\
5830 follow uncased characters and lowercase characters only cased ones.\n\
5831 Return False otherwise.");
5832
5833 static PyObject*
5834 unicode_istitle(PyUnicodeObject *self)
5835 {
5836     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5837     register const Py_UNICODE *e;
5838     int cased, previous_is_cased;
5839
5840     /* Shortcut for single character strings */
5841     if (PyUnicode_GET_SIZE(self) == 1)
5842         return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5843                                (Py_UNICODE_ISUPPER(*p) != 0));
5844
5845     /* Special case for empty strings */
5846     if (PyUnicode_GET_SIZE(self) == 0)
5847         return PyBool_FromLong(0);
5848
5849     e = p + PyUnicode_GET_SIZE(self);
5850     cased = 0;
5851     previous_is_cased = 0;
5852     for (; p < e; p++) {
5853         register const Py_UNICODE ch = *p;
5854
5855         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5856             if (previous_is_cased)
5857                 return PyBool_FromLong(0);
5858             previous_is_cased = 1;
5859             cased = 1;
5860         }
5861         else if (Py_UNICODE_ISLOWER(ch)) {
5862             if (!previous_is_cased)
5863                 return PyBool_FromLong(0);
5864             previous_is_cased = 1;
5865             cased = 1;
5866         }
5867         else
5868             previous_is_cased = 0;
5869     }
5870     return PyBool_FromLong(cased);
5871 }
5872
5873 PyDoc_STRVAR(isspace__doc__,
5874 "S.isspace() -> bool\n\
5875 \n\
5876 Return True if all characters in S are whitespace\n\
5877 and there is at least one character in S, False otherwise.");
5878
5879 static PyObject*
5880 unicode_isspace(PyUnicodeObject *self)
5881 {
5882     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5883     register const Py_UNICODE *e;
5884
5885     /* Shortcut for single character strings */
5886     if (PyUnicode_GET_SIZE(self) == 1 &&
5887         Py_UNICODE_ISSPACE(*p))
5888         return PyBool_FromLong(1);
5889
5890     /* Special case for empty strings */
5891     if (PyUnicode_GET_SIZE(self) == 0)
5892         return PyBool_FromLong(0);
5893
5894     e = p + PyUnicode_GET_SIZE(self);
5895     for (; p < e; p++) {
5896         if (!Py_UNICODE_ISSPACE(*p))
5897             return PyBool_FromLong(0);
5898     }
5899     return PyBool_FromLong(1);
5900 }
5901
5902 PyDoc_STRVAR(isalpha__doc__,
5903 "S.isalpha() -> bool\n\
5904 \n\
5905 Return True if all characters in S are alphabetic\n\
5906 and there is at least one character in S, False otherwise.");
5907
5908 static PyObject*
5909 unicode_isalpha(PyUnicodeObject *self)
5910 {
5911     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5912     register const Py_UNICODE *e;
5913
5914     /* Shortcut for single character strings */
5915     if (PyUnicode_GET_SIZE(self) == 1 &&
5916         Py_UNICODE_ISALPHA(*p))
5917         return PyBool_FromLong(1);
5918
5919     /* Special case for empty strings */
5920     if (PyUnicode_GET_SIZE(self) == 0)
5921         return PyBool_FromLong(0);
5922
5923     e = p + PyUnicode_GET_SIZE(self);
5924     for (; p < e; p++) {
5925         if (!Py_UNICODE_ISALPHA(*p))
5926             return PyBool_FromLong(0);
5927     }
5928     return PyBool_FromLong(1);
5929 }
5930
5931 PyDoc_STRVAR(isalnum__doc__,
5932 "S.isalnum() -> bool\n\
5933 \n\
5934 Return True if all characters in S are alphanumeric\n\
5935 and there is at least one character in S, False otherwise.");
5936
5937 static PyObject*
5938 unicode_isalnum(PyUnicodeObject *self)
5939 {
5940     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5941     register const Py_UNICODE *e;
5942
5943     /* Shortcut for single character strings */
5944     if (PyUnicode_GET_SIZE(self) == 1 &&
5945         Py_UNICODE_ISALNUM(*p))
5946         return PyBool_FromLong(1);
5947
5948     /* Special case for empty strings */
5949     if (PyUnicode_GET_SIZE(self) == 0)
5950         return PyBool_FromLong(0);
5951
5952     e = p + PyUnicode_GET_SIZE(self);
5953     for (; p < e; p++) {
5954         if (!Py_UNICODE_ISALNUM(*p))
5955             return PyBool_FromLong(0);
5956     }
5957     return PyBool_FromLong(1);
5958 }
5959
5960 PyDoc_STRVAR(isdecimal__doc__,
5961 "S.isdecimal() -> bool\n\
5962 \n\
5963 Return True if there are only decimal characters in S,\n\
5964 False otherwise.");
5965
5966 static PyObject*
5967 unicode_isdecimal(PyUnicodeObject *self)
5968 {
5969     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5970     register const Py_UNICODE *e;
5971
5972     /* Shortcut for single character strings */
5973     if (PyUnicode_GET_SIZE(self) == 1 &&
5974         Py_UNICODE_ISDECIMAL(*p))
5975         return PyBool_FromLong(1);
5976
5977     /* Special case for empty strings */
5978     if (PyUnicode_GET_SIZE(self) == 0)
5979         return PyBool_FromLong(0);
5980
5981     e = p + PyUnicode_GET_SIZE(self);
5982     for (; p < e; p++) {
5983         if (!Py_UNICODE_ISDECIMAL(*p))
5984             return PyBool_FromLong(0);
5985     }
5986     return PyBool_FromLong(1);
5987 }
5988
5989 PyDoc_STRVAR(isdigit__doc__,
5990 "S.isdigit() -> bool\n\
5991 \n\
5992 Return True if all characters in S are digits\n\
5993 and there is at least one character in S, False otherwise.");
5994
5995 static PyObject*
5996 unicode_isdigit(PyUnicodeObject *self)
5997 {
5998     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5999     register const Py_UNICODE *e;
6000
6001     /* Shortcut for single character strings */
6002     if (PyUnicode_GET_SIZE(self) == 1 &&
6003         Py_UNICODE_ISDIGIT(*p))
6004         return PyBool_FromLong(1);
6005
6006     /* Special case for empty strings */
6007     if (PyUnicode_GET_SIZE(self) == 0)
6008         return PyBool_FromLong(0);
6009
6010     e = p + PyUnicode_GET_SIZE(self);
6011     for (; p < e; p++) {
6012         if (!Py_UNICODE_ISDIGIT(*p))
6013             return PyBool_FromLong(0);
6014     }
6015     return PyBool_FromLong(1);
6016 }
6017
6018 PyDoc_STRVAR(isnumeric__doc__,
6019 "S.isnumeric() -> bool\n\
6020 \n\
6021 Return True if there are only numeric characters in S,\n\
6022 False otherwise.");
6023
6024 static PyObject*
6025 unicode_isnumeric(PyUnicodeObject *self)
6026 {
6027     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6028     register const Py_UNICODE *e;
6029
6030     /* Shortcut for single character strings */
6031     if (PyUnicode_GET_SIZE(self) == 1 &&
6032         Py_UNICODE_ISNUMERIC(*p))
6033         return PyBool_FromLong(1);
6034
6035     /* Special case for empty strings */
6036     if (PyUnicode_GET_SIZE(self) == 0)
6037         return PyBool_FromLong(0);
6038
6039     e = p + PyUnicode_GET_SIZE(self);
6040     for (; p < e; p++) {
6041         if (!Py_UNICODE_ISNUMERIC(*p))
6042             return PyBool_FromLong(0);
6043     }
6044     return PyBool_FromLong(1);
6045 }
6046
6047 PyDoc_STRVAR(join__doc__,
6048 "S.join(sequence) -> unicode\n\
6049 \n\
6050 Return a string which is the concatenation of the strings in the\n\
6051 sequence.  The separator between elements is S.");
6052
6053 static PyObject*
6054 unicode_join(PyObject *self, PyObject *data)
6055 {
6056     return PyUnicode_Join(self, data);
6057 }
6058
6059 static Py_ssize_t
6060 unicode_length(PyUnicodeObject *self)
6061 {
6062     return self->length;
6063 }
6064
6065 PyDoc_STRVAR(ljust__doc__,
6066 "S.ljust(width[, fillchar]) -> int\n\
6067 \n\
6068 Return S left justified in a Unicode string of length width. Padding is\n\
6069 done using the specified fill character (default is a space).");
6070
6071 static PyObject *
6072 unicode_ljust(PyUnicodeObject *self, PyObject *args)
6073 {
6074     Py_ssize_t width;
6075     Py_UNICODE fillchar = ' ';
6076
6077     if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
6078         return NULL;
6079
6080     if (self->length >= width && PyUnicode_CheckExact(self)) {
6081         Py_INCREF(self);
6082         return (PyObject*) self;
6083     }
6084
6085     return (PyObject*) pad(self, 0, width - self->length, fillchar);
6086 }
6087
6088 PyDoc_STRVAR(lower__doc__,
6089 "S.lower() -> unicode\n\
6090 \n\
6091 Return a copy of the string S converted to lowercase.");
6092
6093 static PyObject*
6094 unicode_lower(PyUnicodeObject *self)
6095 {
6096     return fixup(self, fixlower);
6097 }
6098
6099 #define LEFTSTRIP 0
6100 #define RIGHTSTRIP 1
6101 #define BOTHSTRIP 2
6102
6103 /* Arrays indexed by above */
6104 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6105
6106 #define STRIPNAME(i) (stripformat[i]+3)
6107
6108 /* externally visible for str.strip(unicode) */
6109 PyObject *
6110 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6111 {
6112         Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6113         Py_ssize_t len = PyUnicode_GET_SIZE(self);
6114         Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6115         Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6116         Py_ssize_t i, j;
6117
6118         BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6119
6120         i = 0;
6121         if (striptype != RIGHTSTRIP) {
6122             while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6123                 i++;
6124             }
6125         }
6126
6127         j = len;
6128         if (striptype != LEFTSTRIP) {
6129             do {
6130                 j--;
6131             } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6132             j++;
6133         }
6134
6135         if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6136             Py_INCREF(self);
6137             return (PyObject*)self;
6138         }
6139         else
6140             return PyUnicode_FromUnicode(s+i, j-i);
6141 }
6142
6143
6144 static PyObject *
6145 do_strip(PyUnicodeObject *self, int striptype)
6146 {
6147         Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6148         Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
6149
6150         i = 0;
6151         if (striptype != RIGHTSTRIP) {
6152                 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6153                         i++;
6154                 }
6155         }
6156
6157         j = len;
6158         if (striptype != LEFTSTRIP) {
6159                 do {
6160                         j--;
6161                 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6162                 j++;
6163         }
6164
6165         if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6166                 Py_INCREF(self);
6167                 return (PyObject*)self;
6168         }
6169         else
6170                 return PyUnicode_FromUnicode(s+i, j-i);
6171 }
6172
6173
6174 static PyObject *
6175 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6176 {
6177         PyObject *sep = NULL;
6178
6179         if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6180                 return NULL;
6181
6182         if (sep != NULL && sep != Py_None) {
6183                 if (PyUnicode_Check(sep))
6184                         return _PyUnicode_XStrip(self, striptype, sep);
6185                 else if (PyString_Check(sep)) {
6186                         PyObject *res;
6187                         sep = PyUnicode_FromObject(sep);
6188                         if (sep==NULL)
6189                                 return NULL;
6190                         res = _PyUnicode_XStrip(self, striptype, sep);
6191                         Py_DECREF(sep);
6192                         return res;
6193                 }
6194                 else {
6195                         PyErr_Format(PyExc_TypeError,
6196                                      "%s arg must be None, unicode or str",
6197                                      STRIPNAME(striptype));
6198                         return NULL;
6199                 }
6200         }
6201
6202         return do_strip(self, striptype);
6203 }
6204
6205
6206 PyDoc_STRVAR(strip__doc__,
6207 "S.strip([chars]) -> unicode\n\
6208 \n\
6209 Return a copy of the string S with leading and trailing\n\
6210 whitespace removed.\n\
6211 If chars is given and not None, remove characters in chars instead.\n\
6212 If chars is a str, it will be converted to unicode before stripping");
6213
6214 static PyObject *
6215 unicode_strip(PyUnicodeObject *self, PyObject *args)
6216 {
6217         if (PyTuple_GET_SIZE(args) == 0)
6218                 return do_strip(self, BOTHSTRIP); /* Common case */
6219         else
6220                 return do_argstrip(self, BOTHSTRIP, args);
6221 }
6222
6223
6224 PyDoc_STRVAR(lstrip__doc__,
6225 "S.lstrip([chars]) -> unicode\n\
6226 \n\
6227 Return a copy of the string S with leading whitespace removed.\n\
6228 If chars is given and not None, remove characters in chars instead.\n\
6229 If chars is a str, it will be converted to unicode before stripping");
6230
6231 static PyObject *
6232 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6233 {
6234         if (PyTuple_GET_SIZE(args) == 0)
6235                 return do_strip(self, LEFTSTRIP); /* Common case */
6236         else
6237                 return do_argstrip(self, LEFTSTRIP, args);
6238 }
6239
6240
6241 PyDoc_STRVAR(rstrip__doc__,
6242 "S.rstrip([chars]) -> unicode\n\
6243 \n\
6244 Return a copy of the string S with trailing whitespace removed.\n\
6245 If chars is given and not None, remove characters in chars instead.\n\
6246 If chars is a str, it will be converted to unicode before stripping");
6247
6248 static PyObject *
6249 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6250 {
6251         if (PyTuple_GET_SIZE(args) == 0)
6252                 return do_strip(self, RIGHTSTRIP); /* Common case */
6253         else
6254                 return do_argstrip(self, RIGHTSTRIP, args);
6255 }
6256
6257
6258 static PyObject*
6259 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
6260 {
6261     PyUnicodeObject *u;
6262     Py_UNICODE *p;
6263     Py_ssize_t nchars;
6264     size_t nbytes;
6265
6266     if (len < 0)
6267         len = 0;
6268
6269     if (len == 1 && PyUnicode_CheckExact(str)) {
6270         /* no repeat, return original string */
6271         Py_INCREF(str);
6272         return (PyObject*) str;
6273     }
6274
6275     /* ensure # of chars needed doesn't overflow int and # of bytes
6276      * needed doesn't overflow size_t
6277      */
6278     nchars = len * str->length;
6279     if (len && nchars / len != str->length) {
6280         PyErr_SetString(PyExc_OverflowError,
6281                         "repeated string is too long");
6282         return NULL;
6283     }
6284     nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6285     if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6286         PyErr_SetString(PyExc_OverflowError,
6287                         "repeated string is too long");
6288         return NULL;
6289     }
6290     u = _PyUnicode_New(nchars);
6291     if (!u)
6292         return NULL;
6293
6294     p = u->str;
6295
6296     if (str->length == 1 && len > 0) {
6297         Py_UNICODE_FILL(p, str->str[0], len);
6298     } else {
6299         Py_ssize_t done = 0; /* number of characters copied this far */
6300         if (done < nchars) {
6301             Py_UNICODE_COPY(p, str->str, str->length);
6302             done = str->length;
6303         }
6304         while (done < nchars) {
6305             int n = (done <= nchars-done) ? done : nchars-done;
6306             Py_UNICODE_COPY(p+done, p, n);
6307             done += n;
6308         }
6309     }
6310
6311     return (PyObject*) u;
6312 }
6313
6314 PyObject *PyUnicode_Replace(PyObject *obj,
6315                             PyObject *subobj,
6316                             PyObject *replobj,
6317                             Py_ssize_t maxcount)
6318 {
6319     PyObject *self;
6320     PyObject *str1;
6321     PyObject *str2;
6322     PyObject *result;
6323
6324     self = PyUnicode_FromObject(obj);
6325     if (self == NULL)
6326         return NULL;
6327     str1 = PyUnicode_FromObject(subobj);
6328     if (str1 == NULL) {
6329         Py_DECREF(self);
6330         return NULL;
6331     }
6332     str2 = PyUnicode_FromObject(replobj);
6333     if (str2 == NULL) {
6334         Py_DECREF(self);
6335         Py_DECREF(str1);
6336         return NULL;
6337     }
6338     result = replace((PyUnicodeObject *)self,
6339                      (PyUnicodeObject *)str1,
6340                      (PyUnicodeObject *)str2,
6341                      maxcount);
6342     Py_DECREF(self);
6343     Py_DECREF(str1);
6344     Py_DECREF(str2);
6345     return result;
6346 }
6347
6348 PyDoc_STRVAR(replace__doc__,
6349 "S.replace (old, new[, maxsplit]) -> unicode\n\
6350 \n\
6351 Return a copy of S with all occurrences of substring\n\
6352 old replaced by new.  If the optional argument maxsplit is\n\
6353 given, only the first maxsplit occurrences are replaced.");
6354
6355 static PyObject*
6356 unicode_replace(PyUnicodeObject *self, PyObject *args)
6357 {
6358     PyUnicodeObject *str1;
6359     PyUnicodeObject *str2;
6360     Py_ssize_t maxcount = -1;
6361     PyObject *result;
6362
6363     if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
6364         return NULL;
6365     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6366     if (str1 == NULL)
6367         return NULL;
6368     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
6369     if (str2 == NULL) {
6370         Py_DECREF(str1);
6371         return NULL;
6372     }
6373
6374     result = replace(self, str1, str2, maxcount);
6375
6376     Py_DECREF(str1);
6377     Py_DECREF(str2);
6378     return result;
6379 }
6380
6381 static
6382 PyObject *unicode_repr(PyObject *unicode)
6383 {
6384     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6385                                 PyUnicode_GET_SIZE(unicode),
6386                                 1);
6387 }
6388
6389 PyDoc_STRVAR(rfind__doc__,
6390 "S.rfind(sub [,start [,end]]) -> int\n\
6391 \n\
6392 Return the highest index in S where substring sub is found,\n\
6393 such that sub is contained within s[start,end].  Optional\n\
6394 arguments start and end are interpreted as in slice notation.\n\
6395 \n\
6396 Return -1 on failure.");
6397
6398 static PyObject *
6399 unicode_rfind(PyUnicodeObject *self, PyObject *args)
6400 {
6401     PyObject *substring;
6402     Py_ssize_t start = 0;
6403     Py_ssize_t end = PY_SSIZE_T_MAX;
6404     Py_ssize_t result;
6405
6406     if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6407                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6408         return NULL;
6409     substring = PyUnicode_FromObject(substring);
6410     if (!substring)
6411         return NULL;
6412
6413     result = stringlib_rfind_slice(
6414         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6415         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6416         start, end
6417         );
6418
6419     Py_DECREF(substring);
6420
6421     return PyInt_FromSsize_t(result);
6422 }
6423
6424 PyDoc_STRVAR(rindex__doc__,
6425 "S.rindex(sub [,start [,end]]) -> int\n\
6426 \n\
6427 Like S.rfind() but raise ValueError when the substring is not found.");
6428
6429 static PyObject *
6430 unicode_rindex(PyUnicodeObject *self, PyObject *args)
6431 {
6432     PyObject *substring;
6433     Py_ssize_t start = 0;
6434     Py_ssize_t end = PY_SSIZE_T_MAX;
6435     Py_ssize_t result;
6436
6437     if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6438                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6439         return NULL;
6440     substring = PyUnicode_FromObject(substring);
6441     if (!substring)
6442         return NULL;
6443
6444     result = stringlib_rfind_slice(
6445         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6446         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6447         start, end
6448         );
6449
6450     Py_DECREF(substring);
6451
6452     if (result < 0) {
6453         PyErr_SetString(PyExc_ValueError, "substring not found");
6454         return NULL;
6455     }
6456     return PyInt_FromSsize_t(result);
6457 }
6458
6459 PyDoc_STRVAR(rjust__doc__,
6460 "S.rjust(width[, fillchar]) -> unicode\n\
6461 \n\
6462 Return S right justified in a Unicode string of length width. Padding is\n\
6463 done using the specified fill character (default is a space).");
6464
6465 static PyObject *
6466 unicode_rjust(PyUnicodeObject *self, PyObject *args)
6467 {
6468     Py_ssize_t width;
6469     Py_UNICODE fillchar = ' ';
6470
6471     if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
6472         return NULL;
6473
6474     if (self->length >= width && PyUnicode_CheckExact(self)) {
6475         Py_INCREF(self);
6476         return (PyObject*) self;
6477     }
6478
6479     return (PyObject*) pad(self, width - self->length, 0, fillchar);
6480 }
6481
6482 static PyObject*
6483 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
6484 {
6485     /* standard clamping */
6486     if (start < 0)
6487         start = 0;
6488     if (end < 0)
6489         end = 0;
6490     if (end > self->length)
6491         end = self->length;
6492     if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
6493         /* full slice, return original string */
6494         Py_INCREF(self);
6495         return (PyObject*) self;
6496     }
6497     if (start > end)
6498         start = end;
6499     /* copy slice */
6500     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6501                                              end - start);
6502 }
6503
6504 PyObject *PyUnicode_Split(PyObject *s,
6505                           PyObject *sep,
6506                           Py_ssize_t maxsplit)
6507 {
6508     PyObject *result;
6509
6510     s = PyUnicode_FromObject(s);
6511     if (s == NULL)
6512         return NULL;
6513     if (sep != NULL) {
6514         sep = PyUnicode_FromObject(sep);
6515         if (sep == NULL) {
6516             Py_DECREF(s);
6517             return NULL;
6518         }
6519     }
6520
6521     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6522
6523     Py_DECREF(s);
6524     Py_XDECREF(sep);
6525     return result;
6526 }
6527
6528 PyDoc_STRVAR(split__doc__,
6529 "S.split([sep [,maxsplit]]) -> list of strings\n\
6530 \n\
6531 Return a list of the words in S, using sep as the\n\
6532 delimiter string.  If maxsplit is given, at most maxsplit\n\
6533 splits are done. If sep is not specified or is None,\n\
6534 any whitespace string is a separator.");
6535
6536 static PyObject*
6537 unicode_split(PyUnicodeObject *self, PyObject *args)
6538 {
6539     PyObject *substring = Py_None;
6540     Py_ssize_t maxcount = -1;
6541
6542     if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
6543         return NULL;
6544
6545     if (substring == Py_None)
6546         return split(self, NULL, maxcount);
6547     else if (PyUnicode_Check(substring))
6548         return split(self, (PyUnicodeObject *)substring, maxcount);
6549     else
6550         return PyUnicode_Split((PyObject *)self, substring, maxcount);
6551 }
6552
6553 PyObject *
6554 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6555 {
6556     PyObject* str_obj;
6557     PyObject* sep_obj;
6558     PyObject* out;
6559
6560     str_obj = PyUnicode_FromObject(str_in);
6561     if (!str_obj)
6562         return NULL;
6563     sep_obj = PyUnicode_FromObject(sep_in);
6564     if (!sep_obj) {
6565         Py_DECREF(str_obj);
6566         return NULL;
6567     }
6568
6569     out = stringlib_partition(
6570         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6571         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6572         );
6573
6574     Py_DECREF(sep_obj);
6575     Py_DECREF(str_obj);
6576
6577     return out;
6578 }
6579
6580
6581 PyObject *
6582 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6583 {
6584     PyObject* str_obj;
6585     PyObject* sep_obj;
6586     PyObject* out;
6587
6588     str_obj = PyUnicode_FromObject(str_in);
6589     if (!str_obj)
6590         return NULL;
6591     sep_obj = PyUnicode_FromObject(sep_in);
6592     if (!sep_obj) {
6593         Py_DECREF(str_obj);
6594         return NULL;
6595     }
6596
6597     out = stringlib_rpartition(
6598         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6599         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6600         );
6601
6602     Py_DECREF(sep_obj);
6603     Py_DECREF(str_obj);
6604
6605     return out;
6606 }
6607
6608 PyDoc_STRVAR(partition__doc__,
6609 "S.partition(sep) -> (head, sep, tail)\n\
6610 \n\
6611 Searches for the separator sep in S, and returns the part before it,\n\
6612 the separator itself, and the part after it.  If the separator is not\n\
6613 found, returns S and two empty strings.");
6614
6615 static PyObject*
6616 unicode_partition(PyUnicodeObject *self, PyObject *separator)
6617 {
6618     return PyUnicode_Partition((PyObject *)self, separator);
6619 }
6620
6621 PyDoc_STRVAR(rpartition__doc__,
6622 "S.rpartition(sep) -> (head, sep, tail)\n\
6623 \n\
6624 Searches for the separator sep in S, starting at the end of S, and returns\n\
6625 the part before it, the separator itself, and the part after it.  If the\n\
6626 separator is not found, returns S and two empty strings.");
6627
6628 static PyObject*
6629 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
6630 {
6631     return PyUnicode_RPartition((PyObject *)self, separator);
6632 }
6633
6634 PyObject *PyUnicode_RSplit(PyObject *s,
6635                            PyObject *sep,
6636                            Py_ssize_t maxsplit)
6637 {
6638     PyObject *result;
6639
6640     s = PyUnicode_FromObject(s);
6641     if (s == NULL)
6642         return NULL;
6643     if (sep != NULL) {
6644         sep = PyUnicode_FromObject(sep);
6645         if (sep == NULL) {
6646             Py_DECREF(s);
6647             return NULL;
6648         }
6649     }
6650
6651     result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6652
6653     Py_DECREF(s);
6654     Py_XDECREF(sep);
6655     return result;
6656 }
6657
6658 PyDoc_STRVAR(rsplit__doc__,
6659 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6660 \n\
6661 Return a list of the words in S, using sep as the\n\
6662 delimiter string, starting at the end of the string and\n\
6663 working to the front.  If maxsplit is given, at most maxsplit\n\
6664 splits are done. If sep is not specified, any whitespace string\n\
6665 is a separator.");
6666
6667 static PyObject*
6668 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6669 {
6670     PyObject *substring = Py_None;
6671     Py_ssize_t maxcount = -1;
6672
6673     if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
6674         return NULL;
6675
6676     if (substring == Py_None)
6677         return rsplit(self, NULL, maxcount);
6678     else if (PyUnicode_Check(substring))
6679         return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6680     else
6681         return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6682 }
6683
6684 PyDoc_STRVAR(splitlines__doc__,
6685 "S.splitlines([keepends]]) -> list of strings\n\
6686 \n\
6687 Return a list of the lines in S, breaking at line boundaries.\n\
6688 Line breaks are not included in the resulting list unless keepends\n\
6689 is given and true.");
6690
6691 static PyObject*
6692 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6693 {
6694     int keepends = 0;
6695
6696     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
6697         return NULL;
6698
6699     return PyUnicode_Splitlines((PyObject *)self, keepends);
6700 }
6701
6702 static
6703 PyObject *unicode_str(PyUnicodeObject *self)
6704 {
6705     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
6706 }
6707
6708 PyDoc_STRVAR(swapcase__doc__,
6709 "S.swapcase() -> unicode\n\
6710 \n\
6711 Return a copy of S with uppercase characters converted to lowercase\n\
6712 and vice versa.");
6713
6714 static PyObject*
6715 unicode_swapcase(PyUnicodeObject *self)
6716 {
6717     return fixup(self, fixswapcase);
6718 }
6719
6720 PyDoc_STRVAR(translate__doc__,
6721 "S.translate(table) -> unicode\n\
6722 \n\
6723 Return a copy of the string S, where all characters have been mapped\n\
6724 through the given translation table, which must be a mapping of\n\
6725 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6726 Unmapped characters are left untouched. Characters mapped to None\n\
6727 are deleted.");
6728
6729 static PyObject*
6730 unicode_translate(PyUnicodeObject *self, PyObject *table)
6731 {
6732     return PyUnicode_TranslateCharmap(self->str,
6733                                       self->length,
6734                                       table,
6735                                       "ignore");
6736 }
6737
6738 PyDoc_STRVAR(upper__doc__,
6739 "S.upper() -> unicode\n\
6740 \n\
6741 Return a copy of S converted to uppercase.");
6742
6743 static PyObject*
6744 unicode_upper(PyUnicodeObject *self)
6745 {
6746     return fixup(self, fixupper);
6747 }
6748
6749 PyDoc_STRVAR(zfill__doc__,
6750 "S.zfill(width) -> unicode\n\
6751 \n\
6752 Pad a numeric string x with zeros on the left, to fill a field\n\
6753 of the specified width. The string x is never truncated.");
6754
6755 static PyObject *
6756 unicode_zfill(PyUnicodeObject *self, PyObject *args)
6757 {
6758     Py_ssize_t fill;
6759     PyUnicodeObject *u;
6760
6761     Py_ssize_t width;
6762     if (!PyArg_ParseTuple(args, "n:zfill", &width))
6763         return NULL;
6764
6765     if (self->length >= width) {
6766         if (PyUnicode_CheckExact(self)) {
6767             Py_INCREF(self);
6768             return (PyObject*) self;
6769         }
6770         else
6771             return PyUnicode_FromUnicode(
6772                 PyUnicode_AS_UNICODE(self),
6773                 PyUnicode_GET_SIZE(self)
6774             );
6775     }
6776
6777     fill = width - self->length;
6778
6779     u = pad(self, fill, 0, '0');
6780
6781     if (u == NULL)
6782         return NULL;
6783
6784     if (u->str[fill] == '+' || u->str[fill] == '-') {
6785         /* move sign to beginning of string */
6786         u->str[0] = u->str[fill];
6787         u->str[fill] = '0';
6788     }
6789
6790     return (PyObject*) u;
6791 }
6792
6793 #if 0
6794 static PyObject*
6795 unicode_freelistsize(PyUnicodeObject *self)
6796 {
6797     return PyInt_FromLong(unicode_freelist_size);
6798 }
6799 #endif
6800
6801 PyDoc_STRVAR(startswith__doc__,
6802 "S.startswith(prefix[, start[, end]]) -> bool\n\
6803 \n\
6804 Return True if S starts with the specified prefix, False otherwise.\n\
6805 With optional start, test S beginning at that position.\n\
6806 With optional end, stop comparing S at that position.\n\
6807 prefix can also be a tuple of strings to try.");
6808
6809 static PyObject *
6810 unicode_startswith(PyUnicodeObject *self,
6811                    PyObject *args)
6812 {
6813     PyObject *subobj;
6814     PyUnicodeObject *substring;
6815     Py_ssize_t start = 0;
6816     Py_ssize_t end = PY_SSIZE_T_MAX;
6817     int result;
6818
6819     if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
6820                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6821         return NULL;
6822     if (PyTuple_Check(subobj)) {
6823         Py_ssize_t i;
6824         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6825             substring = (PyUnicodeObject *)PyUnicode_FromObject(
6826                             PyTuple_GET_ITEM(subobj, i));
6827             if (substring == NULL)
6828                 return NULL;
6829             result = tailmatch(self, substring, start, end, -1);
6830             Py_DECREF(substring);
6831             if (result) {
6832                 Py_RETURN_TRUE;
6833             }
6834         }
6835         /* nothing matched */
6836         Py_RETURN_FALSE;
6837     }
6838     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
6839     if (substring == NULL)
6840          return NULL;
6841     result = tailmatch(self, substring, start, end, -1);
6842     Py_DECREF(substring);
6843     return PyBool_FromLong(result);
6844 }
6845
6846
6847 PyDoc_STRVAR(endswith__doc__,
6848 "S.endswith(suffix[, start[, end]]) -> bool\n\
6849 \n\
6850 Return True if S ends with the specified suffix, False otherwise.\n\
6851 With optional start, test S beginning at that position.\n\
6852 With optional end, stop comparing S at that position.\n\
6853 suffix can also be a tuple of strings to try.");
6854
6855 static PyObject *
6856 unicode_endswith(PyUnicodeObject *self,
6857                  PyObject *args)
6858 {
6859     PyObject *subobj;
6860     PyUnicodeObject *substring;
6861     Py_ssize_t start = 0;
6862     Py_ssize_t end = PY_SSIZE_T_MAX;
6863     int result;
6864
6865     if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
6866         _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6867         return NULL;
6868     if (PyTuple_Check(subobj)) {
6869         Py_ssize_t i;
6870         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6871             substring = (PyUnicodeObject *)PyUnicode_FromObject(
6872                             PyTuple_GET_ITEM(subobj, i));
6873             if (substring == NULL)
6874             return NULL;
6875             result = tailmatch(self, substring, start, end, +1);
6876             Py_DECREF(substring);
6877             if (result) {
6878                 Py_RETURN_TRUE;
6879             }
6880         }
6881         Py_RETURN_FALSE;
6882     }
6883     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
6884     if (substring == NULL)
6885     return NULL;
6886
6887     result = tailmatch(self, substring, start, end, +1);
6888     Py_DECREF(substring);
6889     return PyBool_FromLong(result);
6890 }
6891
6892
6893
6894 static PyObject *
6895 unicode_getnewargs(PyUnicodeObject *v)
6896 {
6897         return Py_BuildValue("(u#)", v->str, v->length);
6898 }
6899
6900
6901 static PyMethodDef unicode_methods[] = {
6902
6903     /* Order is according to common usage: often used methods should
6904        appear first, since lookup is done sequentially. */
6905
6906     {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
6907     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6908     {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
6909     {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
6910     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6911     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6912     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6913     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6914     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6915     {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6916     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6917     {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
6918     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6919     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6920     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
6921     {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
6922     {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
6923 /*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6924     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6925     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6926     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
6927     {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
6928     {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
6929     {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
6930     {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
6931     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6932     {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6933     {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6934     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6935     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6936     {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6937     {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6938     {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6939     {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6940     {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6941     {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6942     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6943     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6944     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
6945     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
6946 #if 0
6947     {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
6948 #endif
6949
6950 #if 0
6951     /* This one is just used for debugging the implementation. */
6952     {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
6953 #endif
6954
6955     {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
6956     {NULL, NULL}
6957 };
6958
6959 static PyObject *
6960 unicode_mod(PyObject *v, PyObject *w)
6961 {
6962        if (!PyUnicode_Check(v)) {
6963                Py_INCREF(Py_NotImplemented);
6964                return Py_NotImplemented;
6965        }
6966        return PyUnicode_Format(v, w);
6967 }
6968
6969 static PyNumberMethods unicode_as_number = {
6970         0,                              /*nb_add*/
6971         0,                              /*nb_subtract*/
6972         0,                              /*nb_multiply*/
6973         0,                              /*nb_divide*/
6974         unicode_mod,                    /*nb_remainder*/
6975 };
6976
6977 static PySequenceMethods unicode_as_sequence = {
6978     (lenfunc) unicode_length,           /* sq_length */
6979     PyUnicode_Concat,                   /* sq_concat */
6980     (ssizeargfunc) unicode_repeat,      /* sq_repeat */
6981     (ssizeargfunc) unicode_getitem,     /* sq_item */
6982     (ssizessizeargfunc) unicode_slice,  /* sq_slice */
6983     0,                                  /* sq_ass_item */
6984     0,                                  /* sq_ass_slice */
6985     PyUnicode_Contains,                 /* sq_contains */
6986 };
6987
6988 #define HASINDEX(o) PyType_HasFeature((o)->ob_type, Py_TPFLAGS_HAVE_INDEX)
6989
6990 static PyObject*
6991 unicode_subscript(PyUnicodeObject* self, PyObject* item)
6992 {
6993     PyNumberMethods *nb = item->ob_type->tp_as_number;
6994     if (nb != NULL && HASINDEX(item) && nb->nb_index != NULL) {
6995         Py_ssize_t i = nb->nb_index(item);
6996         if (i == -1 && PyErr_Occurred())
6997             return NULL;
6998         if (i < 0)
6999             i += PyUnicode_GET_SIZE(self);
7000         return unicode_getitem(self, i);
7001     } else if (PySlice_Check(item)) {
7002         Py_ssize_t start, stop, step, slicelength, cur, i;
7003         Py_UNICODE* source_buf;
7004         Py_UNICODE* result_buf;
7005         PyObject* result;
7006
7007         if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
7008                                  &start, &stop, &step, &slicelength) < 0) {
7009             return NULL;
7010         }
7011
7012         if (slicelength <= 0) {
7013             return PyUnicode_FromUnicode(NULL, 0);
7014         } else {
7015             source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
7016             result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7017                                                     sizeof(Py_UNICODE));
7018
7019             if (result_buf == NULL)
7020                     return PyErr_NoMemory();
7021
7022             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7023                 result_buf[i] = source_buf[cur];
7024             }
7025
7026             result = PyUnicode_FromUnicode(result_buf, slicelength);
7027             PyMem_FREE(result_buf);
7028             return result;
7029         }
7030     } else {
7031         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7032         return NULL;
7033     }
7034 }
7035
7036 static PyMappingMethods unicode_as_mapping = {
7037     (lenfunc)unicode_length,            /* mp_length */
7038     (binaryfunc)unicode_subscript,      /* mp_subscript */
7039     (objobjargproc)0,                   /* mp_ass_subscript */
7040 };
7041
7042 static Py_ssize_t
7043 unicode_buffer_getreadbuf(PyUnicodeObject *self,
7044                           Py_ssize_t index,
7045                           const void **ptr)
7046 {
7047     if (index != 0) {
7048         PyErr_SetString(PyExc_SystemError,
7049                         "accessing non-existent unicode segment");
7050         return -1;
7051     }
7052     *ptr = (void *) self->str;
7053     return PyUnicode_GET_DATA_SIZE(self);
7054 }
7055
7056 static Py_ssize_t
7057 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
7058                            const void **ptr)
7059 {
7060     PyErr_SetString(PyExc_TypeError,
7061                     "cannot use unicode as modifiable buffer");
7062     return -1;
7063 }
7064
7065 static int
7066 unicode_buffer_getsegcount(PyUnicodeObject *self,
7067                            Py_ssize_t *lenp)
7068 {
7069     if (lenp)
7070         *lenp = PyUnicode_GET_DATA_SIZE(self);
7071     return 1;
7072 }
7073
7074 static Py_ssize_t
7075 unicode_buffer_getcharbuf(PyUnicodeObject *self,
7076                           Py_ssize_t index,
7077                           const void **ptr)
7078 {
7079     PyObject *str;
7080
7081     if (index != 0) {
7082         PyErr_SetString(PyExc_SystemError,
7083                         "accessing non-existent unicode segment");
7084         return -1;
7085     }
7086     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
7087     if (str == NULL)
7088         return -1;
7089     *ptr = (void *) PyString_AS_STRING(str);
7090     return PyString_GET_SIZE(str);
7091 }
7092
7093 /* Helpers for PyUnicode_Format() */
7094
7095 static PyObject *
7096 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
7097 {
7098     Py_ssize_t argidx = *p_argidx;
7099     if (argidx < arglen) {
7100         (*p_argidx)++;
7101         if (arglen < 0)
7102             return args;
7103         else
7104             return PyTuple_GetItem(args, argidx);
7105     }
7106     PyErr_SetString(PyExc_TypeError,
7107                     "not enough arguments for format string");
7108     return NULL;
7109 }
7110
7111 #define F_LJUST (1<<0)
7112 #define F_SIGN  (1<<1)
7113 #define F_BLANK (1<<2)
7114 #define F_ALT   (1<<3)
7115 #define F_ZERO  (1<<4)
7116
7117 static Py_ssize_t
7118 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
7119 {
7120     register Py_ssize_t i;
7121     Py_ssize_t len = strlen(charbuffer);
7122     for (i = len - 1; i >= 0; i--)
7123         buffer[i] = (Py_UNICODE) charbuffer[i];
7124
7125     return len;
7126 }
7127
7128 static int
7129 doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7130 {
7131     Py_ssize_t result;
7132
7133     PyOS_ascii_formatd((char *)buffer, len, format, x);
7134     result = strtounicode(buffer, (char *)buffer);
7135     return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
7136 }
7137
7138 static int
7139 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7140 {
7141     Py_ssize_t result;
7142
7143     PyOS_snprintf((char *)buffer, len, format, x);
7144     result = strtounicode(buffer, (char *)buffer);
7145     return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
7146 }
7147
7148 /* XXX To save some code duplication, formatfloat/long/int could have been
7149    shared with stringobject.c, converting from 8-bit to Unicode after the
7150    formatting is done. */
7151
7152 static int
7153 formatfloat(Py_UNICODE *buf,
7154             size_t buflen,
7155             int flags,
7156             int prec,
7157             int type,
7158             PyObject *v)
7159 {
7160     /* fmt = '%#.' + `prec` + `type`
7161        worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
7162     char fmt[20];
7163     double x;
7164
7165     x = PyFloat_AsDouble(v);
7166     if (x == -1.0 && PyErr_Occurred())
7167         return -1;
7168     if (prec < 0)
7169         prec = 6;
7170     if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7171         type = 'g';
7172     /* Worst case length calc to ensure no buffer overrun:
7173
7174        'g' formats:
7175          fmt = %#.<prec>g
7176          buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7177             for any double rep.)
7178          len = 1 + prec + 1 + 2 + 5 = 9 + prec
7179
7180        'f' formats:
7181          buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7182          len = 1 + 50 + 1 + prec = 52 + prec
7183
7184        If prec=0 the effective precision is 1 (the leading digit is
7185        always given), therefore increase the length by one.
7186
7187     */
7188     if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7189         (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
7190         PyErr_SetString(PyExc_OverflowError,
7191                         "formatted float is too long (precision too large?)");
7192         return -1;
7193     }
7194     PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7195                   (flags&F_ALT) ? "#" : "",
7196                   prec, type);
7197     return doubletounicode(buf, buflen, fmt, x);
7198 }
7199
7200 static PyObject*
7201 formatlong(PyObject *val, int flags, int prec, int type)
7202 {
7203         char *buf;
7204         int i, len;
7205         PyObject *str; /* temporary string object. */
7206         PyUnicodeObject *result;
7207
7208         str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7209         if (!str)
7210                 return NULL;
7211         result = _PyUnicode_New(len);
7212         if (!result) {
7213                 Py_DECREF(str);
7214                 return NULL;
7215         }
7216         for (i = 0; i < len; i++)
7217                 result->str[i] = buf[i];
7218         result->str[len] = 0;
7219         Py_DECREF(str);
7220         return (PyObject*)result;
7221 }
7222
7223 static int
7224 formatint(Py_UNICODE *buf,
7225           size_t buflen,
7226           int flags,
7227           int prec,
7228           int type,
7229           PyObject *v)
7230 {
7231     /* fmt = '%#.' + `prec` + 'l' + `type`
7232      * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7233      *                     + 1 + 1
7234      *                   = 24
7235      */
7236     char fmt[64]; /* plenty big enough! */
7237     char *sign;
7238     long x;
7239
7240     x = PyInt_AsLong(v);
7241     if (x == -1 && PyErr_Occurred())
7242         return -1;
7243     if (x < 0 && type == 'u') {
7244         type = 'd';
7245     }
7246     if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7247         sign = "-";
7248     else
7249         sign = "";
7250     if (prec < 0)
7251         prec = 1;
7252
7253     /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7254      * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
7255      */
7256     if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
7257         PyErr_SetString(PyExc_OverflowError,
7258                 "formatted integer is too long (precision too large?)");
7259         return -1;
7260     }
7261
7262     if ((flags & F_ALT) &&
7263         (type == 'x' || type == 'X')) {
7264         /* When converting under %#x or %#X, there are a number
7265          * of issues that cause pain:
7266          * - when 0 is being converted, the C standard leaves off
7267          *   the '0x' or '0X', which is inconsistent with other
7268          *   %#x/%#X conversions and inconsistent with Python's
7269          *   hex() function
7270          * - there are platforms that violate the standard and
7271          *   convert 0 with the '0x' or '0X'
7272          *   (Metrowerks, Compaq Tru64)
7273          * - there are platforms that give '0x' when converting
7274          *   under %#X, but convert 0 in accordance with the
7275          *   standard (OS/2 EMX)
7276          *
7277          * We can achieve the desired consistency by inserting our
7278          * own '0x' or '0X' prefix, and substituting %x/%X in place
7279          * of %#x/%#X.
7280          *
7281          * Note that this is the same approach as used in
7282          * formatint() in stringobject.c
7283          */
7284         PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7285                       sign, type, prec, type);
7286     }
7287     else {
7288         PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7289                       sign, (flags&F_ALT) ? "#" : "",
7290                       prec, type);
7291     }
7292     if (sign[0])
7293         return longtounicode(buf, buflen, fmt, -x);
7294     else
7295         return longtounicode(buf, buflen, fmt, x);
7296 }
7297
7298 static int
7299 formatchar(Py_UNICODE *buf,
7300            size_t buflen,
7301            PyObject *v)
7302 {
7303     /* presume that the buffer is at least 2 characters long */
7304     if (PyUnicode_Check(v)) {
7305         if (PyUnicode_GET_SIZE(v) != 1)
7306             goto onError;
7307         buf[0] = PyUnicode_AS_UNICODE(v)[0];
7308     }
7309
7310     else if (PyString_Check(v)) {
7311         if (PyString_GET_SIZE(v) != 1)
7312             goto onError;
7313         buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7314     }
7315
7316     else {
7317         /* Integer input truncated to a character */
7318         long x;
7319         x = PyInt_AsLong(v);
7320         if (x == -1 && PyErr_Occurred())
7321             goto onError;
7322 #ifdef Py_UNICODE_WIDE
7323         if (x < 0 || x > 0x10ffff) {
7324             PyErr_SetString(PyExc_OverflowError,
7325                             "%c arg not in range(0x110000) "
7326                             "(wide Python build)");
7327             return -1;
7328         }
7329 #else
7330         if (x < 0 || x > 0xffff) {
7331             PyErr_SetString(PyExc_OverflowError,
7332                             "%c arg not in range(0x10000) "
7333                             "(narrow Python build)");
7334             return -1;
7335         }
7336 #endif
7337         buf[0] = (Py_UNICODE) x;
7338     }
7339     buf[1] = '\0';
7340     return 1;
7341
7342  onError:
7343     PyErr_SetString(PyExc_TypeError,
7344                     "%c requires int or char");
7345     return -1;
7346 }
7347
7348 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7349
7350    FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7351    chars are formatted. XXX This is a magic number. Each formatting
7352    routine does bounds checking to ensure no overflow, but a better
7353    solution may be to malloc a buffer of appropriate size for each
7354    format. For now, the current solution is sufficient.
7355 */
7356 #define FORMATBUFLEN (size_t)120
7357
7358 PyObject *PyUnicode_Format(PyObject *format,
7359                            PyObject *args)
7360 {
7361     Py_UNICODE *fmt, *res;
7362     Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
7363     int args_owned = 0;
7364     PyUnicodeObject *result = NULL;
7365     PyObject *dict = NULL;
7366     PyObject *uformat;
7367
7368     if (format == NULL || args == NULL) {
7369         PyErr_BadInternalCall();
7370         return NULL;
7371     }
7372     uformat = PyUnicode_FromObject(format);
7373     if (uformat == NULL)
7374         return NULL;
7375     fmt = PyUnicode_AS_UNICODE(uformat);
7376     fmtcnt = PyUnicode_GET_SIZE(uformat);
7377
7378     reslen = rescnt = fmtcnt + 100;
7379     result = _PyUnicode_New(reslen);
7380     if (result == NULL)
7381         goto onError;
7382     res = PyUnicode_AS_UNICODE(result);
7383
7384     if (PyTuple_Check(args)) {
7385         arglen = PyTuple_Size(args);
7386         argidx = 0;
7387     }
7388     else {
7389         arglen = -1;
7390         argidx = -2;
7391     }
7392     if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7393         !PyObject_TypeCheck(args, &PyBaseString_Type))
7394         dict = args;
7395
7396     while (--fmtcnt >= 0) {
7397         if (*fmt != '%') {
7398             if (--rescnt < 0) {
7399                 rescnt = fmtcnt + 100;
7400                 reslen += rescnt;
7401                 if (_PyUnicode_Resize(&result, reslen) < 0)
7402                     goto onError;
7403                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7404                 --rescnt;
7405             }
7406             *res++ = *fmt++;
7407         }
7408         else {
7409             /* Got a format specifier */
7410             int flags = 0;
7411             Py_ssize_t width = -1;
7412             int prec = -1;
7413             Py_UNICODE c = '\0';
7414             Py_UNICODE fill;
7415             PyObject *v = NULL;
7416             PyObject *temp = NULL;
7417             Py_UNICODE *pbuf;
7418             Py_UNICODE sign;
7419             Py_ssize_t len;
7420             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
7421
7422             fmt++;
7423             if (*fmt == '(') {
7424                 Py_UNICODE *keystart;
7425                 Py_ssize_t keylen;
7426                 PyObject *key;
7427                 int pcount = 1;
7428
7429                 if (dict == NULL) {
7430                     PyErr_SetString(PyExc_TypeError,
7431                                     "format requires a mapping");
7432                     goto onError;
7433                 }
7434                 ++fmt;
7435                 --fmtcnt;
7436                 keystart = fmt;
7437                 /* Skip over balanced parentheses */
7438                 while (pcount > 0 && --fmtcnt >= 0) {
7439                     if (*fmt == ')')
7440                         --pcount;
7441                     else if (*fmt == '(')
7442                         ++pcount;
7443                     fmt++;
7444                 }
7445                 keylen = fmt - keystart - 1;
7446                 if (fmtcnt < 0 || pcount > 0) {
7447                     PyErr_SetString(PyExc_ValueError,
7448                                     "incomplete format key");
7449                     goto onError;
7450                 }
7451 #if 0
7452                 /* keys are converted to strings using UTF-8 and
7453                    then looked up since Python uses strings to hold
7454                    variables names etc. in its namespaces and we
7455                    wouldn't want to break common idioms. */
7456                 key = PyUnicode_EncodeUTF8(keystart,
7457                                            keylen,
7458                                            NULL);
7459 #else
7460                 key = PyUnicode_FromUnicode(keystart, keylen);
7461 #endif
7462                 if (key == NULL)
7463                     goto onError;
7464                 if (args_owned) {
7465                     Py_DECREF(args);
7466                     args_owned = 0;
7467                 }
7468                 args = PyObject_GetItem(dict, key);
7469                 Py_DECREF(key);
7470                 if (args == NULL) {
7471                     goto onError;
7472                 }
7473                 args_owned = 1;
7474                 arglen = -1;
7475                 argidx = -2;
7476             }
7477             while (--fmtcnt >= 0) {
7478                 switch (c = *fmt++) {
7479                 case '-': flags |= F_LJUST; continue;
7480                 case '+': flags |= F_SIGN; continue;
7481                 case ' ': flags |= F_BLANK; continue;
7482                 case '#': flags |= F_ALT; continue;
7483                 case '0': flags |= F_ZERO; continue;
7484                 }
7485                 break;
7486             }
7487             if (c == '*') {
7488                 v = getnextarg(args, arglen, &argidx);
7489                 if (v == NULL)
7490                     goto onError;
7491                 if (!PyInt_Check(v)) {
7492                     PyErr_SetString(PyExc_TypeError,
7493                                     "* wants int");
7494                     goto onError;
7495                 }
7496                 width = PyInt_AsLong(v);
7497                 if (width < 0) {
7498                     flags |= F_LJUST;
7499                     width = -width;
7500                 }
7501                 if (--fmtcnt >= 0)
7502                     c = *fmt++;
7503             }
7504             else if (c >= '0' && c <= '9') {
7505                 width = c - '0';
7506                 while (--fmtcnt >= 0) {
7507                     c = *fmt++;
7508                     if (c < '0' || c > '9')
7509                         break;
7510                     if ((width*10) / 10 != width) {
7511                         PyErr_SetString(PyExc_ValueError,
7512                                         "width too big");
7513                         goto onError;
7514                     }
7515                     width = width*10 + (c - '0');
7516                 }
7517             }
7518             if (c == '.') {
7519                 prec = 0;
7520                 if (--fmtcnt >= 0)
7521                     c = *fmt++;
7522                 if (c == '*') {
7523                     v = getnextarg(args, arglen, &argidx);
7524                     if (v == NULL)
7525                         goto onError;
7526                     if (!PyInt_Check(v)) {
7527                         PyErr_SetString(PyExc_TypeError,
7528                                         "* wants int");
7529                         goto onError;
7530                     }
7531                     prec = PyInt_AsLong(v);
7532                     if (prec < 0)
7533                         prec = 0;
7534                     if (--fmtcnt >= 0)
7535                         c = *fmt++;
7536                 }
7537                 else if (c >= '0' && c <= '9') {
7538                     prec = c - '0';
7539                     while (--fmtcnt >= 0) {
7540                         c = Py_CHARMASK(*fmt++);
7541                         if (c < '0' || c > '9')
7542                             break;
7543                         if ((prec*10) / 10 != prec) {
7544                             PyErr_SetString(PyExc_ValueError,
7545                                             "prec too big");
7546                             goto onError;
7547                         }
7548                         prec = prec*10 + (c - '0');
7549                     }
7550                 }
7551             } /* prec */
7552             if (fmtcnt >= 0) {
7553                 if (c == 'h' || c == 'l' || c == 'L') {
7554                     if (--fmtcnt >= 0)
7555                         c = *fmt++;
7556                 }
7557             }
7558             if (fmtcnt < 0) {
7559                 PyErr_SetString(PyExc_ValueError,
7560                                 "incomplete format");
7561                 goto onError;
7562             }
7563             if (c != '%') {
7564                 v = getnextarg(args, arglen, &argidx);
7565                 if (v == NULL)
7566                     goto onError;
7567             }
7568             sign = 0;
7569             fill = ' ';
7570             switch (c) {
7571
7572             case '%':
7573                 pbuf = formatbuf;
7574                 /* presume that buffer length is at least 1 */
7575                 pbuf[0] = '%';
7576                 len = 1;
7577                 break;
7578
7579             case 's':
7580             case 'r':
7581                 if (PyUnicode_Check(v) && c == 's') {
7582                     temp = v;
7583                     Py_INCREF(temp);
7584                 }
7585                 else {
7586                     PyObject *unicode;
7587                     if (c == 's')
7588                         temp = PyObject_Unicode(v);
7589                     else
7590                         temp = PyObject_Repr(v);
7591                     if (temp == NULL)
7592                         goto onError;
7593                     if (PyUnicode_Check(temp))
7594                         /* nothing to do */;
7595                     else if (PyString_Check(temp)) {
7596                         /* convert to string to Unicode */
7597                         unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
7598                                                    PyString_GET_SIZE(temp),
7599                                                    NULL,
7600                                                    "strict");
7601                         Py_DECREF(temp);
7602                         temp = unicode;
7603                         if (temp == NULL)
7604                             goto onError;
7605                     }
7606                     else {
7607                         Py_DECREF(temp);
7608                         PyErr_SetString(PyExc_TypeError,
7609                                         "%s argument has non-string str()");
7610                         goto onError;
7611                     }
7612                 }
7613                 pbuf = PyUnicode_AS_UNICODE(temp);
7614                 len = PyUnicode_GET_SIZE(temp);
7615                 if (prec >= 0 && len > prec)
7616                     len = prec;
7617                 break;
7618
7619             case 'i':
7620             case 'd':
7621             case 'u':
7622             case 'o':
7623             case 'x':
7624             case 'X':
7625                 if (c == 'i')
7626                     c = 'd';
7627                 if (PyLong_Check(v)) {
7628                     temp = formatlong(v, flags, prec, c);
7629                     if (!temp)
7630                         goto onError;
7631                     pbuf = PyUnicode_AS_UNICODE(temp);
7632                     len = PyUnicode_GET_SIZE(temp);
7633                     sign = 1;
7634                 }
7635                 else {
7636                     pbuf = formatbuf;
7637                     len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7638                                     flags, prec, c, v);
7639                     if (len < 0)
7640                         goto onError;
7641                     sign = 1;
7642                 }
7643                 if (flags & F_ZERO)
7644                     fill = '0';
7645                 break;
7646
7647             case 'e':
7648             case 'E':
7649             case 'f':
7650             case 'F':
7651             case 'g':
7652             case 'G':
7653                 if (c == 'F')
7654                         c = 'f';
7655                 pbuf = formatbuf;
7656                 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7657                         flags, prec, c, v);
7658                 if (len < 0)
7659                     goto onError;
7660                 sign = 1;
7661                 if (flags & F_ZERO)
7662                     fill = '0';
7663                 break;
7664
7665             case 'c':
7666                 pbuf = formatbuf;
7667                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
7668                 if (len < 0)
7669                     goto onError;
7670                 break;
7671
7672             default:
7673                 PyErr_Format(PyExc_ValueError,
7674                              "unsupported format character '%c' (0x%x) "
7675                              "at index %i",
7676                              (31<=c && c<=126) ? (char)c : '?',
7677                              (int)c,
7678                              (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
7679                 goto onError;
7680             }
7681             if (sign) {
7682                 if (*pbuf == '-' || *pbuf == '+') {
7683                     sign = *pbuf++;
7684                     len--;
7685                 }
7686                 else if (flags & F_SIGN)
7687                     sign = '+';
7688                 else if (flags & F_BLANK)
7689                     sign = ' ';
7690                 else
7691                     sign = 0;
7692             }
7693             if (width < len)
7694                 width = len;
7695             if (rescnt - (sign != 0) < width) {
7696                 reslen -= rescnt;
7697                 rescnt = width + fmtcnt + 100;
7698                 reslen += rescnt;
7699                 if (reslen < 0) {
7700                     Py_XDECREF(temp);
7701                     PyErr_NoMemory();
7702                     goto onError;
7703                 }
7704                 if (_PyUnicode_Resize(&result, reslen) < 0) {
7705                     Py_XDECREF(temp);
7706                     goto onError;
7707                 }
7708                 res = PyUnicode_AS_UNICODE(result)
7709                     + reslen - rescnt;
7710             }
7711             if (sign) {
7712                 if (fill != ' ')
7713                     *res++ = sign;
7714                 rescnt--;
7715                 if (width > len)
7716                     width--;
7717             }
7718             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7719                 assert(pbuf[0] == '0');
7720                 assert(pbuf[1] == c);
7721                 if (fill != ' ') {
7722                     *res++ = *pbuf++;
7723                     *res++ = *pbuf++;
7724                 }
7725                 rescnt -= 2;
7726                 width -= 2;
7727                 if (width < 0)
7728                     width = 0;
7729                 len -= 2;
7730             }
7731             if (width > len && !(flags & F_LJUST)) {
7732                 do {
7733                     --rescnt;
7734                     *res++ = fill;
7735                 } while (--width > len);
7736             }
7737             if (fill == ' ') {
7738                 if (sign)
7739                     *res++ = sign;
7740                 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7741                     assert(pbuf[0] == '0');
7742                     assert(pbuf[1] == c);
7743                     *res++ = *pbuf++;
7744                     *res++ = *pbuf++;
7745                 }
7746             }
7747             Py_UNICODE_COPY(res, pbuf, len);
7748             res += len;
7749             rescnt -= len;
7750             while (--width >= len) {
7751                 --rescnt;
7752                 *res++ = ' ';
7753             }
7754             if (dict && (argidx < arglen) && c != '%') {
7755                 PyErr_SetString(PyExc_TypeError,
7756                                 "not all arguments converted during string formatting");
7757                 Py_XDECREF(temp);
7758                 goto onError;
7759             }
7760             Py_XDECREF(temp);
7761         } /* '%' */
7762     } /* until end */
7763     if (argidx < arglen && !dict) {
7764         PyErr_SetString(PyExc_TypeError,
7765                         "not all arguments converted during string formatting");
7766         goto onError;
7767     }
7768
7769     if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7770         goto onError;
7771     if (args_owned) {
7772         Py_DECREF(args);
7773     }
7774     Py_DECREF(uformat);
7775     return (PyObject *)result;
7776
7777  onError:
7778     Py_XDECREF(result);
7779     Py_DECREF(uformat);
7780     if (args_owned) {
7781         Py_DECREF(args);
7782     }
7783     return NULL;
7784 }
7785
7786 static PyBufferProcs unicode_as_buffer = {
7787     (readbufferproc) unicode_buffer_getreadbuf,
7788     (writebufferproc) unicode_buffer_getwritebuf,
7789     (segcountproc) unicode_buffer_getsegcount,
7790     (charbufferproc) unicode_buffer_getcharbuf,
7791 };
7792
7793 static PyObject *
7794 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7795
7796 static PyObject *
7797 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7798 {
7799         PyObject *x = NULL;
7800         static char *kwlist[] = {"string", "encoding", "errors", 0};
7801         char *encoding = NULL;
7802         char *errors = NULL;
7803
7804         if (type != &PyUnicode_Type)
7805                 return unicode_subtype_new(type, args, kwds);
7806         if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7807                                           kwlist, &x, &encoding, &errors))
7808             return NULL;
7809         if (x == NULL)
7810                 return (PyObject *)_PyUnicode_New(0);
7811         if (encoding == NULL && errors == NULL)
7812             return PyObject_Unicode(x);
7813         else
7814         return PyUnicode_FromEncodedObject(x, encoding, errors);
7815 }
7816
7817 static PyObject *
7818 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7819 {
7820         PyUnicodeObject *tmp, *pnew;
7821         Py_ssize_t n;
7822
7823         assert(PyType_IsSubtype(type, &PyUnicode_Type));
7824         tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7825         if (tmp == NULL)
7826                 return NULL;
7827         assert(PyUnicode_Check(tmp));
7828         pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
7829         if (pnew == NULL) {
7830                 Py_DECREF(tmp);
7831                 return NULL;
7832         }
7833         pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7834         if (pnew->str == NULL) {
7835                 _Py_ForgetReference((PyObject *)pnew);
7836                 PyObject_Del(pnew);
7837                 Py_DECREF(tmp);
7838                 return PyErr_NoMemory();
7839         }
7840         Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7841         pnew->length = n;
7842         pnew->hash = tmp->hash;
7843         Py_DECREF(tmp);
7844         return (PyObject *)pnew;
7845 }
7846
7847 PyDoc_STRVAR(unicode_doc,
7848 "unicode(string [, encoding[, errors]]) -> object\n\
7849 \n\
7850 Create a new Unicode object from the given encoded string.\n\
7851 encoding defaults to the current default string encoding.\n\
7852 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
7853
7854 PyTypeObject PyUnicode_Type = {
7855     PyObject_HEAD_INIT(&PyType_Type)
7856     0,                                  /* ob_size */
7857     "unicode",                          /* tp_name */
7858     sizeof(PyUnicodeObject),            /* tp_size */
7859     0,                                  /* tp_itemsize */
7860     /* Slots */
7861     (destructor)unicode_dealloc,        /* tp_dealloc */
7862     0,                                  /* tp_print */
7863     0,                                  /* tp_getattr */
7864     0,                                  /* tp_setattr */
7865     (cmpfunc) unicode_compare,          /* tp_compare */
7866     unicode_repr,                       /* tp_repr */
7867     &unicode_as_number,                 /* tp_as_number */
7868     &unicode_as_sequence,               /* tp_as_sequence */
7869     &unicode_as_mapping,                /* tp_as_mapping */
7870     (hashfunc) unicode_hash,            /* tp_hash*/
7871     0,                                  /* tp_call*/
7872     (reprfunc) unicode_str,             /* tp_str */
7873     PyObject_GenericGetAttr,            /* tp_getattro */
7874     0,                                  /* tp_setattro */
7875     &unicode_as_buffer,                 /* tp_as_buffer */
7876     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7877             Py_TPFLAGS_BASETYPE,        /* tp_flags */
7878     unicode_doc,                        /* tp_doc */
7879     0,                                  /* tp_traverse */
7880     0,                                  /* tp_clear */
7881     0,                                  /* tp_richcompare */
7882     0,                                  /* tp_weaklistoffset */
7883     0,                                  /* tp_iter */
7884     0,                                  /* tp_iternext */
7885     unicode_methods,                    /* tp_methods */
7886     0,                                  /* tp_members */
7887     0,                                  /* tp_getset */
7888     &PyBaseString_Type,                 /* tp_base */
7889     0,                                  /* tp_dict */
7890     0,                                  /* tp_descr_get */
7891     0,                                  /* tp_descr_set */
7892     0,                                  /* tp_dictoffset */
7893     0,                                  /* tp_init */
7894     0,                                  /* tp_alloc */
7895     unicode_new,                        /* tp_new */
7896     PyObject_Del,               /* tp_free */
7897 };
7898
7899 /* Initialize the Unicode implementation */
7900
7901 void _PyUnicode_Init(void)
7902 {
7903     int i;
7904
7905     /* XXX - move this array to unicodectype.c ? */
7906     Py_UNICODE linebreak[] = {
7907         0x000A, /* LINE FEED */
7908         0x000D, /* CARRIAGE RETURN */
7909         0x001C, /* FILE SEPARATOR */
7910         0x001D, /* GROUP SEPARATOR */
7911         0x001E, /* RECORD SEPARATOR */
7912         0x0085, /* NEXT LINE */
7913         0x2028, /* LINE SEPARATOR */
7914         0x2029, /* PARAGRAPH SEPARATOR */
7915     };
7916
7917     /* Init the implementation */
7918     unicode_freelist = NULL;
7919     unicode_freelist_size = 0;
7920     unicode_empty = _PyUnicode_New(0);
7921     strcpy(unicode_default_encoding, "ascii");
7922     for (i = 0; i < 256; i++)
7923         unicode_latin1[i] = NULL;
7924     if (PyType_Ready(&PyUnicode_Type) < 0)
7925         Py_FatalError("Can't initialize 'unicode'");
7926
7927     /* initialize the linebreak bloom filter */
7928     bloom_linebreak = make_bloom_mask(
7929         linebreak, sizeof(linebreak) / sizeof(linebreak[0])
7930         );
7931
7932     PyType_Ready(&EncodingMapType);
7933 }
7934
7935 /* Finalize the Unicode implementation */
7936
7937 void
7938 _PyUnicode_Fini(void)
7939 {
7940     PyUnicodeObject *u;
7941     int i;
7942
7943     Py_XDECREF(unicode_empty);
7944     unicode_empty = NULL;
7945
7946     for (i = 0; i < 256; i++) {
7947         if (unicode_latin1[i]) {
7948             Py_DECREF(unicode_latin1[i]);
7949             unicode_latin1[i] = NULL;
7950         }
7951     }
7952
7953     for (u = unicode_freelist; u != NULL;) {
7954         PyUnicodeObject *v = u;
7955         u = *(PyUnicodeObject **)u;
7956         if (v->str)
7957             PyMem_DEL(v->str);
7958         Py_XDECREF(v->defenc);
7959         PyObject_Del(v);
7960     }
7961     unicode_freelist = NULL;
7962     unicode_freelist_size = 0;
7963 }
7964
7965 #ifdef __cplusplus
7966 }
7967 #endif
7968
7969
7970 /*
7971 Local variables:
7972 c-basic-offset: 4
7973 indent-tabs-mode: nil
7974 End:
7975 */