Objects/unicodeobject.c

   1 /*
   2
   3 Unicode implementation based on original code by Fredrik Lundh,
   4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
   5 Unicode Integration Proposal (see file Misc/unicode.txt).
   6
   7 Major speed upgrades to the method implementations at the Reykjavik
   8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
   9
  10 Copyright (c) Corporation for National Research Initiatives.
  11
  12 --------------------------------------------------------------------
  13 The original string type implementation is:
  14
  15     Copyright (c) 1999 by Secret Labs AB
  16     Copyright (c) 1999 by Fredrik Lundh
  17
  18 By obtaining, using, and/or copying this software and/or its
  19 associated documentation, you agree that you have read, understood,
  20 and will comply with the following terms and conditions:
  21
  22 Permission to use, copy, modify, and distribute this software and its
  23 associated documentation for any purpose and without fee is hereby
  24 granted, provided that the above copyright notice appears in all
  25 copies, and that both that copyright notice and this permission notice
  26 appear in supporting documentation, and that the name of Secret Labs
  27 AB or the author not be used in advertising or publicity pertaining to
  28 distribution of the software without specific, written prior
  29 permission.
  30
  31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  33 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  38 --------------------------------------------------------------------
  39
  40 */
  41
  42 #define PY_SSIZE_T_CLEAN
  43 #include "Python.h"
  44
  45 #include "unicodeobject.h"
  46 #include "ucnhash.h"
  47
  48 #ifdef MS_WINDOWS
  49 #include <windows.h>
  50 #endif
  51
  52 /* Limit for the Unicode object free list */
  53
  54 #define MAX_UNICODE_FREELIST_SIZE       1024
  55
  56 /* Limit for the Unicode object free list stay alive optimization.
  57
  58    The implementation will keep allocated Unicode memory intact for
  59    all objects on the free list having a size less than this
  60    limit. This reduces malloc() overhead for small Unicode objects.
  61
  62    At worst this will result in MAX_UNICODE_FREELIST_SIZE *
  63    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
  64    malloc()-overhead) bytes of unused garbage.
  65
  66    Setting the limit to 0 effectively turns the feature off.
  67
  68    Note: This is an experimental feature ! If you get core dumps when
  69    using Unicode objects, turn this feature off.
  70
  71 */
  72
  73 #define KEEPALIVE_SIZE_LIMIT       9
  74
  75 /* Endianness switches; defaults to little endian */
  76
  77 #ifdef WORDS_BIGENDIAN
  78 # define BYTEORDER_IS_BIG_ENDIAN
  79 #else
  80 # define BYTEORDER_IS_LITTLE_ENDIAN
  81 #endif
  82
  83 /* --- Globals ------------------------------------------------------------
  84
  85    The globals are initialized by the _PyUnicode_Init() API and should
  86    not be used before calling that API.
  87
  88 */
  89
  90
  91 #ifdef __cplusplus
  92 extern "C" {
  93 #endif
  94
  95 /* Free list for Unicode objects */
  96 static PyUnicodeObject *unicode_freelist;
  97 static int unicode_freelist_size;
  98
  99 /* The empty Unicode object is shared to improve performance. */
 100 static PyUnicodeObject *unicode_empty;
 101
 102 /* Single character Unicode strings in the Latin-1 range are being
 103    shared as well. */
 104 static PyUnicodeObject *unicode_latin1[256];
 105
 106 /* Default encoding to use and assume when NULL is passed as encoding
 107    parameter; it is initialized by _PyUnicode_Init().
 108
 109    Always use the PyUnicode_SetDefaultEncoding() and
 110    PyUnicode_GetDefaultEncoding() APIs to access this global.
 111
 112 */
 113 static char unicode_default_encoding[100];
 114
 115 Py_UNICODE
 116 PyUnicode_GetMax(void)
 117 {
 118 #ifdef Py_UNICODE_WIDE
 119         return 0x10FFFF;
 120 #else
 121         /* This is actually an illegal character, so it should
 122            not be passed to unichr. */
 123         return 0xFFFF;
 124 #endif
 125 }
 126
 127 /* --- Bloom Filters ----------------------------------------------------- */
 128
 129 /* stuff to implement simple "bloom filters" for Unicode characters.
 130    to keep things simple, we use a single bitmask, using the least 5
 131    bits from each unicode characters as the bit index. */
 132
 133 /* the linebreak mask is set up by Unicode_Init below */
 134
 135 #define BLOOM_MASK unsigned long
 136
 137 static BLOOM_MASK bloom_linebreak;
 138
 139 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
 140
 141 #define BLOOM_LINEBREAK(ch)\
 142     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
 143
 144 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
 145 {
 146     /* calculate simple bloom-style bitmask for a given unicode string */
 147
 148     long mask;
 149     Py_ssize_t i;
 150
 151     mask = 0;
 152     for (i = 0; i < len; i++)
 153         mask |= (1 << (ptr[i] & 0x1F));
 154
 155     return mask;
 156 }
 157
 158 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
 159 {
 160     Py_ssize_t i;
 161
 162     for (i = 0; i < setlen; i++)
 163         if (set[i] == chr)
 164             return 1;
 165
 166     return 0;
 167 }
 168
 169 #define BLOOM_MEMBER(mask, chr, set, setlen)\
 170     BLOOM(mask, chr) && unicode_member(chr, set, setlen)
 171
 172 /* --- Unicode Object ----------------------------------------------------- */
 173
 174 static
 175 int unicode_resize(register PyUnicodeObject *unicode,
 176                       Py_ssize_t length)
 177 {
 178     void *oldstr;
 179
 180     /* Shortcut if there's nothing much to do. */
 181     if (unicode->length == length)
 182         goto reset;
 183
 184     /* Resizing shared object (unicode_empty or single character
 185        objects) in-place is not allowed. Use PyUnicode_Resize()
 186        instead ! */
 187
 188     if (unicode == unicode_empty ||
 189         (unicode->length == 1 &&
 190          unicode->str[0] < 256U &&
 191          unicode_latin1[unicode->str[0]] == unicode)) {
 192         PyErr_SetString(PyExc_SystemError,
 193                         "can't resize shared unicode objects");
 194         return -1;
 195     }
 196
 197     /* We allocate one more byte to make sure the string is Ux0000 terminated.
 198        The overallocation is also used by fastsearch, which assumes that it's
 199        safe to look at str[length] (without making any assumptions about what
 200        it contains). */
 201
 202     oldstr = unicode->str;
 203     PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
 204     if (!unicode->str) {
 205         unicode->str = (Py_UNICODE *)oldstr;
 206         PyErr_NoMemory();
 207         return -1;
 208     }
 209     unicode->str[length] = 0;
 210     unicode->length = length;
 211
 212  reset:
 213     /* Reset the object caches */
 214     if (unicode->defenc) {
 215         Py_DECREF(unicode->defenc);
 216         unicode->defenc = NULL;
 217     }
 218     unicode->hash = -1;
 219
 220     return 0;
 221 }
 222
 223 /* We allocate one more byte to make sure the string is
 224    Ux0000 terminated -- XXX is this needed ?
 225
 226    XXX This allocator could further be enhanced by assuring that the
 227        free list never reduces its size below 1.
 228
 229 */
 230
 231 static
 232 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
 233 {
 234     register PyUnicodeObject *unicode;
 235
 236     /* Optimization for empty strings */
 237     if (length == 0 && unicode_empty != NULL) {
 238         Py_INCREF(unicode_empty);
 239         return unicode_empty;
 240     }
 241
 242     /* Unicode freelist & memory allocation */
 243     if (unicode_freelist) {
 244         unicode = unicode_freelist;
 245         unicode_freelist = *(PyUnicodeObject **)unicode;
 246         unicode_freelist_size--;
 247         if (unicode->str) {
 248             /* Keep-Alive optimization: we only upsize the buffer,
 249                never downsize it. */
 250             if ((unicode->length < length) &&
 251                 unicode_resize(unicode, length) < 0) {
 252                 PyMem_DEL(unicode->str);
 253                 goto onError;
 254             }
 255         }
 256         else {
 257             unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
 258         }
 259         PyObject_INIT(unicode, &PyUnicode_Type);
 260     }
 261     else {
 262         unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
 263         if (unicode == NULL)
 264             return NULL;
 265         unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
 266     }
 267
 268     if (!unicode->str) {
 269         PyErr_NoMemory();
 270         goto onError;
 271     }
 272     /* Initialize the first element to guard against cases where
 273      * the caller fails before initializing str -- unicode_resize()
 274      * reads str[0], and the Keep-Alive optimization can keep memory
 275      * allocated for str alive across a call to unicode_dealloc(unicode).
 276      * We don't want unicode_resize to read uninitialized memory in
 277      * that case.
 278      */
 279     unicode->str[0] = 0;
 280     unicode->str[length] = 0;
 281     unicode->length = length;
 282     unicode->hash = -1;
 283     unicode->defenc = NULL;
 284     return unicode;
 285
 286  onError:
 287     _Py_ForgetReference((PyObject *)unicode);
 288     PyObject_Del(unicode);
 289     return NULL;
 290 }
 291
 292 static
 293 void unicode_dealloc(register PyUnicodeObject *unicode)
 294 {
 295     if (PyUnicode_CheckExact(unicode) &&
 296         unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
 297         /* Keep-Alive optimization */
 298         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
 299             PyMem_DEL(unicode->str);
 300             unicode->str = NULL;
 301             unicode->length = 0;
 302         }
 303         if (unicode->defenc) {
 304             Py_DECREF(unicode->defenc);
 305             unicode->defenc = NULL;
 306         }
 307         /* Add to free list */
 308         *(PyUnicodeObject **)unicode = unicode_freelist;
 309         unicode_freelist = unicode;
 310         unicode_freelist_size++;
 311     }
 312     else {
 313         PyMem_DEL(unicode->str);
 314         Py_XDECREF(unicode->defenc);
 315         Py_Type(unicode)->tp_free((PyObject *)unicode);
 316     }
 317 }
 318
 319 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
 320 {
 321     register PyUnicodeObject *v;
 322
 323     /* Argument checks */
 324     if (unicode == NULL) {
 325         PyErr_BadInternalCall();
 326         return -1;
 327     }
 328     v = (PyUnicodeObject *)*unicode;
 329     if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
 330         PyErr_BadInternalCall();
 331         return -1;
 332     }
 333
 334     /* Resizing unicode_empty and single character objects is not
 335        possible since these are being shared. We simply return a fresh
 336        copy with the same Unicode content. */
 337     if (v->length != length &&
 338         (v == unicode_empty || v->length == 1)) {
 339         PyUnicodeObject *w = _PyUnicode_New(length);
 340         if (w == NULL)
 341             return -1;
 342         Py_UNICODE_COPY(w->str, v->str,
 343                         length < v->length ? length : v->length);
 344         Py_DECREF(*unicode);
 345         *unicode = (PyObject *)w;
 346         return 0;
 347     }
 348
 349     /* Note that we don't have to modify *unicode for unshared Unicode
 350        objects, since we can modify them in-place. */
 351     return unicode_resize(v, length);
 352 }
 353
 354 /* Internal API for use in unicodeobject.c only ! */
 355 #define _PyUnicode_Resize(unicodevar, length) \
 356         PyUnicode_Resize(((PyObject **)(unicodevar)), length)
 357
 358 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
 359                                 Py_ssize_t size)
 360 {
 361     PyUnicodeObject *unicode;
 362
 363     /* If the Unicode data is known at construction time, we can apply
 364        some optimizations which share commonly used objects. */
 365     if (u != NULL) {
 366
 367         /* Optimization for empty strings */
 368         if (size == 0 && unicode_empty != NULL) {
 369             Py_INCREF(unicode_empty);
 370             return (PyObject *)unicode_empty;
 371         }
 372
 373         /* Single character Unicode objects in the Latin-1 range are
 374            shared when using this constructor */
 375         if (size == 1 && *u < 256) {
 376             unicode = unicode_latin1[*u];
 377             if (!unicode) {
 378                 unicode = _PyUnicode_New(1);
 379                 if (!unicode)
 380                     return NULL;
 381                 unicode->str[0] = *u;
 382                 unicode_latin1[*u] = unicode;
 383             }
 384             Py_INCREF(unicode);
 385             return (PyObject *)unicode;
 386         }
 387     }
 388
 389     unicode = _PyUnicode_New(size);
 390     if (!unicode)
 391         return NULL;
 392
 393     /* Copy the Unicode data into the new object */
 394     if (u != NULL)
 395         Py_UNICODE_COPY(unicode->str, u, size);
 396
 397     return (PyObject *)unicode;
 398 }
 399
 400 #ifdef HAVE_WCHAR_H
 401
 402 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 403                                  Py_ssize_t size)
 404 {
 405     PyUnicodeObject *unicode;
 406
 407     if (w == NULL) {
 408         PyErr_BadInternalCall();
 409         return NULL;
 410     }
 411
 412     unicode = _PyUnicode_New(size);
 413     if (!unicode)
 414         return NULL;
 415
 416     /* Copy the wchar_t data into the new object */
 417 #ifdef HAVE_USABLE_WCHAR_T
 418     memcpy(unicode->str, w, size * sizeof(wchar_t));
 419 #else
 420     {
 421         register Py_UNICODE *u;
 422         register Py_ssize_t i;
 423         u = PyUnicode_AS_UNICODE(unicode);
 424         for (i = size; i > 0; i--)
 425             *u++ = *w++;
 426     }
 427 #endif
 428
 429     return (PyObject *)unicode;
 430 }
 431
 432 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
 433                                 wchar_t *w,
 434                                 Py_ssize_t size)
 435 {
 436     if (unicode == NULL) {
 437         PyErr_BadInternalCall();
 438         return -1;
 439     }
 440
 441     /* If possible, try to copy the 0-termination as well */
 442     if (size > PyUnicode_GET_SIZE(unicode))
 443         size = PyUnicode_GET_SIZE(unicode) + 1;
 444
 445 #ifdef HAVE_USABLE_WCHAR_T
 446     memcpy(w, unicode->str, size * sizeof(wchar_t));
 447 #else
 448     {
 449         register Py_UNICODE *u;
 450         register Py_ssize_t i;
 451         u = PyUnicode_AS_UNICODE(unicode);
 452         for (i = size; i > 0; i--)
 453             *w++ = *u++;
 454     }
 455 #endif
 456
 457     if (size > PyUnicode_GET_SIZE(unicode))
 458         return PyUnicode_GET_SIZE(unicode);
 459     else
 460     return size;
 461 }
 462
 463 #endif
 464
 465 PyObject *PyUnicode_FromOrdinal(int ordinal)
 466 {
 467     Py_UNICODE s[1];
 468
 469 #ifdef Py_UNICODE_WIDE
 470     if (ordinal < 0 || ordinal > 0x10ffff) {
 471         PyErr_SetString(PyExc_ValueError,
 472                         "unichr() arg not in range(0x110000) "
 473                         "(wide Python build)");
 474         return NULL;
 475     }
 476 #else
 477     if (ordinal < 0 || ordinal > 0xffff) {
 478         PyErr_SetString(PyExc_ValueError,
 479                         "unichr() arg not in range(0x10000) "
 480                         "(narrow Python build)");
 481         return NULL;
 482     }
 483 #endif
 484
 485     s[0] = (Py_UNICODE)ordinal;
 486     return PyUnicode_FromUnicode(s, 1);
 487 }
 488
 489 PyObject *PyUnicode_FromObject(register PyObject *obj)
 490 {
 491     /* XXX Perhaps we should make this API an alias of
 492            PyObject_Unicode() instead ?! */
 493     if (PyUnicode_CheckExact(obj)) {
 494         Py_INCREF(obj);
 495         return obj;
 496     }
 497     if (PyUnicode_Check(obj)) {
 498         /* For a Unicode subtype that's not a Unicode object,
 499            return a true Unicode object with the same data. */
 500         return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
 501                                      PyUnicode_GET_SIZE(obj));
 502     }
 503     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
 504 }
 505
 506 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
 507                                       const char *encoding,
 508                                       const char *errors)
 509 {
 510     const char *s = NULL;
 511     Py_ssize_t len;
 512     PyObject *v;
 513
 514     if (obj == NULL) {
 515         PyErr_BadInternalCall();
 516         return NULL;
 517     }
 518
 519 #if 0
 520     /* For b/w compatibility we also accept Unicode objects provided
 521        that no encodings is given and then redirect to
 522        PyObject_Unicode() which then applies the additional logic for
 523        Unicode subclasses.
 524
 525        NOTE: This API should really only be used for object which
 526              represent *encoded* Unicode !
 527
 528     */
 529         if (PyUnicode_Check(obj)) {
 530             if (encoding) {
 531                 PyErr_SetString(PyExc_TypeError,
 532                                 "decoding Unicode is not supported");
 533             return NULL;
 534             }
 535         return PyObject_Unicode(obj);
 536             }
 537 #else
 538     if (PyUnicode_Check(obj)) {
 539         PyErr_SetString(PyExc_TypeError,
 540                         "decoding Unicode is not supported");
 541         return NULL;
 542         }
 543 #endif
 544
 545     /* Coerce object */
 546     if (PyString_Check(obj)) {
 547             s = PyString_AS_STRING(obj);
 548             len = PyString_GET_SIZE(obj);
 549             }
 550     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
 551         /* Overwrite the error message with something more useful in
 552            case of a TypeError. */
 553         if (PyErr_ExceptionMatches(PyExc_TypeError))
 554         PyErr_Format(PyExc_TypeError,
 555                          "coercing to Unicode: need string or buffer, "
 556                          "%.80s found",
 557                      Py_Type(obj)->tp_name);
 558         goto onError;
 559     }
 560
 561     /* Convert to Unicode */
 562     if (len == 0) {
 563         Py_INCREF(unicode_empty);
 564         v = (PyObject *)unicode_empty;
 565     }
 566     else
 567         v = PyUnicode_Decode(s, len, encoding, errors);
 568
 569     return v;
 570
 571  onError:
 572     return NULL;
 573 }
 574
 575 PyObject *PyUnicode_Decode(const char *s,
 576                            Py_ssize_t size,
 577                            const char *encoding,
 578                            const char *errors)
 579 {
 580     PyObject *buffer = NULL, *unicode;
 581
 582     if (encoding == NULL)
 583         encoding = PyUnicode_GetDefaultEncoding();
 584
 585     /* Shortcuts for common default encodings */
 586     if (strcmp(encoding, "utf-8") == 0)
 587         return PyUnicode_DecodeUTF8(s, size, errors);
 588     else if (strcmp(encoding, "latin-1") == 0)
 589         return PyUnicode_DecodeLatin1(s, size, errors);
 590 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
 591     else if (strcmp(encoding, "mbcs") == 0)
 592         return PyUnicode_DecodeMBCS(s, size, errors);
 593 #endif
 594     else if (strcmp(encoding, "ascii") == 0)
 595         return PyUnicode_DecodeASCII(s, size, errors);
 596
 597     /* Decode via the codec registry */
 598     buffer = PyBuffer_FromMemory((void *)s, size);
 599     if (buffer == NULL)
 600         goto onError;
 601     unicode = PyCodec_Decode(buffer, encoding, errors);
 602     if (unicode == NULL)
 603         goto onError;
 604     if (!PyUnicode_Check(unicode)) {
 605         PyErr_Format(PyExc_TypeError,
 606                      "decoder did not return an unicode object (type=%.400s)",
 607                      Py_Type(unicode)->tp_name);
 608         Py_DECREF(unicode);
 609         goto onError;
 610     }
 611     Py_DECREF(buffer);
 612     return unicode;
 613
 614  onError:
 615     Py_XDECREF(buffer);
 616     return NULL;
 617 }
 618
 619 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
 620                                     const char *encoding,
 621                                     const char *errors)
 622 {
 623     PyObject *v;
 624
 625     if (!PyUnicode_Check(unicode)) {
 626         PyErr_BadArgument();
 627         goto onError;
 628     }
 629
 630     if (encoding == NULL)
 631         encoding = PyUnicode_GetDefaultEncoding();
 632
 633     /* Decode via the codec registry */
 634     v = PyCodec_Decode(unicode, encoding, errors);
 635     if (v == NULL)
 636         goto onError;
 637     return v;
 638
 639  onError:
 640     return NULL;
 641 }
 642
 643 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
 644                            Py_ssize_t size,
 645                            const char *encoding,
 646                            const char *errors)
 647 {
 648     PyObject *v, *unicode;
 649
 650     unicode = PyUnicode_FromUnicode(s, size);
 651     if (unicode == NULL)
 652         return NULL;
 653     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
 654     Py_DECREF(unicode);
 655     return v;
 656 }
 657
 658 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
 659                                     const char *encoding,
 660                                     const char *errors)
 661 {
 662     PyObject *v;
 663
 664     if (!PyUnicode_Check(unicode)) {
 665         PyErr_BadArgument();
 666         goto onError;
 667     }
 668
 669     if (encoding == NULL)
 670         encoding = PyUnicode_GetDefaultEncoding();
 671
 672     /* Encode via the codec registry */
 673     v = PyCodec_Encode(unicode, encoding, errors);
 674     if (v == NULL)
 675         goto onError;
 676     return v;
 677
 678  onError:
 679     return NULL;
 680 }
 681
 682 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
 683                                     const char *encoding,
 684                                     const char *errors)
 685 {
 686     PyObject *v;
 687
 688     if (!PyUnicode_Check(unicode)) {
 689         PyErr_BadArgument();
 690         goto onError;
 691     }
 692
 693     if (encoding == NULL)
 694         encoding = PyUnicode_GetDefaultEncoding();
 695
 696     /* Shortcuts for common default encodings */
 697     if (errors == NULL) {
 698         if (strcmp(encoding, "utf-8") == 0)
 699             return PyUnicode_AsUTF8String(unicode);
 700         else if (strcmp(encoding, "latin-1") == 0)
 701             return PyUnicode_AsLatin1String(unicode);
 702 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
 703         else if (strcmp(encoding, "mbcs") == 0)
 704             return PyUnicode_AsMBCSString(unicode);
 705 #endif
 706         else if (strcmp(encoding, "ascii") == 0)
 707             return PyUnicode_AsASCIIString(unicode);
 708     }
 709
 710     /* Encode via the codec registry */
 711     v = PyCodec_Encode(unicode, encoding, errors);
 712     if (v == NULL)
 713         goto onError;
 714     if (!PyString_Check(v)) {
 715         PyErr_Format(PyExc_TypeError,
 716                      "encoder did not return a string object (type=%.400s)",
 717                      Py_Type(v)->tp_name);
 718         Py_DECREF(v);
 719         goto onError;
 720     }
 721     return v;
 722
 723  onError:
 724     return NULL;
 725 }
 726
 727 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
 728                                             const char *errors)
 729 {
 730     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
 731
 732     if (v)
 733         return v;
 734     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
 735     if (v && errors == NULL)
 736         ((PyUnicodeObject *)unicode)->defenc = v;
 737     return v;
 738 }
 739
 740 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
 741 {
 742     if (!PyUnicode_Check(unicode)) {
 743         PyErr_BadArgument();
 744         goto onError;
 745     }
 746     return PyUnicode_AS_UNICODE(unicode);
 747
 748  onError:
 749     return NULL;
 750 }
 751
 752 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
 753 {
 754     if (!PyUnicode_Check(unicode)) {
 755         PyErr_BadArgument();
 756         goto onError;
 757     }
 758     return PyUnicode_GET_SIZE(unicode);
 759
 760  onError:
 761     return -1;
 762 }
 763
 764 const char *PyUnicode_GetDefaultEncoding(void)
 765 {
 766     return unicode_default_encoding;
 767 }
 768
 769 int PyUnicode_SetDefaultEncoding(const char *encoding)
 770 {
 771     PyObject *v;
 772
 773     /* Make sure the encoding is valid. As side effect, this also
 774        loads the encoding into the codec registry cache. */
 775     v = _PyCodec_Lookup(encoding);
 776     if (v == NULL)
 777         goto onError;
 778     Py_DECREF(v);
 779     strncpy(unicode_default_encoding,
 780             encoding,
 781             sizeof(unicode_default_encoding));
 782     return 0;
 783
 784  onError:
 785     return -1;
 786 }
 787
 788 /* error handling callback helper:
 789    build arguments, call the callback and check the arguments,
 790    if no exception occurred, copy the replacement to the output
 791    and adjust various state variables.
 792    return 0 on success, -1 on error
 793 */
 794
 795 static
 796 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
 797                  const char *encoding, const char *reason,
 798                  const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
 799                  Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
 800                  PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
 801 {
 802     static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
 803
 804     PyObject *restuple = NULL;
 805     PyObject *repunicode = NULL;
 806     Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
 807     Py_ssize_t requiredsize;
 808     Py_ssize_t newpos;
 809     Py_UNICODE *repptr;
 810     Py_ssize_t repsize;
 811     int res = -1;
 812
 813     if (*errorHandler == NULL) {
 814         *errorHandler = PyCodec_LookupError(errors);
 815         if (*errorHandler == NULL)
 816            goto onError;
 817     }
 818
 819     if (*exceptionObject == NULL) {
 820         *exceptionObject = PyUnicodeDecodeError_Create(
 821             encoding, input, insize, *startinpos, *endinpos, reason);
 822         if (*exceptionObject == NULL)
 823            goto onError;
 824     }
 825     else {
 826         if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
 827             goto onError;
 828         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
 829             goto onError;
 830         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
 831             goto onError;
 832     }
 833
 834     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
 835     if (restuple == NULL)
 836         goto onError;
 837     if (!PyTuple_Check(restuple)) {
 838         PyErr_Format(PyExc_TypeError, &argparse[4]);
 839         goto onError;
 840     }
 841     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
 842         goto onError;
 843     if (newpos<0)
 844         newpos = insize+newpos;
 845     if (newpos<0 || newpos>insize) {
 846         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
 847         goto onError;
 848     }
 849
 850     /* need more space? (at least enough for what we
 851        have+the replacement+the rest of the string (starting
 852        at the new input position), so we won't have to check space
 853        when there are no errors in the rest of the string) */
 854     repptr = PyUnicode_AS_UNICODE(repunicode);
 855     repsize = PyUnicode_GET_SIZE(repunicode);
 856     requiredsize = *outpos + repsize + insize-newpos;
 857     if (requiredsize > outsize) {
 858         if (requiredsize<2*outsize)
 859             requiredsize = 2*outsize;
 860         if (PyUnicode_Resize(output, requiredsize) < 0)
 861             goto onError;
 862         *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
 863     }
 864     *endinpos = newpos;
 865     *inptr = input + newpos;
 866     Py_UNICODE_COPY(*outptr, repptr, repsize);
 867     *outptr += repsize;
 868     *outpos += repsize;
 869     /* we made it! */
 870     res = 0;
 871
 872     onError:
 873     Py_XDECREF(restuple);
 874     return res;
 875 }
 876
 877 /* --- UTF-7 Codec -------------------------------------------------------- */
 878
 879 /* see RFC2152 for details */
 880
 881 static
 882 char utf7_special[128] = {
 883     /* indicate whether a UTF-7 character is special i.e. cannot be directly
 884        encoded:
 885            0 - not special
 886            1 - special
 887            2 - whitespace (optional)
 888            3 - RFC2152 Set O (optional) */
 889     1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
 890     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 891     2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
 892     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
 893     3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 894     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
 895     3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 896     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
 897
 898 };
 899
 900 /* Note: The comparison (c) <= 0 is a trick to work-around gcc
 901    warnings about the comparison always being false; since
 902    utf7_special[0] is 1, we can safely make that one comparison
 903    true  */
 904
 905 #define SPECIAL(c, encodeO, encodeWS) \
 906     ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
 907      (encodeWS && (utf7_special[(c)] == 2)) || \
 908      (encodeO && (utf7_special[(c)] == 3)))
 909
 910 #define B64(n)  \
 911     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
 912 #define B64CHAR(c) \
 913     (isalnum(c) || (c) == '+' || (c) == '/')
 914 #define UB64(c) \
 915     ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ?                   \
 916      (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
 917
 918 #define ENCODE(out, ch, bits)                   \
 919     while (bits >= 6) {                         \
 920         *out++ = B64(ch >> (bits-6));           \
 921         bits -= 6;                              \
 922     }
 923
 924 #define DECODE(out, ch, bits, surrogate)                                \
 925     while (bits >= 16) {                                                \
 926         Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff);   \
 927         bits -= 16;                                                     \
 928         if (surrogate) {                                                \
 929             /* We have already generated an error for the high surrogate \
 930                so let's not bother seeing if the low surrogate is correct or not */ \
 931             surrogate = 0;                                              \
 932         } else if (0xDC00 <= outCh && outCh <= 0xDFFF) {                \
 933             /* This is a surrogate pair. Unfortunately we can't represent \
 934                it in a 16-bit character */                              \
 935             surrogate = 1;                                              \
 936             errmsg = "code pairs are not supported";                    \
 937             goto utf7Error;                                             \
 938         } else {                                                        \
 939             *out++ = outCh;                                             \
 940         }                                                               \
 941     }
 942
 943 PyObject *PyUnicode_DecodeUTF7(const char *s,
 944                                Py_ssize_t size,
 945                                const char *errors)
 946 {
 947     const char *starts = s;
 948     Py_ssize_t startinpos;
 949     Py_ssize_t endinpos;
 950     Py_ssize_t outpos;
 951     const char *e;
 952     PyUnicodeObject *unicode;
 953     Py_UNICODE *p;
 954     const char *errmsg = "";
 955     int inShift = 0;
 956     unsigned int bitsleft = 0;
 957     unsigned long charsleft = 0;
 958     int surrogate = 0;
 959     PyObject *errorHandler = NULL;
 960     PyObject *exc = NULL;
 961
 962     unicode = _PyUnicode_New(size);
 963     if (!unicode)
 964         return NULL;
 965     if (size == 0)
 966         return (PyObject *)unicode;
 967
 968     p = unicode->str;
 969     e = s + size;
 970
 971     while (s < e) {
 972         Py_UNICODE ch;
 973         restart:
 974         ch = *s;
 975
 976         if (inShift) {
 977             if ((ch == '-') || !B64CHAR(ch)) {
 978                 inShift = 0;
 979                 s++;
 980
 981                 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
 982                 if (bitsleft >= 6) {
 983                     /* The shift sequence has a partial character in it. If
 984                        bitsleft < 6 then we could just classify it as padding
 985                        but that is not the case here */
 986
 987                     errmsg = "partial character in shift sequence";
 988                     goto utf7Error;
 989                 }
 990                 /* According to RFC2152 the remaining bits should be zero. We
 991                    choose to signal an error/insert a replacement character
 992                    here so indicate the potential of a misencoded character. */
 993
 994                 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
 995                 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
 996                     errmsg = "non-zero padding bits in shift sequence";
 997                     goto utf7Error;
 998                 }
 999
1000                 if (ch == '-') {
1001                     if ((s < e) && (*(s) == '-')) {
1002                         *p++ = '-';
1003                         inShift = 1;
1004                     }
1005                 } else if (SPECIAL(ch,0,0)) {
1006                     errmsg = "unexpected special character";
1007                         goto utf7Error;
1008                 } else  {
1009                     *p++ = ch;
1010                 }
1011             } else {
1012                 charsleft = (charsleft << 6) | UB64(ch);
1013                 bitsleft += 6;
1014                 s++;
1015                 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1016             }
1017         }
1018         else if ( ch == '+' ) {
1019             startinpos = s-starts;
1020             s++;
1021             if (s < e && *s == '-') {
1022                 s++;
1023                 *p++ = '+';
1024             } else
1025             {
1026                 inShift = 1;
1027                 bitsleft = 0;
1028             }
1029         }
1030         else if (SPECIAL(ch,0,0)) {
1031             startinpos = s-starts;
1032             errmsg = "unexpected special character";
1033             s++;
1034                 goto utf7Error;
1035         }
1036         else {
1037             *p++ = ch;
1038             s++;
1039         }
1040         continue;
1041     utf7Error:
1042         outpos = p-PyUnicode_AS_UNICODE(unicode);
1043         endinpos = s-starts;
1044         if (unicode_decode_call_errorhandler(
1045              errors, &errorHandler,
1046              "utf7", errmsg,
1047              starts, size, &startinpos, &endinpos, &exc, &s,
1048              (PyObject **)&unicode, &outpos, &p))
1049         goto onError;
1050     }
1051
1052     if (inShift) {
1053         outpos = p-PyUnicode_AS_UNICODE(unicode);
1054         endinpos = size;
1055         if (unicode_decode_call_errorhandler(
1056              errors, &errorHandler,
1057              "utf7", "unterminated shift sequence",
1058              starts, size, &startinpos, &endinpos, &exc, &s,
1059              (PyObject **)&unicode, &outpos, &p))
1060             goto onError;
1061         if (s < e)
1062            goto restart;
1063     }
1064
1065     if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1066         goto onError;
1067
1068     Py_XDECREF(errorHandler);
1069     Py_XDECREF(exc);
1070     return (PyObject *)unicode;
1071
1072 onError:
1073     Py_XDECREF(errorHandler);
1074     Py_XDECREF(exc);
1075     Py_DECREF(unicode);
1076     return NULL;
1077 }
1078
1079
1080 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1081                    Py_ssize_t size,
1082                    int encodeSetO,
1083                    int encodeWhiteSpace,
1084                    const char *errors)
1085 {
1086     PyObject *v;
1087     /* It might be possible to tighten this worst case */
1088     Py_ssize_t cbAllocated = 5 * size;
1089     int inShift = 0;
1090     Py_ssize_t i = 0;
1091     unsigned int bitsleft = 0;
1092     unsigned long charsleft = 0;
1093     char * out;
1094     char * start;
1095
1096     if (size == 0)
1097                 return PyString_FromStringAndSize(NULL, 0);
1098
1099     v = PyString_FromStringAndSize(NULL, cbAllocated);
1100     if (v == NULL)
1101         return NULL;
1102
1103     start = out = PyString_AS_STRING(v);
1104     for (;i < size; ++i) {
1105         Py_UNICODE ch = s[i];
1106
1107         if (!inShift) {
1108             if (ch == '+') {
1109                 *out++ = '+';
1110                 *out++ = '-';
1111             } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1112                 charsleft = ch;
1113                 bitsleft = 16;
1114                 *out++ = '+';
1115                 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1116                 inShift = bitsleft > 0;
1117             } else {
1118                 *out++ = (char) ch;
1119             }
1120         } else {
1121             if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1122                 *out++ = B64(charsleft << (6-bitsleft));
1123                 charsleft = 0;
1124                 bitsleft = 0;
1125                 /* Characters not in the BASE64 set implicitly unshift the sequence
1126                    so no '-' is required, except if the character is itself a '-' */
1127                 if (B64CHAR(ch) || ch == '-') {
1128                     *out++ = '-';
1129                 }
1130                 inShift = 0;
1131                 *out++ = (char) ch;
1132             } else {
1133                 bitsleft += 16;
1134                 charsleft = (charsleft << 16) | ch;
1135                 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1136
1137                 /* If the next character is special then we dont' need to terminate
1138                    the shift sequence. If the next character is not a BASE64 character
1139                    or '-' then the shift sequence will be terminated implicitly and we
1140                    don't have to insert a '-'. */
1141
1142                 if (bitsleft == 0) {
1143                     if (i + 1 < size) {
1144                         Py_UNICODE ch2 = s[i+1];
1145
1146                         if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1147
1148                         } else if (B64CHAR(ch2) || ch2 == '-') {
1149                             *out++ = '-';
1150                             inShift = 0;
1151                         } else {
1152                             inShift = 0;
1153                         }
1154
1155                     }
1156                     else {
1157                         *out++ = '-';
1158                         inShift = 0;
1159                     }
1160                 }
1161             }
1162         }
1163     }
1164     if (bitsleft) {
1165         *out++= B64(charsleft << (6-bitsleft) );
1166         *out++ = '-';
1167     }
1168
1169     _PyString_Resize(&v, out - start);
1170     return v;
1171 }
1172
1173 #undef SPECIAL
1174 #undef B64
1175 #undef B64CHAR
1176 #undef UB64
1177 #undef ENCODE
1178 #undef DECODE
1179
1180 /* --- UTF-8 Codec -------------------------------------------------------- */
1181
1182 static
1183 char utf8_code_length[256] = {
1184     /* Map UTF-8 encoded prefix byte to sequence length.  zero means
1185        illegal prefix.  see RFC 2279 for details */
1186     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1187     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1188     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1189     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1193     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1194     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1195     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1196     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1197     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1198     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1199     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1200     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1201     4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1202 };
1203
1204 PyObject *PyUnicode_DecodeUTF8(const char *s,
1205                                Py_ssize_t size,
1206                                const char *errors)
1207 {
1208     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1209 }
1210
1211 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1212                                         Py_ssize_t size,
1213                                         const char *errors,
1214                                         Py_ssize_t *consumed)
1215 {
1216     const char *starts = s;
1217     int n;
1218     Py_ssize_t startinpos;
1219     Py_ssize_t endinpos;
1220     Py_ssize_t outpos;
1221     const char *e;
1222     PyUnicodeObject *unicode;
1223     Py_UNICODE *p;
1224     const char *errmsg = "";
1225     PyObject *errorHandler = NULL;
1226     PyObject *exc = NULL;
1227
1228     /* Note: size will always be longer than the resulting Unicode
1229        character count */
1230     unicode = _PyUnicode_New(size);
1231     if (!unicode)
1232         return NULL;
1233     if (size == 0) {
1234         if (consumed)
1235             *consumed = 0;
1236         return (PyObject *)unicode;
1237     }
1238
1239     /* Unpack UTF-8 encoded data */
1240     p = unicode->str;
1241     e = s + size;
1242
1243     while (s < e) {
1244         Py_UCS4 ch = (unsigned char)*s;
1245
1246         if (ch < 0x80) {
1247             *p++ = (Py_UNICODE)ch;
1248             s++;
1249             continue;
1250         }
1251
1252         n = utf8_code_length[ch];
1253
1254         if (s + n > e) {
1255             if (consumed)
1256                 break;
1257             else {
1258                 errmsg = "unexpected end of data";
1259                 startinpos = s-starts;
1260                 endinpos = size;
1261                 goto utf8Error;
1262             }
1263         }
1264
1265         switch (n) {
1266
1267         case 0:
1268             errmsg = "unexpected code byte";
1269             startinpos = s-starts;
1270             endinpos = startinpos+1;
1271             goto utf8Error;
1272
1273         case 1:
1274             errmsg = "internal error";
1275             startinpos = s-starts;
1276             endinpos = startinpos+1;
1277             goto utf8Error;
1278
1279         case 2:
1280             if ((s[1] & 0xc0) != 0x80) {
1281                 errmsg = "invalid data";
1282                 startinpos = s-starts;
1283                 endinpos = startinpos+2;
1284                 goto utf8Error;
1285             }
1286             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1287             if (ch < 0x80) {
1288                 startinpos = s-starts;
1289                 endinpos = startinpos+2;
1290                 errmsg = "illegal encoding";
1291                 goto utf8Error;
1292             }
1293             else
1294                 *p++ = (Py_UNICODE)ch;
1295             break;
1296
1297         case 3:
1298             if ((s[1] & 0xc0) != 0x80 ||
1299                 (s[2] & 0xc0) != 0x80) {
1300                 errmsg = "invalid data";
1301                 startinpos = s-starts;
1302                 endinpos = startinpos+3;
1303                 goto utf8Error;
1304             }
1305             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1306             if (ch < 0x0800) {
1307                 /* Note: UTF-8 encodings of surrogates are considered
1308                    legal UTF-8 sequences;
1309
1310                    XXX For wide builds (UCS-4) we should probably try
1311                        to recombine the surrogates into a single code
1312                        unit.
1313                 */
1314                 errmsg = "illegal encoding";
1315                 startinpos = s-starts;
1316                 endinpos = startinpos+3;
1317                 goto utf8Error;
1318             }
1319             else
1320                 *p++ = (Py_UNICODE)ch;
1321             break;
1322
1323         case 4:
1324             if ((s[1] & 0xc0) != 0x80 ||
1325                 (s[2] & 0xc0) != 0x80 ||
1326                 (s[3] & 0xc0) != 0x80) {
1327                 errmsg = "invalid data";
1328                 startinpos = s-starts;
1329                 endinpos = startinpos+4;
1330                 goto utf8Error;
1331             }
1332             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1333                  ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1334             /* validate and convert to UTF-16 */
1335             if ((ch < 0x10000)        /* minimum value allowed for 4
1336                                          byte encoding */
1337                 || (ch > 0x10ffff))   /* maximum value allowed for
1338                                          UTF-16 */
1339             {
1340                 errmsg = "illegal encoding";
1341                 startinpos = s-starts;
1342                 endinpos = startinpos+4;
1343                 goto utf8Error;
1344             }
1345 #ifdef Py_UNICODE_WIDE
1346             *p++ = (Py_UNICODE)ch;
1347 #else
1348             /*  compute and append the two surrogates: */
1349
1350             /*  translate from 10000..10FFFF to 0..FFFF */
1351             ch -= 0x10000;
1352
1353             /*  high surrogate = top 10 bits added to D800 */
1354             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1355
1356             /*  low surrogate = bottom 10 bits added to DC00 */
1357             *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1358 #endif
1359             break;
1360
1361         default:
1362             /* Other sizes are only needed for UCS-4 */
1363             errmsg = "unsupported Unicode code range";
1364             startinpos = s-starts;
1365             endinpos = startinpos+n;
1366             goto utf8Error;
1367         }
1368         s += n;
1369         continue;
1370
1371     utf8Error:
1372     outpos = p-PyUnicode_AS_UNICODE(unicode);
1373     if (unicode_decode_call_errorhandler(
1374              errors, &errorHandler,
1375              "utf8", errmsg,
1376              starts, size, &startinpos, &endinpos, &exc, &s,
1377              (PyObject **)&unicode, &outpos, &p))
1378         goto onError;
1379     }
1380     if (consumed)
1381         *consumed = s-starts;
1382
1383     /* Adjust length */
1384     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1385         goto onError;
1386
1387     Py_XDECREF(errorHandler);
1388     Py_XDECREF(exc);
1389     return (PyObject *)unicode;
1390
1391 onError:
1392     Py_XDECREF(errorHandler);
1393     Py_XDECREF(exc);
1394     Py_DECREF(unicode);
1395     return NULL;
1396 }
1397
1398 /* Allocation strategy:  if the string is short, convert into a stack buffer
1399    and allocate exactly as much space needed at the end.  Else allocate the
1400    maximum possible needed (4 result bytes per Unicode character), and return
1401    the excess memory at the end.
1402 */
1403 PyObject *
1404 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1405                      Py_ssize_t size,
1406                      const char *errors)
1407 {
1408 #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
1409
1410     Py_ssize_t i;           /* index into s of next input byte */
1411     PyObject *v;        /* result string object */
1412     char *p;            /* next free byte in output buffer */
1413     Py_ssize_t nallocated;  /* number of result bytes allocated */
1414     Py_ssize_t nneeded;        /* number of result bytes needed */
1415     char stackbuf[MAX_SHORT_UNICHARS * 4];
1416
1417     assert(s != NULL);
1418     assert(size >= 0);
1419
1420     if (size <= MAX_SHORT_UNICHARS) {
1421         /* Write into the stack buffer; nallocated can't overflow.
1422          * At the end, we'll allocate exactly as much heap space as it
1423          * turns out we need.
1424          */
1425         nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1426         v = NULL;   /* will allocate after we're done */
1427         p = stackbuf;
1428     }
1429     else {
1430         /* Overallocate on the heap, and give the excess back at the end. */
1431         nallocated = size * 4;
1432         if (nallocated / 4 != size)  /* overflow! */
1433             return PyErr_NoMemory();
1434         v = PyString_FromStringAndSize(NULL, nallocated);
1435         if (v == NULL)
1436             return NULL;
1437         p = PyString_AS_STRING(v);
1438     }
1439
1440     for (i = 0; i < size;) {
1441         Py_UCS4 ch = s[i++];
1442
1443         if (ch < 0x80)
1444             /* Encode ASCII */
1445             *p++ = (char) ch;
1446
1447         else if (ch < 0x0800) {
1448             /* Encode Latin-1 */
1449             *p++ = (char)(0xc0 | (ch >> 6));
1450             *p++ = (char)(0x80 | (ch & 0x3f));
1451         }
1452         else {
1453             /* Encode UCS2 Unicode ordinals */
1454             if (ch < 0x10000) {
1455                 /* Special case: check for high surrogate */
1456                 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1457                     Py_UCS4 ch2 = s[i];
1458                     /* Check for low surrogate and combine the two to
1459                        form a UCS4 value */
1460                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1461                         ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
1462                         i++;
1463                         goto encodeUCS4;
1464                     }
1465                     /* Fall through: handles isolated high surrogates */
1466                 }
1467                 *p++ = (char)(0xe0 | (ch >> 12));
1468                 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1469                 *p++ = (char)(0x80 | (ch & 0x3f));
1470                 continue;
1471             }
1472 encodeUCS4:
1473             /* Encode UCS4 Unicode ordinals */
1474             *p++ = (char)(0xf0 | (ch >> 18));
1475             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1476             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1477             *p++ = (char)(0x80 | (ch & 0x3f));
1478         }
1479     }
1480
1481     if (v == NULL) {
1482         /* This was stack allocated. */
1483         nneeded = p - stackbuf;
1484         assert(nneeded <= nallocated);
1485         v = PyString_FromStringAndSize(stackbuf, nneeded);
1486     }
1487     else {
1488         /* Cut back to size actually needed. */
1489         nneeded = p - PyString_AS_STRING(v);
1490         assert(nneeded <= nallocated);
1491         _PyString_Resize(&v, nneeded);
1492     }
1493     return v;
1494
1495 #undef MAX_SHORT_UNICHARS
1496 }
1497
1498 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1499 {
1500     if (!PyUnicode_Check(unicode)) {
1501         PyErr_BadArgument();
1502         return NULL;
1503     }
1504     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1505                                 PyUnicode_GET_SIZE(unicode),
1506                                 NULL);
1507 }
1508
1509 /* --- UTF-32 Codec ------------------------------------------------------- */
1510
1511 PyObject *
1512 PyUnicode_DecodeUTF32(const char *s,
1513                       Py_ssize_t size,
1514                       const char *errors,
1515                       int *byteorder)
1516 {
1517     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
1518 }
1519
1520 PyObject *
1521 PyUnicode_DecodeUTF32Stateful(const char *s,
1522                               Py_ssize_t size,
1523                               const char *errors,
1524                               int *byteorder,
1525                               Py_ssize_t *consumed)
1526 {
1527     const char *starts = s;
1528     Py_ssize_t startinpos;
1529     Py_ssize_t endinpos;
1530     Py_ssize_t outpos;
1531     PyUnicodeObject *unicode;
1532     Py_UNICODE *p;
1533 #ifndef Py_UNICODE_WIDE
1534     int i, pairs;
1535 #else
1536     const int pairs = 0;
1537 #endif
1538     const unsigned char *q, *e;
1539     int bo = 0;       /* assume native ordering by default */
1540     const char *errmsg = "";
1541     /* Offsets from q for retrieving bytes in the right order. */
1542 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1543     int iorder[] = {0, 1, 2, 3};
1544 #else
1545     int iorder[] = {3, 2, 1, 0};
1546 #endif
1547     PyObject *errorHandler = NULL;
1548     PyObject *exc = NULL;
1549     /* On narrow builds we split characters outside the BMP into two
1550        codepoints => count how much extra space we need. */
1551 #ifndef Py_UNICODE_WIDE
1552     for (i = pairs = 0; i < size/4; i++)
1553         if (((Py_UCS4 *)s)[i] >= 0x10000)
1554             pairs++;
1555 #endif
1556
1557     /* This might be one to much, because of a BOM */
1558     unicode = _PyUnicode_New((size+3)/4+pairs);
1559     if (!unicode)
1560         return NULL;
1561     if (size == 0)
1562         return (PyObject *)unicode;
1563
1564     /* Unpack UTF-32 encoded data */
1565     p = unicode->str;
1566     q = (unsigned char *)s;
1567     e = q + size;
1568
1569     if (byteorder)
1570         bo = *byteorder;
1571
1572     /* Check for BOM marks (U+FEFF) in the input and adjust current
1573        byte order setting accordingly. In native mode, the leading BOM
1574        mark is skipped, in all other modes, it is copied to the output
1575        stream as-is (giving a ZWNBSP character). */
1576     if (bo == 0) {
1577         if (size >= 4) {
1578             const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
1579                                 (q[iorder[1]] << 8) | q[iorder[0]];
1580 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1581             if (bom == 0x0000FEFF) {
1582                 q += 4;
1583                 bo = -1;
1584             }
1585             else if (bom == 0xFFFE0000) {
1586                 q += 4;
1587                 bo = 1;
1588             }
1589 #else
1590             if (bom == 0x0000FEFF) {
1591                 q += 4;
1592                 bo = 1;
1593             }
1594             else if (bom == 0xFFFE0000) {
1595                 q += 4;
1596                 bo = -1;
1597             }
1598 #endif
1599         }
1600     }
1601
1602     if (bo == -1) {
1603         /* force LE */
1604         iorder[0] = 0;
1605         iorder[1] = 1;
1606         iorder[2] = 2;
1607         iorder[3] = 3;
1608     }
1609     else if (bo == 1) {
1610         /* force BE */
1611         iorder[0] = 3;
1612         iorder[1] = 2;
1613         iorder[2] = 1;
1614         iorder[3] = 0;
1615     }
1616
1617     while (q < e) {
1618         Py_UCS4 ch;
1619         /* remaining bytes at the end? (size should be divisible by 4) */
1620         if (e-q<4) {
1621             if (consumed)
1622                 break;
1623             errmsg = "truncated data";
1624             startinpos = ((const char *)q)-starts;
1625             endinpos = ((const char *)e)-starts;
1626             goto utf32Error;
1627             /* The remaining input chars are ignored if the callback
1628                chooses to skip the input */
1629         }
1630         ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
1631              (q[iorder[1]] << 8) | q[iorder[0]];
1632
1633         if (ch >= 0x110000)
1634         {
1635             errmsg = "codepoint not in range(0x110000)";
1636             startinpos = ((const char *)q)-starts;
1637             endinpos = startinpos+4;
1638             goto utf32Error;
1639         }
1640 #ifndef Py_UNICODE_WIDE
1641         if (ch >= 0x10000)
1642         {
1643             *p++ = 0xD800 | ((ch-0x10000) >> 10);
1644             *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
1645         }
1646         else
1647 #endif
1648             *p++ = ch;
1649         q += 4;
1650         continue;
1651     utf32Error:
1652         outpos = p-PyUnicode_AS_UNICODE(unicode);
1653     if (unicode_decode_call_errorhandler(
1654          errors, &errorHandler,
1655          "utf32", errmsg,
1656          starts, size, &startinpos, &endinpos, &exc, &s,
1657          (PyObject **)&unicode, &outpos, &p))
1658             goto onError;
1659     }
1660
1661     if (byteorder)
1662         *byteorder = bo;
1663
1664     if (consumed)
1665         *consumed = (const char *)q-starts;
1666
1667     /* Adjust length */
1668     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1669         goto onError;
1670
1671     Py_XDECREF(errorHandler);
1672     Py_XDECREF(exc);
1673     return (PyObject *)unicode;
1674
1675 onError:
1676     Py_DECREF(unicode);
1677     Py_XDECREF(errorHandler);
1678     Py_XDECREF(exc);
1679     return NULL;
1680 }
1681
1682 PyObject *
1683 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
1684                       Py_ssize_t size,
1685                       const char *errors,
1686                       int byteorder)
1687 {
1688     PyObject *v;
1689     unsigned char *p;
1690 #ifndef Py_UNICODE_WIDE
1691     int i, pairs;
1692 #else
1693     const int pairs = 0;
1694 #endif
1695     /* Offsets from p for storing byte pairs in the right order. */
1696 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1697     int iorder[] = {0, 1, 2, 3};
1698 #else
1699     int iorder[] = {3, 2, 1, 0};
1700 #endif
1701
1702 #define STORECHAR(CH)                       \
1703     do {                                    \
1704         p[iorder[3]] = ((CH) >> 24) & 0xff; \
1705         p[iorder[2]] = ((CH) >> 16) & 0xff; \
1706         p[iorder[1]] = ((CH) >> 8) & 0xff;  \
1707         p[iorder[0]] = (CH) & 0xff;         \
1708         p += 4;                             \
1709     } while(0)
1710
1711     /* In narrow builds we can output surrogate pairs as one codepoint,
1712        so we need less space. */
1713 #ifndef Py_UNICODE_WIDE
1714     for (i = pairs = 0; i < size-1; i++)
1715         if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
1716             0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
1717             pairs++;
1718 #endif
1719     v = PyString_FromStringAndSize(NULL,
1720                   4 * (size - pairs + (byteorder == 0)));
1721     if (v == NULL)
1722         return NULL;
1723
1724     p = (unsigned char *)PyString_AS_STRING(v);
1725     if (byteorder == 0)
1726         STORECHAR(0xFEFF);
1727     if (size == 0)
1728         return v;
1729
1730     if (byteorder == -1) {
1731         /* force LE */
1732         iorder[0] = 0;
1733         iorder[1] = 1;
1734         iorder[2] = 2;
1735         iorder[3] = 3;
1736     }
1737     else if (byteorder == 1) {
1738         /* force BE */
1739         iorder[0] = 3;
1740         iorder[1] = 2;
1741         iorder[2] = 1;
1742         iorder[3] = 0;
1743     }
1744
1745     while (size-- > 0) {
1746         Py_UCS4 ch = *s++;
1747 #ifndef Py_UNICODE_WIDE
1748         if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
1749             Py_UCS4 ch2 = *s;
1750             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1751                 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1752                 s++;
1753                 size--;
1754             }
1755         }
1756 #endif
1757         STORECHAR(ch);
1758     }
1759     return v;
1760 #undef STORECHAR
1761 }
1762
1763 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
1764 {
1765     if (!PyUnicode_Check(unicode)) {
1766         PyErr_BadArgument();
1767         return NULL;
1768     }
1769     return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
1770                                  PyUnicode_GET_SIZE(unicode),
1771                                  NULL,
1772                                  0);
1773 }
1774
1775 /* --- UTF-16 Codec ------------------------------------------------------- */
1776
1777 PyObject *
1778 PyUnicode_DecodeUTF16(const char *s,
1779                       Py_ssize_t size,
1780                       const char *errors,
1781                       int *byteorder)
1782 {
1783     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1784 }
1785
1786 PyObject *
1787 PyUnicode_DecodeUTF16Stateful(const char *s,
1788                               Py_ssize_t size,
1789                               const char *errors,
1790                               int *byteorder,
1791                               Py_ssize_t *consumed)
1792 {
1793     const char *starts = s;
1794     Py_ssize_t startinpos;
1795     Py_ssize_t endinpos;
1796     Py_ssize_t outpos;
1797     PyUnicodeObject *unicode;
1798     Py_UNICODE *p;
1799     const unsigned char *q, *e;
1800     int bo = 0;       /* assume native ordering by default */
1801     const char *errmsg = "";
1802     /* Offsets from q for retrieving byte pairs in the right order. */
1803 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1804     int ihi = 1, ilo = 0;
1805 #else
1806     int ihi = 0, ilo = 1;
1807 #endif
1808     PyObject *errorHandler = NULL;
1809     PyObject *exc = NULL;
1810
1811     /* Note: size will always be longer than the resulting Unicode
1812        character count */
1813     unicode = _PyUnicode_New(size);
1814     if (!unicode)
1815         return NULL;
1816     if (size == 0)
1817         return (PyObject *)unicode;
1818
1819     /* Unpack UTF-16 encoded data */
1820     p = unicode->str;
1821     q = (unsigned char *)s;
1822     e = q + size;
1823
1824     if (byteorder)
1825         bo = *byteorder;
1826
1827     /* Check for BOM marks (U+FEFF) in the input and adjust current
1828        byte order setting accordingly. In native mode, the leading BOM
1829        mark is skipped, in all other modes, it is copied to the output
1830        stream as-is (giving a ZWNBSP character). */
1831     if (bo == 0) {
1832         if (size >= 2) {
1833             const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
1834 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1835             if (bom == 0xFEFF) {
1836                 q += 2;
1837                 bo = -1;
1838             }
1839             else if (bom == 0xFFFE) {
1840                 q += 2;
1841                 bo = 1;
1842             }
1843 #else
1844             if (bom == 0xFEFF) {
1845                 q += 2;
1846                 bo = 1;
1847             }
1848             else if (bom == 0xFFFE) {
1849                 q += 2;
1850                 bo = -1;
1851             }
1852 #endif
1853         }
1854     }
1855
1856     if (bo == -1) {
1857         /* force LE */
1858         ihi = 1;
1859         ilo = 0;
1860     }
1861     else if (bo == 1) {
1862         /* force BE */
1863         ihi = 0;
1864         ilo = 1;
1865     }
1866
1867     while (q < e) {
1868         Py_UNICODE ch;
1869         /* remaining bytes at the end? (size should be even) */
1870         if (e-q<2) {
1871             if (consumed)
1872                 break;
1873             errmsg = "truncated data";
1874             startinpos = ((const char *)q)-starts;
1875             endinpos = ((const char *)e)-starts;
1876             goto utf16Error;
1877             /* The remaining input chars are ignored if the callback
1878                chooses to skip the input */
1879         }
1880         ch = (q[ihi] << 8) | q[ilo];
1881
1882         q += 2;
1883
1884         if (ch < 0xD800 || ch > 0xDFFF) {
1885             *p++ = ch;
1886             continue;
1887         }
1888
1889         /* UTF-16 code pair: */
1890         if (q >= e) {
1891             errmsg = "unexpected end of data";
1892             startinpos = (((const char *)q)-2)-starts;
1893             endinpos = ((const char *)e)-starts;
1894             goto utf16Error;
1895         }
1896         if (0xD800 <= ch && ch <= 0xDBFF) {
1897             Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1898             q += 2;
1899             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1900 #ifndef Py_UNICODE_WIDE
1901                 *p++ = ch;
1902                 *p++ = ch2;
1903 #else
1904                 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1905 #endif
1906                 continue;
1907             }
1908             else {
1909                 errmsg = "illegal UTF-16 surrogate";
1910                 startinpos = (((const char *)q)-4)-starts;
1911                 endinpos = startinpos+2;
1912                 goto utf16Error;
1913             }
1914
1915         }
1916         errmsg = "illegal encoding";
1917         startinpos = (((const char *)q)-2)-starts;
1918         endinpos = startinpos+2;
1919         /* Fall through to report the error */
1920
1921     utf16Error:
1922         outpos = p-PyUnicode_AS_UNICODE(unicode);
1923         if (unicode_decode_call_errorhandler(
1924                  errors, &errorHandler,
1925                  "utf16", errmsg,
1926                  starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1927                  (PyObject **)&unicode, &outpos, &p))
1928             goto onError;
1929     }
1930
1931     if (byteorder)
1932         *byteorder = bo;
1933
1934     if (consumed)
1935         *consumed = (const char *)q-starts;
1936
1937     /* Adjust length */
1938     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1939         goto onError;
1940
1941     Py_XDECREF(errorHandler);
1942     Py_XDECREF(exc);
1943     return (PyObject *)unicode;
1944
1945 onError:
1946     Py_DECREF(unicode);
1947     Py_XDECREF(errorHandler);
1948     Py_XDECREF(exc);
1949     return NULL;
1950 }
1951
1952 PyObject *
1953 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1954                       Py_ssize_t size,
1955                       const char *errors,
1956                       int byteorder)
1957 {
1958     PyObject *v;
1959     unsigned char *p;
1960 #ifdef Py_UNICODE_WIDE
1961     int i, pairs;
1962 #else
1963     const int pairs = 0;
1964 #endif
1965     /* Offsets from p for storing byte pairs in the right order. */
1966 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1967     int ihi = 1, ilo = 0;
1968 #else
1969     int ihi = 0, ilo = 1;
1970 #endif
1971
1972 #define STORECHAR(CH)                   \
1973     do {                                \
1974         p[ihi] = ((CH) >> 8) & 0xff;    \
1975         p[ilo] = (CH) & 0xff;           \
1976         p += 2;                         \
1977     } while(0)
1978
1979 #ifdef Py_UNICODE_WIDE
1980     for (i = pairs = 0; i < size; i++)
1981         if (s[i] >= 0x10000)
1982             pairs++;
1983 #endif
1984     v = PyString_FromStringAndSize(NULL,
1985                   2 * (size + pairs + (byteorder == 0)));
1986     if (v == NULL)
1987         return NULL;
1988
1989     p = (unsigned char *)PyString_AS_STRING(v);
1990     if (byteorder == 0)
1991         STORECHAR(0xFEFF);
1992     if (size == 0)
1993         return v;
1994
1995     if (byteorder == -1) {
1996         /* force LE */
1997         ihi = 1;
1998         ilo = 0;
1999     }
2000     else if (byteorder == 1) {
2001         /* force BE */
2002         ihi = 0;
2003         ilo = 1;
2004     }
2005
2006     while (size-- > 0) {
2007         Py_UNICODE ch = *s++;
2008         Py_UNICODE ch2 = 0;
2009 #ifdef Py_UNICODE_WIDE
2010         if (ch >= 0x10000) {
2011             ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2012             ch  = 0xD800 | ((ch-0x10000) >> 10);
2013         }
2014 #endif
2015         STORECHAR(ch);
2016         if (ch2)
2017             STORECHAR(ch2);
2018     }
2019     return v;
2020 #undef STORECHAR
2021 }
2022
2023 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2024 {
2025     if (!PyUnicode_Check(unicode)) {
2026         PyErr_BadArgument();
2027         return NULL;
2028     }
2029     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2030                                  PyUnicode_GET_SIZE(unicode),
2031                                  NULL,
2032                                  0);
2033 }
2034
2035 /* --- Unicode Escape Codec ----------------------------------------------- */
2036
2037 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2038
2039 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2040                                         Py_ssize_t size,
2041                                         const char *errors)
2042 {
2043     const char *starts = s;
2044     Py_ssize_t startinpos;
2045     Py_ssize_t endinpos;
2046     Py_ssize_t outpos;
2047     int i;
2048     PyUnicodeObject *v;
2049     Py_UNICODE *p;
2050     const char *end;
2051     char* message;
2052     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2053     PyObject *errorHandler = NULL;
2054     PyObject *exc = NULL;
2055
2056     /* Escaped strings will always be longer than the resulting
2057        Unicode string, so we start with size here and then reduce the
2058        length after conversion to the true value.
2059        (but if the error callback returns a long replacement string
2060        we'll have to allocate more space) */
2061     v = _PyUnicode_New(size);
2062     if (v == NULL)
2063         goto onError;
2064     if (size == 0)
2065         return (PyObject *)v;
2066
2067     p = PyUnicode_AS_UNICODE(v);
2068     end = s + size;
2069
2070     while (s < end) {
2071         unsigned char c;
2072         Py_UNICODE x;
2073         int digits;
2074
2075         /* Non-escape characters are interpreted as Unicode ordinals */
2076         if (*s != '\\') {
2077             *p++ = (unsigned char) *s++;
2078             continue;
2079         }
2080
2081         startinpos = s-starts;
2082         /* \ - Escapes */
2083         s++;
2084         switch (*s++) {
2085
2086         /* \x escapes */
2087         case '\n': break;
2088         case '\\': *p++ = '\\'; break;
2089         case '\'': *p++ = '\''; break;
2090         case '\"': *p++ = '\"'; break;
2091         case 'b': *p++ = '\b'; break;
2092         case 'f': *p++ = '\014'; break; /* FF */
2093         case 't': *p++ = '\t'; break;
2094         case 'n': *p++ = '\n'; break;
2095         case 'r': *p++ = '\r'; break;
2096         case 'v': *p++ = '\013'; break; /* VT */
2097         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2098
2099         /* \OOO (octal) escapes */
2100         case '0': case '1': case '2': case '3':
2101         case '4': case '5': case '6': case '7':
2102             x = s[-1] - '0';
2103             if ('0' <= *s && *s <= '7') {
2104                 x = (x<<3) + *s++ - '0';
2105                 if ('0' <= *s && *s <= '7')
2106                     x = (x<<3) + *s++ - '0';
2107             }
2108             *p++ = x;
2109             break;
2110
2111         /* hex escapes */
2112         /* \xXX */
2113         case 'x':
2114             digits = 2;
2115             message = "truncated \\xXX escape";
2116             goto hexescape;
2117
2118         /* \uXXXX */
2119         case 'u':
2120             digits = 4;
2121             message = "truncated \\uXXXX escape";
2122             goto hexescape;
2123
2124         /* \UXXXXXXXX */
2125         case 'U':
2126             digits = 8;
2127             message = "truncated \\UXXXXXXXX escape";
2128         hexescape:
2129             chr = 0;
2130             outpos = p-PyUnicode_AS_UNICODE(v);
2131             if (s+digits>end) {
2132                 endinpos = size;
2133                 if (unicode_decode_call_errorhandler(
2134                     errors, &errorHandler,
2135                     "unicodeescape", "end of string in escape sequence",
2136                     starts, size, &startinpos, &endinpos, &exc, &s,
2137                     (PyObject **)&v, &outpos, &p))
2138                     goto onError;
2139                 goto nextByte;
2140             }
2141             for (i = 0; i < digits; ++i) {
2142                 c = (unsigned char) s[i];
2143                 if (!isxdigit(c)) {
2144                     endinpos = (s+i+1)-starts;
2145                     if (unicode_decode_call_errorhandler(
2146                         errors, &errorHandler,
2147                         "unicodeescape", message,
2148                         starts, size, &startinpos, &endinpos, &exc, &s,
2149                         (PyObject **)&v, &outpos, &p))
2150                         goto onError;
2151                     goto nextByte;
2152                 }
2153                 chr = (chr<<4) & ~0xF;
2154                 if (c >= '0' && c <= '9')
2155                     chr += c - '0';
2156                 else if (c >= 'a' && c <= 'f')
2157                     chr += 10 + c - 'a';
2158                 else
2159                     chr += 10 + c - 'A';
2160             }
2161             s += i;
2162             if (chr == 0xffffffff && PyErr_Occurred())
2163                 /* _decoding_error will have already written into the
2164                    target buffer. */
2165                 break;
2166         store:
2167             /* when we get here, chr is a 32-bit unicode character */
2168             if (chr <= 0xffff)
2169                 /* UCS-2 character */
2170                 *p++ = (Py_UNICODE) chr;
2171             else if (chr <= 0x10ffff) {
2172                 /* UCS-4 character. Either store directly, or as
2173                    surrogate pair. */
2174 #ifdef Py_UNICODE_WIDE
2175                 *p++ = chr;
2176 #else
2177                 chr -= 0x10000L;
2178                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2179                 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2180 #endif
2181             } else {
2182                 endinpos = s-starts;
2183                 outpos = p-PyUnicode_AS_UNICODE(v);
2184                 if (unicode_decode_call_errorhandler(
2185                     errors, &errorHandler,
2186                     "unicodeescape", "illegal Unicode character",
2187                     starts, size, &startinpos, &endinpos, &exc, &s,
2188                     (PyObject **)&v, &outpos, &p))
2189                     goto onError;
2190             }
2191             break;
2192
2193         /* \N{name} */
2194         case 'N':
2195             message = "malformed \\N character escape";
2196             if (ucnhash_CAPI == NULL) {
2197                 /* load the unicode data module */
2198                 PyObject *m, *api;
2199                 m = PyImport_ImportModule("unicodedata");
2200                 if (m == NULL)
2201                     goto ucnhashError;
2202                 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
2203                 Py_DECREF(m);
2204                 if (api == NULL)
2205                     goto ucnhashError;
2206                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
2207                 Py_DECREF(api);
2208                 if (ucnhash_CAPI == NULL)
2209                     goto ucnhashError;
2210             }
2211             if (*s == '{') {
2212                 const char *start = s+1;
2213                 /* look for the closing brace */
2214                 while (*s != '}' && s < end)
2215                     s++;
2216                 if (s > start && s < end && *s == '}') {
2217                     /* found a name.  look it up in the unicode database */
2218                     message = "unknown Unicode character name";
2219                     s++;
2220                     if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2221                         goto store;
2222                 }
2223             }
2224             endinpos = s-starts;
2225             outpos = p-PyUnicode_AS_UNICODE(v);
2226             if (unicode_decode_call_errorhandler(
2227                 errors, &errorHandler,
2228                 "unicodeescape", message,
2229                 starts, size, &startinpos, &endinpos, &exc, &s,
2230                 (PyObject **)&v, &outpos, &p))
2231                 goto onError;
2232             break;
2233
2234         default:
2235             if (s > end) {
2236                 message = "\\ at end of string";
2237                 s--;
2238                 endinpos = s-starts;
2239                 outpos = p-PyUnicode_AS_UNICODE(v);
2240                 if (unicode_decode_call_errorhandler(
2241                     errors, &errorHandler,
2242                     "unicodeescape", message,
2243                     starts, size, &startinpos, &endinpos, &exc, &s,
2244                     (PyObject **)&v, &outpos, &p))
2245                     goto onError;
2246             }
2247             else {
2248                 *p++ = '\\';
2249                 *p++ = (unsigned char)s[-1];
2250             }
2251             break;
2252         }
2253         nextByte:
2254         ;
2255     }
2256     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2257         goto onError;
2258     Py_XDECREF(errorHandler);
2259     Py_XDECREF(exc);
2260     return (PyObject *)v;
2261
2262 ucnhashError:
2263     PyErr_SetString(
2264         PyExc_UnicodeError,
2265         "\\N escapes not supported (can't load unicodedata module)"
2266         );
2267     Py_XDECREF(v);
2268     Py_XDECREF(errorHandler);
2269     Py_XDECREF(exc);
2270     return NULL;
2271
2272 onError:
2273     Py_XDECREF(v);
2274     Py_XDECREF(errorHandler);
2275     Py_XDECREF(exc);
2276     return NULL;
2277 }
2278
2279 /* Return a Unicode-Escape string version of the Unicode object.
2280
2281    If quotes is true, the string is enclosed in u"" or u'' quotes as
2282    appropriate.
2283
2284 */
2285
2286 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2287                                       Py_ssize_t size,
2288                                       Py_UNICODE ch)
2289 {
2290     /* like wcschr, but doesn't stop at NULL characters */
2291
2292     while (size-- > 0) {
2293         if (*s == ch)
2294             return s;
2295         s++;
2296     }
2297
2298     return NULL;
2299 }
2300
2301 static
2302 PyObject *unicodeescape_string(const Py_UNICODE *s,
2303                                Py_ssize_t size,
2304                                int quotes)
2305 {
2306     PyObject *repr;
2307     char *p;
2308
2309     static const char *hexdigit = "0123456789abcdef";
2310
2311     /* XXX(nnorwitz): rather than over-allocating, it would be
2312        better to choose a different scheme.  Perhaps scan the
2313        first N-chars of the string and allocate based on that size.
2314     */
2315     /* Initial allocation is based on the longest-possible unichr
2316        escape.
2317
2318        In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2319        unichr, so in this case it's the longest unichr escape. In
2320        narrow (UTF-16) builds this is five chars per source unichr
2321        since there are two unichrs in the surrogate pair, so in narrow
2322        (UTF-16) builds it's not the longest unichr escape.
2323
2324        In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2325        so in the narrow (UTF-16) build case it's the longest unichr
2326        escape.
2327     */
2328
2329     repr = PyString_FromStringAndSize(NULL,
2330         2
2331 #ifdef Py_UNICODE_WIDE
2332         + 10*size
2333 #else
2334         + 6*size
2335 #endif
2336         + 1);
2337     if (repr == NULL)
2338         return NULL;
2339
2340     p = PyString_AS_STRING(repr);
2341
2342     if (quotes) {
2343         *p++ = 'u';
2344         *p++ = (findchar(s, size, '\'') &&
2345                 !findchar(s, size, '"')) ? '"' : '\'';
2346     }
2347     while (size-- > 0) {
2348         Py_UNICODE ch = *s++;
2349
2350         /* Escape quotes and backslashes */
2351         if ((quotes &&
2352              ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
2353             *p++ = '\\';
2354             *p++ = (char) ch;
2355             continue;
2356         }
2357
2358 #ifdef Py_UNICODE_WIDE
2359         /* Map 21-bit characters to '\U00xxxxxx' */
2360         else if (ch >= 0x10000) {
2361             *p++ = '\\';
2362             *p++ = 'U';
2363             *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2364             *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2365             *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2366             *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2367             *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2368             *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2369             *p++ = hexdigit[(ch >> 4) & 0x0000000F];
2370             *p++ = hexdigit[ch & 0x0000000F];
2371             continue;
2372         }
2373 #else
2374         /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
2375         else if (ch >= 0xD800 && ch < 0xDC00) {
2376             Py_UNICODE ch2;
2377             Py_UCS4 ucs;
2378
2379             ch2 = *s++;
2380             size--;
2381             if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2382                 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2383                 *p++ = '\\';
2384                 *p++ = 'U';
2385                 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2386                 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2387                 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2388                 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2389                 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2390                 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2391                 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2392                 *p++ = hexdigit[ucs & 0x0000000F];
2393                 continue;
2394             }
2395             /* Fall through: isolated surrogates are copied as-is */
2396             s--;
2397             size++;
2398         }
2399 #endif
2400
2401         /* Map 16-bit characters to '\uxxxx' */
2402         if (ch >= 256) {
2403             *p++ = '\\';
2404             *p++ = 'u';
2405             *p++ = hexdigit[(ch >> 12) & 0x000F];
2406             *p++ = hexdigit[(ch >> 8) & 0x000F];
2407             *p++ = hexdigit[(ch >> 4) & 0x000F];
2408             *p++ = hexdigit[ch & 0x000F];
2409         }
2410
2411         /* Map special whitespace to '\t', \n', '\r' */
2412         else if (ch == '\t') {
2413             *p++ = '\\';
2414             *p++ = 't';
2415         }
2416         else if (ch == '\n') {
2417             *p++ = '\\';
2418             *p++ = 'n';
2419         }
2420         else if (ch == '\r') {
2421             *p++ = '\\';
2422             *p++ = 'r';
2423         }
2424
2425         /* Map non-printable US ASCII to '\xhh' */
2426         else if (ch < ' ' || ch >= 0x7F) {
2427             *p++ = '\\';
2428             *p++ = 'x';
2429             *p++ = hexdigit[(ch >> 4) & 0x000F];
2430             *p++ = hexdigit[ch & 0x000F];
2431         }
2432
2433         /* Copy everything else as-is */
2434         else
2435             *p++ = (char) ch;
2436     }
2437     if (quotes)
2438         *p++ = PyString_AS_STRING(repr)[1];
2439
2440     *p = '\0';
2441     _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
2442     return repr;
2443 }
2444
2445 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2446                                         Py_ssize_t size)
2447 {
2448     return unicodeescape_string(s, size, 0);
2449 }
2450
2451 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2452 {
2453     if (!PyUnicode_Check(unicode)) {
2454         PyErr_BadArgument();
2455         return NULL;
2456     }
2457     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2458                                          PyUnicode_GET_SIZE(unicode));
2459 }
2460
2461 /* --- Raw Unicode Escape Codec ------------------------------------------- */
2462
2463 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2464                                            Py_ssize_t size,
2465                                            const char *errors)
2466 {
2467     const char *starts = s;
2468     Py_ssize_t startinpos;
2469     Py_ssize_t endinpos;
2470     Py_ssize_t outpos;
2471     PyUnicodeObject *v;
2472     Py_UNICODE *p;
2473     const char *end;
2474     const char *bs;
2475     PyObject *errorHandler = NULL;
2476     PyObject *exc = NULL;
2477
2478     /* Escaped strings will always be longer than the resulting
2479        Unicode string, so we start with size here and then reduce the
2480        length after conversion to the true value. (But decoding error
2481        handler might have to resize the string) */
2482     v = _PyUnicode_New(size);
2483     if (v == NULL)
2484         goto onError;
2485     if (size == 0)
2486         return (PyObject *)v;
2487     p = PyUnicode_AS_UNICODE(v);
2488     end = s + size;
2489     while (s < end) {
2490         unsigned char c;
2491         Py_UCS4 x;
2492         int i;
2493         int count;
2494
2495         /* Non-escape characters are interpreted as Unicode ordinals */
2496         if (*s != '\\') {
2497             *p++ = (unsigned char)*s++;
2498             continue;
2499         }
2500         startinpos = s-starts;
2501
2502         /* \u-escapes are only interpreted iff the number of leading
2503            backslashes if odd */
2504         bs = s;
2505         for (;s < end;) {
2506             if (*s != '\\')
2507                 break;
2508             *p++ = (unsigned char)*s++;
2509         }
2510         if (((s - bs) & 1) == 0 ||
2511             s >= end ||
2512             (*s != 'u' && *s != 'U')) {
2513             continue;
2514         }
2515         p--;
2516         count = *s=='u' ? 4 : 8;
2517         s++;
2518
2519         /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
2520         outpos = p-PyUnicode_AS_UNICODE(v);
2521         for (x = 0, i = 0; i < count; ++i, ++s) {
2522             c = (unsigned char)*s;
2523             if (!isxdigit(c)) {
2524                 endinpos = s-starts;
2525                 if (unicode_decode_call_errorhandler(
2526                     errors, &errorHandler,
2527                     "rawunicodeescape", "truncated \\uXXXX",
2528                     starts, size, &startinpos, &endinpos, &exc, &s,
2529                     (PyObject **)&v, &outpos, &p))
2530                     goto onError;
2531                 goto nextByte;
2532             }
2533             x = (x<<4) & ~0xF;
2534             if (c >= '0' && c <= '9')
2535                 x += c - '0';
2536             else if (c >= 'a' && c <= 'f')
2537                 x += 10 + c - 'a';
2538             else
2539                 x += 10 + c - 'A';
2540         }
2541 #ifndef Py_UNICODE_WIDE
2542         if (x > 0x10000) {
2543             if (unicode_decode_call_errorhandler(
2544                     errors, &errorHandler,
2545                     "rawunicodeescape", "\\Uxxxxxxxx out of range",
2546                     starts, size, &startinpos, &endinpos, &exc, &s,
2547                     (PyObject **)&v, &outpos, &p))
2548                     goto onError;
2549         }
2550 #endif
2551         *p++ = x;
2552         nextByte:
2553         ;
2554     }
2555     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2556         goto onError;
2557     Py_XDECREF(errorHandler);
2558     Py_XDECREF(exc);
2559     return (PyObject *)v;
2560
2561  onError:
2562     Py_XDECREF(v);
2563     Py_XDECREF(errorHandler);
2564     Py_XDECREF(exc);
2565     return NULL;
2566 }
2567
2568 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2569                                            Py_ssize_t size)
2570 {
2571     PyObject *repr;
2572     char *p;
2573     char *q;
2574
2575     static const char *hexdigit = "0123456789abcdef";
2576
2577 #ifdef Py_UNICODE_WIDE
2578     repr = PyString_FromStringAndSize(NULL, 10 * size);
2579 #else
2580     repr = PyString_FromStringAndSize(NULL, 6 * size);
2581 #endif
2582     if (repr == NULL)
2583         return NULL;
2584     if (size == 0)
2585         return repr;
2586
2587     p = q = PyString_AS_STRING(repr);
2588     while (size-- > 0) {
2589         Py_UNICODE ch = *s++;
2590 #ifdef Py_UNICODE_WIDE
2591         /* Map 32-bit characters to '\Uxxxxxxxx' */
2592         if (ch >= 0x10000) {
2593             *p++ = '\\';
2594             *p++ = 'U';
2595             *p++ = hexdigit[(ch >> 28) & 0xf];
2596             *p++ = hexdigit[(ch >> 24) & 0xf];
2597             *p++ = hexdigit[(ch >> 20) & 0xf];
2598             *p++ = hexdigit[(ch >> 16) & 0xf];
2599             *p++ = hexdigit[(ch >> 12) & 0xf];
2600             *p++ = hexdigit[(ch >> 8) & 0xf];
2601             *p++ = hexdigit[(ch >> 4) & 0xf];
2602             *p++ = hexdigit[ch & 15];
2603         }
2604         else
2605 #endif
2606         /* Map 16-bit characters to '\uxxxx' */
2607         if (ch >= 256) {
2608             *p++ = '\\';
2609             *p++ = 'u';
2610             *p++ = hexdigit[(ch >> 12) & 0xf];
2611             *p++ = hexdigit[(ch >> 8) & 0xf];
2612             *p++ = hexdigit[(ch >> 4) & 0xf];
2613             *p++ = hexdigit[ch & 15];
2614         }
2615         /* Copy everything else as-is */
2616         else
2617             *p++ = (char) ch;
2618     }
2619     *p = '\0';
2620     _PyString_Resize(&repr, p - q);
2621     return repr;
2622 }
2623
2624 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2625 {
2626     if (!PyUnicode_Check(unicode)) {
2627         PyErr_BadArgument();
2628         return NULL;
2629     }
2630     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2631                                             PyUnicode_GET_SIZE(unicode));
2632 }
2633
2634 /* --- Unicode Internal Codec ------------------------------------------- */
2635
2636 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
2637                                            Py_ssize_t size,
2638                                            const char *errors)
2639 {
2640     const char *starts = s;
2641     Py_ssize_t startinpos;
2642     Py_ssize_t endinpos;
2643     Py_ssize_t outpos;
2644     PyUnicodeObject *v;
2645     Py_UNICODE *p;
2646     const char *end;
2647     const char *reason;
2648     PyObject *errorHandler = NULL;
2649     PyObject *exc = NULL;
2650
2651 #ifdef Py_UNICODE_WIDE
2652     Py_UNICODE unimax = PyUnicode_GetMax();
2653 #endif
2654
2655     /* XXX overflow detection missing */
2656     v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2657     if (v == NULL)
2658         goto onError;
2659     if (PyUnicode_GetSize((PyObject *)v) == 0)
2660         return (PyObject *)v;
2661     p = PyUnicode_AS_UNICODE(v);
2662     end = s + size;
2663
2664     while (s < end) {
2665         memcpy(p, s, sizeof(Py_UNICODE));
2666         /* We have to sanity check the raw data, otherwise doom looms for
2667            some malformed UCS-4 data. */
2668         if (
2669             #ifdef Py_UNICODE_WIDE
2670             *p > unimax || *p < 0 ||
2671             #endif
2672             end-s < Py_UNICODE_SIZE
2673             )
2674             {
2675             startinpos = s - starts;
2676             if (end-s < Py_UNICODE_SIZE) {
2677                 endinpos = end-starts;
2678                 reason = "truncated input";
2679             }
2680             else {
2681                 endinpos = s - starts + Py_UNICODE_SIZE;
2682                 reason = "illegal code point (> 0x10FFFF)";
2683             }
2684             outpos = p - PyUnicode_AS_UNICODE(v);
2685             if (unicode_decode_call_errorhandler(
2686                     errors, &errorHandler,
2687                     "unicode_internal", reason,
2688                     starts, size, &startinpos, &endinpos, &exc, &s,
2689                     (PyObject **)&v, &outpos, &p)) {
2690                 goto onError;
2691             }
2692         }
2693         else {
2694             p++;
2695             s += Py_UNICODE_SIZE;
2696         }
2697     }
2698
2699     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2700         goto onError;
2701     Py_XDECREF(errorHandler);
2702     Py_XDECREF(exc);
2703     return (PyObject *)v;
2704
2705  onError:
2706     Py_XDECREF(v);
2707     Py_XDECREF(errorHandler);
2708     Py_XDECREF(exc);
2709     return NULL;
2710 }
2711
2712 /* --- Latin-1 Codec ------------------------------------------------------ */
2713
2714 PyObject *PyUnicode_DecodeLatin1(const char *s,
2715                                  Py_ssize_t size,
2716                                  const char *errors)
2717 {
2718     PyUnicodeObject *v;
2719     Py_UNICODE *p;
2720
2721     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2722     if (size == 1) {
2723         Py_UNICODE r = *(unsigned char*)s;
2724         return PyUnicode_FromUnicode(&r, 1);
2725     }
2726
2727     v = _PyUnicode_New(size);
2728     if (v == NULL)
2729         goto onError;
2730     if (size == 0)
2731         return (PyObject *)v;
2732     p = PyUnicode_AS_UNICODE(v);
2733     while (size-- > 0)
2734         *p++ = (unsigned char)*s++;
2735     return (PyObject *)v;
2736
2737  onError:
2738     Py_XDECREF(v);
2739     return NULL;
2740 }
2741
2742 /* create or adjust a UnicodeEncodeError */
2743 static void make_encode_exception(PyObject **exceptionObject,
2744     const char *encoding,
2745     const Py_UNICODE *unicode, Py_ssize_t size,
2746     Py_ssize_t startpos, Py_ssize_t endpos,
2747     const char *reason)
2748 {
2749     if (*exceptionObject == NULL) {
2750         *exceptionObject = PyUnicodeEncodeError_Create(
2751             encoding, unicode, size, startpos, endpos, reason);
2752     }
2753     else {
2754         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2755             goto onError;
2756         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2757             goto onError;
2758         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2759             goto onError;
2760         return;
2761         onError:
2762         Py_DECREF(*exceptionObject);
2763         *exceptionObject = NULL;
2764     }
2765 }
2766
2767 /* raises a UnicodeEncodeError */
2768 static void raise_encode_exception(PyObject **exceptionObject,
2769     const char *encoding,
2770     const Py_UNICODE *unicode, Py_ssize_t size,
2771     Py_ssize_t startpos, Py_ssize_t endpos,
2772     const char *reason)
2773 {
2774     make_encode_exception(exceptionObject,
2775         encoding, unicode, size, startpos, endpos, reason);
2776     if (*exceptionObject != NULL)
2777         PyCodec_StrictErrors(*exceptionObject);
2778 }
2779
2780 /* error handling callback helper:
2781    build arguments, call the callback and check the arguments,
2782    put the result into newpos and return the replacement string, which
2783    has to be freed by the caller */
2784 static PyObject *unicode_encode_call_errorhandler(const char *errors,
2785     PyObject **errorHandler,
2786     const char *encoding, const char *reason,
2787     const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2788     Py_ssize_t startpos, Py_ssize_t endpos,
2789     Py_ssize_t *newpos)
2790 {
2791     static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
2792
2793     PyObject *restuple;
2794     PyObject *resunicode;
2795
2796     if (*errorHandler == NULL) {
2797         *errorHandler = PyCodec_LookupError(errors);
2798         if (*errorHandler == NULL)
2799             return NULL;
2800     }
2801
2802     make_encode_exception(exceptionObject,
2803         encoding, unicode, size, startpos, endpos, reason);
2804     if (*exceptionObject == NULL)
2805         return NULL;
2806
2807     restuple = PyObject_CallFunctionObjArgs(
2808         *errorHandler, *exceptionObject, NULL);
2809     if (restuple == NULL)
2810         return NULL;
2811     if (!PyTuple_Check(restuple)) {
2812         PyErr_Format(PyExc_TypeError, &argparse[4]);
2813         Py_DECREF(restuple);
2814         return NULL;
2815     }
2816     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2817         &resunicode, newpos)) {
2818         Py_DECREF(restuple);
2819         return NULL;
2820     }
2821     if (*newpos<0)
2822         *newpos = size+*newpos;
2823     if (*newpos<0 || *newpos>size) {
2824         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
2825         Py_DECREF(restuple);
2826         return NULL;
2827     }
2828     Py_INCREF(resunicode);
2829     Py_DECREF(restuple);
2830     return resunicode;
2831 }
2832
2833 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2834                                  Py_ssize_t size,
2835                                  const char *errors,
2836                                  int limit)
2837 {
2838     /* output object */
2839     PyObject *res;
2840     /* pointers to the beginning and end+1 of input */
2841     const Py_UNICODE *startp = p;
2842     const Py_UNICODE *endp = p + size;
2843     /* pointer to the beginning of the unencodable characters */
2844     /* const Py_UNICODE *badp = NULL; */
2845     /* pointer into the output */
2846     char *str;
2847     /* current output position */
2848     Py_ssize_t respos = 0;
2849     Py_ssize_t ressize;
2850     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2851     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2852     PyObject *errorHandler = NULL;
2853     PyObject *exc = NULL;
2854     /* the following variable is used for caching string comparisons
2855      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2856     int known_errorHandler = -1;
2857
2858     /* allocate enough for a simple encoding without
2859        replacements, if we need more, we'll resize */
2860     res = PyString_FromStringAndSize(NULL, size);
2861     if (res == NULL)
2862         goto onError;
2863     if (size == 0)
2864         return res;
2865     str = PyString_AS_STRING(res);
2866     ressize = size;
2867
2868     while (p<endp) {
2869         Py_UNICODE c = *p;
2870
2871         /* can we encode this? */
2872         if (c<limit) {
2873             /* no overflow check, because we know that the space is enough */
2874             *str++ = (char)c;
2875             ++p;
2876         }
2877         else {
2878             Py_ssize_t unicodepos = p-startp;
2879             Py_ssize_t requiredsize;
2880             PyObject *repunicode;
2881             Py_ssize_t repsize;
2882             Py_ssize_t newpos;
2883             Py_ssize_t respos;
2884             Py_UNICODE *uni2;
2885             /* startpos for collecting unencodable chars */
2886             const Py_UNICODE *collstart = p;
2887             const Py_UNICODE *collend = p;
2888             /* find all unecodable characters */
2889             while ((collend < endp) && ((*collend)>=limit))
2890                 ++collend;
2891             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2892             if (known_errorHandler==-1) {
2893                 if ((errors==NULL) || (!strcmp(errors, "strict")))
2894                     known_errorHandler = 1;
2895                 else if (!strcmp(errors, "replace"))
2896                     known_errorHandler = 2;
2897                 else if (!strcmp(errors, "ignore"))
2898                     known_errorHandler = 3;
2899                 else if (!strcmp(errors, "xmlcharrefreplace"))
2900                     known_errorHandler = 4;
2901                 else
2902                     known_errorHandler = 0;
2903             }
2904             switch (known_errorHandler) {
2905                 case 1: /* strict */
2906                     raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2907                     goto onError;
2908                 case 2: /* replace */
2909                     while (collstart++<collend)
2910                         *str++ = '?'; /* fall through */
2911                 case 3: /* ignore */
2912                     p = collend;
2913                     break;
2914                 case 4: /* xmlcharrefreplace */
2915                     respos = str-PyString_AS_STRING(res);
2916                     /* determine replacement size (temporarily (mis)uses p) */
2917                     for (p = collstart, repsize = 0; p < collend; ++p) {
2918                         if (*p<10)
2919                             repsize += 2+1+1;
2920                         else if (*p<100)
2921                             repsize += 2+2+1;
2922                         else if (*p<1000)
2923                             repsize += 2+3+1;
2924                         else if (*p<10000)
2925                             repsize += 2+4+1;
2926 #ifndef Py_UNICODE_WIDE
2927                         else
2928                             repsize += 2+5+1;
2929 #else
2930                         else if (*p<100000)
2931                             repsize += 2+5+1;
2932                         else if (*p<1000000)
2933                             repsize += 2+6+1;
2934                         else
2935                             repsize += 2+7+1;
2936 #endif
2937                     }
2938                     requiredsize = respos+repsize+(endp-collend);
2939                     if (requiredsize > ressize) {
2940                         if (requiredsize<2*ressize)
2941                             requiredsize = 2*ressize;
2942                         if (_PyString_Resize(&res, requiredsize))
2943                             goto onError;
2944                         str = PyString_AS_STRING(res) + respos;
2945                         ressize = requiredsize;
2946                     }
2947                     /* generate replacement (temporarily (mis)uses p) */
2948                     for (p = collstart; p < collend; ++p) {
2949                         str += sprintf(str, "&#%d;", (int)*p);
2950                     }
2951                     p = collend;
2952                     break;
2953                 default:
2954                     repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2955                         encoding, reason, startp, size, &exc,
2956                         collstart-startp, collend-startp, &newpos);
2957                     if (repunicode == NULL)
2958                         goto onError;
2959                     /* need more space? (at least enough for what we
2960                        have+the replacement+the rest of the string, so
2961                        we won't have to check space for encodable characters) */
2962                     respos = str-PyString_AS_STRING(res);
2963                     repsize = PyUnicode_GET_SIZE(repunicode);
2964                     requiredsize = respos+repsize+(endp-collend);
2965                     if (requiredsize > ressize) {
2966                         if (requiredsize<2*ressize)
2967                             requiredsize = 2*ressize;
2968                         if (_PyString_Resize(&res, requiredsize)) {
2969                             Py_DECREF(repunicode);
2970                             goto onError;
2971                         }
2972                         str = PyString_AS_STRING(res) + respos;
2973                         ressize = requiredsize;
2974                     }
2975                     /* check if there is anything unencodable in the replacement
2976                        and copy it to the output */
2977                     for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2978                         c = *uni2;
2979                         if (c >= limit) {
2980                             raise_encode_exception(&exc, encoding, startp, size,
2981                                 unicodepos, unicodepos+1, reason);
2982                             Py_DECREF(repunicode);
2983                             goto onError;
2984                         }
2985                         *str = (char)c;
2986                     }
2987                     p = startp + newpos;
2988                     Py_DECREF(repunicode);
2989             }
2990         }
2991     }
2992     /* Resize if we allocated to much */
2993     respos = str-PyString_AS_STRING(res);
2994     if (respos<ressize)
2995        /* If this falls res will be NULL */
2996         _PyString_Resize(&res, respos);
2997     Py_XDECREF(errorHandler);
2998     Py_XDECREF(exc);
2999     return res;
3000
3001     onError:
3002     Py_XDECREF(res);
3003     Py_XDECREF(errorHandler);
3004     Py_XDECREF(exc);
3005     return NULL;
3006 }
3007
3008 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3009                                  Py_ssize_t size,
3010                                  const char *errors)
3011 {
3012     return unicode_encode_ucs1(p, size, errors, 256);
3013 }
3014
3015 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3016 {
3017     if (!PyUnicode_Check(unicode)) {
3018         PyErr_BadArgument();
3019         return NULL;
3020     }
3021     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3022                                   PyUnicode_GET_SIZE(unicode),
3023                                   NULL);
3024 }
3025
3026 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3027
3028 PyObject *PyUnicode_DecodeASCII(const char *s,
3029                                 Py_ssize_t size,
3030                                 const char *errors)
3031 {
3032     const char *starts = s;
3033     PyUnicodeObject *v;
3034     Py_UNICODE *p;
3035     Py_ssize_t startinpos;
3036     Py_ssize_t endinpos;
3037     Py_ssize_t outpos;
3038     const char *e;
3039     PyObject *errorHandler = NULL;
3040     PyObject *exc = NULL;
3041
3042     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3043     if (size == 1 && *(unsigned char*)s < 128) {
3044         Py_UNICODE r = *(unsigned char*)s;
3045         return PyUnicode_FromUnicode(&r, 1);
3046     }
3047
3048     v = _PyUnicode_New(size);
3049     if (v == NULL)
3050         goto onError;
3051     if (size == 0)
3052         return (PyObject *)v;
3053     p = PyUnicode_AS_UNICODE(v);
3054     e = s + size;
3055     while (s < e) {
3056         register unsigned char c = (unsigned char)*s;
3057         if (c < 128) {
3058             *p++ = c;
3059             ++s;
3060         }
3061         else {
3062             startinpos = s-starts;
3063             endinpos = startinpos + 1;
3064             outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3065             if (unicode_decode_call_errorhandler(
3066                  errors, &errorHandler,
3067                  "ascii", "ordinal not in range(128)",
3068                  starts, size, &startinpos, &endinpos, &exc, &s,
3069                  (PyObject **)&v, &outpos, &p))
3070                 goto onError;
3071         }
3072     }
3073     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
3074         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3075             goto onError;
3076     Py_XDECREF(errorHandler);
3077     Py_XDECREF(exc);
3078     return (PyObject *)v;
3079
3080  onError:
3081     Py_XDECREF(v);
3082     Py_XDECREF(errorHandler);
3083     Py_XDECREF(exc);
3084     return NULL;
3085 }
3086
3087 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3088                                 Py_ssize_t size,
3089                                 const char *errors)
3090 {
3091     return unicode_encode_ucs1(p, size, errors, 128);
3092 }
3093
3094 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3095 {
3096     if (!PyUnicode_Check(unicode)) {
3097         PyErr_BadArgument();
3098         return NULL;
3099     }
3100     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3101                                  PyUnicode_GET_SIZE(unicode),
3102                                  NULL);
3103 }
3104
3105 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3106
3107 /* --- MBCS codecs for Windows -------------------------------------------- */
3108
3109 #if SIZEOF_INT < SIZEOF_SSIZE_T
3110 #define NEED_RETRY
3111 #endif
3112
3113 /* XXX This code is limited to "true" double-byte encodings, as
3114    a) it assumes an incomplete character consists of a single byte, and
3115    b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3116       encodings, see IsDBCSLeadByteEx documentation. */
3117
3118 static int is_dbcs_lead_byte(const char *s, int offset)
3119 {
3120     const char *curr = s + offset;
3121
3122     if (IsDBCSLeadByte(*curr)) {
3123         const char *prev = CharPrev(s, curr);
3124         return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3125     }
3126     return 0;
3127 }
3128
3129 /*
3130  * Decode MBCS string into unicode object. If 'final' is set, converts
3131  * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3132  */
3133 static int decode_mbcs(PyUnicodeObject **v,
3134                         const char *s, /* MBCS string */
3135                         int size, /* sizeof MBCS string */
3136                         int final)
3137 {
3138     Py_UNICODE *p;
3139     Py_ssize_t n = 0;
3140     int usize = 0;
3141
3142     assert(size >= 0);
3143
3144     /* Skip trailing lead-byte unless 'final' is set */
3145     if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3146         --size;
3147
3148     /* First get the size of the result */
3149     if (size > 0) {
3150         usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3151         if (usize == 0) {
3152             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3153             return -1;
3154         }
3155     }
3156
3157     if (*v == NULL) {
3158         /* Create unicode object */
3159         *v = _PyUnicode_New(usize);
3160         if (*v == NULL)
3161             return -1;
3162     }
3163     else {
3164         /* Extend unicode object */
3165         n = PyUnicode_GET_SIZE(*v);
3166         if (_PyUnicode_Resize(v, n + usize) < 0)
3167             return -1;
3168     }
3169
3170     /* Do the conversion */
3171     if (size > 0) {
3172         p = PyUnicode_AS_UNICODE(*v) + n;
3173         if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3174             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3175             return -1;
3176         }
3177     }
3178
3179     return size;
3180 }
3181
3182 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3183                                         Py_ssize_t size,
3184                                         const char *errors,
3185                                         Py_ssize_t *consumed)
3186 {
3187     PyUnicodeObject *v = NULL;
3188     int done;
3189
3190     if (consumed)
3191         *consumed = 0;
3192
3193 #ifdef NEED_RETRY
3194   retry:
3195     if (size > INT_MAX)
3196         done = decode_mbcs(&v, s, INT_MAX, 0);
3197     else
3198 #endif
3199         done = decode_mbcs(&v, s, (int)size, !consumed);
3200
3201     if (done < 0) {
3202         Py_XDECREF(v);
3203         return NULL;
3204     }
3205
3206     if (consumed)
3207         *consumed += done;
3208
3209 #ifdef NEED_RETRY
3210     if (size > INT_MAX) {
3211         s += done;
3212         size -= done;
3213         goto retry;
3214     }
3215 #endif
3216
3217     return (PyObject *)v;
3218 }
3219
3220 PyObject *PyUnicode_DecodeMBCS(const char *s,
3221                                 Py_ssize_t size,
3222                                 const char *errors)
3223 {
3224     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3225 }
3226
3227 /*
3228  * Convert unicode into string object (MBCS).
3229  * Returns 0 if succeed, -1 otherwise.
3230  */
3231 static int encode_mbcs(PyObject **repr,
3232                         const Py_UNICODE *p, /* unicode */
3233                         int size) /* size of unicode */
3234 {
3235     int mbcssize = 0;
3236     Py_ssize_t n = 0;
3237
3238     assert(size >= 0);
3239
3240     /* First get the size of the result */
3241     if (size > 0) {
3242         mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3243         if (mbcssize == 0) {
3244             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3245             return -1;
3246         }
3247     }
3248
3249     if (*repr == NULL) {
3250         /* Create string object */
3251         *repr = PyString_FromStringAndSize(NULL, mbcssize);
3252         if (*repr == NULL)
3253             return -1;
3254     }
3255     else {
3256         /* Extend string object */
3257         n = PyString_Size(*repr);
3258         if (_PyString_Resize(repr, n + mbcssize) < 0)
3259             return -1;
3260     }
3261
3262     /* Do the conversion */
3263     if (size > 0) {
3264         char *s = PyString_AS_STRING(*repr) + n;
3265         if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3266             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3267             return -1;
3268         }
3269     }
3270
3271     return 0;
3272 }
3273
3274 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
3275                                 Py_ssize_t size,
3276                                 const char *errors)
3277 {
3278     PyObject *repr = NULL;
3279     int ret;
3280
3281 #ifdef NEED_RETRY
3282  retry:
3283     if (size > INT_MAX)
3284         ret = encode_mbcs(&repr, p, INT_MAX);
3285     else
3286 #endif
3287         ret = encode_mbcs(&repr, p, (int)size);
3288
3289     if (ret < 0) {
3290         Py_XDECREF(repr);
3291         return NULL;
3292     }
3293
3294 #ifdef NEED_RETRY
3295     if (size > INT_MAX) {
3296         p += INT_MAX;
3297         size -= INT_MAX;
3298         goto retry;
3299     }
3300 #endif
3301
3302     return repr;
3303 }
3304
3305 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3306 {
3307     if (!PyUnicode_Check(unicode)) {
3308         PyErr_BadArgument();
3309         return NULL;
3310     }
3311     return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3312                                 PyUnicode_GET_SIZE(unicode),
3313                                 NULL);
3314 }
3315
3316 #undef NEED_RETRY
3317
3318 #endif /* MS_WINDOWS */
3319
3320 /* --- Character Mapping Codec -------------------------------------------- */
3321
3322 PyObject *PyUnicode_DecodeCharmap(const char *s,
3323                                   Py_ssize_t size,
3324                                   PyObject *mapping,
3325                                   const char *errors)
3326 {
3327     const char *starts = s;
3328     Py_ssize_t startinpos;
3329     Py_ssize_t endinpos;
3330     Py_ssize_t outpos;
3331     const char *e;
3332     PyUnicodeObject *v;
3333     Py_UNICODE *p;
3334     Py_ssize_t extrachars = 0;
3335     PyObject *errorHandler = NULL;
3336     PyObject *exc = NULL;
3337     Py_UNICODE *mapstring = NULL;
3338     Py_ssize_t maplen = 0;
3339
3340     /* Default to Latin-1 */
3341     if (mapping == NULL)
3342         return PyUnicode_DecodeLatin1(s, size, errors);
3343
3344     v = _PyUnicode_New(size);
3345     if (v == NULL)
3346         goto onError;
3347     if (size == 0)
3348         return (PyObject *)v;
3349     p = PyUnicode_AS_UNICODE(v);
3350     e = s + size;
3351     if (PyUnicode_CheckExact(mapping)) {
3352         mapstring = PyUnicode_AS_UNICODE(mapping);
3353         maplen = PyUnicode_GET_SIZE(mapping);
3354         while (s < e) {
3355             unsigned char ch = *s;
3356             Py_UNICODE x = 0xfffe; /* illegal value */
3357
3358             if (ch < maplen)
3359                 x = mapstring[ch];
3360
3361             if (x == 0xfffe) {
3362                 /* undefined mapping */
3363                 outpos = p-PyUnicode_AS_UNICODE(v);
3364                 startinpos = s-starts;
3365                 endinpos = startinpos+1;
3366                 if (unicode_decode_call_errorhandler(
3367                      errors, &errorHandler,
3368                      "charmap", "character maps to <undefined>",
3369                      starts, size, &startinpos, &endinpos, &exc, &s,
3370                      (PyObject **)&v, &outpos, &p)) {
3371                     goto onError;
3372                 }
3373                 continue;
3374             }
3375             *p++ = x;
3376             ++s;
3377         }
3378     }
3379     else {
3380         while (s < e) {
3381             unsigned char ch = *s;
3382             PyObject *w, *x;
3383
3384             /* Get mapping (char ordinal -> integer, Unicode char or None) */
3385             w = PyInt_FromLong((long)ch);
3386             if (w == NULL)
3387                 goto onError;
3388             x = PyObject_GetItem(mapping, w);
3389             Py_DECREF(w);
3390             if (x == NULL) {
3391                 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3392                     /* No mapping found means: mapping is undefined. */
3393                     PyErr_Clear();
3394                     x = Py_None;
3395                     Py_INCREF(x);
3396                 } else
3397                     goto onError;
3398             }
3399
3400             /* Apply mapping */
3401             if (PyInt_Check(x)) {
3402                 long value = PyInt_AS_LONG(x);
3403                 if (value < 0 || value > 65535) {
3404                     PyErr_SetString(PyExc_TypeError,
3405                                     "character mapping must be in range(65536)");
3406                     Py_DECREF(x);
3407                     goto onError;
3408                 }
3409                 *p++ = (Py_UNICODE)value;
3410             }
3411             else if (x == Py_None) {
3412                 /* undefined mapping */
3413                 outpos = p-PyUnicode_AS_UNICODE(v);
3414                 startinpos = s-starts;
3415                 endinpos = startinpos+1;
3416                 if (unicode_decode_call_errorhandler(
3417                      errors, &errorHandler,
3418                      "charmap", "character maps to <undefined>",
3419                      starts, size, &startinpos, &endinpos, &exc, &s,
3420                      (PyObject **)&v, &outpos, &p)) {
3421                     Py_DECREF(x);
3422                     goto onError;
3423                 }
3424                 Py_DECREF(x);
3425                 continue;
3426             }
3427             else if (PyUnicode_Check(x)) {
3428                 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
3429
3430                 if (targetsize == 1)
3431                     /* 1-1 mapping */
3432                     *p++ = *PyUnicode_AS_UNICODE(x);
3433
3434                 else if (targetsize > 1) {
3435                     /* 1-n mapping */
3436                     if (targetsize > extrachars) {
3437                         /* resize first */
3438                         Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3439                         Py_ssize_t needed = (targetsize - extrachars) + \
3440                                      (targetsize << 2);
3441                         extrachars += needed;
3442                         /* XXX overflow detection missing */
3443                         if (_PyUnicode_Resize(&v,
3444                                              PyUnicode_GET_SIZE(v) + needed) < 0) {
3445                             Py_DECREF(x);
3446                             goto onError;
3447                         }
3448                         p = PyUnicode_AS_UNICODE(v) + oldpos;
3449                     }
3450                     Py_UNICODE_COPY(p,
3451                                     PyUnicode_AS_UNICODE(x),
3452                                     targetsize);
3453                     p += targetsize;
3454                     extrachars -= targetsize;
3455                 }
3456                 /* 1-0 mapping: skip the character */
3457             }
3458             else {
3459                 /* wrong return value */
3460                 PyErr_SetString(PyExc_TypeError,
3461                       "character mapping must return integer, None or unicode");
3462                 Py_DECREF(x);
3463                 goto onError;
3464             }
3465             Py_DECREF(x);
3466             ++s;
3467         }
3468     }
3469     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
3470         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3471             goto onError;
3472     Py_XDECREF(errorHandler);
3473     Py_XDECREF(exc);
3474     return (PyObject *)v;
3475
3476  onError:
3477     Py_XDECREF(errorHandler);
3478     Py_XDECREF(exc);
3479     Py_XDECREF(v);
3480     return NULL;
3481 }
3482
3483 /* Charmap encoding: the lookup table */
3484
3485 struct encoding_map{
3486   PyObject_HEAD
3487   unsigned char level1[32];
3488   int count2, count3;
3489   unsigned char level23[1];
3490 };
3491
3492 static PyObject*
3493 encoding_map_size(PyObject *obj, PyObject* args)
3494 {
3495     struct encoding_map *map = (struct encoding_map*)obj;
3496     return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3497                           128*map->count3);
3498 }
3499
3500 static PyMethodDef encoding_map_methods[] = {
3501         {"size", encoding_map_size, METH_NOARGS,
3502          PyDoc_STR("Return the size (in bytes) of this object") },
3503         { 0 }
3504 };
3505
3506 static void
3507 encoding_map_dealloc(PyObject* o)
3508 {
3509         PyObject_FREE(o);
3510 }
3511
3512 static PyTypeObject EncodingMapType = {
3513         PyVarObject_HEAD_INIT(NULL, 0)
3514         "EncodingMap",          /*tp_name*/
3515         sizeof(struct encoding_map),   /*tp_basicsize*/
3516         0,                      /*tp_itemsize*/
3517         /* methods */
3518         encoding_map_dealloc,   /*tp_dealloc*/
3519         0,                      /*tp_print*/
3520         0,                      /*tp_getattr*/
3521         0,                      /*tp_setattr*/
3522         0,                      /*tp_compare*/
3523         0,                      /*tp_repr*/
3524         0,                      /*tp_as_number*/
3525         0,                      /*tp_as_sequence*/
3526         0,                      /*tp_as_mapping*/
3527         0,                      /*tp_hash*/
3528         0,                      /*tp_call*/
3529         0,                      /*tp_str*/
3530         0,                      /*tp_getattro*/
3531         0,                      /*tp_setattro*/
3532         0,                      /*tp_as_buffer*/
3533         Py_TPFLAGS_DEFAULT,     /*tp_flags*/
3534         0,                      /*tp_doc*/
3535         0,                      /*tp_traverse*/
3536         0,                      /*tp_clear*/
3537         0,                      /*tp_richcompare*/
3538         0,                      /*tp_weaklistoffset*/
3539         0,                      /*tp_iter*/
3540         0,                      /*tp_iternext*/
3541         encoding_map_methods,   /*tp_methods*/
3542         0,                      /*tp_members*/
3543         0,                      /*tp_getset*/
3544         0,                      /*tp_base*/
3545         0,                      /*tp_dict*/
3546         0,                      /*tp_descr_get*/
3547         0,                      /*tp_descr_set*/
3548         0,                      /*tp_dictoffset*/
3549         0,                      /*tp_init*/
3550         0,                      /*tp_alloc*/
3551         0,                      /*tp_new*/
3552         0,                      /*tp_free*/
3553         0,                      /*tp_is_gc*/
3554 };
3555
3556 PyObject*
3557 PyUnicode_BuildEncodingMap(PyObject* string)
3558 {
3559     Py_UNICODE *decode;
3560     PyObject *result;
3561     struct encoding_map *mresult;
3562     int i;
3563     int need_dict = 0;
3564     unsigned char level1[32];
3565     unsigned char level2[512];
3566     unsigned char *mlevel1, *mlevel2, *mlevel3;
3567     int count2 = 0, count3 = 0;
3568
3569     if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3570         PyErr_BadArgument();
3571         return NULL;
3572     }
3573     decode = PyUnicode_AS_UNICODE(string);
3574     memset(level1, 0xFF, sizeof level1);
3575     memset(level2, 0xFF, sizeof level2);
3576
3577     /* If there isn't a one-to-one mapping of NULL to \0,
3578        or if there are non-BMP characters, we need to use
3579        a mapping dictionary. */
3580     if (decode[0] != 0)
3581         need_dict = 1;
3582     for (i = 1; i < 256; i++) {
3583         int l1, l2;
3584         if (decode[i] == 0
3585             #ifdef Py_UNICODE_WIDE
3586             || decode[i] > 0xFFFF
3587             #endif
3588         ) {
3589             need_dict = 1;
3590             break;
3591         }
3592         if (decode[i] == 0xFFFE)
3593             /* unmapped character */
3594             continue;
3595         l1 = decode[i] >> 11;
3596         l2 = decode[i] >> 7;
3597         if (level1[l1] == 0xFF)
3598             level1[l1] = count2++;
3599         if (level2[l2] == 0xFF)
3600             level2[l2] = count3++;
3601     }
3602
3603     if (count2 >= 0xFF || count3 >= 0xFF)
3604         need_dict = 1;
3605
3606     if (need_dict) {
3607         PyObject *result = PyDict_New();
3608         PyObject *key, *value;
3609         if (!result)
3610             return NULL;
3611         for (i = 0; i < 256; i++) {
3612             key = value = NULL;
3613             key = PyInt_FromLong(decode[i]);
3614             value = PyInt_FromLong(i);
3615             if (!key || !value)
3616                 goto failed1;
3617             if (PyDict_SetItem(result, key, value) == -1)
3618                 goto failed1;
3619             Py_DECREF(key);
3620             Py_DECREF(value);
3621         }
3622         return result;
3623       failed1:
3624         Py_XDECREF(key);
3625         Py_XDECREF(value);
3626         Py_DECREF(result);
3627         return NULL;
3628     }
3629
3630     /* Create a three-level trie */
3631     result = PyObject_MALLOC(sizeof(struct encoding_map) +
3632                              16*count2 + 128*count3 - 1);
3633     if (!result)
3634         return PyErr_NoMemory();
3635     PyObject_Init(result, &EncodingMapType);
3636     mresult = (struct encoding_map*)result;
3637     mresult->count2 = count2;
3638     mresult->count3 = count3;
3639     mlevel1 = mresult->level1;
3640     mlevel2 = mresult->level23;
3641     mlevel3 = mresult->level23 + 16*count2;
3642     memcpy(mlevel1, level1, 32);
3643     memset(mlevel2, 0xFF, 16*count2);
3644     memset(mlevel3, 0, 128*count3);
3645     count3 = 0;
3646     for (i = 1; i < 256; i++) {
3647         int o1, o2, o3, i2, i3;
3648         if (decode[i] == 0xFFFE)
3649             /* unmapped character */
3650             continue;
3651         o1 = decode[i]>>11;
3652         o2 = (decode[i]>>7) & 0xF;
3653         i2 = 16*mlevel1[o1] + o2;
3654         if (mlevel2[i2] == 0xFF)
3655             mlevel2[i2] = count3++;
3656         o3 = decode[i] & 0x7F;
3657         i3 = 128*mlevel2[i2] + o3;
3658         mlevel3[i3] = i;
3659     }
3660     return result;
3661 }
3662
3663 static int
3664 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3665 {
3666     struct encoding_map *map = (struct encoding_map*)mapping;
3667     int l1 = c>>11;
3668     int l2 = (c>>7) & 0xF;
3669     int l3 = c & 0x7F;
3670     int i;
3671
3672 #ifdef Py_UNICODE_WIDE
3673     if (c > 0xFFFF) {
3674         return -1;
3675     }
3676 #endif
3677     if (c == 0)
3678         return 0;
3679     /* level 1*/
3680     i = map->level1[l1];
3681     if (i == 0xFF) {
3682         return -1;
3683     }
3684     /* level 2*/
3685     i = map->level23[16*i+l2];
3686     if (i == 0xFF) {
3687         return -1;
3688     }
3689     /* level 3 */
3690     i = map->level23[16*map->count2 + 128*i + l3];
3691     if (i == 0) {
3692         return -1;
3693     }
3694     return i;
3695 }
3696
3697 /* Lookup the character ch in the mapping. If the character
3698    can't be found, Py_None is returned (or NULL, if another
3699    error occurred). */
3700 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
3701 {
3702     PyObject *w = PyInt_FromLong((long)c);
3703     PyObject *x;
3704
3705     if (w == NULL)
3706          return NULL;
3707     x = PyObject_GetItem(mapping, w);
3708     Py_DECREF(w);
3709     if (x == NULL) {
3710         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3711             /* No mapping found means: mapping is undefined. */
3712             PyErr_Clear();
3713             x = Py_None;
3714             Py_INCREF(x);
3715             return x;
3716         } else
3717             return NULL;
3718     }
3719     else if (x == Py_None)
3720         return x;
3721     else if (PyInt_Check(x)) {
3722         long value = PyInt_AS_LONG(x);
3723         if (value < 0 || value > 255) {
3724             PyErr_SetString(PyExc_TypeError,
3725                              "character mapping must be in range(256)");
3726             Py_DECREF(x);
3727             return NULL;
3728         }
3729         return x;
3730     }
3731     else if (PyString_Check(x))
3732         return x;
3733     else {
3734         /* wrong return value */
3735         PyErr_SetString(PyExc_TypeError,
3736               "character mapping must return integer, None or str");
3737         Py_DECREF(x);
3738         return NULL;
3739     }
3740 }
3741
3742 static int
3743 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3744 {
3745         Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3746         /* exponentially overallocate to minimize reallocations */
3747         if (requiredsize < 2*outsize)
3748             requiredsize = 2*outsize;
3749         if (_PyString_Resize(outobj, requiredsize)) {
3750             return 0;
3751         }
3752         return 1;
3753 }
3754
3755 typedef enum charmapencode_result {
3756   enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3757 }charmapencode_result;
3758 /* lookup the character, put the result in the output string and adjust
3759    various state variables. Reallocate the output string if not enough
3760    space is available. Return a new reference to the object that
3761    was put in the output buffer, or Py_None, if the mapping was undefined
3762    (in which case no character was written) or NULL, if a
3763    reallocation error occurred. The caller must decref the result */
3764 static
3765 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
3766     PyObject **outobj, Py_ssize_t *outpos)
3767 {
3768     PyObject *rep;
3769     char *outstart;
3770     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3771
3772     if (Py_Type(mapping) == &EncodingMapType) {
3773         int res = encoding_map_lookup(c, mapping);
3774         Py_ssize_t requiredsize = *outpos+1;
3775         if (res == -1)
3776             return enc_FAILED;
3777         if (outsize<requiredsize)
3778             if (!charmapencode_resize(outobj, outpos, requiredsize))
3779                 return enc_EXCEPTION;
3780         outstart = PyString_AS_STRING(*outobj);
3781         outstart[(*outpos)++] = (char)res;
3782         return enc_SUCCESS;
3783     }
3784
3785     rep = charmapencode_lookup(c, mapping);
3786     if (rep==NULL)
3787         return enc_EXCEPTION;
3788     else if (rep==Py_None) {
3789         Py_DECREF(rep);
3790         return enc_FAILED;
3791     } else {
3792         if (PyInt_Check(rep)) {
3793             Py_ssize_t requiredsize = *outpos+1;
3794             if (outsize<requiredsize)
3795                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
3796                     Py_DECREF(rep);
3797                     return enc_EXCEPTION;
3798                 }
3799             outstart = PyString_AS_STRING(*outobj);
3800             outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3801         }
3802         else {
3803             const char *repchars = PyString_AS_STRING(rep);
3804             Py_ssize_t repsize = PyString_GET_SIZE(rep);
3805             Py_ssize_t requiredsize = *outpos+repsize;
3806             if (outsize<requiredsize)
3807                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
3808                     Py_DECREF(rep);
3809                     return enc_EXCEPTION;
3810                 }
3811             outstart = PyString_AS_STRING(*outobj);
3812             memcpy(outstart + *outpos, repchars, repsize);
3813             *outpos += repsize;
3814         }
3815     }
3816     Py_DECREF(rep);
3817     return enc_SUCCESS;
3818 }
3819
3820 /* handle an error in PyUnicode_EncodeCharmap
3821    Return 0 on success, -1 on error */
3822 static
3823 int charmap_encoding_error(
3824     const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
3825     PyObject **exceptionObject,
3826     int *known_errorHandler, PyObject **errorHandler, const char *errors,
3827     PyObject **res, Py_ssize_t *respos)
3828 {
3829     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3830     Py_ssize_t repsize;
3831     Py_ssize_t newpos;
3832     Py_UNICODE *uni2;
3833     /* startpos for collecting unencodable chars */
3834     Py_ssize_t collstartpos = *inpos;
3835     Py_ssize_t collendpos = *inpos+1;
3836     Py_ssize_t collpos;
3837     char *encoding = "charmap";
3838     char *reason = "character maps to <undefined>";
3839     charmapencode_result x;
3840
3841     /* find all unencodable characters */
3842     while (collendpos < size) {
3843         PyObject *rep;
3844         if (Py_Type(mapping) == &EncodingMapType) {
3845             int res = encoding_map_lookup(p[collendpos], mapping);
3846             if (res != -1)
3847                 break;
3848             ++collendpos;
3849             continue;
3850         }
3851
3852         rep = charmapencode_lookup(p[collendpos], mapping);
3853         if (rep==NULL)
3854             return -1;
3855         else if (rep!=Py_None) {
3856             Py_DECREF(rep);
3857             break;
3858         }
3859         Py_DECREF(rep);
3860         ++collendpos;
3861     }
3862     /* cache callback name lookup
3863      * (if not done yet, i.e. it's the first error) */
3864     if (*known_errorHandler==-1) {
3865         if ((errors==NULL) || (!strcmp(errors, "strict")))
3866             *known_errorHandler = 1;
3867         else if (!strcmp(errors, "replace"))
3868             *known_errorHandler = 2;
3869         else if (!strcmp(errors, "ignore"))
3870             *known_errorHandler = 3;
3871         else if (!strcmp(errors, "xmlcharrefreplace"))
3872             *known_errorHandler = 4;
3873         else
3874             *known_errorHandler = 0;
3875     }
3876     switch (*known_errorHandler) {
3877         case 1: /* strict */
3878             raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3879             return -1;
3880         case 2: /* replace */
3881             for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3882                 x = charmapencode_output('?', mapping, res, respos);
3883                 if (x==enc_EXCEPTION) {
3884                     return -1;
3885                 }
3886                 else if (x==enc_FAILED) {
3887                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3888                     return -1;
3889                 }
3890             }
3891             /* fall through */
3892         case 3: /* ignore */
3893             *inpos = collendpos;
3894             break;
3895         case 4: /* xmlcharrefreplace */
3896             /* generate replacement (temporarily (mis)uses p) */
3897             for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3898                 char buffer[2+29+1+1];
3899                 char *cp;
3900                 sprintf(buffer, "&#%d;", (int)p[collpos]);
3901                 for (cp = buffer; *cp; ++cp) {
3902                     x = charmapencode_output(*cp, mapping, res, respos);
3903                     if (x==enc_EXCEPTION)
3904                         return -1;
3905                     else if (x==enc_FAILED) {
3906                         raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3907                         return -1;
3908                     }
3909                 }
3910             }
3911             *inpos = collendpos;
3912             break;
3913         default:
3914             repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
3915                 encoding, reason, p, size, exceptionObject,
3916                 collstartpos, collendpos, &newpos);
3917             if (repunicode == NULL)
3918                 return -1;
3919             /* generate replacement  */
3920             repsize = PyUnicode_GET_SIZE(repunicode);
3921             for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3922                 x = charmapencode_output(*uni2, mapping, res, respos);
3923                 if (x==enc_EXCEPTION) {
3924                     return -1;
3925                 }
3926                 else if (x==enc_FAILED) {
3927                     Py_DECREF(repunicode);
3928                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3929                     return -1;
3930                 }
3931             }
3932             *inpos = newpos;
3933             Py_DECREF(repunicode);
3934     }
3935     return 0;
3936 }
3937
3938 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3939                                   Py_ssize_t size,
3940                                   PyObject *mapping,
3941                                   const char *errors)
3942 {
3943     /* output object */
3944     PyObject *res = NULL;
3945     /* current input position */
3946     Py_ssize_t inpos = 0;
3947     /* current output position */
3948     Py_ssize_t respos = 0;
3949     PyObject *errorHandler = NULL;
3950     PyObject *exc = NULL;
3951     /* the following variable is used for caching string comparisons
3952      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3953      * 3=ignore, 4=xmlcharrefreplace */
3954     int known_errorHandler = -1;
3955
3956     /* Default to Latin-1 */
3957     if (mapping == NULL)
3958         return PyUnicode_EncodeLatin1(p, size, errors);
3959
3960     /* allocate enough for a simple encoding without
3961        replacements, if we need more, we'll resize */
3962     res = PyString_FromStringAndSize(NULL, size);
3963     if (res == NULL)
3964         goto onError;
3965     if (size == 0)
3966         return res;
3967
3968     while (inpos<size) {
3969         /* try to encode it */
3970         charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3971         if (x==enc_EXCEPTION) /* error */
3972             goto onError;
3973         if (x==enc_FAILED) { /* unencodable character */
3974             if (charmap_encoding_error(p, size, &inpos, mapping,
3975                 &exc,
3976                 &known_errorHandler, &errorHandler, errors,
3977                 &res, &respos)) {
3978                 goto onError;
3979             }
3980         }
3981         else
3982             /* done with this character => adjust input position */
3983             ++inpos;
3984     }
3985
3986     /* Resize if we allocated to much */
3987     if (respos<PyString_GET_SIZE(res)) {
3988         if (_PyString_Resize(&res, respos))
3989             goto onError;
3990     }
3991     Py_XDECREF(exc);
3992     Py_XDECREF(errorHandler);
3993     return res;
3994
3995     onError:
3996     Py_XDECREF(res);
3997     Py_XDECREF(exc);
3998     Py_XDECREF(errorHandler);
3999     return NULL;
4000 }
4001
4002 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4003                                     PyObject *mapping)
4004 {
4005     if (!PyUnicode_Check(unicode) || mapping == NULL) {
4006         PyErr_BadArgument();
4007         return NULL;
4008     }
4009     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4010                                    PyUnicode_GET_SIZE(unicode),
4011                                    mapping,
4012                                    NULL);
4013 }
4014
4015 /* create or adjust a UnicodeTranslateError */
4016 static void make_translate_exception(PyObject **exceptionObject,
4017     const Py_UNICODE *unicode, Py_ssize_t size,
4018     Py_ssize_t startpos, Py_ssize_t endpos,
4019     const char *reason)
4020 {
4021     if (*exceptionObject == NULL) {
4022         *exceptionObject = PyUnicodeTranslateError_Create(
4023             unicode, size, startpos, endpos, reason);
4024     }
4025     else {
4026         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4027             goto onError;
4028         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4029             goto onError;
4030         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4031             goto onError;
4032         return;
4033         onError:
4034         Py_DECREF(*exceptionObject);
4035         *exceptionObject = NULL;
4036     }
4037 }
4038
4039 /* raises a UnicodeTranslateError */
4040 static void raise_translate_exception(PyObject **exceptionObject,
4041     const Py_UNICODE *unicode, Py_ssize_t size,
4042     Py_ssize_t startpos, Py_ssize_t endpos,
4043     const char *reason)
4044 {
4045     make_translate_exception(exceptionObject,
4046         unicode, size, startpos, endpos, reason);
4047     if (*exceptionObject != NULL)
4048         PyCodec_StrictErrors(*exceptionObject);
4049 }
4050
4051 /* error handling callback helper:
4052    build arguments, call the callback and check the arguments,
4053    put the result into newpos and return the replacement string, which
4054    has to be freed by the caller */
4055 static PyObject *unicode_translate_call_errorhandler(const char *errors,
4056     PyObject **errorHandler,
4057     const char *reason,
4058     const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4059     Py_ssize_t startpos, Py_ssize_t endpos,
4060     Py_ssize_t *newpos)
4061 {
4062     static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4063
4064     Py_ssize_t i_newpos;
4065     PyObject *restuple;
4066     PyObject *resunicode;
4067
4068     if (*errorHandler == NULL) {
4069         *errorHandler = PyCodec_LookupError(errors);
4070         if (*errorHandler == NULL)
4071             return NULL;
4072     }
4073
4074     make_translate_exception(exceptionObject,
4075         unicode, size, startpos, endpos, reason);
4076     if (*exceptionObject == NULL)
4077         return NULL;
4078
4079     restuple = PyObject_CallFunctionObjArgs(
4080         *errorHandler, *exceptionObject, NULL);
4081     if (restuple == NULL)
4082         return NULL;
4083     if (!PyTuple_Check(restuple)) {
4084         PyErr_Format(PyExc_TypeError, &argparse[4]);
4085         Py_DECREF(restuple);
4086         return NULL;
4087     }
4088     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4089         &resunicode, &i_newpos)) {
4090         Py_DECREF(restuple);
4091         return NULL;
4092     }
4093     if (i_newpos<0)
4094         *newpos = size+i_newpos;
4095     else
4096         *newpos = i_newpos;
4097     if (*newpos<0 || *newpos>size) {
4098         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4099         Py_DECREF(restuple);
4100         return NULL;
4101     }
4102     Py_INCREF(resunicode);
4103     Py_DECREF(restuple);
4104     return resunicode;
4105 }
4106
4107 /* Lookup the character ch in the mapping and put the result in result,
4108    which must be decrefed by the caller.
4109    Return 0 on success, -1 on error */
4110 static
4111 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4112 {
4113     PyObject *w = PyInt_FromLong((long)c);
4114     PyObject *x;
4115
4116     if (w == NULL)
4117          return -1;
4118     x = PyObject_GetItem(mapping, w);
4119     Py_DECREF(w);
4120     if (x == NULL) {
4121         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4122             /* No mapping found means: use 1:1 mapping. */
4123             PyErr_Clear();
4124             *result = NULL;
4125             return 0;
4126         } else
4127             return -1;
4128     }
4129     else if (x == Py_None) {
4130         *result = x;
4131         return 0;
4132     }
4133     else if (PyInt_Check(x)) {
4134         long value = PyInt_AS_LONG(x);
4135         long max = PyUnicode_GetMax();
4136         if (value < 0 || value > max) {
4137             PyErr_Format(PyExc_TypeError,
4138                              "character mapping must be in range(0x%lx)", max+1);
4139             Py_DECREF(x);
4140             return -1;
4141         }
4142         *result = x;
4143         return 0;
4144     }
4145     else if (PyUnicode_Check(x)) {
4146         *result = x;
4147         return 0;
4148     }
4149     else {
4150         /* wrong return value */
4151         PyErr_SetString(PyExc_TypeError,
4152               "character mapping must return integer, None or unicode");
4153         Py_DECREF(x);
4154         return -1;
4155     }
4156 }
4157 /* ensure that *outobj is at least requiredsize characters long,
4158 if not reallocate and adjust various state variables.
4159 Return 0 on success, -1 on error */
4160 static
4161 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4162     Py_ssize_t requiredsize)
4163 {
4164     Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4165     if (requiredsize > oldsize) {
4166         /* remember old output position */
4167         Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4168         /* exponentially overallocate to minimize reallocations */
4169         if (requiredsize < 2 * oldsize)
4170             requiredsize = 2 * oldsize;
4171         if (_PyUnicode_Resize(outobj, requiredsize) < 0)
4172             return -1;
4173         *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4174     }
4175     return 0;
4176 }
4177 /* lookup the character, put the result in the output string and adjust
4178    various state variables. Return a new reference to the object that
4179    was put in the output buffer in *result, or Py_None, if the mapping was
4180    undefined (in which case no character was written).
4181    The called must decref result.
4182    Return 0 on success, -1 on error. */
4183 static
4184 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
4185     Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4186     PyObject **res)
4187 {
4188     if (charmaptranslate_lookup(*curinp, mapping, res))
4189         return -1;
4190     if (*res==NULL) {
4191         /* not found => default to 1:1 mapping */
4192         *(*outp)++ = *curinp;
4193     }
4194     else if (*res==Py_None)
4195         ;
4196     else if (PyInt_Check(*res)) {
4197         /* no overflow check, because we know that the space is enough */
4198         *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4199     }
4200     else if (PyUnicode_Check(*res)) {
4201         Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4202         if (repsize==1) {
4203             /* no overflow check, because we know that the space is enough */
4204             *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4205         }
4206         else if (repsize!=0) {
4207             /* more than one character */
4208             Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4209                 (insize - (curinp-startinp)) +
4210                 repsize - 1;
4211             if (charmaptranslate_makespace(outobj, outp, requiredsize))
4212                 return -1;
4213             memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4214             *outp += repsize;
4215         }
4216     }
4217     else
4218         return -1;
4219     return 0;
4220 }
4221
4222 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
4223                                      Py_ssize_t size,
4224                                      PyObject *mapping,
4225                                      const char *errors)
4226 {
4227     /* output object */
4228     PyObject *res = NULL;
4229     /* pointers to the beginning and end+1 of input */
4230     const Py_UNICODE *startp = p;
4231     const Py_UNICODE *endp = p + size;
4232     /* pointer into the output */
4233     Py_UNICODE *str;
4234     /* current output position */
4235     Py_ssize_t respos = 0;
4236     char *reason = "character maps to <undefined>";
4237     PyObject *errorHandler = NULL;
4238     PyObject *exc = NULL;
4239     /* the following variable is used for caching string comparisons
4240      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4241      * 3=ignore, 4=xmlcharrefreplace */
4242     int known_errorHandler = -1;
4243
4244     if (mapping == NULL) {
4245         PyErr_BadArgument();
4246         return NULL;
4247     }
4248
4249     /* allocate enough for a simple 1:1 translation without
4250        replacements, if we need more, we'll resize */
4251     res = PyUnicode_FromUnicode(NULL, size);
4252     if (res == NULL)
4253         goto onError;
4254     if (size == 0)
4255         return res;
4256     str = PyUnicode_AS_UNICODE(res);
4257
4258     while (p<endp) {
4259         /* try to encode it */
4260         PyObject *x = NULL;
4261         if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4262             Py_XDECREF(x);
4263             goto onError;
4264         }
4265         Py_XDECREF(x);
4266         if (x!=Py_None) /* it worked => adjust input pointer */
4267             ++p;
4268         else { /* untranslatable character */
4269             PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4270             Py_ssize_t repsize;
4271             Py_ssize_t newpos;
4272             Py_UNICODE *uni2;
4273             /* startpos for collecting untranslatable chars */
4274             const Py_UNICODE *collstart = p;
4275             const Py_UNICODE *collend = p+1;
4276             const Py_UNICODE *coll;
4277
4278             /* find all untranslatable characters */
4279             while (collend < endp) {
4280                 if (charmaptranslate_lookup(*collend, mapping, &x))
4281                     goto onError;
4282                 Py_XDECREF(x);
4283                 if (x!=Py_None)
4284                     break;
4285                 ++collend;
4286             }
4287             /* cache callback name lookup
4288              * (if not done yet, i.e. it's the first error) */
4289             if (known_errorHandler==-1) {
4290                 if ((errors==NULL) || (!strcmp(errors, "strict")))
4291                     known_errorHandler = 1;
4292                 else if (!strcmp(errors, "replace"))
4293                     known_errorHandler = 2;
4294                 else if (!strcmp(errors, "ignore"))
4295                     known_errorHandler = 3;
4296                 else if (!strcmp(errors, "xmlcharrefreplace"))
4297                     known_errorHandler = 4;
4298                 else
4299                     known_errorHandler = 0;
4300             }
4301             switch (known_errorHandler) {
4302                 case 1: /* strict */
4303                     raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4304                     goto onError;
4305                 case 2: /* replace */
4306                     /* No need to check for space, this is a 1:1 replacement */
4307                     for (coll = collstart; coll<collend; ++coll)
4308                         *str++ = '?';
4309                     /* fall through */
4310                 case 3: /* ignore */
4311                     p = collend;
4312                     break;
4313                 case 4: /* xmlcharrefreplace */
4314                     /* generate replacement (temporarily (mis)uses p) */
4315                     for (p = collstart; p < collend; ++p) {
4316                         char buffer[2+29+1+1];
4317                         char *cp;
4318                         sprintf(buffer, "&#%d;", (int)*p);
4319                         if (charmaptranslate_makespace(&res, &str,
4320                             (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4321                             goto onError;
4322                         for (cp = buffer; *cp; ++cp)
4323                             *str++ = *cp;
4324                     }
4325                     p = collend;
4326                     break;
4327                 default:
4328                     repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4329                         reason, startp, size, &exc,
4330                         collstart-startp, collend-startp, &newpos);
4331                     if (repunicode == NULL)
4332                         goto onError;
4333                     /* generate replacement  */
4334                     repsize = PyUnicode_GET_SIZE(repunicode);
4335                     if (charmaptranslate_makespace(&res, &str,
4336                         (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4337                         Py_DECREF(repunicode);
4338                         goto onError;
4339                     }
4340                     for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4341                         *str++ = *uni2;
4342                     p = startp + newpos;
4343                     Py_DECREF(repunicode);
4344             }
4345         }
4346     }
4347     /* Resize if we allocated to much */
4348     respos = str-PyUnicode_AS_UNICODE(res);
4349     if (respos<PyUnicode_GET_SIZE(res)) {
4350         if (_PyUnicode_Resize(&res, respos) < 0)
4351             goto onError;
4352     }
4353     Py_XDECREF(exc);
4354     Py_XDECREF(errorHandler);
4355     return res;
4356
4357     onError:
4358     Py_XDECREF(res);
4359     Py_XDECREF(exc);
4360     Py_XDECREF(errorHandler);
4361     return NULL;
4362 }
4363
4364 PyObject *PyUnicode_Translate(PyObject *str,
4365                               PyObject *mapping,
4366                               const char *errors)
4367 {
4368     PyObject *result;
4369
4370     str = PyUnicode_FromObject(str);
4371     if (str == NULL)
4372         goto onError;
4373     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4374                                         PyUnicode_GET_SIZE(str),
4375                                         mapping,
4376                                         errors);
4377     Py_DECREF(str);
4378     return result;
4379
4380  onError:
4381     Py_XDECREF(str);
4382     return NULL;
4383 }
4384
4385 /* --- Decimal Encoder ---------------------------------------------------- */
4386
4387 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
4388                             Py_ssize_t length,
4389                             char *output,
4390                             const char *errors)
4391 {
4392     Py_UNICODE *p, *end;
4393     PyObject *errorHandler = NULL;
4394     PyObject *exc = NULL;
4395     const char *encoding = "decimal";
4396     const char *reason = "invalid decimal Unicode string";
4397     /* the following variable is used for caching string comparisons
4398      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4399     int known_errorHandler = -1;
4400
4401     if (output == NULL) {
4402         PyErr_BadArgument();
4403         return -1;
4404     }
4405
4406     p = s;
4407     end = s + length;
4408     while (p < end) {
4409         register Py_UNICODE ch = *p;
4410         int decimal;
4411         PyObject *repunicode;
4412         Py_ssize_t repsize;
4413         Py_ssize_t newpos;
4414         Py_UNICODE *uni2;
4415         Py_UNICODE *collstart;
4416         Py_UNICODE *collend;
4417
4418         if (Py_UNICODE_ISSPACE(ch)) {
4419             *output++ = ' ';
4420             ++p;
4421             continue;
4422         }
4423         decimal = Py_UNICODE_TODECIMAL(ch);
4424         if (decimal >= 0) {
4425             *output++ = '0' + decimal;
4426             ++p;
4427             continue;
4428         }
4429         if (0 < ch && ch < 256) {
4430             *output++ = (char)ch;
4431             ++p;
4432             continue;
4433         }
4434         /* All other characters are considered unencodable */
4435         collstart = p;
4436         collend = p+1;
4437         while (collend < end) {
4438             if ((0 < *collend && *collend < 256) ||
4439                 !Py_UNICODE_ISSPACE(*collend) ||
4440                 Py_UNICODE_TODECIMAL(*collend))
4441                 break;
4442         }
4443         /* cache callback name lookup
4444          * (if not done yet, i.e. it's the first error) */
4445         if (known_errorHandler==-1) {
4446             if ((errors==NULL) || (!strcmp(errors, "strict")))
4447                 known_errorHandler = 1;
4448             else if (!strcmp(errors, "replace"))
4449                 known_errorHandler = 2;
4450             else if (!strcmp(errors, "ignore"))
4451                 known_errorHandler = 3;
4452             else if (!strcmp(errors, "xmlcharrefreplace"))
4453                 known_errorHandler = 4;
4454             else
4455                 known_errorHandler = 0;
4456         }
4457         switch (known_errorHandler) {
4458             case 1: /* strict */
4459                 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4460                 goto onError;
4461             case 2: /* replace */
4462                 for (p = collstart; p < collend; ++p)
4463                     *output++ = '?';
4464                 /* fall through */
4465             case 3: /* ignore */
4466                 p = collend;
4467                 break;
4468             case 4: /* xmlcharrefreplace */
4469                 /* generate replacement (temporarily (mis)uses p) */
4470                 for (p = collstart; p < collend; ++p)
4471                     output += sprintf(output, "&#%d;", (int)*p);
4472                 p = collend;
4473                 break;
4474             default:
4475                 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4476                     encoding, reason, s, length, &exc,
4477                     collstart-s, collend-s, &newpos);
4478                 if (repunicode == NULL)
4479                     goto onError;
4480                 /* generate replacement  */
4481                 repsize = PyUnicode_GET_SIZE(repunicode);
4482                 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4483                     Py_UNICODE ch = *uni2;
4484                     if (Py_UNICODE_ISSPACE(ch))
4485                         *output++ = ' ';
4486                     else {
4487                         decimal = Py_UNICODE_TODECIMAL(ch);
4488                         if (decimal >= 0)
4489                             *output++ = '0' + decimal;
4490                         else if (0 < ch && ch < 256)
4491                             *output++ = (char)ch;
4492                         else {
4493                             Py_DECREF(repunicode);
4494                             raise_encode_exception(&exc, encoding,
4495                                 s, length, collstart-s, collend-s, reason);
4496                             goto onError;
4497                         }
4498                     }
4499                 }
4500                 p = s + newpos;
4501                 Py_DECREF(repunicode);
4502         }
4503     }
4504     /* 0-terminate the output string */
4505     *output++ = '\0';
4506     Py_XDECREF(exc);
4507     Py_XDECREF(errorHandler);
4508     return 0;
4509
4510  onError:
4511     Py_XDECREF(exc);
4512     Py_XDECREF(errorHandler);
4513     return -1;
4514 }
4515
4516 /* --- Helpers ------------------------------------------------------------ */
4517
4518 #define STRINGLIB_CHAR Py_UNICODE
4519
4520 #define STRINGLIB_LEN PyUnicode_GET_SIZE
4521 #define STRINGLIB_NEW PyUnicode_FromUnicode
4522 #define STRINGLIB_STR PyUnicode_AS_UNICODE
4523
4524 Py_LOCAL_INLINE(int)
4525 STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
4526 {
4527     if (str[0] != other[0])
4528         return 1;
4529     return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
4530 }
4531
4532 #define STRINGLIB_EMPTY unicode_empty
4533
4534 #include "stringlib/fastsearch.h"
4535
4536 #include "stringlib/count.h"
4537 #include "stringlib/find.h"
4538 #include "stringlib/partition.h"
4539
4540 /* helper macro to fixup start/end slice values */
4541 #define FIX_START_END(obj)                      \
4542     if (start < 0)                              \
4543         start += (obj)->length;                 \
4544     if (start < 0)                              \
4545         start = 0;                              \
4546     if (end > (obj)->length)                    \
4547         end = (obj)->length;                    \
4548     if (end < 0)                                \
4549         end += (obj)->length;                   \
4550     if (end < 0)                                \
4551         end = 0;
4552
4553 Py_ssize_t PyUnicode_Count(PyObject *str,
4554                            PyObject *substr,
4555                            Py_ssize_t start,
4556                            Py_ssize_t end)
4557 {
4558     Py_ssize_t result;
4559     PyUnicodeObject* str_obj;
4560     PyUnicodeObject* sub_obj;
4561
4562     str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4563     if (!str_obj)
4564         return -1;
4565     sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4566     if (!sub_obj) {
4567         Py_DECREF(str_obj);
4568         return -1;
4569     }
4570
4571     FIX_START_END(str_obj);
4572
4573     result = stringlib_count(
4574         str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4575         );
4576
4577     Py_DECREF(sub_obj);
4578     Py_DECREF(str_obj);
4579
4580     return result;
4581 }
4582
4583 Py_ssize_t PyUnicode_Find(PyObject *str,
4584                           PyObject *sub,
4585                           Py_ssize_t start,
4586                           Py_ssize_t end,
4587                           int direction)
4588 {
4589     Py_ssize_t result;
4590
4591     str = PyUnicode_FromObject(str);
4592     if (!str)
4593         return -2;
4594     sub = PyUnicode_FromObject(sub);
4595     if (!sub) {
4596         Py_DECREF(str);
4597         return -2;
4598     }
4599
4600     if (direction > 0)
4601         result = stringlib_find_slice(
4602             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4603             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4604             start, end
4605             );
4606     else
4607         result = stringlib_rfind_slice(
4608             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4609             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4610             start, end
4611             );
4612
4613     Py_DECREF(str);
4614     Py_DECREF(sub);
4615
4616     return result;
4617 }
4618
4619 static
4620 int tailmatch(PyUnicodeObject *self,
4621               PyUnicodeObject *substring,
4622               Py_ssize_t start,
4623               Py_ssize_t end,
4624               int direction)
4625 {
4626     if (substring->length == 0)
4627         return 1;
4628
4629     FIX_START_END(self);
4630
4631     end -= substring->length;
4632     if (end < start)
4633         return 0;
4634
4635     if (direction > 0) {
4636         if (Py_UNICODE_MATCH(self, end, substring))
4637             return 1;
4638     } else {
4639         if (Py_UNICODE_MATCH(self, start, substring))
4640             return 1;
4641     }
4642
4643     return 0;
4644 }
4645
4646 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
4647                         PyObject *substr,
4648                         Py_ssize_t start,
4649                         Py_ssize_t end,
4650                         int direction)
4651 {
4652     Py_ssize_t result;
4653
4654     str = PyUnicode_FromObject(str);
4655     if (str == NULL)
4656         return -1;
4657     substr = PyUnicode_FromObject(substr);
4658     if (substr == NULL) {
4659         Py_DECREF(str);
4660         return -1;
4661     }
4662
4663     result = tailmatch((PyUnicodeObject *)str,
4664                        (PyUnicodeObject *)substr,
4665                        start, end, direction);
4666     Py_DECREF(str);
4667     Py_DECREF(substr);
4668     return result;
4669 }
4670
4671 /* Apply fixfct filter to the Unicode object self and return a
4672    reference to the modified object */
4673
4674 static
4675 PyObject *fixup(PyUnicodeObject *self,
4676                 int (*fixfct)(PyUnicodeObject *s))
4677 {
4678
4679     PyUnicodeObject *u;
4680
4681     u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
4682     if (u == NULL)
4683         return NULL;
4684
4685     Py_UNICODE_COPY(u->str, self->str, self->length);
4686
4687     if (!fixfct(u) && PyUnicode_CheckExact(self)) {
4688         /* fixfct should return TRUE if it modified the buffer. If
4689            FALSE, return a reference to the original buffer instead
4690            (to save space, not time) */
4691         Py_INCREF(self);
4692         Py_DECREF(u);
4693         return (PyObject*) self;
4694     }
4695     return (PyObject*) u;
4696 }
4697
4698 static
4699 int fixupper(PyUnicodeObject *self)
4700 {
4701     Py_ssize_t len = self->length;
4702     Py_UNICODE *s = self->str;
4703     int status = 0;
4704
4705     while (len-- > 0) {
4706         register Py_UNICODE ch;
4707
4708         ch = Py_UNICODE_TOUPPER(*s);
4709         if (ch != *s) {
4710             status = 1;
4711             *s = ch;
4712         }
4713         s++;
4714     }
4715
4716     return status;
4717 }
4718
4719 static
4720 int fixlower(PyUnicodeObject *self)
4721 {
4722     Py_ssize_t len = self->length;
4723     Py_UNICODE *s = self->str;
4724     int status = 0;
4725
4726     while (len-- > 0) {
4727         register Py_UNICODE ch;
4728
4729         ch = Py_UNICODE_TOLOWER(*s);
4730         if (ch != *s) {
4731             status = 1;
4732             *s = ch;
4733         }
4734         s++;
4735     }
4736
4737     return status;
4738 }
4739
4740 static
4741 int fixswapcase(PyUnicodeObject *self)
4742 {
4743     Py_ssize_t len = self->length;
4744     Py_UNICODE *s = self->str;
4745     int status = 0;
4746
4747     while (len-- > 0) {
4748         if (Py_UNICODE_ISUPPER(*s)) {
4749             *s = Py_UNICODE_TOLOWER(*s);
4750             status = 1;
4751         } else if (Py_UNICODE_ISLOWER(*s)) {
4752             *s = Py_UNICODE_TOUPPER(*s);
4753             status = 1;
4754         }
4755         s++;
4756     }
4757
4758     return status;
4759 }
4760
4761 static
4762 int fixcapitalize(PyUnicodeObject *self)
4763 {
4764     Py_ssize_t len = self->length;
4765     Py_UNICODE *s = self->str;
4766     int status = 0;
4767
4768     if (len == 0)
4769         return 0;
4770     if (Py_UNICODE_ISLOWER(*s)) {
4771         *s = Py_UNICODE_TOUPPER(*s);
4772         status = 1;
4773     }
4774     s++;
4775     while (--len > 0) {
4776         if (Py_UNICODE_ISUPPER(*s)) {
4777             *s = Py_UNICODE_TOLOWER(*s);
4778             status = 1;
4779         }
4780         s++;
4781     }
4782     return status;
4783 }
4784
4785 static
4786 int fixtitle(PyUnicodeObject *self)
4787 {
4788     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4789     register Py_UNICODE *e;
4790     int previous_is_cased;
4791
4792     /* Shortcut for single character strings */
4793     if (PyUnicode_GET_SIZE(self) == 1) {
4794         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4795         if (*p != ch) {
4796             *p = ch;
4797             return 1;
4798         }
4799         else
4800             return 0;
4801     }
4802
4803     e = p + PyUnicode_GET_SIZE(self);
4804     previous_is_cased = 0;
4805     for (; p < e; p++) {
4806         register const Py_UNICODE ch = *p;
4807
4808         if (previous_is_cased)
4809             *p = Py_UNICODE_TOLOWER(ch);
4810         else
4811             *p = Py_UNICODE_TOTITLE(ch);
4812
4813         if (Py_UNICODE_ISLOWER(ch) ||
4814             Py_UNICODE_ISUPPER(ch) ||
4815             Py_UNICODE_ISTITLE(ch))
4816             previous_is_cased = 1;
4817         else
4818             previous_is_cased = 0;
4819     }
4820     return 1;
4821 }
4822
4823 PyObject *
4824 PyUnicode_Join(PyObject *separator, PyObject *seq)
4825 {
4826     PyObject *internal_separator = NULL;
4827     const Py_UNICODE blank = ' ';
4828     const Py_UNICODE *sep = &blank;
4829     Py_ssize_t seplen = 1;
4830     PyUnicodeObject *res = NULL; /* the result */
4831     Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
4832     Py_ssize_t res_used;         /* # used bytes */
4833     Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
4834     PyObject *fseq;          /* PySequence_Fast(seq) */
4835     Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
4836     PyObject *item;
4837     Py_ssize_t i;
4838
4839     fseq = PySequence_Fast(seq, "");
4840     if (fseq == NULL) {
4841         return NULL;
4842     }
4843
4844     /* Grrrr.  A codec may be invoked to convert str objects to
4845      * Unicode, and so it's possible to call back into Python code
4846      * during PyUnicode_FromObject(), and so it's possible for a sick
4847      * codec to change the size of fseq (if seq is a list).  Therefore
4848      * we have to keep refetching the size -- can't assume seqlen
4849      * is invariant.
4850      */
4851     seqlen = PySequence_Fast_GET_SIZE(fseq);
4852     /* If empty sequence, return u"". */
4853     if (seqlen == 0) {
4854         res = _PyUnicode_New(0);  /* empty sequence; return u"" */
4855         goto Done;
4856     }
4857     /* If singleton sequence with an exact Unicode, return that. */
4858     if (seqlen == 1) {
4859         item = PySequence_Fast_GET_ITEM(fseq, 0);
4860         if (PyUnicode_CheckExact(item)) {
4861             Py_INCREF(item);
4862             res = (PyUnicodeObject *)item;
4863             goto Done;
4864         }
4865     }
4866
4867     /* At least two items to join, or one that isn't exact Unicode. */
4868     if (seqlen > 1) {
4869         /* Set up sep and seplen -- they're needed. */
4870         if (separator == NULL) {
4871             sep = &blank;
4872             seplen = 1;
4873         }
4874         else {
4875             internal_separator = PyUnicode_FromObject(separator);
4876             if (internal_separator == NULL)
4877                 goto onError;
4878             sep = PyUnicode_AS_UNICODE(internal_separator);
4879             seplen = PyUnicode_GET_SIZE(internal_separator);
4880             /* In case PyUnicode_FromObject() mutated seq. */
4881             seqlen = PySequence_Fast_GET_SIZE(fseq);
4882         }
4883     }
4884
4885     /* Get space. */
4886     res = _PyUnicode_New(res_alloc);
4887     if (res == NULL)
4888         goto onError;
4889     res_p = PyUnicode_AS_UNICODE(res);
4890     res_used = 0;
4891
4892     for (i = 0; i < seqlen; ++i) {
4893         Py_ssize_t itemlen;
4894         Py_ssize_t new_res_used;
4895
4896         item = PySequence_Fast_GET_ITEM(fseq, i);
4897         /* Convert item to Unicode. */
4898         if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4899             PyErr_Format(PyExc_TypeError,
4900                          "sequence item %zd: expected string or Unicode,"
4901                          " %.80s found",
4902                          i, Py_Type(item)->tp_name);
4903             goto onError;
4904         }
4905         item = PyUnicode_FromObject(item);
4906         if (item == NULL)
4907             goto onError;
4908         /* We own a reference to item from here on. */
4909
4910         /* In case PyUnicode_FromObject() mutated seq. */
4911         seqlen = PySequence_Fast_GET_SIZE(fseq);
4912
4913         /* Make sure we have enough space for the separator and the item. */
4914         itemlen = PyUnicode_GET_SIZE(item);
4915         new_res_used = res_used + itemlen;
4916         if (new_res_used < 0)
4917             goto Overflow;
4918         if (i < seqlen - 1) {
4919             new_res_used += seplen;
4920             if (new_res_used < 0)
4921                 goto Overflow;
4922         }
4923         if (new_res_used > res_alloc) {
4924             /* double allocated size until it's big enough */
4925             do {
4926                 res_alloc += res_alloc;
4927                 if (res_alloc <= 0)
4928                     goto Overflow;
4929             } while (new_res_used > res_alloc);
4930             if (_PyUnicode_Resize(&res, res_alloc) < 0) {
4931                 Py_DECREF(item);
4932                 goto onError;
4933             }
4934             res_p = PyUnicode_AS_UNICODE(res) + res_used;
4935         }
4936
4937         /* Copy item, and maybe the separator. */
4938         Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
4939         res_p += itemlen;
4940         if (i < seqlen - 1) {
4941             Py_UNICODE_COPY(res_p, sep, seplen);
4942             res_p += seplen;
4943         }
4944         Py_DECREF(item);
4945         res_used = new_res_used;
4946     }
4947
4948     /* Shrink res to match the used area; this probably can't fail,
4949      * but it's cheap to check.
4950      */
4951     if (_PyUnicode_Resize(&res, res_used) < 0)
4952         goto onError;
4953
4954  Done:
4955     Py_XDECREF(internal_separator);
4956     Py_DECREF(fseq);
4957     return (PyObject *)res;
4958
4959  Overflow:
4960     PyErr_SetString(PyExc_OverflowError,
4961                     "join() result is too long for a Python string");
4962     Py_DECREF(item);
4963     /* fall through */
4964
4965  onError:
4966     Py_XDECREF(internal_separator);
4967     Py_DECREF(fseq);
4968     Py_XDECREF(res);
4969     return NULL;
4970 }
4971
4972 static
4973 PyUnicodeObject *pad(PyUnicodeObject *self,
4974                      Py_ssize_t left,
4975                      Py_ssize_t right,
4976                      Py_UNICODE fill)
4977 {
4978     PyUnicodeObject *u;
4979
4980     if (left < 0)
4981         left = 0;
4982     if (right < 0)
4983         right = 0;
4984
4985     if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
4986         Py_INCREF(self);
4987         return self;
4988     }
4989
4990     u = _PyUnicode_New(left + self->length + right);
4991     if (u) {
4992         if (left)
4993             Py_UNICODE_FILL(u->str, fill, left);
4994         Py_UNICODE_COPY(u->str + left, self->str, self->length);
4995         if (right)
4996             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4997     }
4998
4999     return u;
5000 }
5001
5002 #define SPLIT_APPEND(data, left, right)                                 \
5003         str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
5004         if (!str)                                                       \
5005             goto onError;                                               \
5006         if (PyList_Append(list, str)) {                                 \
5007             Py_DECREF(str);                                             \
5008             goto onError;                                               \
5009         }                                                               \
5010         else                                                            \
5011             Py_DECREF(str);
5012
5013 static
5014 PyObject *split_whitespace(PyUnicodeObject *self,
5015                            PyObject *list,
5016                            Py_ssize_t maxcount)
5017 {
5018     register Py_ssize_t i;
5019     register Py_ssize_t j;
5020     Py_ssize_t len = self->length;
5021     PyObject *str;
5022
5023     for (i = j = 0; i < len; ) {
5024         /* find a token */
5025         while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5026             i++;
5027         j = i;
5028         while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5029             i++;
5030         if (j < i) {
5031             if (maxcount-- <= 0)
5032                 break;
5033             SPLIT_APPEND(self->str, j, i);
5034             while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5035                 i++;
5036             j = i;
5037         }
5038     }
5039     if (j < len) {
5040         SPLIT_APPEND(self->str, j, len);
5041     }
5042     return list;
5043
5044  onError:
5045     Py_DECREF(list);
5046     return NULL;
5047 }
5048
5049 PyObject *PyUnicode_Splitlines(PyObject *string,
5050                                int keepends)
5051 {
5052     register Py_ssize_t i;
5053     register Py_ssize_t j;
5054     Py_ssize_t len;
5055     PyObject *list;
5056     PyObject *str;
5057     Py_UNICODE *data;
5058
5059     string = PyUnicode_FromObject(string);
5060     if (string == NULL)
5061         return NULL;
5062     data = PyUnicode_AS_UNICODE(string);
5063     len = PyUnicode_GET_SIZE(string);
5064
5065     list = PyList_New(0);
5066     if (!list)
5067         goto onError;
5068
5069     for (i = j = 0; i < len; ) {
5070         Py_ssize_t eol;
5071
5072         /* Find a line and append it */
5073         while (i < len && !BLOOM_LINEBREAK(data[i]))
5074             i++;
5075
5076         /* Skip the line break reading CRLF as one line break */
5077         eol = i;
5078         if (i < len) {
5079             if (data[i] == '\r' && i + 1 < len &&
5080                 data[i+1] == '\n')
5081                 i += 2;
5082             else
5083                 i++;
5084             if (keepends)
5085                 eol = i;
5086         }
5087         SPLIT_APPEND(data, j, eol);
5088         j = i;
5089     }
5090     if (j < len) {
5091         SPLIT_APPEND(data, j, len);
5092     }
5093
5094     Py_DECREF(string);
5095     return list;
5096
5097  onError:
5098     Py_XDECREF(list);
5099     Py_DECREF(string);
5100     return NULL;
5101 }
5102
5103 static
5104 PyObject *split_char(PyUnicodeObject *self,
5105                      PyObject *list,
5106                      Py_UNICODE ch,
5107                      Py_ssize_t maxcount)
5108 {
5109     register Py_ssize_t i;
5110     register Py_ssize_t j;
5111     Py_ssize_t len = self->length;
5112     PyObject *str;
5113
5114     for (i = j = 0; i < len; ) {
5115         if (self->str[i] == ch) {
5116             if (maxcount-- <= 0)
5117                 break;
5118             SPLIT_APPEND(self->str, j, i);
5119             i = j = i + 1;
5120         } else
5121             i++;
5122     }
5123     if (j <= len) {
5124         SPLIT_APPEND(self->str, j, len);
5125     }
5126     return list;
5127
5128  onError:
5129     Py_DECREF(list);
5130     return NULL;
5131 }
5132
5133 static
5134 PyObject *split_substring(PyUnicodeObject *self,
5135                           PyObject *list,
5136                           PyUnicodeObject *substring,
5137                           Py_ssize_t maxcount)
5138 {
5139     register Py_ssize_t i;
5140     register Py_ssize_t j;
5141     Py_ssize_t len = self->length;
5142     Py_ssize_t sublen = substring->length;
5143     PyObject *str;
5144
5145     for (i = j = 0; i <= len - sublen; ) {
5146         if (Py_UNICODE_MATCH(self, i, substring)) {
5147             if (maxcount-- <= 0)
5148                 break;
5149             SPLIT_APPEND(self->str, j, i);
5150             i = j = i + sublen;
5151         } else
5152             i++;
5153     }
5154     if (j <= len) {
5155         SPLIT_APPEND(self->str, j, len);
5156     }
5157     return list;
5158
5159  onError:
5160     Py_DECREF(list);
5161     return NULL;
5162 }
5163
5164 static
5165 PyObject *rsplit_whitespace(PyUnicodeObject *self,
5166                             PyObject *list,
5167                             Py_ssize_t maxcount)
5168 {
5169     register Py_ssize_t i;
5170     register Py_ssize_t j;
5171     Py_ssize_t len = self->length;
5172     PyObject *str;
5173
5174     for (i = j = len - 1; i >= 0; ) {
5175         /* find a token */
5176         while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5177             i--;
5178         j = i;
5179         while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5180             i--;
5181         if (j > i) {
5182             if (maxcount-- <= 0)
5183                 break;
5184             SPLIT_APPEND(self->str, i + 1, j + 1);
5185             while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5186                 i--;
5187             j = i;
5188         }
5189     }
5190     if (j >= 0) {
5191         SPLIT_APPEND(self->str, 0, j + 1);
5192     }
5193     if (PyList_Reverse(list) < 0)
5194         goto onError;
5195     return list;
5196
5197  onError:
5198     Py_DECREF(list);
5199     return NULL;
5200 }
5201
5202 static
5203 PyObject *rsplit_char(PyUnicodeObject *self,
5204                       PyObject *list,
5205                       Py_UNICODE ch,
5206                       Py_ssize_t maxcount)
5207 {
5208     register Py_ssize_t i;
5209     register Py_ssize_t j;
5210     Py_ssize_t len = self->length;
5211     PyObject *str;
5212
5213     for (i = j = len - 1; i >= 0; ) {
5214         if (self->str[i] == ch) {
5215             if (maxcount-- <= 0)
5216                 break;
5217             SPLIT_APPEND(self->str, i + 1, j + 1);
5218             j = i = i - 1;
5219         } else
5220             i--;
5221     }
5222     if (j >= -1) {
5223         SPLIT_APPEND(self->str, 0, j + 1);
5224     }
5225     if (PyList_Reverse(list) < 0)
5226         goto onError;
5227     return list;
5228
5229  onError:
5230     Py_DECREF(list);
5231     return NULL;
5232 }
5233
5234 static
5235 PyObject *rsplit_substring(PyUnicodeObject *self,
5236                            PyObject *list,
5237                            PyUnicodeObject *substring,
5238                            Py_ssize_t maxcount)
5239 {
5240     register Py_ssize_t i;
5241     register Py_ssize_t j;
5242     Py_ssize_t len = self->length;
5243     Py_ssize_t sublen = substring->length;
5244     PyObject *str;
5245
5246     for (i = len - sublen, j = len; i >= 0; ) {
5247         if (Py_UNICODE_MATCH(self, i, substring)) {
5248             if (maxcount-- <= 0)
5249                 break;
5250             SPLIT_APPEND(self->str, i + sublen, j);
5251             j = i;
5252             i -= sublen;
5253         } else
5254             i--;
5255     }
5256     if (j >= 0) {
5257         SPLIT_APPEND(self->str, 0, j);
5258     }
5259     if (PyList_Reverse(list) < 0)
5260         goto onError;
5261     return list;
5262
5263  onError:
5264     Py_DECREF(list);
5265     return NULL;
5266 }
5267
5268 #undef SPLIT_APPEND
5269
5270 static
5271 PyObject *split(PyUnicodeObject *self,
5272                 PyUnicodeObject *substring,
5273                 Py_ssize_t maxcount)
5274 {
5275     PyObject *list;
5276
5277     if (maxcount < 0)
5278         maxcount = PY_SSIZE_T_MAX;
5279
5280     list = PyList_New(0);
5281     if (!list)
5282         return NULL;
5283
5284     if (substring == NULL)
5285         return split_whitespace(self,list,maxcount);
5286
5287     else if (substring->length == 1)
5288         return split_char(self,list,substring->str[0],maxcount);
5289
5290     else if (substring->length == 0) {
5291         Py_DECREF(list);
5292         PyErr_SetString(PyExc_ValueError, "empty separator");
5293         return NULL;
5294     }
5295     else
5296         return split_substring(self,list,substring,maxcount);
5297 }
5298
5299 static
5300 PyObject *rsplit(PyUnicodeObject *self,
5301                  PyUnicodeObject *substring,
5302                  Py_ssize_t maxcount)
5303 {
5304     PyObject *list;
5305
5306     if (maxcount < 0)
5307         maxcount = PY_SSIZE_T_MAX;
5308
5309     list = PyList_New(0);
5310     if (!list)
5311         return NULL;
5312
5313     if (substring == NULL)
5314         return rsplit_whitespace(self,list,maxcount);
5315
5316     else if (substring->length == 1)
5317         return rsplit_char(self,list,substring->str[0],maxcount);
5318
5319     else if (substring->length == 0) {
5320         Py_DECREF(list);
5321         PyErr_SetString(PyExc_ValueError, "empty separator");
5322         return NULL;
5323     }
5324     else
5325         return rsplit_substring(self,list,substring,maxcount);
5326 }
5327
5328 static
5329 PyObject *replace(PyUnicodeObject *self,
5330                   PyUnicodeObject *str1,
5331                   PyUnicodeObject *str2,
5332                   Py_ssize_t maxcount)
5333 {
5334     PyUnicodeObject *u;
5335
5336     if (maxcount < 0)
5337         maxcount = PY_SSIZE_T_MAX;
5338
5339     if (str1->length == str2->length) {
5340         /* same length */
5341         Py_ssize_t i;
5342         if (str1->length == 1) {
5343             /* replace characters */
5344             Py_UNICODE u1, u2;
5345             if (!findchar(self->str, self->length, str1->str[0]))
5346                 goto nothing;
5347             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5348             if (!u)
5349                 return NULL;
5350             Py_UNICODE_COPY(u->str, self->str, self->length);
5351             u1 = str1->str[0];
5352             u2 = str2->str[0];
5353             for (i = 0; i < u->length; i++)
5354                 if (u->str[i] == u1) {
5355                     if (--maxcount < 0)
5356                         break;
5357                     u->str[i] = u2;
5358                 }
5359         } else {
5360             i = fastsearch(
5361                 self->str, self->length, str1->str, str1->length, FAST_SEARCH
5362                 );
5363             if (i < 0)
5364                 goto nothing;
5365             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5366             if (!u)
5367                 return NULL;
5368             Py_UNICODE_COPY(u->str, self->str, self->length);
5369             while (i <= self->length - str1->length)
5370                 if (Py_UNICODE_MATCH(self, i, str1)) {
5371                     if (--maxcount < 0)
5372                         break;
5373                     Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5374                     i += str1->length;
5375                 } else
5376                     i++;
5377         }
5378     } else {
5379
5380         Py_ssize_t n, i, j, e;
5381         Py_ssize_t product, new_size, delta;
5382         Py_UNICODE *p;
5383
5384         /* replace strings */
5385         n = stringlib_count(self->str, self->length, str1->str, str1->length);
5386         if (n > maxcount)
5387             n = maxcount;
5388         if (n == 0)
5389             goto nothing;
5390         /* new_size = self->length + n * (str2->length - str1->length)); */
5391         delta = (str2->length - str1->length);
5392         if (delta == 0) {
5393             new_size = self->length;
5394         } else {
5395             product = n * (str2->length - str1->length);
5396             if ((product / (str2->length - str1->length)) != n) {
5397                 PyErr_SetString(PyExc_OverflowError,
5398                                 "replace string is too long");
5399                 return NULL;
5400             }
5401             new_size = self->length + product;
5402             if (new_size < 0) {
5403                 PyErr_SetString(PyExc_OverflowError,
5404                                 "replace string is too long");
5405                 return NULL;
5406             }
5407         }
5408         u = _PyUnicode_New(new_size);
5409         if (!u)
5410             return NULL;
5411         i = 0;
5412         p = u->str;
5413         e = self->length - str1->length;
5414         if (str1->length > 0) {
5415             while (n-- > 0) {
5416                 /* look for next match */
5417                 j = i;
5418                 while (j <= e) {
5419                     if (Py_UNICODE_MATCH(self, j, str1))
5420                         break;
5421                     j++;
5422                 }
5423                 if (j > i) {
5424                     if (j > e)
5425                         break;
5426                     /* copy unchanged part [i:j] */
5427                     Py_UNICODE_COPY(p, self->str+i, j-i);
5428                     p += j - i;
5429                 }
5430                 /* copy substitution string */
5431                 if (str2->length > 0) {
5432                     Py_UNICODE_COPY(p, str2->str, str2->length);
5433                     p += str2->length;
5434                 }
5435                 i = j + str1->length;
5436             }
5437             if (i < self->length)
5438                 /* copy tail [i:] */
5439                 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5440         } else {
5441             /* interleave */
5442             while (n > 0) {
5443                 Py_UNICODE_COPY(p, str2->str, str2->length);
5444                 p += str2->length;
5445                 if (--n <= 0)
5446                     break;
5447                 *p++ = self->str[i++];
5448             }
5449             Py_UNICODE_COPY(p, self->str+i, self->length-i);
5450         }
5451     }
5452     return (PyObject *) u;
5453
5454 nothing:
5455     /* nothing to replace; return original string (when possible) */
5456     if (PyUnicode_CheckExact(self)) {
5457         Py_INCREF(self);
5458         return (PyObject *) self;
5459     }
5460     return PyUnicode_FromUnicode(self->str, self->length);
5461 }
5462
5463 /* --- Unicode Object Methods --------------------------------------------- */
5464
5465 PyDoc_STRVAR(title__doc__,
5466 "S.title() -> unicode\n\
5467 \n\
5468 Return a titlecased version of S, i.e. words start with title case\n\
5469 characters, all remaining cased characters have lower case.");
5470
5471 static PyObject*
5472 unicode_title(PyUnicodeObject *self)
5473 {
5474     return fixup(self, fixtitle);
5475 }
5476
5477 PyDoc_STRVAR(capitalize__doc__,
5478 "S.capitalize() -> unicode\n\
5479 \n\
5480 Return a capitalized version of S, i.e. make the first character\n\
5481 have upper case.");
5482
5483 static PyObject*
5484 unicode_capitalize(PyUnicodeObject *self)
5485 {
5486     return fixup(self, fixcapitalize);
5487 }
5488
5489 #if 0
5490 PyDoc_STRVAR(capwords__doc__,
5491 "S.capwords() -> unicode\n\
5492 \n\
5493 Apply .capitalize() to all words in S and return the result with\n\
5494 normalized whitespace (all whitespace strings are replaced by ' ').");
5495
5496 static PyObject*
5497 unicode_capwords(PyUnicodeObject *self)
5498 {
5499     PyObject *list;
5500     PyObject *item;
5501     Py_ssize_t i;
5502
5503     /* Split into words */
5504     list = split(self, NULL, -1);
5505     if (!list)
5506         return NULL;
5507
5508     /* Capitalize each word */
5509     for (i = 0; i < PyList_GET_SIZE(list); i++) {
5510         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5511                      fixcapitalize);
5512         if (item == NULL)
5513             goto onError;
5514         Py_DECREF(PyList_GET_ITEM(list, i));
5515         PyList_SET_ITEM(list, i, item);
5516     }
5517
5518     /* Join the words to form a new string */
5519     item = PyUnicode_Join(NULL, list);
5520
5521 onError:
5522     Py_DECREF(list);
5523     return (PyObject *)item;
5524 }
5525 #endif
5526
5527 /* Argument converter.  Coerces to a single unicode character */
5528
5529 static int
5530 convert_uc(PyObject *obj, void *addr)
5531 {
5532         Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5533         PyObject *uniobj;
5534         Py_UNICODE *unistr;
5535
5536         uniobj = PyUnicode_FromObject(obj);
5537         if (uniobj == NULL) {
5538                 PyErr_SetString(PyExc_TypeError,
5539                         "The fill character cannot be converted to Unicode");
5540                 return 0;
5541         }
5542         if (PyUnicode_GET_SIZE(uniobj) != 1) {
5543                 PyErr_SetString(PyExc_TypeError,
5544                         "The fill character must be exactly one character long");
5545                 Py_DECREF(uniobj);
5546                 return 0;
5547         }
5548         unistr = PyUnicode_AS_UNICODE(uniobj);
5549         *fillcharloc = unistr[0];
5550         Py_DECREF(uniobj);
5551         return 1;
5552 }
5553
5554 PyDoc_STRVAR(center__doc__,
5555 "S.center(width[, fillchar]) -> unicode\n\
5556 \n\
5557 Return S centered in a Unicode string of length width. Padding is\n\
5558 done using the specified fill character (default is a space)");
5559
5560 static PyObject *
5561 unicode_center(PyUnicodeObject *self, PyObject *args)
5562 {
5563     Py_ssize_t marg, left;
5564     Py_ssize_t width;
5565     Py_UNICODE fillchar = ' ';
5566
5567     if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
5568         return NULL;
5569
5570     if (self->length >= width && PyUnicode_CheckExact(self)) {
5571         Py_INCREF(self);
5572         return (PyObject*) self;
5573     }
5574
5575     marg = width - self->length;
5576     left = marg / 2 + (marg & width & 1);
5577
5578     return (PyObject*) pad(self, left, marg - left, fillchar);
5579 }
5580
5581 #if 0
5582
5583 /* This code should go into some future Unicode collation support
5584    module. The basic comparison should compare ordinals on a naive
5585    basis (this is what Java does and thus JPython too). */
5586
5587 /* speedy UTF-16 code point order comparison */
5588 /* gleaned from: */
5589 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5590
5591 static short utf16Fixup[32] =
5592 {
5593     0, 0, 0, 0, 0, 0, 0, 0,
5594     0, 0, 0, 0, 0, 0, 0, 0,
5595     0, 0, 0, 0, 0, 0, 0, 0,
5596     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
5597 };
5598
5599 static int
5600 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5601 {
5602     Py_ssize_t len1, len2;
5603
5604     Py_UNICODE *s1 = str1->str;
5605     Py_UNICODE *s2 = str2->str;
5606
5607     len1 = str1->length;
5608     len2 = str2->length;
5609
5610     while (len1 > 0 && len2 > 0) {
5611         Py_UNICODE c1, c2;
5612
5613         c1 = *s1++;
5614         c2 = *s2++;
5615
5616         if (c1 > (1<<11) * 26)
5617             c1 += utf16Fixup[c1>>11];
5618         if (c2 > (1<<11) * 26)
5619             c2 += utf16Fixup[c2>>11];
5620         /* now c1 and c2 are in UTF-32-compatible order */
5621
5622         if (c1 != c2)
5623             return (c1 < c2) ? -1 : 1;
5624
5625         len1--; len2--;
5626     }
5627
5628     return (len1 < len2) ? -1 : (len1 != len2);
5629 }
5630
5631 #else
5632
5633 static int
5634 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5635 {
5636     register Py_ssize_t len1, len2;
5637
5638     Py_UNICODE *s1 = str1->str;
5639     Py_UNICODE *s2 = str2->str;
5640
5641     len1 = str1->length;
5642     len2 = str2->length;
5643
5644     while (len1 > 0 && len2 > 0) {
5645         Py_UNICODE c1, c2;
5646
5647         c1 = *s1++;
5648         c2 = *s2++;
5649
5650         if (c1 != c2)
5651             return (c1 < c2) ? -1 : 1;
5652
5653         len1--; len2--;
5654     }
5655
5656     return (len1 < len2) ? -1 : (len1 != len2);
5657 }
5658
5659 #endif
5660
5661 int PyUnicode_Compare(PyObject *left,
5662                       PyObject *right)
5663 {
5664     PyUnicodeObject *u = NULL, *v = NULL;
5665     int result;
5666
5667     /* Coerce the two arguments */
5668     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5669     if (u == NULL)
5670         goto onError;
5671     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5672     if (v == NULL)
5673         goto onError;
5674
5675     /* Shortcut for empty or interned objects */
5676     if (v == u) {
5677         Py_DECREF(u);
5678         Py_DECREF(v);
5679         return 0;
5680     }
5681
5682     result = unicode_compare(u, v);
5683
5684     Py_DECREF(u);
5685     Py_DECREF(v);
5686     return result;
5687
5688 onError:
5689     Py_XDECREF(u);
5690     Py_XDECREF(v);
5691     return -1;
5692 }
5693
5694 PyObject *PyUnicode_RichCompare(PyObject *left,
5695                                 PyObject *right,
5696                                 int op)
5697 {
5698     int result;
5699
5700     result = PyUnicode_Compare(left, right);
5701     if (result == -1 && PyErr_Occurred())
5702         goto onError;
5703
5704     /* Convert the return value to a Boolean */
5705     switch (op) {
5706     case Py_EQ:
5707         result = (result == 0);
5708         break;
5709     case Py_NE:
5710         result = (result != 0);
5711         break;
5712     case Py_LE:
5713         result = (result <= 0);
5714         break;
5715     case Py_GE:
5716         result = (result >= 0);
5717         break;
5718     case Py_LT:
5719         result = (result == -1);
5720         break;
5721     case Py_GT:
5722         result = (result == 1);
5723         break;
5724     }
5725     return PyBool_FromLong(result);
5726
5727  onError:
5728
5729     /* Standard case
5730
5731        Type errors mean that PyUnicode_FromObject() could not convert
5732        one of the arguments (usually the right hand side) to Unicode,
5733        ie. we can't handle the comparison request. However, it is
5734        possible that the other object knows a comparison method, which
5735        is why we return Py_NotImplemented to give the other object a
5736        chance.
5737
5738     */
5739     if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5740         PyErr_Clear();
5741         Py_INCREF(Py_NotImplemented);
5742         return Py_NotImplemented;
5743     }
5744     if (op != Py_EQ && op != Py_NE)
5745         return NULL;
5746
5747     /* Equality comparison.
5748
5749        This is a special case: we silence any PyExc_UnicodeDecodeError
5750        and instead turn it into a PyErr_UnicodeWarning.
5751
5752     */
5753     if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5754         return NULL;
5755     PyErr_Clear();
5756     if (PyErr_Warn(PyExc_UnicodeWarning,
5757                    (op == Py_EQ) ?
5758                    "Unicode equal comparison "
5759                    "failed to convert both arguments to Unicode - "
5760                    "interpreting them as being unequal" :
5761                    "Unicode unequal comparison "
5762                    "failed to convert both arguments to Unicode - "
5763                    "interpreting them as being unequal"
5764                    ) < 0)
5765         return NULL;
5766     result = (op == Py_NE);
5767     return PyBool_FromLong(result);
5768 }
5769
5770 int PyUnicode_Contains(PyObject *container,
5771                        PyObject *element)
5772 {
5773     PyObject *str, *sub;
5774     int result;
5775
5776     /* Coerce the two arguments */
5777     sub = PyUnicode_FromObject(element);
5778     if (!sub) {
5779         PyErr_SetString(PyExc_TypeError,
5780             "'in <string>' requires string as left operand");
5781         return -1;
5782     }
5783
5784     str = PyUnicode_FromObject(container);
5785     if (!str) {
5786         Py_DECREF(sub);
5787         return -1;
5788     }
5789
5790     result = stringlib_contains_obj(str, sub);
5791
5792     Py_DECREF(str);
5793     Py_DECREF(sub);
5794
5795     return result;
5796 }
5797
5798 /* Concat to string or Unicode object giving a new Unicode object. */
5799
5800 PyObject *PyUnicode_Concat(PyObject *left,
5801                            PyObject *right)
5802 {
5803     PyUnicodeObject *u = NULL, *v = NULL, *w;
5804
5805     /* Coerce the two arguments */
5806     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5807     if (u == NULL)
5808         goto onError;
5809     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5810     if (v == NULL)
5811         goto onError;
5812
5813     /* Shortcuts */
5814     if (v == unicode_empty) {
5815         Py_DECREF(v);
5816         return (PyObject *)u;
5817     }
5818     if (u == unicode_empty) {
5819         Py_DECREF(u);
5820         return (PyObject *)v;
5821     }
5822
5823     /* Concat the two Unicode strings */
5824     w = _PyUnicode_New(u->length + v->length);
5825     if (w == NULL)
5826         goto onError;
5827     Py_UNICODE_COPY(w->str, u->str, u->length);
5828     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5829
5830     Py_DECREF(u);
5831     Py_DECREF(v);
5832     return (PyObject *)w;
5833
5834 onError:
5835     Py_XDECREF(u);
5836     Py_XDECREF(v);
5837     return NULL;
5838 }
5839
5840 PyDoc_STRVAR(count__doc__,
5841 "S.count(sub[, start[, end]]) -> int\n\
5842 \n\
5843 Return the number of non-overlapping occurrences of substring sub in\n\
5844 Unicode string S[start:end].  Optional arguments start and end are\n\
5845 interpreted as in slice notation.");
5846
5847 static PyObject *
5848 unicode_count(PyUnicodeObject *self, PyObject *args)
5849 {
5850     PyUnicodeObject *substring;
5851     Py_ssize_t start = 0;
5852     Py_ssize_t end = PY_SSIZE_T_MAX;
5853     PyObject *result;
5854
5855     if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5856                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5857         return NULL;
5858
5859     substring = (PyUnicodeObject *)PyUnicode_FromObject(
5860         (PyObject *)substring);
5861     if (substring == NULL)
5862         return NULL;
5863
5864     FIX_START_END(self);
5865
5866     result = PyInt_FromSsize_t(
5867         stringlib_count(self->str + start, end - start,
5868                         substring->str, substring->length)
5869         );
5870
5871     Py_DECREF(substring);
5872
5873     return result;
5874 }
5875
5876 PyDoc_STRVAR(encode__doc__,
5877 "S.encode([encoding[,errors]]) -> string or unicode\n\
5878 \n\
5879 Encodes S using the codec registered for encoding. encoding defaults\n\
5880 to the default encoding. errors may be given to set a different error\n\
5881 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5882 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5883 'xmlcharrefreplace' as well as any other name registered with\n\
5884 codecs.register_error that can handle UnicodeEncodeErrors.");
5885
5886 static PyObject *
5887 unicode_encode(PyUnicodeObject *self, PyObject *args)
5888 {
5889     char *encoding = NULL;
5890     char *errors = NULL;
5891     PyObject *v;
5892
5893     if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5894         return NULL;
5895     v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
5896     if (v == NULL)
5897         goto onError;
5898     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5899         PyErr_Format(PyExc_TypeError,
5900                      "encoder did not return a string/unicode object "
5901                      "(type=%.400s)",
5902                      Py_Type(v)->tp_name);
5903         Py_DECREF(v);
5904         return NULL;
5905     }
5906     return v;
5907
5908  onError:
5909     return NULL;
5910 }
5911
5912 PyDoc_STRVAR(decode__doc__,
5913 "S.decode([encoding[,errors]]) -> string or unicode\n\
5914 \n\
5915 Decodes S using the codec registered for encoding. encoding defaults\n\
5916 to the default encoding. errors may be given to set a different error\n\
5917 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5918 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5919 as well as any other name registerd with codecs.register_error that is\n\
5920 able to handle UnicodeDecodeErrors.");
5921
5922 static PyObject *
5923 unicode_decode(PyUnicodeObject *self, PyObject *args)
5924 {
5925     char *encoding = NULL;
5926     char *errors = NULL;
5927     PyObject *v;
5928
5929     if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5930         return NULL;
5931     v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
5932     if (v == NULL)
5933         goto onError;
5934     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5935         PyErr_Format(PyExc_TypeError,
5936                      "decoder did not return a string/unicode object "
5937                      "(type=%.400s)",
5938                      Py_Type(v)->tp_name);
5939         Py_DECREF(v);
5940         return NULL;
5941     }
5942     return v;
5943
5944  onError:
5945     return NULL;
5946 }
5947
5948 PyDoc_STRVAR(expandtabs__doc__,
5949 "S.expandtabs([tabsize]) -> unicode\n\
5950 \n\
5951 Return a copy of S where all tab characters are expanded using spaces.\n\
5952 If tabsize is not given, a tab size of 8 characters is assumed.");
5953
5954 static PyObject*
5955 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5956 {
5957     Py_UNICODE *e;
5958     Py_UNICODE *p;
5959     Py_UNICODE *q;
5960     Py_ssize_t i, j, old_j;
5961     PyUnicodeObject *u;
5962     int tabsize = 8;
5963
5964     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5965         return NULL;
5966
5967     /* First pass: determine size of output string */
5968     i = j = old_j = 0;
5969     e = self->str + self->length;
5970     for (p = self->str; p < e; p++)
5971         if (*p == '\t') {
5972             if (tabsize > 0) {
5973                 j += tabsize - (j % tabsize);
5974                 if (old_j > j) {
5975                     PyErr_SetString(PyExc_OverflowError,
5976                                     "new string is too long");
5977                     return NULL;
5978                 }
5979                 old_j = j;
5980             }
5981         }
5982         else {
5983             j++;
5984             if (*p == '\n' || *p == '\r') {
5985                 i += j;
5986                 old_j = j = 0;
5987                 if (i < 0) {
5988                     PyErr_SetString(PyExc_OverflowError,
5989                                     "new string is too long");
5990                     return NULL;
5991                 }
5992             }
5993         }
5994
5995     if ((i + j) < 0) {
5996         PyErr_SetString(PyExc_OverflowError, "new string is too long");
5997         return NULL;
5998     }
5999
6000     /* Second pass: create output string and fill it */
6001     u = _PyUnicode_New(i + j);
6002     if (!u)
6003         return NULL;
6004
6005     j = 0;
6006     q = u->str;
6007
6008     for (p = self->str; p < e; p++)
6009         if (*p == '\t') {
6010             if (tabsize > 0) {
6011                 i = tabsize - (j % tabsize);
6012                 j += i;
6013                 while (i--)
6014                     *q++ = ' ';
6015             }
6016         }
6017         else {
6018             j++;
6019             *q++ = *p;
6020             if (*p == '\n' || *p == '\r')
6021                 j = 0;
6022         }
6023
6024     return (PyObject*) u;
6025 }
6026
6027 PyDoc_STRVAR(find__doc__,
6028 "S.find(sub [,start [,end]]) -> int\n\
6029 \n\
6030 Return the lowest index in S where substring sub is found,\n\
6031 such that sub is contained within s[start:end].  Optional\n\
6032 arguments start and end are interpreted as in slice notation.\n\
6033 \n\
6034 Return -1 on failure.");
6035
6036 static PyObject *
6037 unicode_find(PyUnicodeObject *self, PyObject *args)
6038 {
6039     PyObject *substring;
6040     Py_ssize_t start = 0;
6041     Py_ssize_t end = PY_SSIZE_T_MAX;
6042     Py_ssize_t result;
6043
6044     if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6045                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6046         return NULL;
6047     substring = PyUnicode_FromObject(substring);
6048     if (!substring)
6049         return NULL;
6050
6051     result = stringlib_find_slice(
6052         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6053         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6054         start, end
6055         );
6056
6057     Py_DECREF(substring);
6058
6059     return PyInt_FromSsize_t(result);
6060 }
6061
6062 static PyObject *
6063 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6064 {
6065     if (index < 0 || index >= self->length) {
6066         PyErr_SetString(PyExc_IndexError, "string index out of range");
6067         return NULL;
6068     }
6069
6070     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6071 }
6072
6073 static long
6074 unicode_hash(PyUnicodeObject *self)
6075 {
6076     /* Since Unicode objects compare equal to their ASCII string
6077        counterparts, they should use the individual character values
6078        as basis for their hash value.  This is needed to assure that
6079        strings and Unicode objects behave in the same way as
6080        dictionary keys. */
6081
6082     register Py_ssize_t len;
6083     register Py_UNICODE *p;
6084     register long x;
6085
6086     if (self->hash != -1)
6087         return self->hash;
6088     len = PyUnicode_GET_SIZE(self);
6089     p = PyUnicode_AS_UNICODE(self);
6090     x = *p << 7;
6091     while (--len >= 0)
6092         x = (1000003*x) ^ *p++;
6093     x ^= PyUnicode_GET_SIZE(self);
6094     if (x == -1)
6095         x = -2;
6096     self->hash = x;
6097     return x;
6098 }
6099
6100 PyDoc_STRVAR(index__doc__,
6101 "S.index(sub [,start [,end]]) -> int\n\
6102 \n\
6103 Like S.find() but raise ValueError when the substring is not found.");
6104
6105 static PyObject *
6106 unicode_index(PyUnicodeObject *self, PyObject *args)
6107 {
6108     Py_ssize_t result;
6109     PyObject *substring;
6110     Py_ssize_t start = 0;
6111     Py_ssize_t end = PY_SSIZE_T_MAX;
6112
6113     if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6114                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6115         return NULL;
6116     substring = PyUnicode_FromObject(substring);
6117     if (!substring)
6118         return NULL;
6119
6120     result = stringlib_find_slice(
6121         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6122         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6123         start, end
6124         );
6125
6126     Py_DECREF(substring);
6127
6128     if (result < 0) {
6129         PyErr_SetString(PyExc_ValueError, "substring not found");
6130         return NULL;
6131     }
6132
6133     return PyInt_FromSsize_t(result);
6134 }
6135
6136 PyDoc_STRVAR(islower__doc__,
6137 "S.islower() -> bool\n\
6138 \n\
6139 Return True if all cased characters in S are lowercase and there is\n\
6140 at least one cased character in S, False otherwise.");
6141
6142 static PyObject*
6143 unicode_islower(PyUnicodeObject *self)
6144 {
6145     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6146     register const Py_UNICODE *e;
6147     int cased;
6148
6149     /* Shortcut for single character strings */
6150     if (PyUnicode_GET_SIZE(self) == 1)
6151         return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6152
6153     /* Special case for empty strings */
6154     if (PyUnicode_GET_SIZE(self) == 0)
6155         return PyBool_FromLong(0);
6156
6157     e = p + PyUnicode_GET_SIZE(self);
6158     cased = 0;
6159     for (; p < e; p++) {
6160         register const Py_UNICODE ch = *p;
6161
6162         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6163             return PyBool_FromLong(0);
6164         else if (!cased && Py_UNICODE_ISLOWER(ch))
6165             cased = 1;
6166     }
6167     return PyBool_FromLong(cased);
6168 }
6169
6170 PyDoc_STRVAR(isupper__doc__,
6171 "S.isupper() -> bool\n\
6172 \n\
6173 Return True if all cased characters in S are uppercase and there is\n\
6174 at least one cased character in S, False otherwise.");
6175
6176 static PyObject*
6177 unicode_isupper(PyUnicodeObject *self)
6178 {
6179     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6180     register const Py_UNICODE *e;
6181     int cased;
6182
6183     /* Shortcut for single character strings */
6184     if (PyUnicode_GET_SIZE(self) == 1)
6185         return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6186
6187     /* Special case for empty strings */
6188     if (PyUnicode_GET_SIZE(self) == 0)
6189         return PyBool_FromLong(0);
6190
6191     e = p + PyUnicode_GET_SIZE(self);
6192     cased = 0;
6193     for (; p < e; p++) {
6194         register const Py_UNICODE ch = *p;
6195
6196         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6197             return PyBool_FromLong(0);
6198         else if (!cased && Py_UNICODE_ISUPPER(ch))
6199             cased = 1;
6200     }
6201     return PyBool_FromLong(cased);
6202 }
6203
6204 PyDoc_STRVAR(istitle__doc__,
6205 "S.istitle() -> bool\n\
6206 \n\
6207 Return True if S is a titlecased string and there is at least one\n\
6208 character in S, i.e. upper- and titlecase characters may only\n\
6209 follow uncased characters and lowercase characters only cased ones.\n\
6210 Return False otherwise.");
6211
6212 static PyObject*
6213 unicode_istitle(PyUnicodeObject *self)
6214 {
6215     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6216     register const Py_UNICODE *e;
6217     int cased, previous_is_cased;
6218
6219     /* Shortcut for single character strings */
6220     if (PyUnicode_GET_SIZE(self) == 1)
6221         return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6222                                (Py_UNICODE_ISUPPER(*p) != 0));
6223
6224     /* Special case for empty strings */
6225     if (PyUnicode_GET_SIZE(self) == 0)
6226         return PyBool_FromLong(0);
6227
6228     e = p + PyUnicode_GET_SIZE(self);
6229     cased = 0;
6230     previous_is_cased = 0;
6231     for (; p < e; p++) {
6232         register const Py_UNICODE ch = *p;
6233
6234         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6235             if (previous_is_cased)
6236                 return PyBool_FromLong(0);
6237             previous_is_cased = 1;
6238             cased = 1;
6239         }
6240         else if (Py_UNICODE_ISLOWER(ch)) {
6241             if (!previous_is_cased)
6242                 return PyBool_FromLong(0);
6243             previous_is_cased = 1;
6244             cased = 1;
6245         }
6246         else
6247             previous_is_cased = 0;
6248     }
6249     return PyBool_FromLong(cased);
6250 }
6251
6252 PyDoc_STRVAR(isspace__doc__,
6253 "S.isspace() -> bool\n\
6254 \n\
6255 Return True if all characters in S are whitespace\n\
6256 and there is at least one character in S, False otherwise.");
6257
6258 static PyObject*
6259 unicode_isspace(PyUnicodeObject *self)
6260 {
6261     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6262     register const Py_UNICODE *e;
6263
6264     /* Shortcut for single character strings */
6265     if (PyUnicode_GET_SIZE(self) == 1 &&
6266         Py_UNICODE_ISSPACE(*p))
6267         return PyBool_FromLong(1);
6268
6269     /* Special case for empty strings */
6270     if (PyUnicode_GET_SIZE(self) == 0)
6271         return PyBool_FromLong(0);
6272
6273     e = p + PyUnicode_GET_SIZE(self);
6274     for (; p < e; p++) {
6275         if (!Py_UNICODE_ISSPACE(*p))
6276             return PyBool_FromLong(0);
6277     }
6278     return PyBool_FromLong(1);
6279 }
6280
6281 PyDoc_STRVAR(isalpha__doc__,
6282 "S.isalpha() -> bool\n\
6283 \n\
6284 Return True if all characters in S are alphabetic\n\
6285 and there is at least one character in S, False otherwise.");
6286
6287 static PyObject*
6288 unicode_isalpha(PyUnicodeObject *self)
6289 {
6290     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6291     register const Py_UNICODE *e;
6292
6293     /* Shortcut for single character strings */
6294     if (PyUnicode_GET_SIZE(self) == 1 &&
6295         Py_UNICODE_ISALPHA(*p))
6296         return PyBool_FromLong(1);
6297
6298     /* Special case for empty strings */
6299     if (PyUnicode_GET_SIZE(self) == 0)
6300         return PyBool_FromLong(0);
6301
6302     e = p + PyUnicode_GET_SIZE(self);
6303     for (; p < e; p++) {
6304         if (!Py_UNICODE_ISALPHA(*p))
6305             return PyBool_FromLong(0);
6306     }
6307     return PyBool_FromLong(1);
6308 }
6309
6310 PyDoc_STRVAR(isalnum__doc__,
6311 "S.isalnum() -> bool\n\
6312 \n\
6313 Return True if all characters in S are alphanumeric\n\
6314 and there is at least one character in S, False otherwise.");
6315
6316 static PyObject*
6317 unicode_isalnum(PyUnicodeObject *self)
6318 {
6319     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6320     register const Py_UNICODE *e;
6321
6322     /* Shortcut for single character strings */
6323     if (PyUnicode_GET_SIZE(self) == 1 &&
6324         Py_UNICODE_ISALNUM(*p))
6325         return PyBool_FromLong(1);
6326
6327     /* Special case for empty strings */
6328     if (PyUnicode_GET_SIZE(self) == 0)
6329         return PyBool_FromLong(0);
6330
6331     e = p + PyUnicode_GET_SIZE(self);
6332     for (; p < e; p++) {
6333         if (!Py_UNICODE_ISALNUM(*p))
6334             return PyBool_FromLong(0);
6335     }
6336     return PyBool_FromLong(1);
6337 }
6338
6339 PyDoc_STRVAR(isdecimal__doc__,
6340 "S.isdecimal() -> bool\n\
6341 \n\
6342 Return True if there are only decimal characters in S,\n\
6343 False otherwise.");
6344
6345 static PyObject*
6346 unicode_isdecimal(PyUnicodeObject *self)
6347 {
6348     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6349     register const Py_UNICODE *e;
6350
6351     /* Shortcut for single character strings */
6352     if (PyUnicode_GET_SIZE(self) == 1 &&
6353         Py_UNICODE_ISDECIMAL(*p))
6354         return PyBool_FromLong(1);
6355
6356     /* Special case for empty strings */
6357     if (PyUnicode_GET_SIZE(self) == 0)
6358         return PyBool_FromLong(0);
6359
6360     e = p + PyUnicode_GET_SIZE(self);
6361     for (; p < e; p++) {
6362         if (!Py_UNICODE_ISDECIMAL(*p))
6363             return PyBool_FromLong(0);
6364     }
6365     return PyBool_FromLong(1);
6366 }
6367
6368 PyDoc_STRVAR(isdigit__doc__,
6369 "S.isdigit() -> bool\n\
6370 \n\
6371 Return True if all characters in S are digits\n\
6372 and there is at least one character in S, False otherwise.");
6373
6374 static PyObject*
6375 unicode_isdigit(PyUnicodeObject *self)
6376 {
6377     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6378     register const Py_UNICODE *e;
6379
6380     /* Shortcut for single character strings */
6381     if (PyUnicode_GET_SIZE(self) == 1 &&
6382         Py_UNICODE_ISDIGIT(*p))
6383         return PyBool_FromLong(1);
6384
6385     /* Special case for empty strings */
6386     if (PyUnicode_GET_SIZE(self) == 0)
6387         return PyBool_FromLong(0);
6388
6389     e = p + PyUnicode_GET_SIZE(self);
6390     for (; p < e; p++) {
6391         if (!Py_UNICODE_ISDIGIT(*p))
6392             return PyBool_FromLong(0);
6393     }
6394     return PyBool_FromLong(1);
6395 }
6396
6397 PyDoc_STRVAR(isnumeric__doc__,
6398 "S.isnumeric() -> bool\n\
6399 \n\
6400 Return True if there are only numeric characters in S,\n\
6401 False otherwise.");
6402
6403 static PyObject*
6404 unicode_isnumeric(PyUnicodeObject *self)
6405 {
6406     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6407     register const Py_UNICODE *e;
6408
6409     /* Shortcut for single character strings */
6410     if (PyUnicode_GET_SIZE(self) == 1 &&
6411         Py_UNICODE_ISNUMERIC(*p))
6412         return PyBool_FromLong(1);
6413
6414     /* Special case for empty strings */
6415     if (PyUnicode_GET_SIZE(self) == 0)
6416         return PyBool_FromLong(0);
6417
6418     e = p + PyUnicode_GET_SIZE(self);
6419     for (; p < e; p++) {
6420         if (!Py_UNICODE_ISNUMERIC(*p))
6421             return PyBool_FromLong(0);
6422     }
6423     return PyBool_FromLong(1);
6424 }
6425
6426 PyDoc_STRVAR(join__doc__,
6427 "S.join(sequence) -> unicode\n\
6428 \n\
6429 Return a string which is the concatenation of the strings in the\n\
6430 sequence.  The separator between elements is S.");
6431
6432 static PyObject*
6433 unicode_join(PyObject *self, PyObject *data)
6434 {
6435     return PyUnicode_Join(self, data);
6436 }
6437
6438 static Py_ssize_t
6439 unicode_length(PyUnicodeObject *self)
6440 {
6441     return self->length;
6442 }
6443
6444 PyDoc_STRVAR(ljust__doc__,
6445 "S.ljust(width[, fillchar]) -> int\n\
6446 \n\
6447 Return S left justified in a Unicode string of length width. Padding is\n\
6448 done using the specified fill character (default is a space).");
6449
6450 static PyObject *
6451 unicode_ljust(PyUnicodeObject *self, PyObject *args)
6452 {
6453     Py_ssize_t width;
6454     Py_UNICODE fillchar = ' ';
6455
6456     if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
6457         return NULL;
6458
6459     if (self->length >= width && PyUnicode_CheckExact(self)) {
6460         Py_INCREF(self);
6461         return (PyObject*) self;
6462     }
6463
6464     return (PyObject*) pad(self, 0, width - self->length, fillchar);
6465 }
6466
6467 PyDoc_STRVAR(lower__doc__,
6468 "S.lower() -> unicode\n\
6469 \n\
6470 Return a copy of the string S converted to lowercase.");
6471
6472 static PyObject*
6473 unicode_lower(PyUnicodeObject *self)
6474 {
6475     return fixup(self, fixlower);
6476 }
6477
6478 #define LEFTSTRIP 0
6479 #define RIGHTSTRIP 1
6480 #define BOTHSTRIP 2
6481
6482 /* Arrays indexed by above */
6483 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6484
6485 #define STRIPNAME(i) (stripformat[i]+3)
6486
6487 /* externally visible for str.strip(unicode) */
6488 PyObject *
6489 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6490 {
6491         Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6492         Py_ssize_t len = PyUnicode_GET_SIZE(self);
6493         Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6494         Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6495         Py_ssize_t i, j;
6496
6497         BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6498
6499         i = 0;
6500         if (striptype != RIGHTSTRIP) {
6501             while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6502                 i++;
6503             }
6504         }
6505
6506         j = len;
6507         if (striptype != LEFTSTRIP) {
6508             do {
6509                 j--;
6510             } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6511             j++;
6512         }
6513
6514         if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6515             Py_INCREF(self);
6516             return (PyObject*)self;
6517         }
6518         else
6519             return PyUnicode_FromUnicode(s+i, j-i);
6520 }
6521
6522
6523 static PyObject *
6524 do_strip(PyUnicodeObject *self, int striptype)
6525 {
6526         Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6527         Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
6528
6529         i = 0;
6530         if (striptype != RIGHTSTRIP) {
6531                 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6532                         i++;
6533                 }
6534         }
6535
6536         j = len;
6537         if (striptype != LEFTSTRIP) {
6538                 do {
6539                         j--;
6540                 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6541                 j++;
6542         }
6543
6544         if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6545                 Py_INCREF(self);
6546                 return (PyObject*)self;
6547         }
6548         else
6549                 return PyUnicode_FromUnicode(s+i, j-i);
6550 }
6551
6552
6553 static PyObject *
6554 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6555 {
6556         PyObject *sep = NULL;
6557
6558         if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6559                 return NULL;
6560
6561         if (sep != NULL && sep != Py_None) {
6562                 if (PyUnicode_Check(sep))
6563                         return _PyUnicode_XStrip(self, striptype, sep);
6564                 else if (PyString_Check(sep)) {
6565                         PyObject *res;
6566                         sep = PyUnicode_FromObject(sep);
6567                         if (sep==NULL)
6568                                 return NULL;
6569                         res = _PyUnicode_XStrip(self, striptype, sep);
6570                         Py_DECREF(sep);
6571                         return res;
6572                 }
6573                 else {
6574                         PyErr_Format(PyExc_TypeError,
6575                                      "%s arg must be None, unicode or str",
6576                                      STRIPNAME(striptype));
6577                         return NULL;
6578                 }
6579         }
6580
6581         return do_strip(self, striptype);
6582 }
6583
6584
6585 PyDoc_STRVAR(strip__doc__,
6586 "S.strip([chars]) -> unicode\n\
6587 \n\
6588 Return a copy of the string S with leading and trailing\n\
6589 whitespace removed.\n\
6590 If chars is given and not None, remove characters in chars instead.\n\
6591 If chars is a str, it will be converted to unicode before stripping");
6592
6593 static PyObject *
6594 unicode_strip(PyUnicodeObject *self, PyObject *args)
6595 {
6596         if (PyTuple_GET_SIZE(args) == 0)
6597                 return do_strip(self, BOTHSTRIP); /* Common case */
6598         else
6599                 return do_argstrip(self, BOTHSTRIP, args);
6600 }
6601
6602
6603 PyDoc_STRVAR(lstrip__doc__,
6604 "S.lstrip([chars]) -> unicode\n\
6605 \n\
6606 Return a copy of the string S with leading whitespace removed.\n\
6607 If chars is given and not None, remove characters in chars instead.\n\
6608 If chars is a str, it will be converted to unicode before stripping");
6609
6610 static PyObject *
6611 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6612 {
6613         if (PyTuple_GET_SIZE(args) == 0)
6614                 return do_strip(self, LEFTSTRIP); /* Common case */
6615         else
6616                 return do_argstrip(self, LEFTSTRIP, args);
6617 }
6618
6619
6620 PyDoc_STRVAR(rstrip__doc__,
6621 "S.rstrip([chars]) -> unicode\n\
6622 \n\
6623 Return a copy of the string S with trailing whitespace removed.\n\
6624 If chars is given and not None, remove characters in chars instead.\n\
6625 If chars is a str, it will be converted to unicode before stripping");
6626
6627 static PyObject *
6628 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6629 {
6630         if (PyTuple_GET_SIZE(args) == 0)
6631                 return do_strip(self, RIGHTSTRIP); /* Common case */
6632         else
6633                 return do_argstrip(self, RIGHTSTRIP, args);
6634 }
6635
6636
6637 static PyObject*
6638 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
6639 {
6640     PyUnicodeObject *u;
6641     Py_UNICODE *p;
6642     Py_ssize_t nchars;
6643     size_t nbytes;
6644
6645     if (len < 0)
6646         len = 0;
6647
6648     if (len == 1 && PyUnicode_CheckExact(str)) {
6649         /* no repeat, return original string */
6650         Py_INCREF(str);
6651         return (PyObject*) str;
6652     }
6653
6654     /* ensure # of chars needed doesn't overflow int and # of bytes
6655      * needed doesn't overflow size_t
6656      */
6657     nchars = len * str->length;
6658     if (len && nchars / len != str->length) {
6659         PyErr_SetString(PyExc_OverflowError,
6660                         "repeated string is too long");
6661         return NULL;
6662     }
6663     nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6664     if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6665         PyErr_SetString(PyExc_OverflowError,
6666                         "repeated string is too long");
6667         return NULL;
6668     }
6669     u = _PyUnicode_New(nchars);
6670     if (!u)
6671         return NULL;
6672
6673     p = u->str;
6674
6675     if (str->length == 1 && len > 0) {
6676         Py_UNICODE_FILL(p, str->str[0], len);
6677     } else {
6678         Py_ssize_t done = 0; /* number of characters copied this far */
6679         if (done < nchars) {
6680             Py_UNICODE_COPY(p, str->str, str->length);
6681             done = str->length;
6682         }
6683         while (done < nchars) {
6684             int n = (done <= nchars-done) ? done : nchars-done;
6685             Py_UNICODE_COPY(p+done, p, n);
6686             done += n;
6687         }
6688     }
6689
6690     return (PyObject*) u;
6691 }
6692
6693 PyObject *PyUnicode_Replace(PyObject *obj,
6694                             PyObject *subobj,
6695                             PyObject *replobj,
6696                             Py_ssize_t maxcount)
6697 {
6698     PyObject *self;
6699     PyObject *str1;
6700     PyObject *str2;
6701     PyObject *result;
6702
6703     self = PyUnicode_FromObject(obj);
6704     if (self == NULL)
6705         return NULL;
6706     str1 = PyUnicode_FromObject(subobj);
6707     if (str1 == NULL) {
6708         Py_DECREF(self);
6709         return NULL;
6710     }
6711     str2 = PyUnicode_FromObject(replobj);
6712     if (str2 == NULL) {
6713         Py_DECREF(self);
6714         Py_DECREF(str1);
6715         return NULL;
6716     }
6717     result = replace((PyUnicodeObject *)self,
6718                      (PyUnicodeObject *)str1,
6719                      (PyUnicodeObject *)str2,
6720                      maxcount);
6721     Py_DECREF(self);
6722     Py_DECREF(str1);
6723     Py_DECREF(str2);
6724     return result;
6725 }
6726
6727 PyDoc_STRVAR(replace__doc__,
6728 "S.replace (old, new[, maxsplit]) -> unicode\n\
6729 \n\
6730 Return a copy of S with all occurrences of substring\n\
6731 old replaced by new.  If the optional argument maxsplit is\n\
6732 given, only the first maxsplit occurrences are replaced.");
6733
6734 static PyObject*
6735 unicode_replace(PyUnicodeObject *self, PyObject *args)
6736 {
6737     PyUnicodeObject *str1;
6738     PyUnicodeObject *str2;
6739     Py_ssize_t maxcount = -1;
6740     PyObject *result;
6741
6742     if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
6743         return NULL;
6744     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6745     if (str1 == NULL)
6746         return NULL;
6747     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
6748     if (str2 == NULL) {
6749         Py_DECREF(str1);
6750         return NULL;
6751     }
6752
6753     result = replace(self, str1, str2, maxcount);
6754
6755     Py_DECREF(str1);
6756     Py_DECREF(str2);
6757     return result;
6758 }
6759
6760 static
6761 PyObject *unicode_repr(PyObject *unicode)
6762 {
6763     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6764                                 PyUnicode_GET_SIZE(unicode),
6765                                 1);
6766 }
6767
6768 PyDoc_STRVAR(rfind__doc__,
6769 "S.rfind(sub [,start [,end]]) -> int\n\
6770 \n\
6771 Return the highest index in S where substring sub is found,\n\
6772 such that sub is contained within s[start:end].  Optional\n\
6773 arguments start and end are interpreted as in slice notation.\n\
6774 \n\
6775 Return -1 on failure.");
6776
6777 static PyObject *
6778 unicode_rfind(PyUnicodeObject *self, PyObject *args)
6779 {
6780     PyObject *substring;
6781     Py_ssize_t start = 0;
6782     Py_ssize_t end = PY_SSIZE_T_MAX;
6783     Py_ssize_t result;
6784
6785     if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6786                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6787         return NULL;
6788     substring = PyUnicode_FromObject(substring);
6789     if (!substring)
6790         return NULL;
6791
6792     result = stringlib_rfind_slice(
6793         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6794         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6795         start, end
6796         );
6797
6798     Py_DECREF(substring);
6799
6800     return PyInt_FromSsize_t(result);
6801 }
6802
6803 PyDoc_STRVAR(rindex__doc__,
6804 "S.rindex(sub [,start [,end]]) -> int\n\
6805 \n\
6806 Like S.rfind() but raise ValueError when the substring is not found.");
6807
6808 static PyObject *
6809 unicode_rindex(PyUnicodeObject *self, PyObject *args)
6810 {
6811     PyObject *substring;
6812     Py_ssize_t start = 0;
6813     Py_ssize_t end = PY_SSIZE_T_MAX;
6814     Py_ssize_t result;
6815
6816     if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6817                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6818         return NULL;
6819     substring = PyUnicode_FromObject(substring);
6820     if (!substring)
6821         return NULL;
6822
6823     result = stringlib_rfind_slice(
6824         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6825         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6826         start, end
6827         );
6828
6829     Py_DECREF(substring);
6830
6831     if (result < 0) {
6832         PyErr_SetString(PyExc_ValueError, "substring not found");
6833         return NULL;
6834     }
6835     return PyInt_FromSsize_t(result);
6836 }
6837
6838 PyDoc_STRVAR(rjust__doc__,
6839 "S.rjust(width[, fillchar]) -> unicode\n\
6840 \n\
6841 Return S right justified in a Unicode string of length width. Padding is\n\
6842 done using the specified fill character (default is a space).");
6843
6844 static PyObject *
6845 unicode_rjust(PyUnicodeObject *self, PyObject *args)
6846 {
6847     Py_ssize_t width;
6848     Py_UNICODE fillchar = ' ';
6849
6850     if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
6851         return NULL;
6852
6853     if (self->length >= width && PyUnicode_CheckExact(self)) {
6854         Py_INCREF(self);
6855         return (PyObject*) self;
6856     }
6857
6858     return (PyObject*) pad(self, width - self->length, 0, fillchar);
6859 }
6860
6861 static PyObject*
6862 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
6863 {
6864     /* standard clamping */
6865     if (start < 0)
6866         start = 0;
6867     if (end < 0)
6868         end = 0;
6869     if (end > self->length)
6870         end = self->length;
6871     if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
6872         /* full slice, return original string */
6873         Py_INCREF(self);
6874         return (PyObject*) self;
6875     }
6876     if (start > end)
6877         start = end;
6878     /* copy slice */
6879     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6880                                              end - start);
6881 }
6882
6883 PyObject *PyUnicode_Split(PyObject *s,
6884                           PyObject *sep,
6885                           Py_ssize_t maxsplit)
6886 {
6887     PyObject *result;
6888
6889     s = PyUnicode_FromObject(s);
6890     if (s == NULL)
6891         return NULL;
6892     if (sep != NULL) {
6893         sep = PyUnicode_FromObject(sep);
6894         if (sep == NULL) {
6895             Py_DECREF(s);
6896             return NULL;
6897         }
6898     }
6899
6900     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6901
6902     Py_DECREF(s);
6903     Py_XDECREF(sep);
6904     return result;
6905 }
6906
6907 PyDoc_STRVAR(split__doc__,
6908 "S.split([sep [,maxsplit]]) -> list of strings\n\
6909 \n\
6910 Return a list of the words in S, using sep as the\n\
6911 delimiter string.  If maxsplit is given, at most maxsplit\n\
6912 splits are done. If sep is not specified or is None,\n\
6913 any whitespace string is a separator.");
6914
6915 static PyObject*
6916 unicode_split(PyUnicodeObject *self, PyObject *args)
6917 {
6918     PyObject *substring = Py_None;
6919     Py_ssize_t maxcount = -1;
6920
6921     if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
6922         return NULL;
6923
6924     if (substring == Py_None)
6925         return split(self, NULL, maxcount);
6926     else if (PyUnicode_Check(substring))
6927         return split(self, (PyUnicodeObject *)substring, maxcount);
6928     else
6929         return PyUnicode_Split((PyObject *)self, substring, maxcount);
6930 }
6931
6932 PyObject *
6933 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6934 {
6935     PyObject* str_obj;
6936     PyObject* sep_obj;
6937     PyObject* out;
6938
6939     str_obj = PyUnicode_FromObject(str_in);
6940     if (!str_obj)
6941         return NULL;
6942     sep_obj = PyUnicode_FromObject(sep_in);
6943     if (!sep_obj) {
6944         Py_DECREF(str_obj);
6945         return NULL;
6946     }
6947
6948     out = stringlib_partition(
6949         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6950         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6951         );
6952
6953     Py_DECREF(sep_obj);
6954     Py_DECREF(str_obj);
6955
6956     return out;
6957 }
6958
6959
6960 PyObject *
6961 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6962 {
6963     PyObject* str_obj;
6964     PyObject* sep_obj;
6965     PyObject* out;
6966
6967     str_obj = PyUnicode_FromObject(str_in);
6968     if (!str_obj)
6969         return NULL;
6970     sep_obj = PyUnicode_FromObject(sep_in);
6971     if (!sep_obj) {
6972         Py_DECREF(str_obj);
6973         return NULL;
6974     }
6975
6976     out = stringlib_rpartition(
6977         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6978         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6979         );
6980
6981     Py_DECREF(sep_obj);
6982     Py_DECREF(str_obj);
6983
6984     return out;
6985 }
6986
6987 PyDoc_STRVAR(partition__doc__,
6988 "S.partition(sep) -> (head, sep, tail)\n\
6989 \n\
6990 Searches for the separator sep in S, and returns the part before it,\n\
6991 the separator itself, and the part after it.  If the separator is not\n\
6992 found, returns S and two empty strings.");
6993
6994 static PyObject*
6995 unicode_partition(PyUnicodeObject *self, PyObject *separator)
6996 {
6997     return PyUnicode_Partition((PyObject *)self, separator);
6998 }
6999
7000 PyDoc_STRVAR(rpartition__doc__,
7001 "S.rpartition(sep) -> (tail, sep, head)\n\
7002 \n\
7003 Searches for the separator sep in S, starting at the end of S, and returns\n\
7004 the part before it, the separator itself, and the part after it.  If the\n\
7005 separator is not found, returns two empty strings and S.");
7006
7007 static PyObject*
7008 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7009 {
7010     return PyUnicode_RPartition((PyObject *)self, separator);
7011 }
7012
7013 PyObject *PyUnicode_RSplit(PyObject *s,
7014                            PyObject *sep,
7015                            Py_ssize_t maxsplit)
7016 {
7017     PyObject *result;
7018
7019     s = PyUnicode_FromObject(s);
7020     if (s == NULL)
7021         return NULL;
7022     if (sep != NULL) {
7023         sep = PyUnicode_FromObject(sep);
7024         if (sep == NULL) {
7025             Py_DECREF(s);
7026             return NULL;
7027         }
7028     }
7029
7030     result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7031
7032     Py_DECREF(s);
7033     Py_XDECREF(sep);
7034     return result;
7035 }
7036
7037 PyDoc_STRVAR(rsplit__doc__,
7038 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7039 \n\
7040 Return a list of the words in S, using sep as the\n\
7041 delimiter string, starting at the end of the string and\n\
7042 working to the front.  If maxsplit is given, at most maxsplit\n\
7043 splits are done. If sep is not specified, any whitespace string\n\
7044 is a separator.");
7045
7046 static PyObject*
7047 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7048 {
7049     PyObject *substring = Py_None;
7050     Py_ssize_t maxcount = -1;
7051
7052     if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7053         return NULL;
7054
7055     if (substring == Py_None)
7056         return rsplit(self, NULL, maxcount);
7057     else if (PyUnicode_Check(substring))
7058         return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7059     else
7060         return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7061 }
7062
7063 PyDoc_STRVAR(splitlines__doc__,
7064 "S.splitlines([keepends]]) -> list of strings\n\
7065 \n\
7066 Return a list of the lines in S, breaking at line boundaries.\n\
7067 Line breaks are not included in the resulting list unless keepends\n\
7068 is given and true.");
7069
7070 static PyObject*
7071 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7072 {
7073     int keepends = 0;
7074
7075     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7076         return NULL;
7077
7078     return PyUnicode_Splitlines((PyObject *)self, keepends);
7079 }
7080
7081 static
7082 PyObject *unicode_str(PyUnicodeObject *self)
7083 {
7084     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
7085 }
7086
7087 PyDoc_STRVAR(swapcase__doc__,
7088 "S.swapcase() -> unicode\n\
7089 \n\
7090 Return a copy of S with uppercase characters converted to lowercase\n\
7091 and vice versa.");
7092
7093 static PyObject*
7094 unicode_swapcase(PyUnicodeObject *self)
7095 {
7096     return fixup(self, fixswapcase);
7097 }
7098
7099 PyDoc_STRVAR(translate__doc__,
7100 "S.translate(table) -> unicode\n\
7101 \n\
7102 Return a copy of the string S, where all characters have been mapped\n\
7103 through the given translation table, which must be a mapping of\n\
7104 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7105 Unmapped characters are left untouched. Characters mapped to None\n\
7106 are deleted.");
7107
7108 static PyObject*
7109 unicode_translate(PyUnicodeObject *self, PyObject *table)
7110 {
7111     return PyUnicode_TranslateCharmap(self->str,
7112                                       self->length,
7113                                       table,
7114                                       "ignore");
7115 }
7116
7117 PyDoc_STRVAR(upper__doc__,
7118 "S.upper() -> unicode\n\
7119 \n\
7120 Return a copy of S converted to uppercase.");
7121
7122 static PyObject*
7123 unicode_upper(PyUnicodeObject *self)
7124 {
7125     return fixup(self, fixupper);
7126 }
7127
7128 PyDoc_STRVAR(zfill__doc__,
7129 "S.zfill(width) -> unicode\n\
7130 \n\
7131 Pad a numeric string x with zeros on the left, to fill a field\n\
7132 of the specified width. The string x is never truncated.");
7133
7134 static PyObject *
7135 unicode_zfill(PyUnicodeObject *self, PyObject *args)
7136 {
7137     Py_ssize_t fill;
7138     PyUnicodeObject *u;
7139
7140     Py_ssize_t width;
7141     if (!PyArg_ParseTuple(args, "n:zfill", &width))
7142         return NULL;
7143
7144     if (self->length >= width) {
7145         if (PyUnicode_CheckExact(self)) {
7146             Py_INCREF(self);
7147             return (PyObject*) self;
7148         }
7149         else
7150             return PyUnicode_FromUnicode(
7151                 PyUnicode_AS_UNICODE(self),
7152                 PyUnicode_GET_SIZE(self)
7153             );
7154     }
7155
7156     fill = width - self->length;
7157
7158     u = pad(self, fill, 0, '0');
7159
7160     if (u == NULL)
7161         return NULL;
7162
7163     if (u->str[fill] == '+' || u->str[fill] == '-') {
7164         /* move sign to beginning of string */
7165         u->str[0] = u->str[fill];
7166         u->str[fill] = '0';
7167     }
7168
7169     return (PyObject*) u;
7170 }
7171
7172 #if 0
7173 static PyObject*
7174 unicode_freelistsize(PyUnicodeObject *self)
7175 {
7176     return PyInt_FromLong(unicode_freelist_size);
7177 }
7178 #endif
7179
7180 PyDoc_STRVAR(startswith__doc__,
7181 "S.startswith(prefix[, start[, end]]) -> bool\n\
7182 \n\
7183 Return True if S starts with the specified prefix, False otherwise.\n\
7184 With optional start, test S beginning at that position.\n\
7185 With optional end, stop comparing S at that position.\n\
7186 prefix can also be a tuple of strings to try.");
7187
7188 static PyObject *
7189 unicode_startswith(PyUnicodeObject *self,
7190                    PyObject *args)
7191 {
7192     PyObject *subobj;
7193     PyUnicodeObject *substring;
7194     Py_ssize_t start = 0;
7195     Py_ssize_t end = PY_SSIZE_T_MAX;
7196     int result;
7197
7198     if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
7199                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7200         return NULL;
7201     if (PyTuple_Check(subobj)) {
7202         Py_ssize_t i;
7203         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7204             substring = (PyUnicodeObject *)PyUnicode_FromObject(
7205                             PyTuple_GET_ITEM(subobj, i));
7206             if (substring == NULL)
7207                 return NULL;
7208             result = tailmatch(self, substring, start, end, -1);
7209             Py_DECREF(substring);
7210             if (result) {
7211                 Py_RETURN_TRUE;
7212             }
7213         }
7214         /* nothing matched */
7215         Py_RETURN_FALSE;
7216     }
7217     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7218     if (substring == NULL)
7219          return NULL;
7220     result = tailmatch(self, substring, start, end, -1);
7221     Py_DECREF(substring);
7222     return PyBool_FromLong(result);
7223 }
7224
7225
7226 PyDoc_STRVAR(endswith__doc__,
7227 "S.endswith(suffix[, start[, end]]) -> bool\n\
7228 \n\
7229 Return True if S ends with the specified suffix, False otherwise.\n\
7230 With optional start, test S beginning at that position.\n\
7231 With optional end, stop comparing S at that position.\n\
7232 suffix can also be a tuple of strings to try.");
7233
7234 static PyObject *
7235 unicode_endswith(PyUnicodeObject *self,
7236                  PyObject *args)
7237 {
7238     PyObject *subobj;
7239     PyUnicodeObject *substring;
7240     Py_ssize_t start = 0;
7241     Py_ssize_t end = PY_SSIZE_T_MAX;
7242     int result;
7243
7244     if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7245         _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7246         return NULL;
7247     if (PyTuple_Check(subobj)) {
7248         Py_ssize_t i;
7249         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7250             substring = (PyUnicodeObject *)PyUnicode_FromObject(
7251                             PyTuple_GET_ITEM(subobj, i));
7252             if (substring == NULL)
7253             return NULL;
7254             result = tailmatch(self, substring, start, end, +1);
7255             Py_DECREF(substring);
7256             if (result) {
7257                 Py_RETURN_TRUE;
7258             }
7259         }
7260         Py_RETURN_FALSE;
7261     }
7262     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7263     if (substring == NULL)
7264     return NULL;
7265
7266     result = tailmatch(self, substring, start, end, +1);
7267     Py_DECREF(substring);
7268     return PyBool_FromLong(result);
7269 }
7270
7271
7272
7273 static PyObject *
7274 unicode_getnewargs(PyUnicodeObject *v)
7275 {
7276         return Py_BuildValue("(u#)", v->str, v->length);
7277 }
7278
7279
7280 static PyMethodDef unicode_methods[] = {
7281
7282     /* Order is according to common usage: often used methods should
7283        appear first, since lookup is done sequentially. */
7284
7285     {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7286     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7287     {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
7288     {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
7289     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7290     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7291     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7292     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7293     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7294     {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7295     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
7296     {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
7297     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7298     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7299     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
7300     {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
7301     {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
7302 /*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7303     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7304     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7305     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
7306     {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
7307     {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
7308     {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
7309     {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
7310     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7311     {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7312     {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7313     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7314     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7315     {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7316     {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7317     {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7318     {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7319     {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7320     {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7321     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7322     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7323     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
7324     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
7325 #if 0
7326     {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
7327 #endif
7328
7329 #if 0
7330     /* This one is just used for debugging the implementation. */
7331     {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
7332 #endif
7333
7334     {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
7335     {NULL, NULL}
7336 };
7337
7338 static PyObject *
7339 unicode_mod(PyObject *v, PyObject *w)
7340 {
7341        if (!PyUnicode_Check(v)) {
7342                Py_INCREF(Py_NotImplemented);
7343                return Py_NotImplemented;
7344        }
7345        return PyUnicode_Format(v, w);
7346 }
7347
7348 static PyNumberMethods unicode_as_number = {
7349         0,                              /*nb_add*/
7350         0,                              /*nb_subtract*/
7351         0,                              /*nb_multiply*/
7352         0,                              /*nb_divide*/
7353         unicode_mod,                    /*nb_remainder*/
7354 };
7355
7356 static PySequenceMethods unicode_as_sequence = {
7357     (lenfunc) unicode_length,           /* sq_length */
7358     PyUnicode_Concat,                   /* sq_concat */
7359     (ssizeargfunc) unicode_repeat,      /* sq_repeat */
7360     (ssizeargfunc) unicode_getitem,     /* sq_item */
7361     (ssizessizeargfunc) unicode_slice,  /* sq_slice */
7362     0,                                  /* sq_ass_item */
7363     0,                                  /* sq_ass_slice */
7364     PyUnicode_Contains,                 /* sq_contains */
7365 };
7366
7367 static PyObject*
7368 unicode_subscript(PyUnicodeObject* self, PyObject* item)
7369 {
7370     if (PyIndex_Check(item)) {
7371         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
7372         if (i == -1 && PyErr_Occurred())
7373             return NULL;
7374         if (i < 0)
7375             i += PyUnicode_GET_SIZE(self);
7376         return unicode_getitem(self, i);
7377     } else if (PySlice_Check(item)) {
7378         Py_ssize_t start, stop, step, slicelength, cur, i;
7379         Py_UNICODE* source_buf;
7380         Py_UNICODE* result_buf;
7381         PyObject* result;
7382
7383         if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
7384                                  &start, &stop, &step, &slicelength) < 0) {
7385             return NULL;
7386         }
7387
7388         if (slicelength <= 0) {
7389             return PyUnicode_FromUnicode(NULL, 0);
7390         } else if (start == 0 && step == 1 && slicelength == self->length &&
7391                    PyUnicode_CheckExact(self)) {
7392             Py_INCREF(self);
7393             return (PyObject *)self;
7394         } else if (step == 1) {
7395             return PyUnicode_FromUnicode(self->str + start, slicelength);
7396         } else {
7397             source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
7398             result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7399                                                     sizeof(Py_UNICODE));
7400
7401             if (result_buf == NULL)
7402                     return PyErr_NoMemory();
7403
7404             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7405                 result_buf[i] = source_buf[cur];
7406             }
7407
7408             result = PyUnicode_FromUnicode(result_buf, slicelength);
7409             PyMem_FREE(result_buf);
7410             return result;
7411         }
7412     } else {
7413         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7414         return NULL;
7415     }
7416 }
7417
7418 static PyMappingMethods unicode_as_mapping = {
7419     (lenfunc)unicode_length,            /* mp_length */
7420     (binaryfunc)unicode_subscript,      /* mp_subscript */
7421     (objobjargproc)0,                   /* mp_ass_subscript */
7422 };
7423
7424 static Py_ssize_t
7425 unicode_buffer_getreadbuf(PyUnicodeObject *self,
7426                           Py_ssize_t index,
7427                           const void **ptr)
7428 {
7429     if (index != 0) {
7430         PyErr_SetString(PyExc_SystemError,
7431                         "accessing non-existent unicode segment");
7432         return -1;
7433     }
7434     *ptr = (void *) self->str;
7435     return PyUnicode_GET_DATA_SIZE(self);
7436 }
7437
7438 static Py_ssize_t
7439 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
7440                            const void **ptr)
7441 {
7442     PyErr_SetString(PyExc_TypeError,
7443                     "cannot use unicode as modifiable buffer");
7444     return -1;
7445 }
7446
7447 static int
7448 unicode_buffer_getsegcount(PyUnicodeObject *self,
7449                            Py_ssize_t *lenp)
7450 {
7451     if (lenp)
7452         *lenp = PyUnicode_GET_DATA_SIZE(self);
7453     return 1;
7454 }
7455
7456 static Py_ssize_t
7457 unicode_buffer_getcharbuf(PyUnicodeObject *self,
7458                           Py_ssize_t index,
7459                           const void **ptr)
7460 {
7461     PyObject *str;
7462
7463     if (index != 0) {
7464         PyErr_SetString(PyExc_SystemError,
7465                         "accessing non-existent unicode segment");
7466         return -1;
7467     }
7468     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
7469     if (str == NULL)
7470         return -1;
7471     *ptr = (void *) PyString_AS_STRING(str);
7472     return PyString_GET_SIZE(str);
7473 }
7474
7475 /* Helpers for PyUnicode_Format() */
7476
7477 static PyObject *
7478 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
7479 {
7480     Py_ssize_t argidx = *p_argidx;
7481     if (argidx < arglen) {
7482         (*p_argidx)++;
7483         if (arglen < 0)
7484             return args;
7485         else
7486             return PyTuple_GetItem(args, argidx);
7487     }
7488     PyErr_SetString(PyExc_TypeError,
7489                     "not enough arguments for format string");
7490     return NULL;
7491 }
7492
7493 #define F_LJUST (1<<0)
7494 #define F_SIGN  (1<<1)
7495 #define F_BLANK (1<<2)
7496 #define F_ALT   (1<<3)
7497 #define F_ZERO  (1<<4)
7498
7499 static Py_ssize_t
7500 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
7501 {
7502     register Py_ssize_t i;
7503     Py_ssize_t len = strlen(charbuffer);
7504     for (i = len - 1; i >= 0; i--)
7505         buffer[i] = (Py_UNICODE) charbuffer[i];
7506
7507     return len;
7508 }
7509
7510 static int
7511 doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7512 {
7513     Py_ssize_t result;
7514
7515     PyOS_ascii_formatd((char *)buffer, len, format, x);
7516     result = strtounicode(buffer, (char *)buffer);
7517     return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
7518 }
7519
7520 static int
7521 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7522 {
7523     Py_ssize_t result;
7524
7525     PyOS_snprintf((char *)buffer, len, format, x);
7526     result = strtounicode(buffer, (char *)buffer);
7527     return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
7528 }
7529
7530 /* XXX To save some code duplication, formatfloat/long/int could have been
7531    shared with stringobject.c, converting from 8-bit to Unicode after the
7532    formatting is done. */
7533
7534 static int
7535 formatfloat(Py_UNICODE *buf,
7536             size_t buflen,
7537             int flags,
7538             int prec,
7539             int type,
7540             PyObject *v)
7541 {
7542     /* fmt = '%#.' + `prec` + `type`
7543        worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
7544     char fmt[20];
7545     double x;
7546
7547     x = PyFloat_AsDouble(v);
7548     if (x == -1.0 && PyErr_Occurred())
7549         return -1;
7550     if (prec < 0)
7551         prec = 6;
7552     if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7553         type = 'g';
7554     /* Worst case length calc to ensure no buffer overrun:
7555
7556        'g' formats:
7557          fmt = %#.<prec>g
7558          buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7559             for any double rep.)
7560          len = 1 + prec + 1 + 2 + 5 = 9 + prec
7561
7562        'f' formats:
7563          buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7564          len = 1 + 50 + 1 + prec = 52 + prec
7565
7566        If prec=0 the effective precision is 1 (the leading digit is
7567        always given), therefore increase the length by one.
7568
7569     */
7570     if (((type == 'g' || type == 'G') &&
7571           buflen <= (size_t)10 + (size_t)prec) ||
7572         (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
7573         PyErr_SetString(PyExc_OverflowError,
7574                         "formatted float is too long (precision too large?)");
7575         return -1;
7576     }
7577     PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7578                   (flags&F_ALT) ? "#" : "",
7579                   prec, type);
7580     return doubletounicode(buf, buflen, fmt, x);
7581 }
7582
7583 static PyObject*
7584 formatlong(PyObject *val, int flags, int prec, int type)
7585 {
7586         char *buf;
7587         int i, len;
7588         PyObject *str; /* temporary string object. */
7589         PyUnicodeObject *result;
7590
7591         str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7592         if (!str)
7593                 return NULL;
7594         result = _PyUnicode_New(len);
7595         if (!result) {
7596                 Py_DECREF(str);
7597                 return NULL;
7598         }
7599         for (i = 0; i < len; i++)
7600                 result->str[i] = buf[i];
7601         result->str[len] = 0;
7602         Py_DECREF(str);
7603         return (PyObject*)result;
7604 }
7605
7606 static int
7607 formatint(Py_UNICODE *buf,
7608           size_t buflen,
7609           int flags,
7610           int prec,
7611           int type,
7612           PyObject *v)
7613 {
7614     /* fmt = '%#.' + `prec` + 'l' + `type`
7615      * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7616      *                     + 1 + 1
7617      *                   = 24
7618      */
7619     char fmt[64]; /* plenty big enough! */
7620     char *sign;
7621     long x;
7622
7623     x = PyInt_AsLong(v);
7624     if (x == -1 && PyErr_Occurred())
7625         return -1;
7626     if (x < 0 && type == 'u') {
7627         type = 'd';
7628     }
7629     if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7630         sign = "-";
7631     else
7632         sign = "";
7633     if (prec < 0)
7634         prec = 1;
7635
7636     /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7637      * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
7638      */
7639     if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
7640         PyErr_SetString(PyExc_OverflowError,
7641                 "formatted integer is too long (precision too large?)");
7642         return -1;
7643     }
7644
7645     if ((flags & F_ALT) &&
7646         (type == 'x' || type == 'X')) {
7647         /* When converting under %#x or %#X, there are a number
7648          * of issues that cause pain:
7649          * - when 0 is being converted, the C standard leaves off
7650          *   the '0x' or '0X', which is inconsistent with other
7651          *   %#x/%#X conversions and inconsistent with Python's
7652          *   hex() function
7653          * - there are platforms that violate the standard and
7654          *   convert 0 with the '0x' or '0X'
7655          *   (Metrowerks, Compaq Tru64)
7656          * - there are platforms that give '0x' when converting
7657          *   under %#X, but convert 0 in accordance with the
7658          *   standard (OS/2 EMX)
7659          *
7660          * We can achieve the desired consistency by inserting our
7661          * own '0x' or '0X' prefix, and substituting %x/%X in place
7662          * of %#x/%#X.
7663          *
7664          * Note that this is the same approach as used in
7665          * formatint() in stringobject.c
7666          */
7667         PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7668                       sign, type, prec, type);
7669     }
7670     else {
7671         PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7672                       sign, (flags&F_ALT) ? "#" : "",
7673                       prec, type);
7674     }
7675     if (sign[0])
7676         return longtounicode(buf, buflen, fmt, -x);
7677     else
7678         return longtounicode(buf, buflen, fmt, x);
7679 }
7680
7681 static int
7682 formatchar(Py_UNICODE *buf,
7683            size_t buflen,
7684            PyObject *v)
7685 {
7686     /* presume that the buffer is at least 2 characters long */
7687     if (PyUnicode_Check(v)) {
7688         if (PyUnicode_GET_SIZE(v) != 1)
7689             goto onError;
7690         buf[0] = PyUnicode_AS_UNICODE(v)[0];
7691     }
7692
7693     else if (PyString_Check(v)) {
7694         if (PyString_GET_SIZE(v) != 1)
7695             goto onError;
7696         buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7697     }
7698
7699     else {
7700         /* Integer input truncated to a character */
7701         long x;
7702         x = PyInt_AsLong(v);
7703         if (x == -1 && PyErr_Occurred())
7704             goto onError;
7705 #ifdef Py_UNICODE_WIDE
7706         if (x < 0 || x > 0x10ffff) {
7707             PyErr_SetString(PyExc_OverflowError,
7708                             "%c arg not in range(0x110000) "
7709                             "(wide Python build)");
7710             return -1;
7711         }
7712 #else
7713         if (x < 0 || x > 0xffff) {
7714             PyErr_SetString(PyExc_OverflowError,
7715                             "%c arg not in range(0x10000) "
7716                             "(narrow Python build)");
7717             return -1;
7718         }
7719 #endif
7720         buf[0] = (Py_UNICODE) x;
7721     }
7722     buf[1] = '\0';
7723     return 1;
7724
7725  onError:
7726     PyErr_SetString(PyExc_TypeError,
7727                     "%c requires int or char");
7728     return -1;
7729 }
7730
7731 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7732
7733    FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7734    chars are formatted. XXX This is a magic number. Each formatting
7735    routine does bounds checking to ensure no overflow, but a better
7736    solution may be to malloc a buffer of appropriate size for each
7737    format. For now, the current solution is sufficient.
7738 */
7739 #define FORMATBUFLEN (size_t)120
7740
7741 PyObject *PyUnicode_Format(PyObject *format,
7742                            PyObject *args)
7743 {
7744     Py_UNICODE *fmt, *res;
7745     Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
7746     int args_owned = 0;
7747     PyUnicodeObject *result = NULL;
7748     PyObject *dict = NULL;
7749     PyObject *uformat;
7750
7751     if (format == NULL || args == NULL) {
7752         PyErr_BadInternalCall();
7753         return NULL;
7754     }
7755     uformat = PyUnicode_FromObject(format);
7756     if (uformat == NULL)
7757         return NULL;
7758     fmt = PyUnicode_AS_UNICODE(uformat);
7759     fmtcnt = PyUnicode_GET_SIZE(uformat);
7760
7761     reslen = rescnt = fmtcnt + 100;
7762     result = _PyUnicode_New(reslen);
7763     if (result == NULL)
7764         goto onError;
7765     res = PyUnicode_AS_UNICODE(result);
7766
7767     if (PyTuple_Check(args)) {
7768         arglen = PyTuple_Size(args);
7769         argidx = 0;
7770     }
7771     else {
7772         arglen = -1;
7773         argidx = -2;
7774     }
7775     if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
7776         !PyObject_TypeCheck(args, &PyBaseString_Type))
7777         dict = args;
7778
7779     while (--fmtcnt >= 0) {
7780         if (*fmt != '%') {
7781             if (--rescnt < 0) {
7782                 rescnt = fmtcnt + 100;
7783                 reslen += rescnt;
7784                 if (_PyUnicode_Resize(&result, reslen) < 0)
7785                     goto onError;
7786                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7787                 --rescnt;
7788             }
7789             *res++ = *fmt++;
7790         }
7791         else {
7792             /* Got a format specifier */
7793             int flags = 0;
7794             Py_ssize_t width = -1;
7795             int prec = -1;
7796             Py_UNICODE c = '\0';
7797             Py_UNICODE fill;
7798             PyObject *v = NULL;
7799             PyObject *temp = NULL;
7800             Py_UNICODE *pbuf;
7801             Py_UNICODE sign;
7802             Py_ssize_t len;
7803             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
7804
7805             fmt++;
7806             if (*fmt == '(') {
7807                 Py_UNICODE *keystart;
7808                 Py_ssize_t keylen;
7809                 PyObject *key;
7810                 int pcount = 1;
7811
7812                 if (dict == NULL) {
7813                     PyErr_SetString(PyExc_TypeError,
7814                                     "format requires a mapping");
7815                     goto onError;
7816                 }
7817                 ++fmt;
7818                 --fmtcnt;
7819                 keystart = fmt;
7820                 /* Skip over balanced parentheses */
7821                 while (pcount > 0 && --fmtcnt >= 0) {
7822                     if (*fmt == ')')
7823                         --pcount;
7824                     else if (*fmt == '(')
7825                         ++pcount;
7826                     fmt++;
7827                 }
7828                 keylen = fmt - keystart - 1;
7829                 if (fmtcnt < 0 || pcount > 0) {
7830                     PyErr_SetString(PyExc_ValueError,
7831                                     "incomplete format key");
7832                     goto onError;
7833                 }
7834 #if 0
7835                 /* keys are converted to strings using UTF-8 and
7836                    then looked up since Python uses strings to hold
7837                    variables names etc. in its namespaces and we
7838                    wouldn't want to break common idioms. */
7839                 key = PyUnicode_EncodeUTF8(keystart,
7840                                            keylen,
7841                                            NULL);
7842 #else
7843                 key = PyUnicode_FromUnicode(keystart, keylen);
7844 #endif
7845                 if (key == NULL)
7846                     goto onError;
7847                 if (args_owned) {
7848                     Py_DECREF(args);
7849                     args_owned = 0;
7850                 }
7851                 args = PyObject_GetItem(dict, key);
7852                 Py_DECREF(key);
7853                 if (args == NULL) {
7854                     goto onError;
7855                 }
7856                 args_owned = 1;
7857                 arglen = -1;
7858                 argidx = -2;
7859             }
7860             while (--fmtcnt >= 0) {
7861                 switch (c = *fmt++) {
7862                 case '-': flags |= F_LJUST; continue;
7863                 case '+': flags |= F_SIGN; continue;
7864                 case ' ': flags |= F_BLANK; continue;
7865                 case '#': flags |= F_ALT; continue;
7866                 case '0': flags |= F_ZERO; continue;
7867                 }
7868                 break;
7869             }
7870             if (c == '*') {
7871                 v = getnextarg(args, arglen, &argidx);
7872                 if (v == NULL)
7873                     goto onError;
7874                 if (!PyInt_Check(v)) {
7875                     PyErr_SetString(PyExc_TypeError,
7876                                     "* wants int");
7877                     goto onError;
7878                 }
7879                 width = PyInt_AsLong(v);
7880                 if (width < 0) {
7881                     flags |= F_LJUST;
7882                     width = -width;
7883                 }
7884                 if (--fmtcnt >= 0)
7885                     c = *fmt++;
7886             }
7887             else if (c >= '0' && c <= '9') {
7888                 width = c - '0';
7889                 while (--fmtcnt >= 0) {
7890                     c = *fmt++;
7891                     if (c < '0' || c > '9')
7892                         break;
7893                     if ((width*10) / 10 != width) {
7894                         PyErr_SetString(PyExc_ValueError,
7895                                         "width too big");
7896                         goto onError;
7897                     }
7898                     width = width*10 + (c - '0');
7899                 }
7900             }
7901             if (c == '.') {
7902                 prec = 0;
7903                 if (--fmtcnt >= 0)
7904                     c = *fmt++;
7905                 if (c == '*') {
7906                     v = getnextarg(args, arglen, &argidx);
7907                     if (v == NULL)
7908                         goto onError;
7909                     if (!PyInt_Check(v)) {
7910                         PyErr_SetString(PyExc_TypeError,
7911                                         "* wants int");
7912                         goto onError;
7913                     }
7914                     prec = PyInt_AsLong(v);
7915                     if (prec < 0)
7916                         prec = 0;
7917                     if (--fmtcnt >= 0)
7918                         c = *fmt++;
7919                 }
7920                 else if (c >= '0' && c <= '9') {
7921                     prec = c - '0';
7922                     while (--fmtcnt >= 0) {
7923                         c = Py_CHARMASK(*fmt++);
7924                         if (c < '0' || c > '9')
7925                             break;
7926                         if ((prec*10) / 10 != prec) {
7927                             PyErr_SetString(PyExc_ValueError,
7928                                             "prec too big");
7929                             goto onError;
7930                         }
7931                         prec = prec*10 + (c - '0');
7932                     }
7933                 }
7934             } /* prec */
7935             if (fmtcnt >= 0) {
7936                 if (c == 'h' || c == 'l' || c == 'L') {
7937                     if (--fmtcnt >= 0)
7938                         c = *fmt++;
7939                 }
7940             }
7941             if (fmtcnt < 0) {
7942                 PyErr_SetString(PyExc_ValueError,
7943                                 "incomplete format");
7944                 goto onError;
7945             }
7946             if (c != '%') {
7947                 v = getnextarg(args, arglen, &argidx);
7948                 if (v == NULL)
7949                     goto onError;
7950             }
7951             sign = 0;
7952             fill = ' ';
7953             switch (c) {
7954
7955             case '%':
7956                 pbuf = formatbuf;
7957                 /* presume that buffer length is at least 1 */
7958                 pbuf[0] = '%';
7959                 len = 1;
7960                 break;
7961
7962             case 's':
7963             case 'r':
7964                 if (PyUnicode_Check(v) && c == 's') {
7965                     temp = v;
7966                     Py_INCREF(temp);
7967                 }
7968                 else {
7969                     PyObject *unicode;
7970                     if (c == 's')
7971                         temp = PyObject_Unicode(v);
7972                     else
7973                         temp = PyObject_Repr(v);
7974                     if (temp == NULL)
7975                         goto onError;
7976                     if (PyUnicode_Check(temp))
7977                         /* nothing to do */;
7978                     else if (PyString_Check(temp)) {
7979                         /* convert to string to Unicode */
7980                         unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
7981                                                    PyString_GET_SIZE(temp),
7982                                                    NULL,
7983                                                    "strict");
7984                         Py_DECREF(temp);
7985                         temp = unicode;
7986                         if (temp == NULL)
7987                             goto onError;
7988                     }
7989                     else {
7990                         Py_DECREF(temp);
7991                         PyErr_SetString(PyExc_TypeError,
7992                                         "%s argument has non-string str()");
7993                         goto onError;
7994                     }
7995                 }
7996                 pbuf = PyUnicode_AS_UNICODE(temp);
7997                 len = PyUnicode_GET_SIZE(temp);
7998                 if (prec >= 0 && len > prec)
7999                     len = prec;
8000                 break;
8001
8002             case 'i':
8003             case 'd':
8004             case 'u':
8005             case 'o':
8006             case 'x':
8007             case 'X':
8008                 if (c == 'i')
8009                     c = 'd';
8010                 if (PyLong_Check(v)) {
8011                     temp = formatlong(v, flags, prec, c);
8012                     if (!temp)
8013                         goto onError;
8014                     pbuf = PyUnicode_AS_UNICODE(temp);
8015                     len = PyUnicode_GET_SIZE(temp);
8016                     sign = 1;
8017                 }
8018                 else {
8019                     pbuf = formatbuf;
8020                     len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8021                                     flags, prec, c, v);
8022                     if (len < 0)
8023                         goto onError;
8024                     sign = 1;
8025                 }
8026                 if (flags & F_ZERO)
8027                     fill = '0';
8028                 break;
8029
8030             case 'e':
8031             case 'E':
8032             case 'f':
8033             case 'F':
8034             case 'g':
8035             case 'G':
8036                 if (c == 'F')
8037                         c = 'f';
8038                 pbuf = formatbuf;
8039                 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8040                         flags, prec, c, v);
8041                 if (len < 0)
8042                     goto onError;
8043                 sign = 1;
8044                 if (flags & F_ZERO)
8045                     fill = '0';
8046                 break;
8047
8048             case 'c':
8049                 pbuf = formatbuf;
8050                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8051                 if (len < 0)
8052                     goto onError;
8053                 break;
8054
8055             default:
8056                 PyErr_Format(PyExc_ValueError,
8057                              "unsupported format character '%c' (0x%x) "
8058                              "at index %zd",
8059                              (31<=c && c<=126) ? (char)c : '?',
8060                              (int)c,
8061                              (Py_ssize_t)(fmt - 1 -
8062                                           PyUnicode_AS_UNICODE(uformat)));
8063                 goto onError;
8064             }
8065             if (sign) {
8066                 if (*pbuf == '-' || *pbuf == '+') {
8067                     sign = *pbuf++;
8068                     len--;
8069                 }
8070                 else if (flags & F_SIGN)
8071                     sign = '+';
8072                 else if (flags & F_BLANK)
8073                     sign = ' ';
8074                 else
8075                     sign = 0;
8076             }
8077             if (width < len)
8078                 width = len;
8079             if (rescnt - (sign != 0) < width) {
8080                 reslen -= rescnt;
8081                 rescnt = width + fmtcnt + 100;
8082                 reslen += rescnt;
8083                 if (reslen < 0) {
8084                     Py_XDECREF(temp);
8085                     PyErr_NoMemory();
8086                     goto onError;
8087                 }
8088                 if (_PyUnicode_Resize(&result, reslen) < 0) {
8089                     Py_XDECREF(temp);
8090                     goto onError;
8091                 }
8092                 res = PyUnicode_AS_UNICODE(result)
8093                     + reslen - rescnt;
8094             }
8095             if (sign) {
8096                 if (fill != ' ')
8097                     *res++ = sign;
8098                 rescnt--;
8099                 if (width > len)
8100                     width--;
8101             }
8102             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8103                 assert(pbuf[0] == '0');
8104                 assert(pbuf[1] == c);
8105                 if (fill != ' ') {
8106                     *res++ = *pbuf++;
8107                     *res++ = *pbuf++;
8108                 }
8109                 rescnt -= 2;
8110                 width -= 2;
8111                 if (width < 0)
8112                     width = 0;
8113                 len -= 2;
8114             }
8115             if (width > len && !(flags & F_LJUST)) {
8116                 do {
8117                     --rescnt;
8118                     *res++ = fill;
8119                 } while (--width > len);
8120             }
8121             if (fill == ' ') {
8122                 if (sign)
8123                     *res++ = sign;
8124                 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8125                     assert(pbuf[0] == '0');
8126                     assert(pbuf[1] == c);
8127                     *res++ = *pbuf++;
8128                     *res++ = *pbuf++;
8129                 }
8130             }
8131             Py_UNICODE_COPY(res, pbuf, len);
8132             res += len;
8133             rescnt -= len;
8134             while (--width >= len) {
8135                 --rescnt;
8136                 *res++ = ' ';
8137             }
8138             if (dict && (argidx < arglen) && c != '%') {
8139                 PyErr_SetString(PyExc_TypeError,
8140                                 "not all arguments converted during string formatting");
8141                 Py_XDECREF(temp);
8142                 goto onError;
8143             }
8144             Py_XDECREF(temp);
8145         } /* '%' */
8146     } /* until end */
8147     if (argidx < arglen && !dict) {
8148         PyErr_SetString(PyExc_TypeError,
8149                         "not all arguments converted during string formatting");
8150         goto onError;
8151     }
8152
8153     if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8154         goto onError;
8155     if (args_owned) {
8156         Py_DECREF(args);
8157     }
8158     Py_DECREF(uformat);
8159     return (PyObject *)result;
8160
8161  onError:
8162     Py_XDECREF(result);
8163     Py_DECREF(uformat);
8164     if (args_owned) {
8165         Py_DECREF(args);
8166     }
8167     return NULL;
8168 }
8169
8170 static PyBufferProcs unicode_as_buffer = {
8171     (readbufferproc) unicode_buffer_getreadbuf,
8172     (writebufferproc) unicode_buffer_getwritebuf,
8173     (segcountproc) unicode_buffer_getsegcount,
8174     (charbufferproc) unicode_buffer_getcharbuf,
8175 };
8176
8177 static PyObject *
8178 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8179
8180 static PyObject *
8181 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8182 {
8183         PyObject *x = NULL;
8184         static char *kwlist[] = {"string", "encoding", "errors", 0};
8185         char *encoding = NULL;
8186         char *errors = NULL;
8187
8188         if (type != &PyUnicode_Type)
8189                 return unicode_subtype_new(type, args, kwds);
8190         if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8191                                           kwlist, &x, &encoding, &errors))
8192             return NULL;
8193         if (x == NULL)
8194                 return (PyObject *)_PyUnicode_New(0);
8195         if (encoding == NULL && errors == NULL)
8196             return PyObject_Unicode(x);
8197         else
8198         return PyUnicode_FromEncodedObject(x, encoding, errors);
8199 }
8200
8201 static PyObject *
8202 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8203 {
8204         PyUnicodeObject *tmp, *pnew;
8205         Py_ssize_t n;
8206
8207         assert(PyType_IsSubtype(type, &PyUnicode_Type));
8208         tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8209         if (tmp == NULL)
8210                 return NULL;
8211         assert(PyUnicode_Check(tmp));
8212         pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8213         if (pnew == NULL) {
8214                 Py_DECREF(tmp);
8215                 return NULL;
8216         }
8217         pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8218         if (pnew->str == NULL) {
8219                 _Py_ForgetReference((PyObject *)pnew);
8220                 PyObject_Del(pnew);
8221                 Py_DECREF(tmp);
8222                 return PyErr_NoMemory();
8223         }
8224         Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8225         pnew->length = n;
8226         pnew->hash = tmp->hash;
8227         Py_DECREF(tmp);
8228         return (PyObject *)pnew;
8229 }
8230
8231 PyDoc_STRVAR(unicode_doc,
8232 "unicode(string [, encoding[, errors]]) -> object\n\
8233 \n\
8234 Create a new Unicode object from the given encoded string.\n\
8235 encoding defaults to the current default string encoding.\n\
8236 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8237
8238 PyTypeObject PyUnicode_Type = {
8239     PyVarObject_HEAD_INIT(&PyType_Type, 0)
8240     "unicode",                          /* tp_name */
8241     sizeof(PyUnicodeObject),            /* tp_size */
8242     0,                                  /* tp_itemsize */
8243     /* Slots */
8244     (destructor)unicode_dealloc,        /* tp_dealloc */
8245     0,                                  /* tp_print */
8246     0,                                  /* tp_getattr */
8247     0,                                  /* tp_setattr */
8248     0,                                  /* tp_compare */
8249     unicode_repr,                       /* tp_repr */
8250     &unicode_as_number,                 /* tp_as_number */
8251     &unicode_as_sequence,               /* tp_as_sequence */
8252     &unicode_as_mapping,                /* tp_as_mapping */
8253     (hashfunc) unicode_hash,            /* tp_hash*/
8254     0,                                  /* tp_call*/
8255     (reprfunc) unicode_str,             /* tp_str */
8256     PyObject_GenericGetAttr,            /* tp_getattro */
8257     0,                                  /* tp_setattro */
8258     &unicode_as_buffer,                 /* tp_as_buffer */
8259     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
8260             Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS,  /* tp_flags */
8261     unicode_doc,                        /* tp_doc */
8262     0,                                  /* tp_traverse */
8263     0,                                  /* tp_clear */
8264     PyUnicode_RichCompare,              /* tp_richcompare */
8265     0,                                  /* tp_weaklistoffset */
8266     0,                                  /* tp_iter */
8267     0,                                  /* tp_iternext */
8268     unicode_methods,                    /* tp_methods */
8269     0,                                  /* tp_members */
8270     0,                                  /* tp_getset */
8271     &PyBaseString_Type,                 /* tp_base */
8272     0,                                  /* tp_dict */
8273     0,                                  /* tp_descr_get */
8274     0,                                  /* tp_descr_set */
8275     0,                                  /* tp_dictoffset */
8276     0,                                  /* tp_init */
8277     0,                                  /* tp_alloc */
8278     unicode_new,                        /* tp_new */
8279     PyObject_Del,               /* tp_free */
8280 };
8281
8282 /* Initialize the Unicode implementation */
8283
8284 void _PyUnicode_Init(void)
8285 {
8286     int i;
8287
8288     /* XXX - move this array to unicodectype.c ? */
8289     Py_UNICODE linebreak[] = {
8290         0x000A, /* LINE FEED */
8291         0x000D, /* CARRIAGE RETURN */
8292         0x001C, /* FILE SEPARATOR */
8293         0x001D, /* GROUP SEPARATOR */
8294         0x001E, /* RECORD SEPARATOR */
8295         0x0085, /* NEXT LINE */
8296         0x2028, /* LINE SEPARATOR */
8297         0x2029, /* PARAGRAPH SEPARATOR */
8298     };
8299
8300     /* Init the implementation */
8301     unicode_freelist = NULL;
8302     unicode_freelist_size = 0;
8303     unicode_empty = _PyUnicode_New(0);
8304     if (!unicode_empty)
8305         return;
8306
8307     strcpy(unicode_default_encoding, "ascii");
8308     for (i = 0; i < 256; i++)
8309         unicode_latin1[i] = NULL;
8310     if (PyType_Ready(&PyUnicode_Type) < 0)
8311         Py_FatalError("Can't initialize 'unicode'");
8312
8313     /* initialize the linebreak bloom filter */
8314     bloom_linebreak = make_bloom_mask(
8315         linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8316         );
8317
8318     PyType_Ready(&EncodingMapType);
8319 }
8320
8321 /* Finalize the Unicode implementation */
8322
8323 void
8324 _PyUnicode_Fini(void)
8325 {
8326     PyUnicodeObject *u;
8327     int i;
8328
8329     Py_XDECREF(unicode_empty);
8330     unicode_empty = NULL;
8331
8332     for (i = 0; i < 256; i++) {
8333         if (unicode_latin1[i]) {
8334             Py_DECREF(unicode_latin1[i]);
8335             unicode_latin1[i] = NULL;
8336         }
8337     }
8338
8339     for (u = unicode_freelist; u != NULL;) {
8340         PyUnicodeObject *v = u;
8341         u = *(PyUnicodeObject **)u;
8342         if (v->str)
8343             PyMem_DEL(v->str);
8344         Py_XDECREF(v->defenc);
8345         PyObject_Del(v);
8346     }
8347     unicode_freelist = NULL;
8348     unicode_freelist_size = 0;
8349 }
8350
8351 #ifdef __cplusplus
8352 }
8353 #endif
8354
8355
8356 /*
8357 Local variables:
8358 c-basic-offset: 4
8359 indent-tabs-mode: nil
8360 End:
8361 */