Objects/unicodeobject.c

   1 /*
   2
   3 Unicode implementation based on original code by Fredrik Lundh,
   4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
   5 Unicode Integration Proposal (see file Misc/unicode.txt).
   6
   7 Major speed upgrades to the method implementations at the Reykjavik
   8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
   9
  10 Copyright (c) Corporation for National Research Initiatives.
  11
  12 --------------------------------------------------------------------
  13 The original string type implementation is:
  14
  15     Copyright (c) 1999 by Secret Labs AB
  16     Copyright (c) 1999 by Fredrik Lundh
  17
  18 By obtaining, using, and/or copying this software and/or its
  19 associated documentation, you agree that you have read, understood,
  20 and will comply with the following terms and conditions:
  21
  22 Permission to use, copy, modify, and distribute this software and its
  23 associated documentation for any purpose and without fee is hereby
  24 granted, provided that the above copyright notice appears in all
  25 copies, and that both that copyright notice and this permission notice
  26 appear in supporting documentation, and that the name of Secret Labs
  27 AB or the author not be used in advertising or publicity pertaining to
  28 distribution of the software without specific, written prior
  29 permission.
  30
  31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  33 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  38 --------------------------------------------------------------------
  39
  40 */
  41
  42 #define PY_SSIZE_T_CLEAN
  43 #include "Python.h"
  44
  45 #include "unicodeobject.h"
  46 #include "ucnhash.h"
  47
  48 #ifdef MS_WINDOWS
  49 #include <windows.h>
  50 #endif
  51
  52 /* Limit for the Unicode object free list */
  53
  54 #define MAX_UNICODE_FREELIST_SIZE       1024
  55
  56 /* Limit for the Unicode object free list stay alive optimization.
  57
  58    The implementation will keep allocated Unicode memory intact for
  59    all objects on the free list having a size less than this
  60    limit. This reduces malloc() overhead for small Unicode objects.
  61
  62    At worst this will result in MAX_UNICODE_FREELIST_SIZE *
  63    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
  64    malloc()-overhead) bytes of unused garbage.
  65
  66    Setting the limit to 0 effectively turns the feature off.
  67
  68    Note: This is an experimental feature ! If you get core dumps when
  69    using Unicode objects, turn this feature off.
  70
  71 */
  72
  73 #define KEEPALIVE_SIZE_LIMIT       9
  74
  75 /* Endianness switches; defaults to little endian */
  76
  77 #ifdef WORDS_BIGENDIAN
  78 # define BYTEORDER_IS_BIG_ENDIAN
  79 #else
  80 # define BYTEORDER_IS_LITTLE_ENDIAN
  81 #endif
  82
  83 /* --- Globals ------------------------------------------------------------
  84
  85    The globals are initialized by the _PyUnicode_Init() API and should
  86    not be used before calling that API.
  87
  88 */
  89
  90
  91 #ifdef __cplusplus
  92 extern "C" {
  93 #endif
  94
  95 /* Free list for Unicode objects */
  96 static PyUnicodeObject *unicode_freelist;
  97 static int unicode_freelist_size;
  98
  99 /* The empty Unicode object is shared to improve performance. */
 100 static PyUnicodeObject *unicode_empty;
 101
 102 /* Single character Unicode strings in the Latin-1 range are being
 103    shared as well. */
 104 static PyUnicodeObject *unicode_latin1[256];
 105
 106 /* Default encoding to use and assume when NULL is passed as encoding
 107    parameter; it is initialized by _PyUnicode_Init().
 108
 109    Always use the PyUnicode_SetDefaultEncoding() and
 110    PyUnicode_GetDefaultEncoding() APIs to access this global.
 111
 112 */
 113 static char unicode_default_encoding[100];
 114
 115 Py_UNICODE
 116 PyUnicode_GetMax(void)
 117 {
 118 #ifdef Py_UNICODE_WIDE
 119         return 0x10FFFF;
 120 #else
 121         /* This is actually an illegal character, so it should
 122            not be passed to unichr. */
 123         return 0xFFFF;
 124 #endif
 125 }
 126
 127 /* --- Bloom Filters ----------------------------------------------------- */
 128
 129 /* stuff to implement simple "bloom filters" for Unicode characters.
 130    to keep things simple, we use a single bitmask, using the least 5
 131    bits from each unicode characters as the bit index. */
 132
 133 /* the linebreak mask is set up by Unicode_Init below */
 134
 135 #define BLOOM_MASK unsigned long
 136
 137 static BLOOM_MASK bloom_linebreak;
 138
 139 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
 140
 141 #define BLOOM_LINEBREAK(ch)\
 142     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
 143
 144 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
 145 {
 146     /* calculate simple bloom-style bitmask for a given unicode string */
 147
 148     long mask;
 149     Py_ssize_t i;
 150
 151     mask = 0;
 152     for (i = 0; i < len; i++)
 153         mask |= (1 << (ptr[i] & 0x1F));
 154
 155     return mask;
 156 }
 157
 158 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
 159 {
 160     Py_ssize_t i;
 161
 162     for (i = 0; i < setlen; i++)
 163         if (set[i] == chr)
 164             return 1;
 165
 166     return 0;
 167 }
 168
 169 #define BLOOM_MEMBER(mask, chr, set, setlen)\
 170     BLOOM(mask, chr) && unicode_member(chr, set, setlen)
 171
 172 /* --- Unicode Object ----------------------------------------------------- */
 173
 174 static
 175 int unicode_resize(register PyUnicodeObject *unicode,
 176                       Py_ssize_t length)
 177 {
 178     void *oldstr;
 179
 180     /* Shortcut if there's nothing much to do. */
 181     if (unicode->length == length)
 182         goto reset;
 183
 184     /* Resizing shared object (unicode_empty or single character
 185        objects) in-place is not allowed. Use PyUnicode_Resize()
 186        instead ! */
 187
 188     if (unicode == unicode_empty ||
 189         (unicode->length == 1 &&
 190          unicode->str[0] < 256U &&
 191          unicode_latin1[unicode->str[0]] == unicode)) {
 192         PyErr_SetString(PyExc_SystemError,
 193                         "can't resize shared unicode objects");
 194         return -1;
 195     }
 196
 197     /* We allocate one more byte to make sure the string is Ux0000 terminated.
 198        The overallocation is also used by fastsearch, which assumes that it's
 199        safe to look at str[length] (without making any assumptions about what
 200        it contains). */
 201
 202     oldstr = unicode->str;
 203     PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
 204     if (!unicode->str) {
 205         unicode->str = (Py_UNICODE *)oldstr;
 206         PyErr_NoMemory();
 207         return -1;
 208     }
 209     unicode->str[length] = 0;
 210     unicode->length = length;
 211
 212  reset:
 213     /* Reset the object caches */
 214     if (unicode->defenc) {
 215         Py_DECREF(unicode->defenc);
 216         unicode->defenc = NULL;
 217     }
 218     unicode->hash = -1;
 219
 220     return 0;
 221 }
 222
 223 /* We allocate one more byte to make sure the string is
 224    Ux0000 terminated -- XXX is this needed ?
 225
 226    XXX This allocator could further be enhanced by assuring that the
 227        free list never reduces its size below 1.
 228
 229 */
 230
 231 static
 232 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
 233 {
 234     register PyUnicodeObject *unicode;
 235
 236     /* Optimization for empty strings */
 237     if (length == 0 && unicode_empty != NULL) {
 238         Py_INCREF(unicode_empty);
 239         return unicode_empty;
 240     }
 241
 242     /* Unicode freelist & memory allocation */
 243     if (unicode_freelist) {
 244         unicode = unicode_freelist;
 245         unicode_freelist = *(PyUnicodeObject **)unicode;
 246         unicode_freelist_size--;
 247         if (unicode->str) {
 248             /* Keep-Alive optimization: we only upsize the buffer,
 249                never downsize it. */
 250             if ((unicode->length < length) &&
 251                 unicode_resize(unicode, length) < 0) {
 252                 PyMem_DEL(unicode->str);
 253                 goto onError;
 254             }
 255         }
 256         else {
 257             unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
 258         }
 259         PyObject_INIT(unicode, &PyUnicode_Type);
 260     }
 261     else {
 262         unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
 263         if (unicode == NULL)
 264             return NULL;
 265         unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
 266     }
 267
 268     if (!unicode->str) {
 269         PyErr_NoMemory();
 270         goto onError;
 271     }
 272     /* Initialize the first element to guard against cases where
 273      * the caller fails before initializing str -- unicode_resize()
 274      * reads str[0], and the Keep-Alive optimization can keep memory
 275      * allocated for str alive across a call to unicode_dealloc(unicode).
 276      * We don't want unicode_resize to read uninitialized memory in
 277      * that case.
 278      */
 279     unicode->str[0] = 0;
 280     unicode->str[length] = 0;
 281     unicode->length = length;
 282     unicode->hash = -1;
 283     unicode->defenc = NULL;
 284     return unicode;
 285
 286  onError:
 287     _Py_ForgetReference((PyObject *)unicode);
 288     PyObject_Del(unicode);
 289     return NULL;
 290 }
 291
 292 static
 293 void unicode_dealloc(register PyUnicodeObject *unicode)
 294 {
 295     if (PyUnicode_CheckExact(unicode) &&
 296         unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
 297         /* Keep-Alive optimization */
 298         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
 299             PyMem_DEL(unicode->str);
 300             unicode->str = NULL;
 301             unicode->length = 0;
 302         }
 303         if (unicode->defenc) {
 304             Py_DECREF(unicode->defenc);
 305             unicode->defenc = NULL;
 306         }
 307         /* Add to free list */
 308         *(PyUnicodeObject **)unicode = unicode_freelist;
 309         unicode_freelist = unicode;
 310         unicode_freelist_size++;
 311     }
 312     else {
 313         PyMem_DEL(unicode->str);
 314         Py_XDECREF(unicode->defenc);
 315         unicode->ob_type->tp_free((PyObject *)unicode);
 316     }
 317 }
 318
 319 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
 320 {
 321     register PyUnicodeObject *v;
 322
 323     /* Argument checks */
 324     if (unicode == NULL) {
 325         PyErr_BadInternalCall();
 326         return -1;
 327     }
 328     v = (PyUnicodeObject *)*unicode;
 329     if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
 330         PyErr_BadInternalCall();
 331         return -1;
 332     }
 333
 334     /* Resizing unicode_empty and single character objects is not
 335        possible since these are being shared. We simply return a fresh
 336        copy with the same Unicode content. */
 337     if (v->length != length &&
 338         (v == unicode_empty || v->length == 1)) {
 339         PyUnicodeObject *w = _PyUnicode_New(length);
 340         if (w == NULL)
 341             return -1;
 342         Py_UNICODE_COPY(w->str, v->str,
 343                         length < v->length ? length : v->length);
 344         Py_DECREF(*unicode);
 345         *unicode = (PyObject *)w;
 346         return 0;
 347     }
 348
 349     /* Note that we don't have to modify *unicode for unshared Unicode
 350        objects, since we can modify them in-place. */
 351     return unicode_resize(v, length);
 352 }
 353
 354 /* Internal API for use in unicodeobject.c only ! */
 355 #define _PyUnicode_Resize(unicodevar, length) \
 356         PyUnicode_Resize(((PyObject **)(unicodevar)), length)
 357
 358 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
 359                                 Py_ssize_t size)
 360 {
 361     PyUnicodeObject *unicode;
 362
 363     /* If the Unicode data is known at construction time, we can apply
 364        some optimizations which share commonly used objects. */
 365     if (u != NULL) {
 366
 367         /* Optimization for empty strings */
 368         if (size == 0 && unicode_empty != NULL) {
 369             Py_INCREF(unicode_empty);
 370             return (PyObject *)unicode_empty;
 371         }
 372
 373         /* Single character Unicode objects in the Latin-1 range are
 374            shared when using this constructor */
 375         if (size == 1 && *u < 256) {
 376             unicode = unicode_latin1[*u];
 377             if (!unicode) {
 378                 unicode = _PyUnicode_New(1);
 379                 if (!unicode)
 380                     return NULL;
 381                 unicode->str[0] = *u;
 382                 unicode_latin1[*u] = unicode;
 383             }
 384             Py_INCREF(unicode);
 385             return (PyObject *)unicode;
 386         }
 387     }
 388
 389     unicode = _PyUnicode_New(size);
 390     if (!unicode)
 391         return NULL;
 392
 393     /* Copy the Unicode data into the new object */
 394     if (u != NULL)
 395         Py_UNICODE_COPY(unicode->str, u, size);
 396
 397     return (PyObject *)unicode;
 398 }
 399
 400 #ifdef HAVE_WCHAR_H
 401
 402 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 403                                  Py_ssize_t size)
 404 {
 405     PyUnicodeObject *unicode;
 406
 407     if (w == NULL) {
 408         PyErr_BadInternalCall();
 409         return NULL;
 410     }
 411
 412     unicode = _PyUnicode_New(size);
 413     if (!unicode)
 414         return NULL;
 415
 416     /* Copy the wchar_t data into the new object */
 417 #ifdef HAVE_USABLE_WCHAR_T
 418     memcpy(unicode->str, w, size * sizeof(wchar_t));
 419 #else
 420     {
 421         register Py_UNICODE *u;
 422         register Py_ssize_t i;
 423         u = PyUnicode_AS_UNICODE(unicode);
 424         for (i = size; i > 0; i--)
 425             *u++ = *w++;
 426     }
 427 #endif
 428
 429     return (PyObject *)unicode;
 430 }
 431
 432 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
 433                                 wchar_t *w,
 434                                 Py_ssize_t size)
 435 {
 436     if (unicode == NULL) {
 437         PyErr_BadInternalCall();
 438         return -1;
 439     }
 440
 441     /* If possible, try to copy the 0-termination as well */
 442     if (size > PyUnicode_GET_SIZE(unicode))
 443         size = PyUnicode_GET_SIZE(unicode) + 1;
 444
 445 #ifdef HAVE_USABLE_WCHAR_T
 446     memcpy(w, unicode->str, size * sizeof(wchar_t));
 447 #else
 448     {
 449         register Py_UNICODE *u;
 450         register Py_ssize_t i;
 451         u = PyUnicode_AS_UNICODE(unicode);
 452         for (i = size; i > 0; i--)
 453             *w++ = *u++;
 454     }
 455 #endif
 456
 457     if (size > PyUnicode_GET_SIZE(unicode))
 458         return PyUnicode_GET_SIZE(unicode);
 459     else
 460     return size;
 461 }
 462
 463 #endif
 464
 465 PyObject *PyUnicode_FromOrdinal(int ordinal)
 466 {
 467     Py_UNICODE s[1];
 468
 469 #ifdef Py_UNICODE_WIDE
 470     if (ordinal < 0 || ordinal > 0x10ffff) {
 471         PyErr_SetString(PyExc_ValueError,
 472                         "unichr() arg not in range(0x110000) "
 473                         "(wide Python build)");
 474         return NULL;
 475     }
 476 #else
 477     if (ordinal < 0 || ordinal > 0xffff) {
 478         PyErr_SetString(PyExc_ValueError,
 479                         "unichr() arg not in range(0x10000) "
 480                         "(narrow Python build)");
 481         return NULL;
 482     }
 483 #endif
 484
 485     s[0] = (Py_UNICODE)ordinal;
 486     return PyUnicode_FromUnicode(s, 1);
 487 }
 488
 489 PyObject *PyUnicode_FromObject(register PyObject *obj)
 490 {
 491     /* XXX Perhaps we should make this API an alias of
 492            PyObject_Unicode() instead ?! */
 493     if (PyUnicode_CheckExact(obj)) {
 494         Py_INCREF(obj);
 495         return obj;
 496     }
 497     if (PyUnicode_Check(obj)) {
 498         /* For a Unicode subtype that's not a Unicode object,
 499            return a true Unicode object with the same data. */
 500         return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
 501                                      PyUnicode_GET_SIZE(obj));
 502     }
 503     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
 504 }
 505
 506 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
 507                                       const char *encoding,
 508                                       const char *errors)
 509 {
 510     const char *s = NULL;
 511     Py_ssize_t len;
 512     PyObject *v;
 513
 514     if (obj == NULL) {
 515         PyErr_BadInternalCall();
 516         return NULL;
 517     }
 518
 519 #if 0
 520     /* For b/w compatibility we also accept Unicode objects provided
 521        that no encodings is given and then redirect to
 522        PyObject_Unicode() which then applies the additional logic for
 523        Unicode subclasses.
 524
 525        NOTE: This API should really only be used for object which
 526              represent *encoded* Unicode !
 527
 528     */
 529         if (PyUnicode_Check(obj)) {
 530             if (encoding) {
 531                 PyErr_SetString(PyExc_TypeError,
 532                                 "decoding Unicode is not supported");
 533             return NULL;
 534             }
 535         return PyObject_Unicode(obj);
 536             }
 537 #else
 538     if (PyUnicode_Check(obj)) {
 539         PyErr_SetString(PyExc_TypeError,
 540                         "decoding Unicode is not supported");
 541         return NULL;
 542         }
 543 #endif
 544
 545     /* Coerce object */
 546     if (PyString_Check(obj)) {
 547             s = PyString_AS_STRING(obj);
 548             len = PyString_GET_SIZE(obj);
 549             }
 550     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
 551         /* Overwrite the error message with something more useful in
 552            case of a TypeError. */
 553         if (PyErr_ExceptionMatches(PyExc_TypeError))
 554         PyErr_Format(PyExc_TypeError,
 555                          "coercing to Unicode: need string or buffer, "
 556                          "%.80s found",
 557                      obj->ob_type->tp_name);
 558         goto onError;
 559     }
 560
 561     /* Convert to Unicode */
 562     if (len == 0) {
 563         Py_INCREF(unicode_empty);
 564         v = (PyObject *)unicode_empty;
 565     }
 566     else
 567         v = PyUnicode_Decode(s, len, encoding, errors);
 568
 569     return v;
 570
 571  onError:
 572     return NULL;
 573 }
 574
 575 PyObject *PyUnicode_Decode(const char *s,
 576                            Py_ssize_t size,
 577                            const char *encoding,
 578                            const char *errors)
 579 {
 580     PyObject *buffer = NULL, *unicode;
 581
 582     if (encoding == NULL)
 583         encoding = PyUnicode_GetDefaultEncoding();
 584
 585     /* Shortcuts for common default encodings */
 586     if (strcmp(encoding, "utf-8") == 0)
 587         return PyUnicode_DecodeUTF8(s, size, errors);
 588     else if (strcmp(encoding, "latin-1") == 0)
 589         return PyUnicode_DecodeLatin1(s, size, errors);
 590 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
 591     else if (strcmp(encoding, "mbcs") == 0)
 592         return PyUnicode_DecodeMBCS(s, size, errors);
 593 #endif
 594     else if (strcmp(encoding, "ascii") == 0)
 595         return PyUnicode_DecodeASCII(s, size, errors);
 596
 597     /* Decode via the codec registry */
 598     buffer = PyBuffer_FromMemory((void *)s, size);
 599     if (buffer == NULL)
 600         goto onError;
 601     unicode = PyCodec_Decode(buffer, encoding, errors);
 602     if (unicode == NULL)
 603         goto onError;
 604     if (!PyUnicode_Check(unicode)) {
 605         PyErr_Format(PyExc_TypeError,
 606                      "decoder did not return an unicode object (type=%.400s)",
 607                      unicode->ob_type->tp_name);
 608         Py_DECREF(unicode);
 609         goto onError;
 610     }
 611     Py_DECREF(buffer);
 612     return unicode;
 613
 614  onError:
 615     Py_XDECREF(buffer);
 616     return NULL;
 617 }
 618
 619 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
 620                                     const char *encoding,
 621                                     const char *errors)
 622 {
 623     PyObject *v;
 624
 625     if (!PyUnicode_Check(unicode)) {
 626         PyErr_BadArgument();
 627         goto onError;
 628     }
 629
 630     if (encoding == NULL)
 631         encoding = PyUnicode_GetDefaultEncoding();
 632
 633     /* Decode via the codec registry */
 634     v = PyCodec_Decode(unicode, encoding, errors);
 635     if (v == NULL)
 636         goto onError;
 637     return v;
 638
 639  onError:
 640     return NULL;
 641 }
 642
 643 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
 644                            Py_ssize_t size,
 645                            const char *encoding,
 646                            const char *errors)
 647 {
 648     PyObject *v, *unicode;
 649
 650     unicode = PyUnicode_FromUnicode(s, size);
 651     if (unicode == NULL)
 652         return NULL;
 653     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
 654     Py_DECREF(unicode);
 655     return v;
 656 }
 657
 658 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
 659                                     const char *encoding,
 660                                     const char *errors)
 661 {
 662     PyObject *v;
 663
 664     if (!PyUnicode_Check(unicode)) {
 665         PyErr_BadArgument();
 666         goto onError;
 667     }
 668
 669     if (encoding == NULL)
 670         encoding = PyUnicode_GetDefaultEncoding();
 671
 672     /* Encode via the codec registry */
 673     v = PyCodec_Encode(unicode, encoding, errors);
 674     if (v == NULL)
 675         goto onError;
 676     return v;
 677
 678  onError:
 679     return NULL;
 680 }
 681
 682 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
 683                                     const char *encoding,
 684                                     const char *errors)
 685 {
 686     PyObject *v;
 687
 688     if (!PyUnicode_Check(unicode)) {
 689         PyErr_BadArgument();
 690         goto onError;
 691     }
 692
 693     if (encoding == NULL)
 694         encoding = PyUnicode_GetDefaultEncoding();
 695
 696     /* Shortcuts for common default encodings */
 697     if (errors == NULL) {
 698         if (strcmp(encoding, "utf-8") == 0)
 699             return PyUnicode_AsUTF8String(unicode);
 700         else if (strcmp(encoding, "latin-1") == 0)
 701             return PyUnicode_AsLatin1String(unicode);
 702 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
 703         else if (strcmp(encoding, "mbcs") == 0)
 704             return PyUnicode_AsMBCSString(unicode);
 705 #endif
 706         else if (strcmp(encoding, "ascii") == 0)
 707             return PyUnicode_AsASCIIString(unicode);
 708     }
 709
 710     /* Encode via the codec registry */
 711     v = PyCodec_Encode(unicode, encoding, errors);
 712     if (v == NULL)
 713         goto onError;
 714     if (!PyString_Check(v)) {
 715         PyErr_Format(PyExc_TypeError,
 716                      "encoder did not return a string object (type=%.400s)",
 717                      v->ob_type->tp_name);
 718         Py_DECREF(v);
 719         goto onError;
 720     }
 721     return v;
 722
 723  onError:
 724     return NULL;
 725 }
 726
 727 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
 728                                             const char *errors)
 729 {
 730     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
 731
 732     if (v)
 733         return v;
 734     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
 735     if (v && errors == NULL)
 736         ((PyUnicodeObject *)unicode)->defenc = v;
 737     return v;
 738 }
 739
 740 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
 741 {
 742     if (!PyUnicode_Check(unicode)) {
 743         PyErr_BadArgument();
 744         goto onError;
 745     }
 746     return PyUnicode_AS_UNICODE(unicode);
 747
 748  onError:
 749     return NULL;
 750 }
 751
 752 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
 753 {
 754     if (!PyUnicode_Check(unicode)) {
 755         PyErr_BadArgument();
 756         goto onError;
 757     }
 758     return PyUnicode_GET_SIZE(unicode);
 759
 760  onError:
 761     return -1;
 762 }
 763
 764 const char *PyUnicode_GetDefaultEncoding(void)
 765 {
 766     return unicode_default_encoding;
 767 }
 768
 769 int PyUnicode_SetDefaultEncoding(const char *encoding)
 770 {
 771     PyObject *v;
 772
 773     /* Make sure the encoding is valid. As side effect, this also
 774        loads the encoding into the codec registry cache. */
 775     v = _PyCodec_Lookup(encoding);
 776     if (v == NULL)
 777         goto onError;
 778     Py_DECREF(v);
 779     strncpy(unicode_default_encoding,
 780             encoding,
 781             sizeof(unicode_default_encoding));
 782     return 0;
 783
 784  onError:
 785     return -1;
 786 }
 787
 788 /* error handling callback helper:
 789    build arguments, call the callback and check the arguments,
 790    if no exception occurred, copy the replacement to the output
 791    and adjust various state variables.
 792    return 0 on success, -1 on error
 793 */
 794
 795 static
 796 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
 797                  const char *encoding, const char *reason,
 798                  const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
 799                  PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
 800 {
 801     static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
 802
 803     PyObject *restuple = NULL;
 804     PyObject *repunicode = NULL;
 805     Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
 806     Py_ssize_t requiredsize;
 807     Py_ssize_t newpos;
 808     Py_UNICODE *repptr;
 809     Py_ssize_t repsize;
 810     int res = -1;
 811
 812     if (*errorHandler == NULL) {
 813         *errorHandler = PyCodec_LookupError(errors);
 814         if (*errorHandler == NULL)
 815            goto onError;
 816     }
 817
 818     if (*exceptionObject == NULL) {
 819         *exceptionObject = PyUnicodeDecodeError_Create(
 820             encoding, input, insize, *startinpos, *endinpos, reason);
 821         if (*exceptionObject == NULL)
 822            goto onError;
 823     }
 824     else {
 825         if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
 826             goto onError;
 827         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
 828             goto onError;
 829         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
 830             goto onError;
 831     }
 832
 833     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
 834     if (restuple == NULL)
 835         goto onError;
 836     if (!PyTuple_Check(restuple)) {
 837         PyErr_Format(PyExc_TypeError, &argparse[4]);
 838         goto onError;
 839     }
 840     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
 841         goto onError;
 842     if (newpos<0)
 843         newpos = insize+newpos;
 844     if (newpos<0 || newpos>insize) {
 845         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
 846         goto onError;
 847     }
 848
 849     /* need more space? (at least enough for what we
 850        have+the replacement+the rest of the string (starting
 851        at the new input position), so we won't have to check space
 852        when there are no errors in the rest of the string) */
 853     repptr = PyUnicode_AS_UNICODE(repunicode);
 854     repsize = PyUnicode_GET_SIZE(repunicode);
 855     requiredsize = *outpos + repsize + insize-newpos;
 856     if (requiredsize > outsize) {
 857         if (requiredsize<2*outsize)
 858             requiredsize = 2*outsize;
 859         if (PyUnicode_Resize(output, requiredsize) < 0)
 860             goto onError;
 861         *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
 862     }
 863     *endinpos = newpos;
 864     *inptr = input + newpos;
 865     Py_UNICODE_COPY(*outptr, repptr, repsize);
 866     *outptr += repsize;
 867     *outpos += repsize;
 868     /* we made it! */
 869     res = 0;
 870
 871     onError:
 872     Py_XDECREF(restuple);
 873     return res;
 874 }
 875
 876 /* --- UTF-7 Codec -------------------------------------------------------- */
 877
 878 /* see RFC2152 for details */
 879
 880 static
 881 char utf7_special[128] = {
 882     /* indicate whether a UTF-7 character is special i.e. cannot be directly
 883        encoded:
 884            0 - not special
 885            1 - special
 886            2 - whitespace (optional)
 887            3 - RFC2152 Set O (optional) */
 888     1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
 889     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 890     2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
 891     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
 892     3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 893     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
 894     3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 895     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
 896
 897 };
 898
 899 /* Note: The comparison (c) <= 0 is a trick to work-around gcc
 900    warnings about the comparison always being false; since
 901    utf7_special[0] is 1, we can safely make that one comparison
 902    true  */
 903
 904 #define SPECIAL(c, encodeO, encodeWS) \
 905     ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
 906      (encodeWS && (utf7_special[(c)] == 2)) || \
 907      (encodeO && (utf7_special[(c)] == 3)))
 908
 909 #define B64(n)  \
 910     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
 911 #define B64CHAR(c) \
 912     (isalnum(c) || (c) == '+' || (c) == '/')
 913 #define UB64(c) \
 914     ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ?                   \
 915      (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
 916
 917 #define ENCODE(out, ch, bits)                   \
 918     while (bits >= 6) {                         \
 919         *out++ = B64(ch >> (bits-6));           \
 920         bits -= 6;                              \
 921     }
 922
 923 #define DECODE(out, ch, bits, surrogate)                                \
 924     while (bits >= 16) {                                                \
 925         Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff);   \
 926         bits -= 16;                                                     \
 927         if (surrogate) {                                                \
 928             /* We have already generated an error for the high surrogate \
 929                so let's not bother seeing if the low surrogate is correct or not */ \
 930             surrogate = 0;                                              \
 931         } else if (0xDC00 <= outCh && outCh <= 0xDFFF) {                \
 932             /* This is a surrogate pair. Unfortunately we can't represent \
 933                it in a 16-bit character */                              \
 934             surrogate = 1;                                              \
 935             errmsg = "code pairs are not supported";                    \
 936             goto utf7Error;                                             \
 937         } else {                                                        \
 938             *out++ = outCh;                                             \
 939         }                                                               \
 940     }
 941
 942 PyObject *PyUnicode_DecodeUTF7(const char *s,
 943                                Py_ssize_t size,
 944                                const char *errors)
 945 {
 946     const char *starts = s;
 947     Py_ssize_t startinpos;
 948     Py_ssize_t endinpos;
 949     Py_ssize_t outpos;
 950     const char *e;
 951     PyUnicodeObject *unicode;
 952     Py_UNICODE *p;
 953     const char *errmsg = "";
 954     int inShift = 0;
 955     unsigned int bitsleft = 0;
 956     unsigned long charsleft = 0;
 957     int surrogate = 0;
 958     PyObject *errorHandler = NULL;
 959     PyObject *exc = NULL;
 960
 961     unicode = _PyUnicode_New(size);
 962     if (!unicode)
 963         return NULL;
 964     if (size == 0)
 965         return (PyObject *)unicode;
 966
 967     p = unicode->str;
 968     e = s + size;
 969
 970     while (s < e) {
 971         Py_UNICODE ch;
 972         restart:
 973         ch = *s;
 974
 975         if (inShift) {
 976             if ((ch == '-') || !B64CHAR(ch)) {
 977                 inShift = 0;
 978                 s++;
 979
 980                 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
 981                 if (bitsleft >= 6) {
 982                     /* The shift sequence has a partial character in it. If
 983                        bitsleft < 6 then we could just classify it as padding
 984                        but that is not the case here */
 985
 986                     errmsg = "partial character in shift sequence";
 987                     goto utf7Error;
 988                 }
 989                 /* According to RFC2152 the remaining bits should be zero. We
 990                    choose to signal an error/insert a replacement character
 991                    here so indicate the potential of a misencoded character. */
 992
 993                 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
 994                 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
 995                     errmsg = "non-zero padding bits in shift sequence";
 996                     goto utf7Error;
 997                 }
 998
 999                 if (ch == '-') {
1000                     if ((s < e) && (*(s) == '-')) {
1001                         *p++ = '-';
1002                         inShift = 1;
1003                     }
1004                 } else if (SPECIAL(ch,0,0)) {
1005                     errmsg = "unexpected special character";
1006                         goto utf7Error;
1007                 } else  {
1008                     *p++ = ch;
1009                 }
1010             } else {
1011                 charsleft = (charsleft << 6) | UB64(ch);
1012                 bitsleft += 6;
1013                 s++;
1014                 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1015             }
1016         }
1017         else if ( ch == '+' ) {
1018             startinpos = s-starts;
1019             s++;
1020             if (s < e && *s == '-') {
1021                 s++;
1022                 *p++ = '+';
1023             } else
1024             {
1025                 inShift = 1;
1026                 bitsleft = 0;
1027             }
1028         }
1029         else if (SPECIAL(ch,0,0)) {
1030             errmsg = "unexpected special character";
1031             s++;
1032                 goto utf7Error;
1033         }
1034         else {
1035             *p++ = ch;
1036             s++;
1037         }
1038         continue;
1039     utf7Error:
1040         outpos = p-PyUnicode_AS_UNICODE(unicode);
1041         endinpos = s-starts;
1042         if (unicode_decode_call_errorhandler(
1043              errors, &errorHandler,
1044              "utf7", errmsg,
1045              starts, size, &startinpos, &endinpos, &exc, &s,
1046              (PyObject **)&unicode, &outpos, &p))
1047         goto onError;
1048     }
1049
1050     if (inShift) {
1051         outpos = p-PyUnicode_AS_UNICODE(unicode);
1052         endinpos = size;
1053         if (unicode_decode_call_errorhandler(
1054              errors, &errorHandler,
1055              "utf7", "unterminated shift sequence",
1056              starts, size, &startinpos, &endinpos, &exc, &s,
1057              (PyObject **)&unicode, &outpos, &p))
1058             goto onError;
1059         if (s < e)
1060            goto restart;
1061     }
1062
1063     if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1064         goto onError;
1065
1066     Py_XDECREF(errorHandler);
1067     Py_XDECREF(exc);
1068     return (PyObject *)unicode;
1069
1070 onError:
1071     Py_XDECREF(errorHandler);
1072     Py_XDECREF(exc);
1073     Py_DECREF(unicode);
1074     return NULL;
1075 }
1076
1077
1078 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1079                    Py_ssize_t size,
1080                    int encodeSetO,
1081                    int encodeWhiteSpace,
1082                    const char *errors)
1083 {
1084     PyObject *v;
1085     /* It might be possible to tighten this worst case */
1086     Py_ssize_t cbAllocated = 5 * size;
1087     int inShift = 0;
1088     Py_ssize_t i = 0;
1089     unsigned int bitsleft = 0;
1090     unsigned long charsleft = 0;
1091     char * out;
1092     char * start;
1093
1094     if (size == 0)
1095                 return PyString_FromStringAndSize(NULL, 0);
1096
1097     v = PyString_FromStringAndSize(NULL, cbAllocated);
1098     if (v == NULL)
1099         return NULL;
1100
1101     start = out = PyString_AS_STRING(v);
1102     for (;i < size; ++i) {
1103         Py_UNICODE ch = s[i];
1104
1105         if (!inShift) {
1106             if (ch == '+') {
1107                 *out++ = '+';
1108                 *out++ = '-';
1109             } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1110                 charsleft = ch;
1111                 bitsleft = 16;
1112                 *out++ = '+';
1113                 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1114                 inShift = bitsleft > 0;
1115             } else {
1116                 *out++ = (char) ch;
1117             }
1118         } else {
1119             if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1120                 *out++ = B64(charsleft << (6-bitsleft));
1121                 charsleft = 0;
1122                 bitsleft = 0;
1123                 /* Characters not in the BASE64 set implicitly unshift the sequence
1124                    so no '-' is required, except if the character is itself a '-' */
1125                 if (B64CHAR(ch) || ch == '-') {
1126                     *out++ = '-';
1127                 }
1128                 inShift = 0;
1129                 *out++ = (char) ch;
1130             } else {
1131                 bitsleft += 16;
1132                 charsleft = (charsleft << 16) | ch;
1133                 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1134
1135                 /* If the next character is special then we dont' need to terminate
1136                    the shift sequence. If the next character is not a BASE64 character
1137                    or '-' then the shift sequence will be terminated implicitly and we
1138                    don't have to insert a '-'. */
1139
1140                 if (bitsleft == 0) {
1141                     if (i + 1 < size) {
1142                         Py_UNICODE ch2 = s[i+1];
1143
1144                         if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1145
1146                         } else if (B64CHAR(ch2) || ch2 == '-') {
1147                             *out++ = '-';
1148                             inShift = 0;
1149                         } else {
1150                             inShift = 0;
1151                         }
1152
1153                     }
1154                     else {
1155                         *out++ = '-';
1156                         inShift = 0;
1157                     }
1158                 }
1159             }
1160         }
1161     }
1162     if (bitsleft) {
1163         *out++= B64(charsleft << (6-bitsleft) );
1164         *out++ = '-';
1165     }
1166
1167     _PyString_Resize(&v, out - start);
1168     return v;
1169 }
1170
1171 #undef SPECIAL
1172 #undef B64
1173 #undef B64CHAR
1174 #undef UB64
1175 #undef ENCODE
1176 #undef DECODE
1177
1178 /* --- UTF-8 Codec -------------------------------------------------------- */
1179
1180 static
1181 char utf8_code_length[256] = {
1182     /* Map UTF-8 encoded prefix byte to sequence length.  zero means
1183        illegal prefix.  see RFC 2279 for details */
1184     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1185     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1186     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1187     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1188     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1189     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1193     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1194     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1195     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1196     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1197     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1198     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1199     4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1200 };
1201
1202 PyObject *PyUnicode_DecodeUTF8(const char *s,
1203                                Py_ssize_t size,
1204                                const char *errors)
1205 {
1206     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1207 }
1208
1209 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1210                                         Py_ssize_t size,
1211                                         const char *errors,
1212                                         Py_ssize_t *consumed)
1213 {
1214     const char *starts = s;
1215     int n;
1216     Py_ssize_t startinpos;
1217     Py_ssize_t endinpos;
1218     Py_ssize_t outpos;
1219     const char *e;
1220     PyUnicodeObject *unicode;
1221     Py_UNICODE *p;
1222     const char *errmsg = "";
1223     PyObject *errorHandler = NULL;
1224     PyObject *exc = NULL;
1225
1226     /* Note: size will always be longer than the resulting Unicode
1227        character count */
1228     unicode = _PyUnicode_New(size);
1229     if (!unicode)
1230         return NULL;
1231     if (size == 0) {
1232         if (consumed)
1233             *consumed = 0;
1234         return (PyObject *)unicode;
1235     }
1236
1237     /* Unpack UTF-8 encoded data */
1238     p = unicode->str;
1239     e = s + size;
1240
1241     while (s < e) {
1242         Py_UCS4 ch = (unsigned char)*s;
1243
1244         if (ch < 0x80) {
1245             *p++ = (Py_UNICODE)ch;
1246             s++;
1247             continue;
1248         }
1249
1250         n = utf8_code_length[ch];
1251
1252         if (s + n > e) {
1253             if (consumed)
1254                 break;
1255             else {
1256                 errmsg = "unexpected end of data";
1257                 startinpos = s-starts;
1258                 endinpos = size;
1259                 goto utf8Error;
1260             }
1261         }
1262
1263         switch (n) {
1264
1265         case 0:
1266             errmsg = "unexpected code byte";
1267             startinpos = s-starts;
1268             endinpos = startinpos+1;
1269             goto utf8Error;
1270
1271         case 1:
1272             errmsg = "internal error";
1273             startinpos = s-starts;
1274             endinpos = startinpos+1;
1275             goto utf8Error;
1276
1277         case 2:
1278             if ((s[1] & 0xc0) != 0x80) {
1279                 errmsg = "invalid data";
1280                 startinpos = s-starts;
1281                 endinpos = startinpos+2;
1282                 goto utf8Error;
1283             }
1284             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1285             if (ch < 0x80) {
1286                 startinpos = s-starts;
1287                 endinpos = startinpos+2;
1288                 errmsg = "illegal encoding";
1289                 goto utf8Error;
1290             }
1291             else
1292                 *p++ = (Py_UNICODE)ch;
1293             break;
1294
1295         case 3:
1296             if ((s[1] & 0xc0) != 0x80 ||
1297                 (s[2] & 0xc0) != 0x80) {
1298                 errmsg = "invalid data";
1299                 startinpos = s-starts;
1300                 endinpos = startinpos+3;
1301                 goto utf8Error;
1302             }
1303             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1304             if (ch < 0x0800) {
1305                 /* Note: UTF-8 encodings of surrogates are considered
1306                    legal UTF-8 sequences;
1307
1308                    XXX For wide builds (UCS-4) we should probably try
1309                        to recombine the surrogates into a single code
1310                        unit.
1311                 */
1312                 errmsg = "illegal encoding";
1313                 startinpos = s-starts;
1314                 endinpos = startinpos+3;
1315                 goto utf8Error;
1316             }
1317             else
1318                 *p++ = (Py_UNICODE)ch;
1319             break;
1320
1321         case 4:
1322             if ((s[1] & 0xc0) != 0x80 ||
1323                 (s[2] & 0xc0) != 0x80 ||
1324                 (s[3] & 0xc0) != 0x80) {
1325                 errmsg = "invalid data";
1326                 startinpos = s-starts;
1327                 endinpos = startinpos+4;
1328                 goto utf8Error;
1329             }
1330             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1331                  ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1332             /* validate and convert to UTF-16 */
1333             if ((ch < 0x10000)        /* minimum value allowed for 4
1334                                          byte encoding */
1335                 || (ch > 0x10ffff))   /* maximum value allowed for
1336                                          UTF-16 */
1337             {
1338                 errmsg = "illegal encoding";
1339                 startinpos = s-starts;
1340                 endinpos = startinpos+4;
1341                 goto utf8Error;
1342             }
1343 #ifdef Py_UNICODE_WIDE
1344             *p++ = (Py_UNICODE)ch;
1345 #else
1346             /*  compute and append the two surrogates: */
1347
1348             /*  translate from 10000..10FFFF to 0..FFFF */
1349             ch -= 0x10000;
1350
1351             /*  high surrogate = top 10 bits added to D800 */
1352             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1353
1354             /*  low surrogate = bottom 10 bits added to DC00 */
1355             *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1356 #endif
1357             break;
1358
1359         default:
1360             /* Other sizes are only needed for UCS-4 */
1361             errmsg = "unsupported Unicode code range";
1362             startinpos = s-starts;
1363             endinpos = startinpos+n;
1364             goto utf8Error;
1365         }
1366         s += n;
1367         continue;
1368
1369     utf8Error:
1370     outpos = p-PyUnicode_AS_UNICODE(unicode);
1371     if (unicode_decode_call_errorhandler(
1372              errors, &errorHandler,
1373              "utf8", errmsg,
1374              starts, size, &startinpos, &endinpos, &exc, &s,
1375              (PyObject **)&unicode, &outpos, &p))
1376         goto onError;
1377     }
1378     if (consumed)
1379         *consumed = s-starts;
1380
1381     /* Adjust length */
1382     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1383         goto onError;
1384
1385     Py_XDECREF(errorHandler);
1386     Py_XDECREF(exc);
1387     return (PyObject *)unicode;
1388
1389 onError:
1390     Py_XDECREF(errorHandler);
1391     Py_XDECREF(exc);
1392     Py_DECREF(unicode);
1393     return NULL;
1394 }
1395
1396 /* Allocation strategy:  if the string is short, convert into a stack buffer
1397    and allocate exactly as much space needed at the end.  Else allocate the
1398    maximum possible needed (4 result bytes per Unicode character), and return
1399    the excess memory at the end.
1400 */
1401 PyObject *
1402 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1403                      Py_ssize_t size,
1404                      const char *errors)
1405 {
1406 #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
1407
1408     Py_ssize_t i;           /* index into s of next input byte */
1409     PyObject *v;        /* result string object */
1410     char *p;            /* next free byte in output buffer */
1411     Py_ssize_t nallocated;  /* number of result bytes allocated */
1412     Py_ssize_t nneeded;        /* number of result bytes needed */
1413     char stackbuf[MAX_SHORT_UNICHARS * 4];
1414
1415     assert(s != NULL);
1416     assert(size >= 0);
1417
1418     if (size <= MAX_SHORT_UNICHARS) {
1419         /* Write into the stack buffer; nallocated can't overflow.
1420          * At the end, we'll allocate exactly as much heap space as it
1421          * turns out we need.
1422          */
1423         nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1424         v = NULL;   /* will allocate after we're done */
1425         p = stackbuf;
1426     }
1427     else {
1428         /* Overallocate on the heap, and give the excess back at the end. */
1429         nallocated = size * 4;
1430         if (nallocated / 4 != size)  /* overflow! */
1431             return PyErr_NoMemory();
1432         v = PyString_FromStringAndSize(NULL, nallocated);
1433         if (v == NULL)
1434             return NULL;
1435         p = PyString_AS_STRING(v);
1436     }
1437
1438     for (i = 0; i < size;) {
1439         Py_UCS4 ch = s[i++];
1440
1441         if (ch < 0x80)
1442             /* Encode ASCII */
1443             *p++ = (char) ch;
1444
1445         else if (ch < 0x0800) {
1446             /* Encode Latin-1 */
1447             *p++ = (char)(0xc0 | (ch >> 6));
1448             *p++ = (char)(0x80 | (ch & 0x3f));
1449         }
1450         else {
1451             /* Encode UCS2 Unicode ordinals */
1452             if (ch < 0x10000) {
1453                 /* Special case: check for high surrogate */
1454                 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1455                     Py_UCS4 ch2 = s[i];
1456                     /* Check for low surrogate and combine the two to
1457                        form a UCS4 value */
1458                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1459                         ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
1460                         i++;
1461                         goto encodeUCS4;
1462                     }
1463                     /* Fall through: handles isolated high surrogates */
1464                 }
1465                 *p++ = (char)(0xe0 | (ch >> 12));
1466                 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1467                 *p++ = (char)(0x80 | (ch & 0x3f));
1468                 continue;
1469             }
1470 encodeUCS4:
1471             /* Encode UCS4 Unicode ordinals */
1472             *p++ = (char)(0xf0 | (ch >> 18));
1473             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1474             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1475             *p++ = (char)(0x80 | (ch & 0x3f));
1476         }
1477     }
1478
1479     if (v == NULL) {
1480         /* This was stack allocated. */
1481         nneeded = p - stackbuf;
1482         assert(nneeded <= nallocated);
1483         v = PyString_FromStringAndSize(stackbuf, nneeded);
1484     }
1485     else {
1486         /* Cut back to size actually needed. */
1487         nneeded = p - PyString_AS_STRING(v);
1488         assert(nneeded <= nallocated);
1489         _PyString_Resize(&v, nneeded);
1490     }
1491     return v;
1492
1493 #undef MAX_SHORT_UNICHARS
1494 }
1495
1496 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1497 {
1498     if (!PyUnicode_Check(unicode)) {
1499         PyErr_BadArgument();
1500         return NULL;
1501     }
1502     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1503                                 PyUnicode_GET_SIZE(unicode),
1504                                 NULL);
1505 }
1506
1507 /* --- UTF-16 Codec ------------------------------------------------------- */
1508
1509 PyObject *
1510 PyUnicode_DecodeUTF16(const char *s,
1511                       Py_ssize_t size,
1512                       const char *errors,
1513                       int *byteorder)
1514 {
1515     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1516 }
1517
1518 PyObject *
1519 PyUnicode_DecodeUTF16Stateful(const char *s,
1520                               Py_ssize_t size,
1521                               const char *errors,
1522                               int *byteorder,
1523                               Py_ssize_t *consumed)
1524 {
1525     const char *starts = s;
1526     Py_ssize_t startinpos;
1527     Py_ssize_t endinpos;
1528     Py_ssize_t outpos;
1529     PyUnicodeObject *unicode;
1530     Py_UNICODE *p;
1531     const unsigned char *q, *e;
1532     int bo = 0;       /* assume native ordering by default */
1533     const char *errmsg = "";
1534     /* Offsets from q for retrieving byte pairs in the right order. */
1535 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1536     int ihi = 1, ilo = 0;
1537 #else
1538     int ihi = 0, ilo = 1;
1539 #endif
1540     PyObject *errorHandler = NULL;
1541     PyObject *exc = NULL;
1542
1543     /* Note: size will always be longer than the resulting Unicode
1544        character count */
1545     unicode = _PyUnicode_New(size);
1546     if (!unicode)
1547         return NULL;
1548     if (size == 0)
1549         return (PyObject *)unicode;
1550
1551     /* Unpack UTF-16 encoded data */
1552     p = unicode->str;
1553     q = (unsigned char *)s;
1554     e = q + size;
1555
1556     if (byteorder)
1557         bo = *byteorder;
1558
1559     /* Check for BOM marks (U+FEFF) in the input and adjust current
1560        byte order setting accordingly. In native mode, the leading BOM
1561        mark is skipped, in all other modes, it is copied to the output
1562        stream as-is (giving a ZWNBSP character). */
1563     if (bo == 0) {
1564         if (size >= 2) {
1565             const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
1566 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1567             if (bom == 0xFEFF) {
1568                 q += 2;
1569                 bo = -1;
1570             }
1571             else if (bom == 0xFFFE) {
1572                 q += 2;
1573                 bo = 1;
1574             }
1575 #else
1576             if (bom == 0xFEFF) {
1577                 q += 2;
1578                 bo = 1;
1579             }
1580             else if (bom == 0xFFFE) {
1581                 q += 2;
1582                 bo = -1;
1583             }
1584 #endif
1585         }
1586     }
1587
1588     if (bo == -1) {
1589         /* force LE */
1590         ihi = 1;
1591         ilo = 0;
1592     }
1593     else if (bo == 1) {
1594         /* force BE */
1595         ihi = 0;
1596         ilo = 1;
1597     }
1598
1599     while (q < e) {
1600         Py_UNICODE ch;
1601         /* remaining bytes at the end? (size should be even) */
1602         if (e-q<2) {
1603             if (consumed)
1604                 break;
1605             errmsg = "truncated data";
1606             startinpos = ((const char *)q)-starts;
1607             endinpos = ((const char *)e)-starts;
1608             goto utf16Error;
1609             /* The remaining input chars are ignored if the callback
1610                chooses to skip the input */
1611         }
1612         ch = (q[ihi] << 8) | q[ilo];
1613
1614         q += 2;
1615
1616         if (ch < 0xD800 || ch > 0xDFFF) {
1617             *p++ = ch;
1618             continue;
1619         }
1620
1621         /* UTF-16 code pair: */
1622         if (q >= e) {
1623             errmsg = "unexpected end of data";
1624             startinpos = (((const char *)q)-2)-starts;
1625             endinpos = ((const char *)e)-starts;
1626             goto utf16Error;
1627         }
1628         if (0xD800 <= ch && ch <= 0xDBFF) {
1629             Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1630             q += 2;
1631             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1632 #ifndef Py_UNICODE_WIDE
1633                 *p++ = ch;
1634                 *p++ = ch2;
1635 #else
1636                 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1637 #endif
1638                 continue;
1639             }
1640             else {
1641                 errmsg = "illegal UTF-16 surrogate";
1642                 startinpos = (((const char *)q)-4)-starts;
1643                 endinpos = startinpos+2;
1644                 goto utf16Error;
1645             }
1646
1647         }
1648         errmsg = "illegal encoding";
1649         startinpos = (((const char *)q)-2)-starts;
1650         endinpos = startinpos+2;
1651         /* Fall through to report the error */
1652
1653     utf16Error:
1654         outpos = p-PyUnicode_AS_UNICODE(unicode);
1655         if (unicode_decode_call_errorhandler(
1656                  errors, &errorHandler,
1657                  "utf16", errmsg,
1658                  starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1659                  (PyObject **)&unicode, &outpos, &p))
1660             goto onError;
1661     }
1662
1663     if (byteorder)
1664         *byteorder = bo;
1665
1666     if (consumed)
1667         *consumed = (const char *)q-starts;
1668
1669     /* Adjust length */
1670     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1671         goto onError;
1672
1673     Py_XDECREF(errorHandler);
1674     Py_XDECREF(exc);
1675     return (PyObject *)unicode;
1676
1677 onError:
1678     Py_DECREF(unicode);
1679     Py_XDECREF(errorHandler);
1680     Py_XDECREF(exc);
1681     return NULL;
1682 }
1683
1684 PyObject *
1685 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1686                       Py_ssize_t size,
1687                       const char *errors,
1688                       int byteorder)
1689 {
1690     PyObject *v;
1691     unsigned char *p;
1692 #ifdef Py_UNICODE_WIDE
1693     int i, pairs;
1694 #else
1695     const int pairs = 0;
1696 #endif
1697     /* Offsets from p for storing byte pairs in the right order. */
1698 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1699     int ihi = 1, ilo = 0;
1700 #else
1701     int ihi = 0, ilo = 1;
1702 #endif
1703
1704 #define STORECHAR(CH)                   \
1705     do {                                \
1706         p[ihi] = ((CH) >> 8) & 0xff;    \
1707         p[ilo] = (CH) & 0xff;           \
1708         p += 2;                         \
1709     } while(0)
1710
1711 #ifdef Py_UNICODE_WIDE
1712     for (i = pairs = 0; i < size; i++)
1713         if (s[i] >= 0x10000)
1714             pairs++;
1715 #endif
1716     v = PyString_FromStringAndSize(NULL,
1717                   2 * (size + pairs + (byteorder == 0)));
1718     if (v == NULL)
1719         return NULL;
1720
1721     p = (unsigned char *)PyString_AS_STRING(v);
1722     if (byteorder == 0)
1723         STORECHAR(0xFEFF);
1724     if (size == 0)
1725         return v;
1726
1727     if (byteorder == -1) {
1728         /* force LE */
1729         ihi = 1;
1730         ilo = 0;
1731     }
1732     else if (byteorder == 1) {
1733         /* force BE */
1734         ihi = 0;
1735         ilo = 1;
1736     }
1737
1738     while (size-- > 0) {
1739         Py_UNICODE ch = *s++;
1740         Py_UNICODE ch2 = 0;
1741 #ifdef Py_UNICODE_WIDE
1742         if (ch >= 0x10000) {
1743             ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1744             ch  = 0xD800 | ((ch-0x10000) >> 10);
1745         }
1746 #endif
1747         STORECHAR(ch);
1748         if (ch2)
1749             STORECHAR(ch2);
1750     }
1751     return v;
1752 #undef STORECHAR
1753 }
1754
1755 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1756 {
1757     if (!PyUnicode_Check(unicode)) {
1758         PyErr_BadArgument();
1759         return NULL;
1760     }
1761     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1762                                  PyUnicode_GET_SIZE(unicode),
1763                                  NULL,
1764                                  0);
1765 }
1766
1767 /* --- Unicode Escape Codec ----------------------------------------------- */
1768
1769 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1770
1771 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1772                                         Py_ssize_t size,
1773                                         const char *errors)
1774 {
1775     const char *starts = s;
1776     Py_ssize_t startinpos;
1777     Py_ssize_t endinpos;
1778     Py_ssize_t outpos;
1779     int i;
1780     PyUnicodeObject *v;
1781     Py_UNICODE *p;
1782     const char *end;
1783     char* message;
1784     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1785     PyObject *errorHandler = NULL;
1786     PyObject *exc = NULL;
1787
1788     /* Escaped strings will always be longer than the resulting
1789        Unicode string, so we start with size here and then reduce the
1790        length after conversion to the true value.
1791        (but if the error callback returns a long replacement string
1792        we'll have to allocate more space) */
1793     v = _PyUnicode_New(size);
1794     if (v == NULL)
1795         goto onError;
1796     if (size == 0)
1797         return (PyObject *)v;
1798
1799     p = PyUnicode_AS_UNICODE(v);
1800     end = s + size;
1801
1802     while (s < end) {
1803         unsigned char c;
1804         Py_UNICODE x;
1805         int digits;
1806
1807         /* Non-escape characters are interpreted as Unicode ordinals */
1808         if (*s != '\\') {
1809             *p++ = (unsigned char) *s++;
1810             continue;
1811         }
1812
1813         startinpos = s-starts;
1814         /* \ - Escapes */
1815         s++;
1816         switch (*s++) {
1817
1818         /* \x escapes */
1819         case '\n': break;
1820         case '\\': *p++ = '\\'; break;
1821         case '\'': *p++ = '\''; break;
1822         case '\"': *p++ = '\"'; break;
1823         case 'b': *p++ = '\b'; break;
1824         case 'f': *p++ = '\014'; break; /* FF */
1825         case 't': *p++ = '\t'; break;
1826         case 'n': *p++ = '\n'; break;
1827         case 'r': *p++ = '\r'; break;
1828         case 'v': *p++ = '\013'; break; /* VT */
1829         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1830
1831         /* \OOO (octal) escapes */
1832         case '0': case '1': case '2': case '3':
1833         case '4': case '5': case '6': case '7':
1834             x = s[-1] - '0';
1835             if ('0' <= *s && *s <= '7') {
1836                 x = (x<<3) + *s++ - '0';
1837                 if ('0' <= *s && *s <= '7')
1838                     x = (x<<3) + *s++ - '0';
1839             }
1840             *p++ = x;
1841             break;
1842
1843         /* hex escapes */
1844         /* \xXX */
1845         case 'x':
1846             digits = 2;
1847             message = "truncated \\xXX escape";
1848             goto hexescape;
1849
1850         /* \uXXXX */
1851         case 'u':
1852             digits = 4;
1853             message = "truncated \\uXXXX escape";
1854             goto hexescape;
1855
1856         /* \UXXXXXXXX */
1857         case 'U':
1858             digits = 8;
1859             message = "truncated \\UXXXXXXXX escape";
1860         hexescape:
1861             chr = 0;
1862             outpos = p-PyUnicode_AS_UNICODE(v);
1863             if (s+digits>end) {
1864                 endinpos = size;
1865                 if (unicode_decode_call_errorhandler(
1866                     errors, &errorHandler,
1867                     "unicodeescape", "end of string in escape sequence",
1868                     starts, size, &startinpos, &endinpos, &exc, &s,
1869                     (PyObject **)&v, &outpos, &p))
1870                     goto onError;
1871                 goto nextByte;
1872             }
1873             for (i = 0; i < digits; ++i) {
1874                 c = (unsigned char) s[i];
1875                 if (!isxdigit(c)) {
1876                     endinpos = (s+i+1)-starts;
1877                     if (unicode_decode_call_errorhandler(
1878                         errors, &errorHandler,
1879                         "unicodeescape", message,
1880                         starts, size, &startinpos, &endinpos, &exc, &s,
1881                         (PyObject **)&v, &outpos, &p))
1882                         goto onError;
1883                     goto nextByte;
1884                 }
1885                 chr = (chr<<4) & ~0xF;
1886                 if (c >= '0' && c <= '9')
1887                     chr += c - '0';
1888                 else if (c >= 'a' && c <= 'f')
1889                     chr += 10 + c - 'a';
1890                 else
1891                     chr += 10 + c - 'A';
1892             }
1893             s += i;
1894             if (chr == 0xffffffff && PyErr_Occurred())
1895                 /* _decoding_error will have already written into the
1896                    target buffer. */
1897                 break;
1898         store:
1899             /* when we get here, chr is a 32-bit unicode character */
1900             if (chr <= 0xffff)
1901                 /* UCS-2 character */
1902                 *p++ = (Py_UNICODE) chr;
1903             else if (chr <= 0x10ffff) {
1904                 /* UCS-4 character. Either store directly, or as
1905                    surrogate pair. */
1906 #ifdef Py_UNICODE_WIDE
1907                 *p++ = chr;
1908 #else
1909                 chr -= 0x10000L;
1910                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1911                 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
1912 #endif
1913             } else {
1914                 endinpos = s-starts;
1915                 outpos = p-PyUnicode_AS_UNICODE(v);
1916                 if (unicode_decode_call_errorhandler(
1917                     errors, &errorHandler,
1918                     "unicodeescape", "illegal Unicode character",
1919                     starts, size, &startinpos, &endinpos, &exc, &s,
1920                     (PyObject **)&v, &outpos, &p))
1921                     goto onError;
1922             }
1923             break;
1924
1925         /* \N{name} */
1926         case 'N':
1927             message = "malformed \\N character escape";
1928             if (ucnhash_CAPI == NULL) {
1929                 /* load the unicode data module */
1930                 PyObject *m, *api;
1931                 m = PyImport_ImportModule("unicodedata");
1932                 if (m == NULL)
1933                     goto ucnhashError;
1934                 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
1935                 Py_DECREF(m);
1936                 if (api == NULL)
1937                     goto ucnhashError;
1938                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
1939                 Py_DECREF(api);
1940                 if (ucnhash_CAPI == NULL)
1941                     goto ucnhashError;
1942             }
1943             if (*s == '{') {
1944                 const char *start = s+1;
1945                 /* look for the closing brace */
1946                 while (*s != '}' && s < end)
1947                     s++;
1948                 if (s > start && s < end && *s == '}') {
1949                     /* found a name.  look it up in the unicode database */
1950                     message = "unknown Unicode character name";
1951                     s++;
1952                     if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
1953                         goto store;
1954                 }
1955             }
1956             endinpos = s-starts;
1957             outpos = p-PyUnicode_AS_UNICODE(v);
1958             if (unicode_decode_call_errorhandler(
1959                 errors, &errorHandler,
1960                 "unicodeescape", message,
1961                 starts, size, &startinpos, &endinpos, &exc, &s,
1962                 (PyObject **)&v, &outpos, &p))
1963                 goto onError;
1964             break;
1965
1966         default:
1967             if (s > end) {
1968                 message = "\\ at end of string";
1969                 s--;
1970                 endinpos = s-starts;
1971                 outpos = p-PyUnicode_AS_UNICODE(v);
1972                 if (unicode_decode_call_errorhandler(
1973                     errors, &errorHandler,
1974                     "unicodeescape", message,
1975                     starts, size, &startinpos, &endinpos, &exc, &s,
1976                     (PyObject **)&v, &outpos, &p))
1977                     goto onError;
1978             }
1979             else {
1980                 *p++ = '\\';
1981                 *p++ = (unsigned char)s[-1];
1982             }
1983             break;
1984         }
1985         nextByte:
1986         ;
1987     }
1988     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
1989         goto onError;
1990     Py_XDECREF(errorHandler);
1991     Py_XDECREF(exc);
1992     return (PyObject *)v;
1993
1994 ucnhashError:
1995     PyErr_SetString(
1996         PyExc_UnicodeError,
1997         "\\N escapes not supported (can't load unicodedata module)"
1998         );
1999     Py_XDECREF(v);
2000     Py_XDECREF(errorHandler);
2001     Py_XDECREF(exc);
2002     return NULL;
2003
2004 onError:
2005     Py_XDECREF(v);
2006     Py_XDECREF(errorHandler);
2007     Py_XDECREF(exc);
2008     return NULL;
2009 }
2010
2011 /* Return a Unicode-Escape string version of the Unicode object.
2012
2013    If quotes is true, the string is enclosed in u"" or u'' quotes as
2014    appropriate.
2015
2016 */
2017
2018 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2019                                       Py_ssize_t size,
2020                                       Py_UNICODE ch)
2021 {
2022     /* like wcschr, but doesn't stop at NULL characters */
2023
2024     while (size-- > 0) {
2025         if (*s == ch)
2026             return s;
2027         s++;
2028     }
2029
2030     return NULL;
2031 }
2032
2033 static
2034 PyObject *unicodeescape_string(const Py_UNICODE *s,
2035                                Py_ssize_t size,
2036                                int quotes)
2037 {
2038     PyObject *repr;
2039     char *p;
2040
2041     static const char *hexdigit = "0123456789abcdef";
2042
2043     /* XXX(nnorwitz): rather than over-allocating, it would be
2044        better to choose a different scheme.  Perhaps scan the
2045        first N-chars of the string and allocate based on that size.
2046     */
2047     /* Initial allocation is based on the longest-possible unichr
2048        escape.
2049
2050        In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2051        unichr, so in this case it's the longest unichr escape. In
2052        narrow (UTF-16) builds this is five chars per source unichr
2053        since there are two unichrs in the surrogate pair, so in narrow
2054        (UTF-16) builds it's not the longest unichr escape.
2055
2056        In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2057        so in the narrow (UTF-16) build case it's the longest unichr
2058        escape.
2059     */
2060
2061     repr = PyString_FromStringAndSize(NULL,
2062         2
2063 #ifdef Py_UNICODE_WIDE
2064         + 10*size
2065 #else
2066         + 6*size
2067 #endif
2068         + 1);
2069     if (repr == NULL)
2070         return NULL;
2071
2072     p = PyString_AS_STRING(repr);
2073
2074     if (quotes) {
2075         *p++ = 'u';
2076         *p++ = (findchar(s, size, '\'') &&
2077                 !findchar(s, size, '"')) ? '"' : '\'';
2078     }
2079     while (size-- > 0) {
2080         Py_UNICODE ch = *s++;
2081
2082         /* Escape quotes and backslashes */
2083         if ((quotes &&
2084              ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
2085             *p++ = '\\';
2086             *p++ = (char) ch;
2087             continue;
2088         }
2089
2090 #ifdef Py_UNICODE_WIDE
2091         /* Map 21-bit characters to '\U00xxxxxx' */
2092         else if (ch >= 0x10000) {
2093             *p++ = '\\';
2094             *p++ = 'U';
2095             *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2096             *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2097             *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2098             *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2099             *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2100             *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2101             *p++ = hexdigit[(ch >> 4) & 0x0000000F];
2102             *p++ = hexdigit[ch & 0x0000000F];
2103             continue;
2104         }
2105 #else
2106         /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
2107         else if (ch >= 0xD800 && ch < 0xDC00) {
2108             Py_UNICODE ch2;
2109             Py_UCS4 ucs;
2110
2111             ch2 = *s++;
2112             size--;
2113             if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2114                 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2115                 *p++ = '\\';
2116                 *p++ = 'U';
2117                 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2118                 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2119                 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2120                 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2121                 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2122                 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2123                 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2124                 *p++ = hexdigit[ucs & 0x0000000F];
2125                 continue;
2126             }
2127             /* Fall through: isolated surrogates are copied as-is */
2128             s--;
2129             size++;
2130         }
2131 #endif
2132
2133         /* Map 16-bit characters to '\uxxxx' */
2134         if (ch >= 256) {
2135             *p++ = '\\';
2136             *p++ = 'u';
2137             *p++ = hexdigit[(ch >> 12) & 0x000F];
2138             *p++ = hexdigit[(ch >> 8) & 0x000F];
2139             *p++ = hexdigit[(ch >> 4) & 0x000F];
2140             *p++ = hexdigit[ch & 0x000F];
2141         }
2142
2143         /* Map special whitespace to '\t', \n', '\r' */
2144         else if (ch == '\t') {
2145             *p++ = '\\';
2146             *p++ = 't';
2147         }
2148         else if (ch == '\n') {
2149             *p++ = '\\';
2150             *p++ = 'n';
2151         }
2152         else if (ch == '\r') {
2153             *p++ = '\\';
2154             *p++ = 'r';
2155         }
2156
2157         /* Map non-printable US ASCII to '\xhh' */
2158         else if (ch < ' ' || ch >= 0x7F) {
2159             *p++ = '\\';
2160             *p++ = 'x';
2161             *p++ = hexdigit[(ch >> 4) & 0x000F];
2162             *p++ = hexdigit[ch & 0x000F];
2163         }
2164
2165         /* Copy everything else as-is */
2166         else
2167             *p++ = (char) ch;
2168     }
2169     if (quotes)
2170         *p++ = PyString_AS_STRING(repr)[1];
2171
2172     *p = '\0';
2173     _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
2174     return repr;
2175 }
2176
2177 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2178                                         Py_ssize_t size)
2179 {
2180     return unicodeescape_string(s, size, 0);
2181 }
2182
2183 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2184 {
2185     if (!PyUnicode_Check(unicode)) {
2186         PyErr_BadArgument();
2187         return NULL;
2188     }
2189     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2190                                          PyUnicode_GET_SIZE(unicode));
2191 }
2192
2193 /* --- Raw Unicode Escape Codec ------------------------------------------- */
2194
2195 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2196                                            Py_ssize_t size,
2197                                            const char *errors)
2198 {
2199     const char *starts = s;
2200     Py_ssize_t startinpos;
2201     Py_ssize_t endinpos;
2202     Py_ssize_t outpos;
2203     PyUnicodeObject *v;
2204     Py_UNICODE *p;
2205     const char *end;
2206     const char *bs;
2207     PyObject *errorHandler = NULL;
2208     PyObject *exc = NULL;
2209
2210     /* Escaped strings will always be longer than the resulting
2211        Unicode string, so we start with size here and then reduce the
2212        length after conversion to the true value. (But decoding error
2213        handler might have to resize the string) */
2214     v = _PyUnicode_New(size);
2215     if (v == NULL)
2216         goto onError;
2217     if (size == 0)
2218         return (PyObject *)v;
2219     p = PyUnicode_AS_UNICODE(v);
2220     end = s + size;
2221     while (s < end) {
2222         unsigned char c;
2223         Py_UCS4 x;
2224         int i;
2225         int count;
2226
2227         /* Non-escape characters are interpreted as Unicode ordinals */
2228         if (*s != '\\') {
2229             *p++ = (unsigned char)*s++;
2230             continue;
2231         }
2232         startinpos = s-starts;
2233
2234         /* \u-escapes are only interpreted iff the number of leading
2235            backslashes if odd */
2236         bs = s;
2237         for (;s < end;) {
2238             if (*s != '\\')
2239                 break;
2240             *p++ = (unsigned char)*s++;
2241         }
2242         if (((s - bs) & 1) == 0 ||
2243             s >= end ||
2244             (*s != 'u' && *s != 'U')) {
2245             continue;
2246         }
2247         p--;
2248         count = *s=='u' ? 4 : 8;
2249         s++;
2250
2251         /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
2252         outpos = p-PyUnicode_AS_UNICODE(v);
2253         for (x = 0, i = 0; i < count; ++i, ++s) {
2254             c = (unsigned char)*s;
2255             if (!isxdigit(c)) {
2256                 endinpos = s-starts;
2257                 if (unicode_decode_call_errorhandler(
2258                     errors, &errorHandler,
2259                     "rawunicodeescape", "truncated \\uXXXX",
2260                     starts, size, &startinpos, &endinpos, &exc, &s,
2261                     (PyObject **)&v, &outpos, &p))
2262                     goto onError;
2263                 goto nextByte;
2264             }
2265             x = (x<<4) & ~0xF;
2266             if (c >= '0' && c <= '9')
2267                 x += c - '0';
2268             else if (c >= 'a' && c <= 'f')
2269                 x += 10 + c - 'a';
2270             else
2271                 x += 10 + c - 'A';
2272         }
2273 #ifndef Py_UNICODE_WIDE
2274         if (x > 0x10000) {
2275             if (unicode_decode_call_errorhandler(
2276                     errors, &errorHandler,
2277                     "rawunicodeescape", "\\Uxxxxxxxx out of range",
2278                     starts, size, &startinpos, &endinpos, &exc, &s,
2279                     (PyObject **)&v, &outpos, &p))
2280                     goto onError;
2281         }
2282 #endif
2283         *p++ = x;
2284         nextByte:
2285         ;
2286     }
2287     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2288         goto onError;
2289     Py_XDECREF(errorHandler);
2290     Py_XDECREF(exc);
2291     return (PyObject *)v;
2292
2293  onError:
2294     Py_XDECREF(v);
2295     Py_XDECREF(errorHandler);
2296     Py_XDECREF(exc);
2297     return NULL;
2298 }
2299
2300 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2301                                            Py_ssize_t size)
2302 {
2303     PyObject *repr;
2304     char *p;
2305     char *q;
2306
2307     static const char *hexdigit = "0123456789abcdef";
2308
2309 #ifdef Py_UNICODE_WIDE
2310     repr = PyString_FromStringAndSize(NULL, 10 * size);
2311 #else
2312     repr = PyString_FromStringAndSize(NULL, 6 * size);
2313 #endif
2314     if (repr == NULL)
2315         return NULL;
2316     if (size == 0)
2317         return repr;
2318
2319     p = q = PyString_AS_STRING(repr);
2320     while (size-- > 0) {
2321         Py_UNICODE ch = *s++;
2322 #ifdef Py_UNICODE_WIDE
2323         /* Map 32-bit characters to '\Uxxxxxxxx' */
2324         if (ch >= 0x10000) {
2325             *p++ = '\\';
2326             *p++ = 'U';
2327             *p++ = hexdigit[(ch >> 28) & 0xf];
2328             *p++ = hexdigit[(ch >> 24) & 0xf];
2329             *p++ = hexdigit[(ch >> 20) & 0xf];
2330             *p++ = hexdigit[(ch >> 16) & 0xf];
2331             *p++ = hexdigit[(ch >> 12) & 0xf];
2332             *p++ = hexdigit[(ch >> 8) & 0xf];
2333             *p++ = hexdigit[(ch >> 4) & 0xf];
2334             *p++ = hexdigit[ch & 15];
2335         }
2336         else
2337 #endif
2338         /* Map 16-bit characters to '\uxxxx' */
2339         if (ch >= 256) {
2340             *p++ = '\\';
2341             *p++ = 'u';
2342             *p++ = hexdigit[(ch >> 12) & 0xf];
2343             *p++ = hexdigit[(ch >> 8) & 0xf];
2344             *p++ = hexdigit[(ch >> 4) & 0xf];
2345             *p++ = hexdigit[ch & 15];
2346         }
2347         /* Copy everything else as-is */
2348         else
2349             *p++ = (char) ch;
2350     }
2351     *p = '\0';
2352     _PyString_Resize(&repr, p - q);
2353     return repr;
2354 }
2355
2356 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2357 {
2358     if (!PyUnicode_Check(unicode)) {
2359         PyErr_BadArgument();
2360         return NULL;
2361     }
2362     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2363                                             PyUnicode_GET_SIZE(unicode));
2364 }
2365
2366 /* --- Unicode Internal Codec ------------------------------------------- */
2367
2368 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
2369                                            Py_ssize_t size,
2370                                            const char *errors)
2371 {
2372     const char *starts = s;
2373     Py_ssize_t startinpos;
2374     Py_ssize_t endinpos;
2375     Py_ssize_t outpos;
2376     PyUnicodeObject *v;
2377     Py_UNICODE *p;
2378     const char *end;
2379     const char *reason;
2380     PyObject *errorHandler = NULL;
2381     PyObject *exc = NULL;
2382
2383 #ifdef Py_UNICODE_WIDE
2384     Py_UNICODE unimax = PyUnicode_GetMax();
2385 #endif
2386
2387     /* XXX overflow detection missing */
2388     v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2389     if (v == NULL)
2390         goto onError;
2391     if (PyUnicode_GetSize((PyObject *)v) == 0)
2392         return (PyObject *)v;
2393     p = PyUnicode_AS_UNICODE(v);
2394     end = s + size;
2395
2396     while (s < end) {
2397         memcpy(p, s, sizeof(Py_UNICODE));
2398         /* We have to sanity check the raw data, otherwise doom looms for
2399            some malformed UCS-4 data. */
2400         if (
2401             #ifdef Py_UNICODE_WIDE
2402             *p > unimax || *p < 0 ||
2403             #endif
2404             end-s < Py_UNICODE_SIZE
2405             )
2406             {
2407             startinpos = s - starts;
2408             if (end-s < Py_UNICODE_SIZE) {
2409                 endinpos = end-starts;
2410                 reason = "truncated input";
2411             }
2412             else {
2413                 endinpos = s - starts + Py_UNICODE_SIZE;
2414                 reason = "illegal code point (> 0x10FFFF)";
2415             }
2416             outpos = p - PyUnicode_AS_UNICODE(v);
2417             if (unicode_decode_call_errorhandler(
2418                     errors, &errorHandler,
2419                     "unicode_internal", reason,
2420                     starts, size, &startinpos, &endinpos, &exc, &s,
2421                     (PyObject **)&v, &outpos, &p)) {
2422                 goto onError;
2423             }
2424         }
2425         else {
2426             p++;
2427             s += Py_UNICODE_SIZE;
2428         }
2429     }
2430
2431     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2432         goto onError;
2433     Py_XDECREF(errorHandler);
2434     Py_XDECREF(exc);
2435     return (PyObject *)v;
2436
2437  onError:
2438     Py_XDECREF(v);
2439     Py_XDECREF(errorHandler);
2440     Py_XDECREF(exc);
2441     return NULL;
2442 }
2443
2444 /* --- Latin-1 Codec ------------------------------------------------------ */
2445
2446 PyObject *PyUnicode_DecodeLatin1(const char *s,
2447                                  Py_ssize_t size,
2448                                  const char *errors)
2449 {
2450     PyUnicodeObject *v;
2451     Py_UNICODE *p;
2452
2453     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2454     if (size == 1) {
2455         Py_UNICODE r = *(unsigned char*)s;
2456         return PyUnicode_FromUnicode(&r, 1);
2457     }
2458
2459     v = _PyUnicode_New(size);
2460     if (v == NULL)
2461         goto onError;
2462     if (size == 0)
2463         return (PyObject *)v;
2464     p = PyUnicode_AS_UNICODE(v);
2465     while (size-- > 0)
2466         *p++ = (unsigned char)*s++;
2467     return (PyObject *)v;
2468
2469  onError:
2470     Py_XDECREF(v);
2471     return NULL;
2472 }
2473
2474 /* create or adjust a UnicodeEncodeError */
2475 static void make_encode_exception(PyObject **exceptionObject,
2476     const char *encoding,
2477     const Py_UNICODE *unicode, Py_ssize_t size,
2478     Py_ssize_t startpos, Py_ssize_t endpos,
2479     const char *reason)
2480 {
2481     if (*exceptionObject == NULL) {
2482         *exceptionObject = PyUnicodeEncodeError_Create(
2483             encoding, unicode, size, startpos, endpos, reason);
2484     }
2485     else {
2486         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2487             goto onError;
2488         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2489             goto onError;
2490         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2491             goto onError;
2492         return;
2493         onError:
2494         Py_DECREF(*exceptionObject);
2495         *exceptionObject = NULL;
2496     }
2497 }
2498
2499 /* raises a UnicodeEncodeError */
2500 static void raise_encode_exception(PyObject **exceptionObject,
2501     const char *encoding,
2502     const Py_UNICODE *unicode, Py_ssize_t size,
2503     Py_ssize_t startpos, Py_ssize_t endpos,
2504     const char *reason)
2505 {
2506     make_encode_exception(exceptionObject,
2507         encoding, unicode, size, startpos, endpos, reason);
2508     if (*exceptionObject != NULL)
2509         PyCodec_StrictErrors(*exceptionObject);
2510 }
2511
2512 /* error handling callback helper:
2513    build arguments, call the callback and check the arguments,
2514    put the result into newpos and return the replacement string, which
2515    has to be freed by the caller */
2516 static PyObject *unicode_encode_call_errorhandler(const char *errors,
2517     PyObject **errorHandler,
2518     const char *encoding, const char *reason,
2519     const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2520     Py_ssize_t startpos, Py_ssize_t endpos,
2521     Py_ssize_t *newpos)
2522 {
2523     static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
2524
2525     PyObject *restuple;
2526     PyObject *resunicode;
2527
2528     if (*errorHandler == NULL) {
2529         *errorHandler = PyCodec_LookupError(errors);
2530         if (*errorHandler == NULL)
2531             return NULL;
2532     }
2533
2534     make_encode_exception(exceptionObject,
2535         encoding, unicode, size, startpos, endpos, reason);
2536     if (*exceptionObject == NULL)
2537         return NULL;
2538
2539     restuple = PyObject_CallFunctionObjArgs(
2540         *errorHandler, *exceptionObject, NULL);
2541     if (restuple == NULL)
2542         return NULL;
2543     if (!PyTuple_Check(restuple)) {
2544         PyErr_Format(PyExc_TypeError, &argparse[4]);
2545         Py_DECREF(restuple);
2546         return NULL;
2547     }
2548     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2549         &resunicode, newpos)) {
2550         Py_DECREF(restuple);
2551         return NULL;
2552     }
2553     if (*newpos<0)
2554         *newpos = size+*newpos;
2555     if (*newpos<0 || *newpos>size) {
2556         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
2557         Py_DECREF(restuple);
2558         return NULL;
2559     }
2560     Py_INCREF(resunicode);
2561     Py_DECREF(restuple);
2562     return resunicode;
2563 }
2564
2565 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2566                                  Py_ssize_t size,
2567                                  const char *errors,
2568                                  int limit)
2569 {
2570     /* output object */
2571     PyObject *res;
2572     /* pointers to the beginning and end+1 of input */
2573     const Py_UNICODE *startp = p;
2574     const Py_UNICODE *endp = p + size;
2575     /* pointer to the beginning of the unencodable characters */
2576     /* const Py_UNICODE *badp = NULL; */
2577     /* pointer into the output */
2578     char *str;
2579     /* current output position */
2580     Py_ssize_t respos = 0;
2581     Py_ssize_t ressize;
2582     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2583     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2584     PyObject *errorHandler = NULL;
2585     PyObject *exc = NULL;
2586     /* the following variable is used for caching string comparisons
2587      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2588     int known_errorHandler = -1;
2589
2590     /* allocate enough for a simple encoding without
2591        replacements, if we need more, we'll resize */
2592     res = PyString_FromStringAndSize(NULL, size);
2593     if (res == NULL)
2594         goto onError;
2595     if (size == 0)
2596         return res;
2597     str = PyString_AS_STRING(res);
2598     ressize = size;
2599
2600     while (p<endp) {
2601         Py_UNICODE c = *p;
2602
2603         /* can we encode this? */
2604         if (c<limit) {
2605             /* no overflow check, because we know that the space is enough */
2606             *str++ = (char)c;
2607             ++p;
2608         }
2609         else {
2610             Py_ssize_t unicodepos = p-startp;
2611             Py_ssize_t requiredsize;
2612             PyObject *repunicode;
2613             Py_ssize_t repsize;
2614             Py_ssize_t newpos;
2615             Py_ssize_t respos;
2616             Py_UNICODE *uni2;
2617             /* startpos for collecting unencodable chars */
2618             const Py_UNICODE *collstart = p;
2619             const Py_UNICODE *collend = p;
2620             /* find all unecodable characters */
2621             while ((collend < endp) && ((*collend)>=limit))
2622                 ++collend;
2623             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2624             if (known_errorHandler==-1) {
2625                 if ((errors==NULL) || (!strcmp(errors, "strict")))
2626                     known_errorHandler = 1;
2627                 else if (!strcmp(errors, "replace"))
2628                     known_errorHandler = 2;
2629                 else if (!strcmp(errors, "ignore"))
2630                     known_errorHandler = 3;
2631                 else if (!strcmp(errors, "xmlcharrefreplace"))
2632                     known_errorHandler = 4;
2633                 else
2634                     known_errorHandler = 0;
2635             }
2636             switch (known_errorHandler) {
2637                 case 1: /* strict */
2638                     raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2639                     goto onError;
2640                 case 2: /* replace */
2641                     while (collstart++<collend)
2642                         *str++ = '?'; /* fall through */
2643                 case 3: /* ignore */
2644                     p = collend;
2645                     break;
2646                 case 4: /* xmlcharrefreplace */
2647                     respos = str-PyString_AS_STRING(res);
2648                     /* determine replacement size (temporarily (mis)uses p) */
2649                     for (p = collstart, repsize = 0; p < collend; ++p) {
2650                         if (*p<10)
2651                             repsize += 2+1+1;
2652                         else if (*p<100)
2653                             repsize += 2+2+1;
2654                         else if (*p<1000)
2655                             repsize += 2+3+1;
2656                         else if (*p<10000)
2657                             repsize += 2+4+1;
2658 #ifndef Py_UNICODE_WIDE
2659                         else
2660                             repsize += 2+5+1;
2661 #else
2662                         else if (*p<100000)
2663                             repsize += 2+5+1;
2664                         else if (*p<1000000)
2665                             repsize += 2+6+1;
2666                         else
2667                             repsize += 2+7+1;
2668 #endif
2669                     }
2670                     requiredsize = respos+repsize+(endp-collend);
2671                     if (requiredsize > ressize) {
2672                         if (requiredsize<2*ressize)
2673                             requiredsize = 2*ressize;
2674                         if (_PyString_Resize(&res, requiredsize))
2675                             goto onError;
2676                         str = PyString_AS_STRING(res) + respos;
2677                         ressize = requiredsize;
2678                     }
2679                     /* generate replacement (temporarily (mis)uses p) */
2680                     for (p = collstart; p < collend; ++p) {
2681                         str += sprintf(str, "&#%d;", (int)*p);
2682                     }
2683                     p = collend;
2684                     break;
2685                 default:
2686                     repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2687                         encoding, reason, startp, size, &exc,
2688                         collstart-startp, collend-startp, &newpos);
2689                     if (repunicode == NULL)
2690                         goto onError;
2691                     /* need more space? (at least enough for what we
2692                        have+the replacement+the rest of the string, so
2693                        we won't have to check space for encodable characters) */
2694                     respos = str-PyString_AS_STRING(res);
2695                     repsize = PyUnicode_GET_SIZE(repunicode);
2696                     requiredsize = respos+repsize+(endp-collend);
2697                     if (requiredsize > ressize) {
2698                         if (requiredsize<2*ressize)
2699                             requiredsize = 2*ressize;
2700                         if (_PyString_Resize(&res, requiredsize)) {
2701                             Py_DECREF(repunicode);
2702                             goto onError;
2703                         }
2704                         str = PyString_AS_STRING(res) + respos;
2705                         ressize = requiredsize;
2706                     }
2707                     /* check if there is anything unencodable in the replacement
2708                        and copy it to the output */
2709                     for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2710                         c = *uni2;
2711                         if (c >= limit) {
2712                             raise_encode_exception(&exc, encoding, startp, size,
2713                                 unicodepos, unicodepos+1, reason);
2714                             Py_DECREF(repunicode);
2715                             goto onError;
2716                         }
2717                         *str = (char)c;
2718                     }
2719                     p = startp + newpos;
2720                     Py_DECREF(repunicode);
2721             }
2722         }
2723     }
2724     /* Resize if we allocated to much */
2725     respos = str-PyString_AS_STRING(res);
2726     if (respos<ressize)
2727        /* If this falls res will be NULL */
2728         _PyString_Resize(&res, respos);
2729     Py_XDECREF(errorHandler);
2730     Py_XDECREF(exc);
2731     return res;
2732
2733     onError:
2734     Py_XDECREF(res);
2735     Py_XDECREF(errorHandler);
2736     Py_XDECREF(exc);
2737     return NULL;
2738 }
2739
2740 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2741                                  Py_ssize_t size,
2742                                  const char *errors)
2743 {
2744     return unicode_encode_ucs1(p, size, errors, 256);
2745 }
2746
2747 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2748 {
2749     if (!PyUnicode_Check(unicode)) {
2750         PyErr_BadArgument();
2751         return NULL;
2752     }
2753     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2754                                   PyUnicode_GET_SIZE(unicode),
2755                                   NULL);
2756 }
2757
2758 /* --- 7-bit ASCII Codec -------------------------------------------------- */
2759
2760 PyObject *PyUnicode_DecodeASCII(const char *s,
2761                                 Py_ssize_t size,
2762                                 const char *errors)
2763 {
2764     const char *starts = s;
2765     PyUnicodeObject *v;
2766     Py_UNICODE *p;
2767     Py_ssize_t startinpos;
2768     Py_ssize_t endinpos;
2769     Py_ssize_t outpos;
2770     const char *e;
2771     PyObject *errorHandler = NULL;
2772     PyObject *exc = NULL;
2773
2774     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
2775     if (size == 1 && *(unsigned char*)s < 128) {
2776         Py_UNICODE r = *(unsigned char*)s;
2777         return PyUnicode_FromUnicode(&r, 1);
2778     }
2779
2780     v = _PyUnicode_New(size);
2781     if (v == NULL)
2782         goto onError;
2783     if (size == 0)
2784         return (PyObject *)v;
2785     p = PyUnicode_AS_UNICODE(v);
2786     e = s + size;
2787     while (s < e) {
2788         register unsigned char c = (unsigned char)*s;
2789         if (c < 128) {
2790             *p++ = c;
2791             ++s;
2792         }
2793         else {
2794             startinpos = s-starts;
2795             endinpos = startinpos + 1;
2796             outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
2797             if (unicode_decode_call_errorhandler(
2798                  errors, &errorHandler,
2799                  "ascii", "ordinal not in range(128)",
2800                  starts, size, &startinpos, &endinpos, &exc, &s,
2801                  (PyObject **)&v, &outpos, &p))
2802                 goto onError;
2803         }
2804     }
2805     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
2806         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2807             goto onError;
2808     Py_XDECREF(errorHandler);
2809     Py_XDECREF(exc);
2810     return (PyObject *)v;
2811
2812  onError:
2813     Py_XDECREF(v);
2814     Py_XDECREF(errorHandler);
2815     Py_XDECREF(exc);
2816     return NULL;
2817 }
2818
2819 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2820                                 Py_ssize_t size,
2821                                 const char *errors)
2822 {
2823     return unicode_encode_ucs1(p, size, errors, 128);
2824 }
2825
2826 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2827 {
2828     if (!PyUnicode_Check(unicode)) {
2829         PyErr_BadArgument();
2830         return NULL;
2831     }
2832     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2833                                  PyUnicode_GET_SIZE(unicode),
2834                                  NULL);
2835 }
2836
2837 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
2838
2839 /* --- MBCS codecs for Windows -------------------------------------------- */
2840
2841 #if SIZEOF_INT < SIZEOF_SSIZE_T
2842 #define NEED_RETRY
2843 #endif
2844
2845 /* XXX This code is limited to "true" double-byte encodings, as
2846    a) it assumes an incomplete character consists of a single byte, and
2847    b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2848       encodings, see IsDBCSLeadByteEx documentation. */
2849
2850 static int is_dbcs_lead_byte(const char *s, int offset)
2851 {
2852     const char *curr = s + offset;
2853
2854     if (IsDBCSLeadByte(*curr)) {
2855         const char *prev = CharPrev(s, curr);
2856         return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
2857     }
2858     return 0;
2859 }
2860
2861 /*
2862  * Decode MBCS string into unicode object. If 'final' is set, converts
2863  * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2864  */
2865 static int decode_mbcs(PyUnicodeObject **v,
2866                         const char *s, /* MBCS string */
2867                         int size, /* sizeof MBCS string */
2868                         int final)
2869 {
2870     Py_UNICODE *p;
2871     Py_ssize_t n = 0;
2872     int usize = 0;
2873
2874     assert(size >= 0);
2875
2876     /* Skip trailing lead-byte unless 'final' is set */
2877     if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
2878         --size;
2879
2880     /* First get the size of the result */
2881     if (size > 0) {
2882         usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2883         if (usize == 0) {
2884             PyErr_SetFromWindowsErrWithFilename(0, NULL);
2885             return -1;
2886         }
2887     }
2888
2889     if (*v == NULL) {
2890         /* Create unicode object */
2891         *v = _PyUnicode_New(usize);
2892         if (*v == NULL)
2893             return -1;
2894     }
2895     else {
2896         /* Extend unicode object */
2897         n = PyUnicode_GET_SIZE(*v);
2898         if (_PyUnicode_Resize(v, n + usize) < 0)
2899             return -1;
2900     }
2901
2902     /* Do the conversion */
2903     if (size > 0) {
2904         p = PyUnicode_AS_UNICODE(*v) + n;
2905         if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2906             PyErr_SetFromWindowsErrWithFilename(0, NULL);
2907             return -1;
2908         }
2909     }
2910
2911     return size;
2912 }
2913
2914 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
2915                                         Py_ssize_t size,
2916                                         const char *errors,
2917                                         Py_ssize_t *consumed)
2918 {
2919     PyUnicodeObject *v = NULL;
2920     int done;
2921
2922     if (consumed)
2923         *consumed = 0;
2924
2925 #ifdef NEED_RETRY
2926   retry:
2927     if (size > INT_MAX)
2928         done = decode_mbcs(&v, s, INT_MAX, 0);
2929     else
2930 #endif
2931         done = decode_mbcs(&v, s, (int)size, !consumed);
2932
2933     if (done < 0) {
2934         Py_XDECREF(v);
2935         return NULL;
2936     }
2937
2938     if (consumed)
2939         *consumed += done;
2940
2941 #ifdef NEED_RETRY
2942     if (size > INT_MAX) {
2943         s += done;
2944         size -= done;
2945         goto retry;
2946     }
2947 #endif
2948
2949     return (PyObject *)v;
2950 }
2951
2952 PyObject *PyUnicode_DecodeMBCS(const char *s,
2953                                 Py_ssize_t size,
2954                                 const char *errors)
2955 {
2956     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
2957 }
2958
2959 /*
2960  * Convert unicode into string object (MBCS).
2961  * Returns 0 if succeed, -1 otherwise.
2962  */
2963 static int encode_mbcs(PyObject **repr,
2964                         const Py_UNICODE *p, /* unicode */
2965                         int size) /* size of unicode */
2966 {
2967     int mbcssize = 0;
2968     Py_ssize_t n = 0;
2969
2970     assert(size >= 0);
2971
2972     /* First get the size of the result */
2973     if (size > 0) {
2974         mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2975         if (mbcssize == 0) {
2976             PyErr_SetFromWindowsErrWithFilename(0, NULL);
2977             return -1;
2978         }
2979     }
2980
2981     if (*repr == NULL) {
2982         /* Create string object */
2983         *repr = PyString_FromStringAndSize(NULL, mbcssize);
2984         if (*repr == NULL)
2985             return -1;
2986     }
2987     else {
2988         /* Extend string object */
2989         n = PyString_Size(*repr);
2990         if (_PyString_Resize(repr, n + mbcssize) < 0)
2991             return -1;
2992     }
2993
2994     /* Do the conversion */
2995     if (size > 0) {
2996         char *s = PyString_AS_STRING(*repr) + n;
2997         if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2998             PyErr_SetFromWindowsErrWithFilename(0, NULL);
2999             return -1;
3000         }
3001     }
3002
3003     return 0;
3004 }
3005
3006 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
3007                                 Py_ssize_t size,
3008                                 const char *errors)
3009 {
3010     PyObject *repr = NULL;
3011     int ret;
3012
3013 #ifdef NEED_RETRY
3014  retry:
3015     if (size > INT_MAX)
3016         ret = encode_mbcs(&repr, p, INT_MAX);
3017     else
3018 #endif
3019         ret = encode_mbcs(&repr, p, (int)size);
3020
3021     if (ret < 0) {
3022         Py_XDECREF(repr);
3023         return NULL;
3024     }
3025
3026 #ifdef NEED_RETRY
3027     if (size > INT_MAX) {
3028         p += INT_MAX;
3029         size -= INT_MAX;
3030         goto retry;
3031     }
3032 #endif
3033
3034     return repr;
3035 }
3036
3037 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3038 {
3039     if (!PyUnicode_Check(unicode)) {
3040         PyErr_BadArgument();
3041         return NULL;
3042     }
3043     return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3044                                 PyUnicode_GET_SIZE(unicode),
3045                                 NULL);
3046 }
3047
3048 #undef NEED_RETRY
3049
3050 #endif /* MS_WINDOWS */
3051
3052 /* --- Character Mapping Codec -------------------------------------------- */
3053
3054 PyObject *PyUnicode_DecodeCharmap(const char *s,
3055                                   Py_ssize_t size,
3056                                   PyObject *mapping,
3057                                   const char *errors)
3058 {
3059     const char *starts = s;
3060     Py_ssize_t startinpos;
3061     Py_ssize_t endinpos;
3062     Py_ssize_t outpos;
3063     const char *e;
3064     PyUnicodeObject *v;
3065     Py_UNICODE *p;
3066     Py_ssize_t extrachars = 0;
3067     PyObject *errorHandler = NULL;
3068     PyObject *exc = NULL;
3069     Py_UNICODE *mapstring = NULL;
3070     Py_ssize_t maplen = 0;
3071
3072     /* Default to Latin-1 */
3073     if (mapping == NULL)
3074         return PyUnicode_DecodeLatin1(s, size, errors);
3075
3076     v = _PyUnicode_New(size);
3077     if (v == NULL)
3078         goto onError;
3079     if (size == 0)
3080         return (PyObject *)v;
3081     p = PyUnicode_AS_UNICODE(v);
3082     e = s + size;
3083     if (PyUnicode_CheckExact(mapping)) {
3084         mapstring = PyUnicode_AS_UNICODE(mapping);
3085         maplen = PyUnicode_GET_SIZE(mapping);
3086         while (s < e) {
3087             unsigned char ch = *s;
3088             Py_UNICODE x = 0xfffe; /* illegal value */
3089
3090             if (ch < maplen)
3091                 x = mapstring[ch];
3092
3093             if (x == 0xfffe) {
3094                 /* undefined mapping */
3095                 outpos = p-PyUnicode_AS_UNICODE(v);
3096                 startinpos = s-starts;
3097                 endinpos = startinpos+1;
3098                 if (unicode_decode_call_errorhandler(
3099                      errors, &errorHandler,
3100                      "charmap", "character maps to <undefined>",
3101                      starts, size, &startinpos, &endinpos, &exc, &s,
3102                      (PyObject **)&v, &outpos, &p)) {
3103                     goto onError;
3104                 }
3105                 continue;
3106             }
3107             *p++ = x;
3108             ++s;
3109         }
3110     }
3111     else {
3112         while (s < e) {
3113             unsigned char ch = *s;
3114             PyObject *w, *x;
3115
3116             /* Get mapping (char ordinal -> integer, Unicode char or None) */
3117             w = PyInt_FromLong((long)ch);
3118             if (w == NULL)
3119                 goto onError;
3120             x = PyObject_GetItem(mapping, w);
3121             Py_DECREF(w);
3122             if (x == NULL) {
3123                 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3124                     /* No mapping found means: mapping is undefined. */
3125                     PyErr_Clear();
3126                     x = Py_None;
3127                     Py_INCREF(x);
3128                 } else
3129                     goto onError;
3130             }
3131
3132             /* Apply mapping */
3133             if (PyInt_Check(x)) {
3134                 long value = PyInt_AS_LONG(x);
3135                 if (value < 0 || value > 65535) {
3136                     PyErr_SetString(PyExc_TypeError,
3137                                     "character mapping must be in range(65536)");
3138                     Py_DECREF(x);
3139                     goto onError;
3140                 }
3141                 *p++ = (Py_UNICODE)value;
3142             }
3143             else if (x == Py_None) {
3144                 /* undefined mapping */
3145                 outpos = p-PyUnicode_AS_UNICODE(v);
3146                 startinpos = s-starts;
3147                 endinpos = startinpos+1;
3148                 if (unicode_decode_call_errorhandler(
3149                      errors, &errorHandler,
3150                      "charmap", "character maps to <undefined>",
3151                      starts, size, &startinpos, &endinpos, &exc, &s,
3152                      (PyObject **)&v, &outpos, &p)) {
3153                     Py_DECREF(x);
3154                     goto onError;
3155                 }
3156                 Py_DECREF(x);
3157                 continue;
3158             }
3159             else if (PyUnicode_Check(x)) {
3160                 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
3161
3162                 if (targetsize == 1)
3163                     /* 1-1 mapping */
3164                     *p++ = *PyUnicode_AS_UNICODE(x);
3165
3166                 else if (targetsize > 1) {
3167                     /* 1-n mapping */
3168                     if (targetsize > extrachars) {
3169                         /* resize first */
3170                         Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3171                         Py_ssize_t needed = (targetsize - extrachars) + \
3172                                      (targetsize << 2);
3173                         extrachars += needed;
3174                         /* XXX overflow detection missing */
3175                         if (_PyUnicode_Resize(&v,
3176                                              PyUnicode_GET_SIZE(v) + needed) < 0) {
3177                             Py_DECREF(x);
3178                             goto onError;
3179                         }
3180                         p = PyUnicode_AS_UNICODE(v) + oldpos;
3181                     }
3182                     Py_UNICODE_COPY(p,
3183                                     PyUnicode_AS_UNICODE(x),
3184                                     targetsize);
3185                     p += targetsize;
3186                     extrachars -= targetsize;
3187                 }
3188                 /* 1-0 mapping: skip the character */
3189             }
3190             else {
3191                 /* wrong return value */
3192                 PyErr_SetString(PyExc_TypeError,
3193                       "character mapping must return integer, None or unicode");
3194                 Py_DECREF(x);
3195                 goto onError;
3196             }
3197             Py_DECREF(x);
3198             ++s;
3199         }
3200     }
3201     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
3202         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3203             goto onError;
3204     Py_XDECREF(errorHandler);
3205     Py_XDECREF(exc);
3206     return (PyObject *)v;
3207
3208  onError:
3209     Py_XDECREF(errorHandler);
3210     Py_XDECREF(exc);
3211     Py_XDECREF(v);
3212     return NULL;
3213 }
3214
3215 /* Charmap encoding: the lookup table */
3216
3217 struct encoding_map{
3218   PyObject_HEAD
3219   unsigned char level1[32];
3220   int count2, count3;
3221   unsigned char level23[1];
3222 };
3223
3224 static PyObject*
3225 encoding_map_size(PyObject *obj, PyObject* args)
3226 {
3227     struct encoding_map *map = (struct encoding_map*)obj;
3228     return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3229                           128*map->count3);
3230 }
3231
3232 static PyMethodDef encoding_map_methods[] = {
3233         {"size", encoding_map_size, METH_NOARGS,
3234          PyDoc_STR("Return the size (in bytes) of this object") },
3235         { 0 }
3236 };
3237
3238 static void
3239 encoding_map_dealloc(PyObject* o)
3240 {
3241         PyObject_FREE(o);
3242 }
3243
3244 static PyTypeObject EncodingMapType = {
3245         PyObject_HEAD_INIT(NULL)
3246         0,                      /*ob_size*/
3247         "EncodingMap",          /*tp_name*/
3248         sizeof(struct encoding_map),   /*tp_basicsize*/
3249         0,                      /*tp_itemsize*/
3250         /* methods */
3251         encoding_map_dealloc,   /*tp_dealloc*/
3252         0,                      /*tp_print*/
3253         0,                      /*tp_getattr*/
3254         0,                      /*tp_setattr*/
3255         0,                      /*tp_compare*/
3256         0,                      /*tp_repr*/
3257         0,                      /*tp_as_number*/
3258         0,                      /*tp_as_sequence*/
3259         0,                      /*tp_as_mapping*/
3260         0,                      /*tp_hash*/
3261         0,                      /*tp_call*/
3262         0,                      /*tp_str*/
3263         0,                      /*tp_getattro*/
3264         0,                      /*tp_setattro*/
3265         0,                      /*tp_as_buffer*/
3266         Py_TPFLAGS_DEFAULT,     /*tp_flags*/
3267         0,                      /*tp_doc*/
3268         0,                      /*tp_traverse*/
3269         0,                      /*tp_clear*/
3270         0,                      /*tp_richcompare*/
3271         0,                      /*tp_weaklistoffset*/
3272         0,                      /*tp_iter*/
3273         0,                      /*tp_iternext*/
3274         encoding_map_methods,   /*tp_methods*/
3275         0,                      /*tp_members*/
3276         0,                      /*tp_getset*/
3277         0,                      /*tp_base*/
3278         0,                      /*tp_dict*/
3279         0,                      /*tp_descr_get*/
3280         0,                      /*tp_descr_set*/
3281         0,                      /*tp_dictoffset*/
3282         0,                      /*tp_init*/
3283         0,                      /*tp_alloc*/
3284         0,                      /*tp_new*/
3285         0,                      /*tp_free*/
3286         0,                      /*tp_is_gc*/
3287 };
3288
3289 PyObject*
3290 PyUnicode_BuildEncodingMap(PyObject* string)
3291 {
3292     Py_UNICODE *decode;
3293     PyObject *result;
3294     struct encoding_map *mresult;
3295     int i;
3296     int need_dict = 0;
3297     unsigned char level1[32];
3298     unsigned char level2[512];
3299     unsigned char *mlevel1, *mlevel2, *mlevel3;
3300     int count2 = 0, count3 = 0;
3301
3302     if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3303         PyErr_BadArgument();
3304         return NULL;
3305     }
3306     decode = PyUnicode_AS_UNICODE(string);
3307     memset(level1, 0xFF, sizeof level1);
3308     memset(level2, 0xFF, sizeof level2);
3309
3310     /* If there isn't a one-to-one mapping of NULL to \0,
3311        or if there are non-BMP characters, we need to use
3312        a mapping dictionary. */
3313     if (decode[0] != 0)
3314         need_dict = 1;
3315     for (i = 1; i < 256; i++) {
3316         int l1, l2;
3317         if (decode[i] == 0
3318             #ifdef Py_UNICODE_WIDE
3319             || decode[i] > 0xFFFF
3320             #endif
3321         ) {
3322             need_dict = 1;
3323             break;
3324         }
3325         if (decode[i] == 0xFFFE)
3326             /* unmapped character */
3327             continue;
3328         l1 = decode[i] >> 11;
3329         l2 = decode[i] >> 7;
3330         if (level1[l1] == 0xFF)
3331             level1[l1] = count2++;
3332         if (level2[l2] == 0xFF)
3333             level2[l2] = count3++;
3334     }
3335
3336     if (count2 >= 0xFF || count3 >= 0xFF)
3337         need_dict = 1;
3338
3339     if (need_dict) {
3340         PyObject *result = PyDict_New();
3341         PyObject *key, *value;
3342         if (!result)
3343             return NULL;
3344         for (i = 0; i < 256; i++) {
3345             key = value = NULL;
3346             key = PyInt_FromLong(decode[i]);
3347             value = PyInt_FromLong(i);
3348             if (!key || !value)
3349                 goto failed1;
3350             if (PyDict_SetItem(result, key, value) == -1)
3351                 goto failed1;
3352             Py_DECREF(key);
3353             Py_DECREF(value);
3354         }
3355         return result;
3356       failed1:
3357         Py_XDECREF(key);
3358         Py_XDECREF(value);
3359         Py_DECREF(result);
3360         return NULL;
3361     }
3362
3363     /* Create a three-level trie */
3364     result = PyObject_MALLOC(sizeof(struct encoding_map) +
3365                              16*count2 + 128*count3 - 1);
3366     if (!result)
3367         return PyErr_NoMemory();
3368     PyObject_Init(result, &EncodingMapType);
3369     mresult = (struct encoding_map*)result;
3370     mresult->count2 = count2;
3371     mresult->count3 = count3;
3372     mlevel1 = mresult->level1;
3373     mlevel2 = mresult->level23;
3374     mlevel3 = mresult->level23 + 16*count2;
3375     memcpy(mlevel1, level1, 32);
3376     memset(mlevel2, 0xFF, 16*count2);
3377     memset(mlevel3, 0, 128*count3);
3378     count3 = 0;
3379     for (i = 1; i < 256; i++) {
3380         int o1, o2, o3, i2, i3;
3381         if (decode[i] == 0xFFFE)
3382             /* unmapped character */
3383             continue;
3384         o1 = decode[i]>>11;
3385         o2 = (decode[i]>>7) & 0xF;
3386         i2 = 16*mlevel1[o1] + o2;
3387         if (mlevel2[i2] == 0xFF)
3388             mlevel2[i2] = count3++;
3389         o3 = decode[i] & 0x7F;
3390         i3 = 128*mlevel2[i2] + o3;
3391         mlevel3[i3] = i;
3392     }
3393     return result;
3394 }
3395
3396 static int
3397 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3398 {
3399     struct encoding_map *map = (struct encoding_map*)mapping;
3400     int l1 = c>>11;
3401     int l2 = (c>>7) & 0xF;
3402     int l3 = c & 0x7F;
3403     int i;
3404
3405 #ifdef Py_UNICODE_WIDE
3406     if (c > 0xFFFF) {
3407         return -1;
3408     }
3409 #endif
3410     if (c == 0)
3411         return 0;
3412     /* level 1*/
3413     i = map->level1[l1];
3414     if (i == 0xFF) {
3415         return -1;
3416     }
3417     /* level 2*/
3418     i = map->level23[16*i+l2];
3419     if (i == 0xFF) {
3420         return -1;
3421     }
3422     /* level 3 */
3423     i = map->level23[16*map->count2 + 128*i + l3];
3424     if (i == 0) {
3425         return -1;
3426     }
3427     return i;
3428 }
3429
3430 /* Lookup the character ch in the mapping. If the character
3431    can't be found, Py_None is returned (or NULL, if another
3432    error occurred). */
3433 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
3434 {
3435     PyObject *w = PyInt_FromLong((long)c);
3436     PyObject *x;
3437
3438     if (w == NULL)
3439          return NULL;
3440     x = PyObject_GetItem(mapping, w);
3441     Py_DECREF(w);
3442     if (x == NULL) {
3443         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3444             /* No mapping found means: mapping is undefined. */
3445             PyErr_Clear();
3446             x = Py_None;
3447             Py_INCREF(x);
3448             return x;
3449         } else
3450             return NULL;
3451     }
3452     else if (x == Py_None)
3453         return x;
3454     else if (PyInt_Check(x)) {
3455         long value = PyInt_AS_LONG(x);
3456         if (value < 0 || value > 255) {
3457             PyErr_SetString(PyExc_TypeError,
3458                              "character mapping must be in range(256)");
3459             Py_DECREF(x);
3460             return NULL;
3461         }
3462         return x;
3463     }
3464     else if (PyString_Check(x))
3465         return x;
3466     else {
3467         /* wrong return value */
3468         PyErr_SetString(PyExc_TypeError,
3469               "character mapping must return integer, None or str");
3470         Py_DECREF(x);
3471         return NULL;
3472     }
3473 }
3474
3475 static int
3476 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3477 {
3478         Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3479         /* exponentially overallocate to minimize reallocations */
3480         if (requiredsize < 2*outsize)
3481             requiredsize = 2*outsize;
3482         if (_PyString_Resize(outobj, requiredsize)) {
3483             return 0;
3484         }
3485         return 1;
3486 }
3487
3488 typedef enum charmapencode_result {
3489   enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3490 }charmapencode_result;
3491 /* lookup the character, put the result in the output string and adjust
3492    various state variables. Reallocate the output string if not enough
3493    space is available. Return a new reference to the object that
3494    was put in the output buffer, or Py_None, if the mapping was undefined
3495    (in which case no character was written) or NULL, if a
3496    reallocation error occurred. The caller must decref the result */
3497 static
3498 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
3499     PyObject **outobj, Py_ssize_t *outpos)
3500 {
3501     PyObject *rep;
3502     char *outstart;
3503     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3504
3505     if (mapping->ob_type == &EncodingMapType) {
3506         int res = encoding_map_lookup(c, mapping);
3507         Py_ssize_t requiredsize = *outpos+1;
3508         if (res == -1)
3509             return enc_FAILED;
3510         if (outsize<requiredsize)
3511             if (!charmapencode_resize(outobj, outpos, requiredsize))
3512                 return enc_EXCEPTION;
3513         outstart = PyString_AS_STRING(*outobj);
3514         outstart[(*outpos)++] = (char)res;
3515         return enc_SUCCESS;
3516     }
3517
3518     rep = charmapencode_lookup(c, mapping);
3519     if (rep==NULL)
3520         return enc_EXCEPTION;
3521     else if (rep==Py_None) {
3522         Py_DECREF(rep);
3523         return enc_FAILED;
3524     } else {
3525         if (PyInt_Check(rep)) {
3526             Py_ssize_t requiredsize = *outpos+1;
3527             if (outsize<requiredsize)
3528                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
3529                     Py_DECREF(rep);
3530                     return enc_EXCEPTION;
3531                 }
3532             outstart = PyString_AS_STRING(*outobj);
3533             outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3534         }
3535         else {
3536             const char *repchars = PyString_AS_STRING(rep);
3537             Py_ssize_t repsize = PyString_GET_SIZE(rep);
3538             Py_ssize_t requiredsize = *outpos+repsize;
3539             if (outsize<requiredsize)
3540                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
3541                     Py_DECREF(rep);
3542                     return enc_EXCEPTION;
3543                 }
3544             outstart = PyString_AS_STRING(*outobj);
3545             memcpy(outstart + *outpos, repchars, repsize);
3546             *outpos += repsize;
3547         }
3548     }
3549     Py_DECREF(rep);
3550     return enc_SUCCESS;
3551 }
3552
3553 /* handle an error in PyUnicode_EncodeCharmap
3554    Return 0 on success, -1 on error */
3555 static
3556 int charmap_encoding_error(
3557     const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
3558     PyObject **exceptionObject,
3559     int *known_errorHandler, PyObject **errorHandler, const char *errors,
3560     PyObject **res, Py_ssize_t *respos)
3561 {
3562     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3563     Py_ssize_t repsize;
3564     Py_ssize_t newpos;
3565     Py_UNICODE *uni2;
3566     /* startpos for collecting unencodable chars */
3567     Py_ssize_t collstartpos = *inpos;
3568     Py_ssize_t collendpos = *inpos+1;
3569     Py_ssize_t collpos;
3570     char *encoding = "charmap";
3571     char *reason = "character maps to <undefined>";
3572     charmapencode_result x;
3573
3574     /* find all unencodable characters */
3575     while (collendpos < size) {
3576         PyObject *rep;
3577         if (mapping->ob_type == &EncodingMapType) {
3578             int res = encoding_map_lookup(p[collendpos], mapping);
3579             if (res != -1)
3580                 break;
3581             ++collendpos;
3582             continue;
3583         }
3584
3585         rep = charmapencode_lookup(p[collendpos], mapping);
3586         if (rep==NULL)
3587             return -1;
3588         else if (rep!=Py_None) {
3589             Py_DECREF(rep);
3590             break;
3591         }
3592         Py_DECREF(rep);
3593         ++collendpos;
3594     }
3595     /* cache callback name lookup
3596      * (if not done yet, i.e. it's the first error) */
3597     if (*known_errorHandler==-1) {
3598         if ((errors==NULL) || (!strcmp(errors, "strict")))
3599             *known_errorHandler = 1;
3600         else if (!strcmp(errors, "replace"))
3601             *known_errorHandler = 2;
3602         else if (!strcmp(errors, "ignore"))
3603             *known_errorHandler = 3;
3604         else if (!strcmp(errors, "xmlcharrefreplace"))
3605             *known_errorHandler = 4;
3606         else
3607             *known_errorHandler = 0;
3608     }
3609     switch (*known_errorHandler) {
3610         case 1: /* strict */
3611             raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3612             return -1;
3613         case 2: /* replace */
3614             for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3615                 x = charmapencode_output('?', mapping, res, respos);
3616                 if (x==enc_EXCEPTION) {
3617                     return -1;
3618                 }
3619                 else if (x==enc_FAILED) {
3620                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3621                     return -1;
3622                 }
3623             }
3624             /* fall through */
3625         case 3: /* ignore */
3626             *inpos = collendpos;
3627             break;
3628         case 4: /* xmlcharrefreplace */
3629             /* generate replacement (temporarily (mis)uses p) */
3630             for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3631                 char buffer[2+29+1+1];
3632                 char *cp;
3633                 sprintf(buffer, "&#%d;", (int)p[collpos]);
3634                 for (cp = buffer; *cp; ++cp) {
3635                     x = charmapencode_output(*cp, mapping, res, respos);
3636                     if (x==enc_EXCEPTION)
3637                         return -1;
3638                     else if (x==enc_FAILED) {
3639                         raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3640                         return -1;
3641                     }
3642                 }
3643             }
3644             *inpos = collendpos;
3645             break;
3646         default:
3647             repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
3648                 encoding, reason, p, size, exceptionObject,
3649                 collstartpos, collendpos, &newpos);
3650             if (repunicode == NULL)
3651                 return -1;
3652             /* generate replacement  */
3653             repsize = PyUnicode_GET_SIZE(repunicode);
3654             for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3655                 x = charmapencode_output(*uni2, mapping, res, respos);
3656                 if (x==enc_EXCEPTION) {
3657                     return -1;
3658                 }
3659                 else if (x==enc_FAILED) {
3660                     Py_DECREF(repunicode);
3661                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3662                     return -1;
3663                 }
3664             }
3665             *inpos = newpos;
3666             Py_DECREF(repunicode);
3667     }
3668     return 0;
3669 }
3670
3671 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3672                                   Py_ssize_t size,
3673                                   PyObject *mapping,
3674                                   const char *errors)
3675 {
3676     /* output object */
3677     PyObject *res = NULL;
3678     /* current input position */
3679     Py_ssize_t inpos = 0;
3680     /* current output position */
3681     Py_ssize_t respos = 0;
3682     PyObject *errorHandler = NULL;
3683     PyObject *exc = NULL;
3684     /* the following variable is used for caching string comparisons
3685      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3686      * 3=ignore, 4=xmlcharrefreplace */
3687     int known_errorHandler = -1;
3688
3689     /* Default to Latin-1 */
3690     if (mapping == NULL)
3691         return PyUnicode_EncodeLatin1(p, size, errors);
3692
3693     /* allocate enough for a simple encoding without
3694        replacements, if we need more, we'll resize */
3695     res = PyString_FromStringAndSize(NULL, size);
3696     if (res == NULL)
3697         goto onError;
3698     if (size == 0)
3699         return res;
3700
3701     while (inpos<size) {
3702         /* try to encode it */
3703         charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3704         if (x==enc_EXCEPTION) /* error */
3705             goto onError;
3706         if (x==enc_FAILED) { /* unencodable character */
3707             if (charmap_encoding_error(p, size, &inpos, mapping,
3708                 &exc,
3709                 &known_errorHandler, &errorHandler, errors,
3710                 &res, &respos)) {
3711                 goto onError;
3712             }
3713         }
3714         else
3715             /* done with this character => adjust input position */
3716             ++inpos;
3717     }
3718
3719     /* Resize if we allocated to much */
3720     if (respos<PyString_GET_SIZE(res)) {
3721         if (_PyString_Resize(&res, respos))
3722             goto onError;
3723     }
3724     Py_XDECREF(exc);
3725     Py_XDECREF(errorHandler);
3726     return res;
3727
3728     onError:
3729     Py_XDECREF(res);
3730     Py_XDECREF(exc);
3731     Py_XDECREF(errorHandler);
3732     return NULL;
3733 }
3734
3735 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3736                                     PyObject *mapping)
3737 {
3738     if (!PyUnicode_Check(unicode) || mapping == NULL) {
3739         PyErr_BadArgument();
3740         return NULL;
3741     }
3742     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3743                                    PyUnicode_GET_SIZE(unicode),
3744                                    mapping,
3745                                    NULL);
3746 }
3747
3748 /* create or adjust a UnicodeTranslateError */
3749 static void make_translate_exception(PyObject **exceptionObject,
3750     const Py_UNICODE *unicode, Py_ssize_t size,
3751     Py_ssize_t startpos, Py_ssize_t endpos,
3752     const char *reason)
3753 {
3754     if (*exceptionObject == NULL) {
3755         *exceptionObject = PyUnicodeTranslateError_Create(
3756             unicode, size, startpos, endpos, reason);
3757     }
3758     else {
3759         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3760             goto onError;
3761         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3762             goto onError;
3763         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3764             goto onError;
3765         return;
3766         onError:
3767         Py_DECREF(*exceptionObject);
3768         *exceptionObject = NULL;
3769     }
3770 }
3771
3772 /* raises a UnicodeTranslateError */
3773 static void raise_translate_exception(PyObject **exceptionObject,
3774     const Py_UNICODE *unicode, Py_ssize_t size,
3775     Py_ssize_t startpos, Py_ssize_t endpos,
3776     const char *reason)
3777 {
3778     make_translate_exception(exceptionObject,
3779         unicode, size, startpos, endpos, reason);
3780     if (*exceptionObject != NULL)
3781         PyCodec_StrictErrors(*exceptionObject);
3782 }
3783
3784 /* error handling callback helper:
3785    build arguments, call the callback and check the arguments,
3786    put the result into newpos and return the replacement string, which
3787    has to be freed by the caller */
3788 static PyObject *unicode_translate_call_errorhandler(const char *errors,
3789     PyObject **errorHandler,
3790     const char *reason,
3791     const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3792     Py_ssize_t startpos, Py_ssize_t endpos,
3793     Py_ssize_t *newpos)
3794 {
3795     static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
3796
3797     Py_ssize_t i_newpos;
3798     PyObject *restuple;
3799     PyObject *resunicode;
3800
3801     if (*errorHandler == NULL) {
3802         *errorHandler = PyCodec_LookupError(errors);
3803         if (*errorHandler == NULL)
3804             return NULL;
3805     }
3806
3807     make_translate_exception(exceptionObject,
3808         unicode, size, startpos, endpos, reason);
3809     if (*exceptionObject == NULL)
3810         return NULL;
3811
3812     restuple = PyObject_CallFunctionObjArgs(
3813         *errorHandler, *exceptionObject, NULL);
3814     if (restuple == NULL)
3815         return NULL;
3816     if (!PyTuple_Check(restuple)) {
3817         PyErr_Format(PyExc_TypeError, &argparse[4]);
3818         Py_DECREF(restuple);
3819         return NULL;
3820     }
3821     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3822         &resunicode, &i_newpos)) {
3823         Py_DECREF(restuple);
3824         return NULL;
3825     }
3826     if (i_newpos<0)
3827         *newpos = size+i_newpos;
3828     else
3829         *newpos = i_newpos;
3830     if (*newpos<0 || *newpos>size) {
3831         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3832         Py_DECREF(restuple);
3833         return NULL;
3834     }
3835     Py_INCREF(resunicode);
3836     Py_DECREF(restuple);
3837     return resunicode;
3838 }
3839
3840 /* Lookup the character ch in the mapping and put the result in result,
3841    which must be decrefed by the caller.
3842    Return 0 on success, -1 on error */
3843 static
3844 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3845 {
3846     PyObject *w = PyInt_FromLong((long)c);
3847     PyObject *x;
3848
3849     if (w == NULL)
3850          return -1;
3851     x = PyObject_GetItem(mapping, w);
3852     Py_DECREF(w);
3853     if (x == NULL) {
3854         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3855             /* No mapping found means: use 1:1 mapping. */
3856             PyErr_Clear();
3857             *result = NULL;
3858             return 0;
3859         } else
3860             return -1;
3861     }
3862     else if (x == Py_None) {
3863         *result = x;
3864         return 0;
3865     }
3866     else if (PyInt_Check(x)) {
3867         long value = PyInt_AS_LONG(x);
3868         long max = PyUnicode_GetMax();
3869         if (value < 0 || value > max) {
3870             PyErr_Format(PyExc_TypeError,
3871                              "character mapping must be in range(0x%lx)", max+1);
3872             Py_DECREF(x);
3873             return -1;
3874         }
3875         *result = x;
3876         return 0;
3877     }
3878     else if (PyUnicode_Check(x)) {
3879         *result = x;
3880         return 0;
3881     }
3882     else {
3883         /* wrong return value */
3884         PyErr_SetString(PyExc_TypeError,
3885               "character mapping must return integer, None or unicode");
3886         Py_DECREF(x);
3887         return -1;
3888     }
3889 }
3890 /* ensure that *outobj is at least requiredsize characters long,
3891 if not reallocate and adjust various state variables.
3892 Return 0 on success, -1 on error */
3893 static
3894 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
3895     Py_ssize_t requiredsize)
3896 {
3897     Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
3898     if (requiredsize > oldsize) {
3899         /* remember old output position */
3900         Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3901         /* exponentially overallocate to minimize reallocations */
3902         if (requiredsize < 2 * oldsize)
3903             requiredsize = 2 * oldsize;
3904         if (_PyUnicode_Resize(outobj, requiredsize) < 0)
3905             return -1;
3906         *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
3907     }
3908     return 0;
3909 }
3910 /* lookup the character, put the result in the output string and adjust
3911    various state variables. Return a new reference to the object that
3912    was put in the output buffer in *result, or Py_None, if the mapping was
3913    undefined (in which case no character was written).
3914    The called must decref result.
3915    Return 0 on success, -1 on error. */
3916 static
3917 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
3918     Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
3919     PyObject **res)
3920 {
3921     if (charmaptranslate_lookup(*curinp, mapping, res))
3922         return -1;
3923     if (*res==NULL) {
3924         /* not found => default to 1:1 mapping */
3925         *(*outp)++ = *curinp;
3926     }
3927     else if (*res==Py_None)
3928         ;
3929     else if (PyInt_Check(*res)) {
3930         /* no overflow check, because we know that the space is enough */
3931         *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3932     }
3933     else if (PyUnicode_Check(*res)) {
3934         Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
3935         if (repsize==1) {
3936             /* no overflow check, because we know that the space is enough */
3937             *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3938         }
3939         else if (repsize!=0) {
3940             /* more than one character */
3941             Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
3942                 (insize - (curinp-startinp)) +
3943                 repsize - 1;
3944             if (charmaptranslate_makespace(outobj, outp, requiredsize))
3945                 return -1;
3946             memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3947             *outp += repsize;
3948         }
3949     }
3950     else
3951         return -1;
3952     return 0;
3953 }
3954
3955 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
3956                                      Py_ssize_t size,
3957                                      PyObject *mapping,
3958                                      const char *errors)
3959 {
3960     /* output object */
3961     PyObject *res = NULL;
3962     /* pointers to the beginning and end+1 of input */
3963     const Py_UNICODE *startp = p;
3964     const Py_UNICODE *endp = p + size;
3965     /* pointer into the output */
3966     Py_UNICODE *str;
3967     /* current output position */
3968     Py_ssize_t respos = 0;
3969     char *reason = "character maps to <undefined>";
3970     PyObject *errorHandler = NULL;
3971     PyObject *exc = NULL;
3972     /* the following variable is used for caching string comparisons
3973      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3974      * 3=ignore, 4=xmlcharrefreplace */
3975     int known_errorHandler = -1;
3976
3977     if (mapping == NULL) {
3978         PyErr_BadArgument();
3979         return NULL;
3980     }
3981
3982     /* allocate enough for a simple 1:1 translation without
3983        replacements, if we need more, we'll resize */
3984     res = PyUnicode_FromUnicode(NULL, size);
3985     if (res == NULL)
3986         goto onError;
3987     if (size == 0)
3988         return res;
3989     str = PyUnicode_AS_UNICODE(res);
3990
3991     while (p<endp) {
3992         /* try to encode it */
3993         PyObject *x = NULL;
3994         if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
3995             Py_XDECREF(x);
3996             goto onError;
3997         }
3998         Py_XDECREF(x);
3999         if (x!=Py_None) /* it worked => adjust input pointer */
4000             ++p;
4001         else { /* untranslatable character */
4002             PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4003             Py_ssize_t repsize;
4004             Py_ssize_t newpos;
4005             Py_UNICODE *uni2;
4006             /* startpos for collecting untranslatable chars */
4007             const Py_UNICODE *collstart = p;
4008             const Py_UNICODE *collend = p+1;
4009             const Py_UNICODE *coll;
4010
4011             /* find all untranslatable characters */
4012             while (collend < endp) {
4013                 if (charmaptranslate_lookup(*collend, mapping, &x))
4014                     goto onError;
4015                 Py_XDECREF(x);
4016                 if (x!=Py_None)
4017                     break;
4018                 ++collend;
4019             }
4020             /* cache callback name lookup
4021              * (if not done yet, i.e. it's the first error) */
4022             if (known_errorHandler==-1) {
4023                 if ((errors==NULL) || (!strcmp(errors, "strict")))
4024                     known_errorHandler = 1;
4025                 else if (!strcmp(errors, "replace"))
4026                     known_errorHandler = 2;
4027                 else if (!strcmp(errors, "ignore"))
4028                     known_errorHandler = 3;
4029                 else if (!strcmp(errors, "xmlcharrefreplace"))
4030                     known_errorHandler = 4;
4031                 else
4032                     known_errorHandler = 0;
4033             }
4034             switch (known_errorHandler) {
4035                 case 1: /* strict */
4036                     raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4037                     goto onError;
4038                 case 2: /* replace */
4039                     /* No need to check for space, this is a 1:1 replacement */
4040                     for (coll = collstart; coll<collend; ++coll)
4041                         *str++ = '?';
4042                     /* fall through */
4043                 case 3: /* ignore */
4044                     p = collend;
4045                     break;
4046                 case 4: /* xmlcharrefreplace */
4047                     /* generate replacement (temporarily (mis)uses p) */
4048                     for (p = collstart; p < collend; ++p) {
4049                         char buffer[2+29+1+1];
4050                         char *cp;
4051                         sprintf(buffer, "&#%d;", (int)*p);
4052                         if (charmaptranslate_makespace(&res, &str,
4053                             (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4054                             goto onError;
4055                         for (cp = buffer; *cp; ++cp)
4056                             *str++ = *cp;
4057                     }
4058                     p = collend;
4059                     break;
4060                 default:
4061                     repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4062                         reason, startp, size, &exc,
4063                         collstart-startp, collend-startp, &newpos);
4064                     if (repunicode == NULL)
4065                         goto onError;
4066                     /* generate replacement  */
4067                     repsize = PyUnicode_GET_SIZE(repunicode);
4068                     if (charmaptranslate_makespace(&res, &str,
4069                         (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4070                         Py_DECREF(repunicode);
4071                         goto onError;
4072                     }
4073                     for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4074                         *str++ = *uni2;
4075                     p = startp + newpos;
4076                     Py_DECREF(repunicode);
4077             }
4078         }
4079     }
4080     /* Resize if we allocated to much */
4081     respos = str-PyUnicode_AS_UNICODE(res);
4082     if (respos<PyUnicode_GET_SIZE(res)) {
4083         if (_PyUnicode_Resize(&res, respos) < 0)
4084             goto onError;
4085     }
4086     Py_XDECREF(exc);
4087     Py_XDECREF(errorHandler);
4088     return res;
4089
4090     onError:
4091     Py_XDECREF(res);
4092     Py_XDECREF(exc);
4093     Py_XDECREF(errorHandler);
4094     return NULL;
4095 }
4096
4097 PyObject *PyUnicode_Translate(PyObject *str,
4098                               PyObject *mapping,
4099                               const char *errors)
4100 {
4101     PyObject *result;
4102
4103     str = PyUnicode_FromObject(str);
4104     if (str == NULL)
4105         goto onError;
4106     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4107                                         PyUnicode_GET_SIZE(str),
4108                                         mapping,
4109                                         errors);
4110     Py_DECREF(str);
4111     return result;
4112
4113  onError:
4114     Py_XDECREF(str);
4115     return NULL;
4116 }
4117
4118 /* --- Decimal Encoder ---------------------------------------------------- */
4119
4120 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
4121                             Py_ssize_t length,
4122                             char *output,
4123                             const char *errors)
4124 {
4125     Py_UNICODE *p, *end;
4126     PyObject *errorHandler = NULL;
4127     PyObject *exc = NULL;
4128     const char *encoding = "decimal";
4129     const char *reason = "invalid decimal Unicode string";
4130     /* the following variable is used for caching string comparisons
4131      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4132     int known_errorHandler = -1;
4133
4134     if (output == NULL) {
4135         PyErr_BadArgument();
4136         return -1;
4137     }
4138
4139     p = s;
4140     end = s + length;
4141     while (p < end) {
4142         register Py_UNICODE ch = *p;
4143         int decimal;
4144         PyObject *repunicode;
4145         Py_ssize_t repsize;
4146         Py_ssize_t newpos;
4147         Py_UNICODE *uni2;
4148         Py_UNICODE *collstart;
4149         Py_UNICODE *collend;
4150
4151         if (Py_UNICODE_ISSPACE(ch)) {
4152             *output++ = ' ';
4153             ++p;
4154             continue;
4155         }
4156         decimal = Py_UNICODE_TODECIMAL(ch);
4157         if (decimal >= 0) {
4158             *output++ = '0' + decimal;
4159             ++p;
4160             continue;
4161         }
4162         if (0 < ch && ch < 256) {
4163             *output++ = (char)ch;
4164             ++p;
4165             continue;
4166         }
4167         /* All other characters are considered unencodable */
4168         collstart = p;
4169         collend = p+1;
4170         while (collend < end) {
4171             if ((0 < *collend && *collend < 256) ||
4172                 !Py_UNICODE_ISSPACE(*collend) ||
4173                 Py_UNICODE_TODECIMAL(*collend))
4174                 break;
4175         }
4176         /* cache callback name lookup
4177          * (if not done yet, i.e. it's the first error) */
4178         if (known_errorHandler==-1) {
4179             if ((errors==NULL) || (!strcmp(errors, "strict")))
4180                 known_errorHandler = 1;
4181             else if (!strcmp(errors, "replace"))
4182                 known_errorHandler = 2;
4183             else if (!strcmp(errors, "ignore"))
4184                 known_errorHandler = 3;
4185             else if (!strcmp(errors, "xmlcharrefreplace"))
4186                 known_errorHandler = 4;
4187             else
4188                 known_errorHandler = 0;
4189         }
4190         switch (known_errorHandler) {
4191             case 1: /* strict */
4192                 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4193                 goto onError;
4194             case 2: /* replace */
4195                 for (p = collstart; p < collend; ++p)
4196                     *output++ = '?';
4197                 /* fall through */
4198             case 3: /* ignore */
4199                 p = collend;
4200                 break;
4201             case 4: /* xmlcharrefreplace */
4202                 /* generate replacement (temporarily (mis)uses p) */
4203                 for (p = collstart; p < collend; ++p)
4204                     output += sprintf(output, "&#%d;", (int)*p);
4205                 p = collend;
4206                 break;
4207             default:
4208                 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4209                     encoding, reason, s, length, &exc,
4210                     collstart-s, collend-s, &newpos);
4211                 if (repunicode == NULL)
4212                     goto onError;
4213                 /* generate replacement  */
4214                 repsize = PyUnicode_GET_SIZE(repunicode);
4215                 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4216                     Py_UNICODE ch = *uni2;
4217                     if (Py_UNICODE_ISSPACE(ch))
4218                         *output++ = ' ';
4219                     else {
4220                         decimal = Py_UNICODE_TODECIMAL(ch);
4221                         if (decimal >= 0)
4222                             *output++ = '0' + decimal;
4223                         else if (0 < ch && ch < 256)
4224                             *output++ = (char)ch;
4225                         else {
4226                             Py_DECREF(repunicode);
4227                             raise_encode_exception(&exc, encoding,
4228                                 s, length, collstart-s, collend-s, reason);
4229                             goto onError;
4230                         }
4231                     }
4232                 }
4233                 p = s + newpos;
4234                 Py_DECREF(repunicode);
4235         }
4236     }
4237     /* 0-terminate the output string */
4238     *output++ = '\0';
4239     Py_XDECREF(exc);
4240     Py_XDECREF(errorHandler);
4241     return 0;
4242
4243  onError:
4244     Py_XDECREF(exc);
4245     Py_XDECREF(errorHandler);
4246     return -1;
4247 }
4248
4249 /* --- Helpers ------------------------------------------------------------ */
4250
4251 #define STRINGLIB_CHAR Py_UNICODE
4252
4253 #define STRINGLIB_LEN PyUnicode_GET_SIZE
4254 #define STRINGLIB_NEW PyUnicode_FromUnicode
4255 #define STRINGLIB_STR PyUnicode_AS_UNICODE
4256
4257 Py_LOCAL_INLINE(int)
4258 STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
4259 {
4260     if (str[0] != other[0])
4261         return 1;
4262     return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
4263 }
4264
4265 #define STRINGLIB_EMPTY unicode_empty
4266
4267 #include "stringlib/fastsearch.h"
4268
4269 #include "stringlib/count.h"
4270 #include "stringlib/find.h"
4271 #include "stringlib/partition.h"
4272
4273 /* helper macro to fixup start/end slice values */
4274 #define FIX_START_END(obj)                      \
4275     if (start < 0)                              \
4276         start += (obj)->length;                 \
4277     if (start < 0)                              \
4278         start = 0;                              \
4279     if (end > (obj)->length)                    \
4280         end = (obj)->length;                    \
4281     if (end < 0)                                \
4282         end += (obj)->length;                   \
4283     if (end < 0)                                \
4284         end = 0;
4285
4286 Py_ssize_t PyUnicode_Count(PyObject *str,
4287                            PyObject *substr,
4288                            Py_ssize_t start,
4289                            Py_ssize_t end)
4290 {
4291     Py_ssize_t result;
4292     PyUnicodeObject* str_obj;
4293     PyUnicodeObject* sub_obj;
4294
4295     str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4296     if (!str_obj)
4297         return -1;
4298     sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4299     if (!sub_obj) {
4300         Py_DECREF(str_obj);
4301         return -1;
4302     }
4303
4304     FIX_START_END(str_obj);
4305
4306     result = stringlib_count(
4307         str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4308         );
4309
4310     Py_DECREF(sub_obj);
4311     Py_DECREF(str_obj);
4312
4313     return result;
4314 }
4315
4316 Py_ssize_t PyUnicode_Find(PyObject *str,
4317                           PyObject *sub,
4318                           Py_ssize_t start,
4319                           Py_ssize_t end,
4320                           int direction)
4321 {
4322     Py_ssize_t result;
4323
4324     str = PyUnicode_FromObject(str);
4325     if (!str)
4326         return -2;
4327     sub = PyUnicode_FromObject(sub);
4328     if (!sub) {
4329         Py_DECREF(str);
4330         return -2;
4331     }
4332
4333     if (direction > 0)
4334         result = stringlib_find_slice(
4335             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4336             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4337             start, end
4338             );
4339     else
4340         result = stringlib_rfind_slice(
4341             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4342             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4343             start, end
4344             );
4345
4346     Py_DECREF(str);
4347     Py_DECREF(sub);
4348
4349     return result;
4350 }
4351
4352 static
4353 int tailmatch(PyUnicodeObject *self,
4354               PyUnicodeObject *substring,
4355               Py_ssize_t start,
4356               Py_ssize_t end,
4357               int direction)
4358 {
4359     if (substring->length == 0)
4360         return 1;
4361
4362     FIX_START_END(self);
4363
4364     end -= substring->length;
4365     if (end < start)
4366         return 0;
4367
4368     if (direction > 0) {
4369         if (Py_UNICODE_MATCH(self, end, substring))
4370             return 1;
4371     } else {
4372         if (Py_UNICODE_MATCH(self, start, substring))
4373             return 1;
4374     }
4375
4376     return 0;
4377 }
4378
4379 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
4380                         PyObject *substr,
4381                         Py_ssize_t start,
4382                         Py_ssize_t end,
4383                         int direction)
4384 {
4385     Py_ssize_t result;
4386
4387     str = PyUnicode_FromObject(str);
4388     if (str == NULL)
4389         return -1;
4390     substr = PyUnicode_FromObject(substr);
4391     if (substr == NULL) {
4392         Py_DECREF(str);
4393         return -1;
4394     }
4395
4396     result = tailmatch((PyUnicodeObject *)str,
4397                        (PyUnicodeObject *)substr,
4398                        start, end, direction);
4399     Py_DECREF(str);
4400     Py_DECREF(substr);
4401     return result;
4402 }
4403
4404 /* Apply fixfct filter to the Unicode object self and return a
4405    reference to the modified object */
4406
4407 static
4408 PyObject *fixup(PyUnicodeObject *self,
4409                 int (*fixfct)(PyUnicodeObject *s))
4410 {
4411
4412     PyUnicodeObject *u;
4413
4414     u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
4415     if (u == NULL)
4416         return NULL;
4417
4418     Py_UNICODE_COPY(u->str, self->str, self->length);
4419
4420     if (!fixfct(u) && PyUnicode_CheckExact(self)) {
4421         /* fixfct should return TRUE if it modified the buffer. If
4422            FALSE, return a reference to the original buffer instead
4423            (to save space, not time) */
4424         Py_INCREF(self);
4425         Py_DECREF(u);
4426         return (PyObject*) self;
4427     }
4428     return (PyObject*) u;
4429 }
4430
4431 static
4432 int fixupper(PyUnicodeObject *self)
4433 {
4434     Py_ssize_t len = self->length;
4435     Py_UNICODE *s = self->str;
4436     int status = 0;
4437
4438     while (len-- > 0) {
4439         register Py_UNICODE ch;
4440
4441         ch = Py_UNICODE_TOUPPER(*s);
4442         if (ch != *s) {
4443             status = 1;
4444             *s = ch;
4445         }
4446         s++;
4447     }
4448
4449     return status;
4450 }
4451
4452 static
4453 int fixlower(PyUnicodeObject *self)
4454 {
4455     Py_ssize_t len = self->length;
4456     Py_UNICODE *s = self->str;
4457     int status = 0;
4458
4459     while (len-- > 0) {
4460         register Py_UNICODE ch;
4461
4462         ch = Py_UNICODE_TOLOWER(*s);
4463         if (ch != *s) {
4464             status = 1;
4465             *s = ch;
4466         }
4467         s++;
4468     }
4469
4470     return status;
4471 }
4472
4473 static
4474 int fixswapcase(PyUnicodeObject *self)
4475 {
4476     Py_ssize_t len = self->length;
4477     Py_UNICODE *s = self->str;
4478     int status = 0;
4479
4480     while (len-- > 0) {
4481         if (Py_UNICODE_ISUPPER(*s)) {
4482             *s = Py_UNICODE_TOLOWER(*s);
4483             status = 1;
4484         } else if (Py_UNICODE_ISLOWER(*s)) {
4485             *s = Py_UNICODE_TOUPPER(*s);
4486             status = 1;
4487         }
4488         s++;
4489     }
4490
4491     return status;
4492 }
4493
4494 static
4495 int fixcapitalize(PyUnicodeObject *self)
4496 {
4497     Py_ssize_t len = self->length;
4498     Py_UNICODE *s = self->str;
4499     int status = 0;
4500
4501     if (len == 0)
4502         return 0;
4503     if (Py_UNICODE_ISLOWER(*s)) {
4504         *s = Py_UNICODE_TOUPPER(*s);
4505         status = 1;
4506     }
4507     s++;
4508     while (--len > 0) {
4509         if (Py_UNICODE_ISUPPER(*s)) {
4510             *s = Py_UNICODE_TOLOWER(*s);
4511             status = 1;
4512         }
4513         s++;
4514     }
4515     return status;
4516 }
4517
4518 static
4519 int fixtitle(PyUnicodeObject *self)
4520 {
4521     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4522     register Py_UNICODE *e;
4523     int previous_is_cased;
4524
4525     /* Shortcut for single character strings */
4526     if (PyUnicode_GET_SIZE(self) == 1) {
4527         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4528         if (*p != ch) {
4529             *p = ch;
4530             return 1;
4531         }
4532         else
4533             return 0;
4534     }
4535
4536     e = p + PyUnicode_GET_SIZE(self);
4537     previous_is_cased = 0;
4538     for (; p < e; p++) {
4539         register const Py_UNICODE ch = *p;
4540
4541         if (previous_is_cased)
4542             *p = Py_UNICODE_TOLOWER(ch);
4543         else
4544             *p = Py_UNICODE_TOTITLE(ch);
4545
4546         if (Py_UNICODE_ISLOWER(ch) ||
4547             Py_UNICODE_ISUPPER(ch) ||
4548             Py_UNICODE_ISTITLE(ch))
4549             previous_is_cased = 1;
4550         else
4551             previous_is_cased = 0;
4552     }
4553     return 1;
4554 }
4555
4556 PyObject *
4557 PyUnicode_Join(PyObject *separator, PyObject *seq)
4558 {
4559     PyObject *internal_separator = NULL;
4560     const Py_UNICODE blank = ' ';
4561     const Py_UNICODE *sep = &blank;
4562     Py_ssize_t seplen = 1;
4563     PyUnicodeObject *res = NULL; /* the result */
4564     Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
4565     Py_ssize_t res_used;         /* # used bytes */
4566     Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
4567     PyObject *fseq;          /* PySequence_Fast(seq) */
4568     Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
4569     PyObject *item;
4570     Py_ssize_t i;
4571
4572     fseq = PySequence_Fast(seq, "");
4573     if (fseq == NULL) {
4574         return NULL;
4575     }
4576
4577     /* Grrrr.  A codec may be invoked to convert str objects to
4578      * Unicode, and so it's possible to call back into Python code
4579      * during PyUnicode_FromObject(), and so it's possible for a sick
4580      * codec to change the size of fseq (if seq is a list).  Therefore
4581      * we have to keep refetching the size -- can't assume seqlen
4582      * is invariant.
4583      */
4584     seqlen = PySequence_Fast_GET_SIZE(fseq);
4585     /* If empty sequence, return u"". */
4586     if (seqlen == 0) {
4587         res = _PyUnicode_New(0);  /* empty sequence; return u"" */
4588         goto Done;
4589     }
4590     /* If singleton sequence with an exact Unicode, return that. */
4591     if (seqlen == 1) {
4592         item = PySequence_Fast_GET_ITEM(fseq, 0);
4593         if (PyUnicode_CheckExact(item)) {
4594             Py_INCREF(item);
4595             res = (PyUnicodeObject *)item;
4596             goto Done;
4597         }
4598     }
4599
4600     /* At least two items to join, or one that isn't exact Unicode. */
4601     if (seqlen > 1) {
4602         /* Set up sep and seplen -- they're needed. */
4603         if (separator == NULL) {
4604             sep = &blank;
4605             seplen = 1;
4606         }
4607         else {
4608             internal_separator = PyUnicode_FromObject(separator);
4609             if (internal_separator == NULL)
4610                 goto onError;
4611             sep = PyUnicode_AS_UNICODE(internal_separator);
4612             seplen = PyUnicode_GET_SIZE(internal_separator);
4613             /* In case PyUnicode_FromObject() mutated seq. */
4614             seqlen = PySequence_Fast_GET_SIZE(fseq);
4615         }
4616     }
4617
4618     /* Get space. */
4619     res = _PyUnicode_New(res_alloc);
4620     if (res == NULL)
4621         goto onError;
4622     res_p = PyUnicode_AS_UNICODE(res);
4623     res_used = 0;
4624
4625     for (i = 0; i < seqlen; ++i) {
4626         Py_ssize_t itemlen;
4627         Py_ssize_t new_res_used;
4628
4629         item = PySequence_Fast_GET_ITEM(fseq, i);
4630         /* Convert item to Unicode. */
4631         if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4632             PyErr_Format(PyExc_TypeError,
4633                          "sequence item %zd: expected string or Unicode,"
4634                          " %.80s found",
4635                          i, item->ob_type->tp_name);
4636             goto onError;
4637         }
4638         item = PyUnicode_FromObject(item);
4639         if (item == NULL)
4640             goto onError;
4641         /* We own a reference to item from here on. */
4642
4643         /* In case PyUnicode_FromObject() mutated seq. */
4644         seqlen = PySequence_Fast_GET_SIZE(fseq);
4645
4646         /* Make sure we have enough space for the separator and the item. */
4647         itemlen = PyUnicode_GET_SIZE(item);
4648         new_res_used = res_used + itemlen;
4649         if (new_res_used < 0)
4650             goto Overflow;
4651         if (i < seqlen - 1) {
4652             new_res_used += seplen;
4653             if (new_res_used < 0)
4654                 goto Overflow;
4655         }
4656         if (new_res_used > res_alloc) {
4657             /* double allocated size until it's big enough */
4658             do {
4659                 res_alloc += res_alloc;
4660                 if (res_alloc <= 0)
4661                     goto Overflow;
4662             } while (new_res_used > res_alloc);
4663             if (_PyUnicode_Resize(&res, res_alloc) < 0) {
4664                 Py_DECREF(item);
4665                 goto onError;
4666             }
4667             res_p = PyUnicode_AS_UNICODE(res) + res_used;
4668         }
4669
4670         /* Copy item, and maybe the separator. */
4671         Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
4672         res_p += itemlen;
4673         if (i < seqlen - 1) {
4674             Py_UNICODE_COPY(res_p, sep, seplen);
4675             res_p += seplen;
4676         }
4677         Py_DECREF(item);
4678         res_used = new_res_used;
4679     }
4680
4681     /* Shrink res to match the used area; this probably can't fail,
4682      * but it's cheap to check.
4683      */
4684     if (_PyUnicode_Resize(&res, res_used) < 0)
4685         goto onError;
4686
4687  Done:
4688     Py_XDECREF(internal_separator);
4689     Py_DECREF(fseq);
4690     return (PyObject *)res;
4691
4692  Overflow:
4693     PyErr_SetString(PyExc_OverflowError,
4694                     "join() result is too long for a Python string");
4695     Py_DECREF(item);
4696     /* fall through */
4697
4698  onError:
4699     Py_XDECREF(internal_separator);
4700     Py_DECREF(fseq);
4701     Py_XDECREF(res);
4702     return NULL;
4703 }
4704
4705 static
4706 PyUnicodeObject *pad(PyUnicodeObject *self,
4707                      Py_ssize_t left,
4708                      Py_ssize_t right,
4709                      Py_UNICODE fill)
4710 {
4711     PyUnicodeObject *u;
4712
4713     if (left < 0)
4714         left = 0;
4715     if (right < 0)
4716         right = 0;
4717
4718     if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
4719         Py_INCREF(self);
4720         return self;
4721     }
4722
4723     u = _PyUnicode_New(left + self->length + right);
4724     if (u) {
4725         if (left)
4726             Py_UNICODE_FILL(u->str, fill, left);
4727         Py_UNICODE_COPY(u->str + left, self->str, self->length);
4728         if (right)
4729             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4730     }
4731
4732     return u;
4733 }
4734
4735 #define SPLIT_APPEND(data, left, right)                                 \
4736         str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4737         if (!str)                                                       \
4738             goto onError;                                               \
4739         if (PyList_Append(list, str)) {                                 \
4740             Py_DECREF(str);                                             \
4741             goto onError;                                               \
4742         }                                                               \
4743         else                                                            \
4744             Py_DECREF(str);
4745
4746 static
4747 PyObject *split_whitespace(PyUnicodeObject *self,
4748                            PyObject *list,
4749                            Py_ssize_t maxcount)
4750 {
4751     register Py_ssize_t i;
4752     register Py_ssize_t j;
4753     Py_ssize_t len = self->length;
4754     PyObject *str;
4755
4756     for (i = j = 0; i < len; ) {
4757         /* find a token */
4758         while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4759             i++;
4760         j = i;
4761         while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4762             i++;
4763         if (j < i) {
4764             if (maxcount-- <= 0)
4765                 break;
4766             SPLIT_APPEND(self->str, j, i);
4767             while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4768                 i++;
4769             j = i;
4770         }
4771     }
4772     if (j < len) {
4773         SPLIT_APPEND(self->str, j, len);
4774     }
4775     return list;
4776
4777  onError:
4778     Py_DECREF(list);
4779     return NULL;
4780 }
4781
4782 PyObject *PyUnicode_Splitlines(PyObject *string,
4783                                int keepends)
4784 {
4785     register Py_ssize_t i;
4786     register Py_ssize_t j;
4787     Py_ssize_t len;
4788     PyObject *list;
4789     PyObject *str;
4790     Py_UNICODE *data;
4791
4792     string = PyUnicode_FromObject(string);
4793     if (string == NULL)
4794         return NULL;
4795     data = PyUnicode_AS_UNICODE(string);
4796     len = PyUnicode_GET_SIZE(string);
4797
4798     list = PyList_New(0);
4799     if (!list)
4800         goto onError;
4801
4802     for (i = j = 0; i < len; ) {
4803         Py_ssize_t eol;
4804
4805         /* Find a line and append it */
4806         while (i < len && !BLOOM_LINEBREAK(data[i]))
4807             i++;
4808
4809         /* Skip the line break reading CRLF as one line break */
4810         eol = i;
4811         if (i < len) {
4812             if (data[i] == '\r' && i + 1 < len &&
4813                 data[i+1] == '\n')
4814                 i += 2;
4815             else
4816                 i++;
4817             if (keepends)
4818                 eol = i;
4819         }
4820         SPLIT_APPEND(data, j, eol);
4821         j = i;
4822     }
4823     if (j < len) {
4824         SPLIT_APPEND(data, j, len);
4825     }
4826
4827     Py_DECREF(string);
4828     return list;
4829
4830  onError:
4831     Py_XDECREF(list);
4832     Py_DECREF(string);
4833     return NULL;
4834 }
4835
4836 static
4837 PyObject *split_char(PyUnicodeObject *self,
4838                      PyObject *list,
4839                      Py_UNICODE ch,
4840                      Py_ssize_t maxcount)
4841 {
4842     register Py_ssize_t i;
4843     register Py_ssize_t j;
4844     Py_ssize_t len = self->length;
4845     PyObject *str;
4846
4847     for (i = j = 0; i < len; ) {
4848         if (self->str[i] == ch) {
4849             if (maxcount-- <= 0)
4850                 break;
4851             SPLIT_APPEND(self->str, j, i);
4852             i = j = i + 1;
4853         } else
4854             i++;
4855     }
4856     if (j <= len) {
4857         SPLIT_APPEND(self->str, j, len);
4858     }
4859     return list;
4860
4861  onError:
4862     Py_DECREF(list);
4863     return NULL;
4864 }
4865
4866 static
4867 PyObject *split_substring(PyUnicodeObject *self,
4868                           PyObject *list,
4869                           PyUnicodeObject *substring,
4870                           Py_ssize_t maxcount)
4871 {
4872     register Py_ssize_t i;
4873     register Py_ssize_t j;
4874     Py_ssize_t len = self->length;
4875     Py_ssize_t sublen = substring->length;
4876     PyObject *str;
4877
4878     for (i = j = 0; i <= len - sublen; ) {
4879         if (Py_UNICODE_MATCH(self, i, substring)) {
4880             if (maxcount-- <= 0)
4881                 break;
4882             SPLIT_APPEND(self->str, j, i);
4883             i = j = i + sublen;
4884         } else
4885             i++;
4886     }
4887     if (j <= len) {
4888         SPLIT_APPEND(self->str, j, len);
4889     }
4890     return list;
4891
4892  onError:
4893     Py_DECREF(list);
4894     return NULL;
4895 }
4896
4897 static
4898 PyObject *rsplit_whitespace(PyUnicodeObject *self,
4899                             PyObject *list,
4900                             Py_ssize_t maxcount)
4901 {
4902     register Py_ssize_t i;
4903     register Py_ssize_t j;
4904     Py_ssize_t len = self->length;
4905     PyObject *str;
4906
4907     for (i = j = len - 1; i >= 0; ) {
4908         /* find a token */
4909         while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4910             i--;
4911         j = i;
4912         while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4913             i--;
4914         if (j > i) {
4915             if (maxcount-- <= 0)
4916                 break;
4917             SPLIT_APPEND(self->str, i + 1, j + 1);
4918             while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4919                 i--;
4920             j = i;
4921         }
4922     }
4923     if (j >= 0) {
4924         SPLIT_APPEND(self->str, 0, j + 1);
4925     }
4926     if (PyList_Reverse(list) < 0)
4927         goto onError;
4928     return list;
4929
4930  onError:
4931     Py_DECREF(list);
4932     return NULL;
4933 }
4934
4935 static
4936 PyObject *rsplit_char(PyUnicodeObject *self,
4937                       PyObject *list,
4938                       Py_UNICODE ch,
4939                       Py_ssize_t maxcount)
4940 {
4941     register Py_ssize_t i;
4942     register Py_ssize_t j;
4943     Py_ssize_t len = self->length;
4944     PyObject *str;
4945
4946     for (i = j = len - 1; i >= 0; ) {
4947         if (self->str[i] == ch) {
4948             if (maxcount-- <= 0)
4949                 break;
4950             SPLIT_APPEND(self->str, i + 1, j + 1);
4951             j = i = i - 1;
4952         } else
4953             i--;
4954     }
4955     if (j >= -1) {
4956         SPLIT_APPEND(self->str, 0, j + 1);
4957     }
4958     if (PyList_Reverse(list) < 0)
4959         goto onError;
4960     return list;
4961
4962  onError:
4963     Py_DECREF(list);
4964     return NULL;
4965 }
4966
4967 static
4968 PyObject *rsplit_substring(PyUnicodeObject *self,
4969                            PyObject *list,
4970                            PyUnicodeObject *substring,
4971                            Py_ssize_t maxcount)
4972 {
4973     register Py_ssize_t i;
4974     register Py_ssize_t j;
4975     Py_ssize_t len = self->length;
4976     Py_ssize_t sublen = substring->length;
4977     PyObject *str;
4978
4979     for (i = len - sublen, j = len; i >= 0; ) {
4980         if (Py_UNICODE_MATCH(self, i, substring)) {
4981             if (maxcount-- <= 0)
4982                 break;
4983             SPLIT_APPEND(self->str, i + sublen, j);
4984             j = i;
4985             i -= sublen;
4986         } else
4987             i--;
4988     }
4989     if (j >= 0) {
4990         SPLIT_APPEND(self->str, 0, j);
4991     }
4992     if (PyList_Reverse(list) < 0)
4993         goto onError;
4994     return list;
4995
4996  onError:
4997     Py_DECREF(list);
4998     return NULL;
4999 }
5000
5001 #undef SPLIT_APPEND
5002
5003 static
5004 PyObject *split(PyUnicodeObject *self,
5005                 PyUnicodeObject *substring,
5006                 Py_ssize_t maxcount)
5007 {
5008     PyObject *list;
5009
5010     if (maxcount < 0)
5011         maxcount = PY_SSIZE_T_MAX;
5012
5013     list = PyList_New(0);
5014     if (!list)
5015         return NULL;
5016
5017     if (substring == NULL)
5018         return split_whitespace(self,list,maxcount);
5019
5020     else if (substring->length == 1)
5021         return split_char(self,list,substring->str[0],maxcount);
5022
5023     else if (substring->length == 0) {
5024         Py_DECREF(list);
5025         PyErr_SetString(PyExc_ValueError, "empty separator");
5026         return NULL;
5027     }
5028     else
5029         return split_substring(self,list,substring,maxcount);
5030 }
5031
5032 static
5033 PyObject *rsplit(PyUnicodeObject *self,
5034                  PyUnicodeObject *substring,
5035                  Py_ssize_t maxcount)
5036 {
5037     PyObject *list;
5038
5039     if (maxcount < 0)
5040         maxcount = PY_SSIZE_T_MAX;
5041
5042     list = PyList_New(0);
5043     if (!list)
5044         return NULL;
5045
5046     if (substring == NULL)
5047         return rsplit_whitespace(self,list,maxcount);
5048
5049     else if (substring->length == 1)
5050         return rsplit_char(self,list,substring->str[0],maxcount);
5051
5052     else if (substring->length == 0) {
5053         Py_DECREF(list);
5054         PyErr_SetString(PyExc_ValueError, "empty separator");
5055         return NULL;
5056     }
5057     else
5058         return rsplit_substring(self,list,substring,maxcount);
5059 }
5060
5061 static
5062 PyObject *replace(PyUnicodeObject *self,
5063                   PyUnicodeObject *str1,
5064                   PyUnicodeObject *str2,
5065                   Py_ssize_t maxcount)
5066 {
5067     PyUnicodeObject *u;
5068
5069     if (maxcount < 0)
5070         maxcount = PY_SSIZE_T_MAX;
5071
5072     if (str1->length == str2->length) {
5073         /* same length */
5074         Py_ssize_t i;
5075         if (str1->length == 1) {
5076             /* replace characters */
5077             Py_UNICODE u1, u2;
5078             if (!findchar(self->str, self->length, str1->str[0]))
5079                 goto nothing;
5080             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5081             if (!u)
5082                 return NULL;
5083             Py_UNICODE_COPY(u->str, self->str, self->length);
5084             u1 = str1->str[0];
5085             u2 = str2->str[0];
5086             for (i = 0; i < u->length; i++)
5087                 if (u->str[i] == u1) {
5088                     if (--maxcount < 0)
5089                         break;
5090                     u->str[i] = u2;
5091                 }
5092         } else {
5093             i = fastsearch(
5094                 self->str, self->length, str1->str, str1->length, FAST_SEARCH
5095                 );
5096             if (i < 0)
5097                 goto nothing;
5098             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5099             if (!u)
5100                 return NULL;
5101             Py_UNICODE_COPY(u->str, self->str, self->length);
5102             while (i <= self->length - str1->length)
5103                 if (Py_UNICODE_MATCH(self, i, str1)) {
5104                     if (--maxcount < 0)
5105                         break;
5106                     Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5107                     i += str1->length;
5108                 } else
5109                     i++;
5110         }
5111     } else {
5112
5113         Py_ssize_t n, i, j, e;
5114         Py_ssize_t product, new_size, delta;
5115         Py_UNICODE *p;
5116
5117         /* replace strings */
5118         n = stringlib_count(self->str, self->length, str1->str, str1->length);
5119         if (n > maxcount)
5120             n = maxcount;
5121         if (n == 0)
5122             goto nothing;
5123         /* new_size = self->length + n * (str2->length - str1->length)); */
5124         delta = (str2->length - str1->length);
5125         if (delta == 0) {
5126             new_size = self->length;
5127         } else {
5128             product = n * (str2->length - str1->length);
5129             if ((product / (str2->length - str1->length)) != n) {
5130                 PyErr_SetString(PyExc_OverflowError,
5131                                 "replace string is too long");
5132                 return NULL;
5133             }
5134             new_size = self->length + product;
5135             if (new_size < 0) {
5136                 PyErr_SetString(PyExc_OverflowError,
5137                                 "replace string is too long");
5138                 return NULL;
5139             }
5140         }
5141         u = _PyUnicode_New(new_size);
5142         if (!u)
5143             return NULL;
5144         i = 0;
5145         p = u->str;
5146         e = self->length - str1->length;
5147         if (str1->length > 0) {
5148             while (n-- > 0) {
5149                 /* look for next match */
5150                 j = i;
5151                 while (j <= e) {
5152                     if (Py_UNICODE_MATCH(self, j, str1))
5153                         break;
5154                     j++;
5155                 }
5156                 if (j > i) {
5157                     if (j > e)
5158                         break;
5159                     /* copy unchanged part [i:j] */
5160                     Py_UNICODE_COPY(p, self->str+i, j-i);
5161                     p += j - i;
5162                 }
5163                 /* copy substitution string */
5164                 if (str2->length > 0) {
5165                     Py_UNICODE_COPY(p, str2->str, str2->length);
5166                     p += str2->length;
5167                 }
5168                 i = j + str1->length;
5169             }
5170             if (i < self->length)
5171                 /* copy tail [i:] */
5172                 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5173         } else {
5174             /* interleave */
5175             while (n > 0) {
5176                 Py_UNICODE_COPY(p, str2->str, str2->length);
5177                 p += str2->length;
5178                 if (--n <= 0)
5179                     break;
5180                 *p++ = self->str[i++];
5181             }
5182             Py_UNICODE_COPY(p, self->str+i, self->length-i);
5183         }
5184     }
5185     return (PyObject *) u;
5186
5187 nothing:
5188     /* nothing to replace; return original string (when possible) */
5189     if (PyUnicode_CheckExact(self)) {
5190         Py_INCREF(self);
5191         return (PyObject *) self;
5192     }
5193     return PyUnicode_FromUnicode(self->str, self->length);
5194 }
5195
5196 /* --- Unicode Object Methods --------------------------------------------- */
5197
5198 PyDoc_STRVAR(title__doc__,
5199 "S.title() -> unicode\n\
5200 \n\
5201 Return a titlecased version of S, i.e. words start with title case\n\
5202 characters, all remaining cased characters have lower case.");
5203
5204 static PyObject*
5205 unicode_title(PyUnicodeObject *self)
5206 {
5207     return fixup(self, fixtitle);
5208 }
5209
5210 PyDoc_STRVAR(capitalize__doc__,
5211 "S.capitalize() -> unicode\n\
5212 \n\
5213 Return a capitalized version of S, i.e. make the first character\n\
5214 have upper case.");
5215
5216 static PyObject*
5217 unicode_capitalize(PyUnicodeObject *self)
5218 {
5219     return fixup(self, fixcapitalize);
5220 }
5221
5222 #if 0
5223 PyDoc_STRVAR(capwords__doc__,
5224 "S.capwords() -> unicode\n\
5225 \n\
5226 Apply .capitalize() to all words in S and return the result with\n\
5227 normalized whitespace (all whitespace strings are replaced by ' ').");
5228
5229 static PyObject*
5230 unicode_capwords(PyUnicodeObject *self)
5231 {
5232     PyObject *list;
5233     PyObject *item;
5234     Py_ssize_t i;
5235
5236     /* Split into words */
5237     list = split(self, NULL, -1);
5238     if (!list)
5239         return NULL;
5240
5241     /* Capitalize each word */
5242     for (i = 0; i < PyList_GET_SIZE(list); i++) {
5243         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5244                      fixcapitalize);
5245         if (item == NULL)
5246             goto onError;
5247         Py_DECREF(PyList_GET_ITEM(list, i));
5248         PyList_SET_ITEM(list, i, item);
5249     }
5250
5251     /* Join the words to form a new string */
5252     item = PyUnicode_Join(NULL, list);
5253
5254 onError:
5255     Py_DECREF(list);
5256     return (PyObject *)item;
5257 }
5258 #endif
5259
5260 /* Argument converter.  Coerces to a single unicode character */
5261
5262 static int
5263 convert_uc(PyObject *obj, void *addr)
5264 {
5265         Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5266         PyObject *uniobj;
5267         Py_UNICODE *unistr;
5268
5269         uniobj = PyUnicode_FromObject(obj);
5270         if (uniobj == NULL) {
5271                 PyErr_SetString(PyExc_TypeError,
5272                         "The fill character cannot be converted to Unicode");
5273                 return 0;
5274         }
5275         if (PyUnicode_GET_SIZE(uniobj) != 1) {
5276                 PyErr_SetString(PyExc_TypeError,
5277                         "The fill character must be exactly one character long");
5278                 Py_DECREF(uniobj);
5279                 return 0;
5280         }
5281         unistr = PyUnicode_AS_UNICODE(uniobj);
5282         *fillcharloc = unistr[0];
5283         Py_DECREF(uniobj);
5284         return 1;
5285 }
5286
5287 PyDoc_STRVAR(center__doc__,
5288 "S.center(width[, fillchar]) -> unicode\n\
5289 \n\
5290 Return S centered in a Unicode string of length width. Padding is\n\
5291 done using the specified fill character (default is a space)");
5292
5293 static PyObject *
5294 unicode_center(PyUnicodeObject *self, PyObject *args)
5295 {
5296     Py_ssize_t marg, left;
5297     Py_ssize_t width;
5298     Py_UNICODE fillchar = ' ';
5299
5300     if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
5301         return NULL;
5302
5303     if (self->length >= width && PyUnicode_CheckExact(self)) {
5304         Py_INCREF(self);
5305         return (PyObject*) self;
5306     }
5307
5308     marg = width - self->length;
5309     left = marg / 2 + (marg & width & 1);
5310
5311     return (PyObject*) pad(self, left, marg - left, fillchar);
5312 }
5313
5314 #if 0
5315
5316 /* This code should go into some future Unicode collation support
5317    module. The basic comparison should compare ordinals on a naive
5318    basis (this is what Java does and thus JPython too). */
5319
5320 /* speedy UTF-16 code point order comparison */
5321 /* gleaned from: */
5322 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5323
5324 static short utf16Fixup[32] =
5325 {
5326     0, 0, 0, 0, 0, 0, 0, 0,
5327     0, 0, 0, 0, 0, 0, 0, 0,
5328     0, 0, 0, 0, 0, 0, 0, 0,
5329     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
5330 };
5331
5332 static int
5333 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5334 {
5335     Py_ssize_t len1, len2;
5336
5337     Py_UNICODE *s1 = str1->str;
5338     Py_UNICODE *s2 = str2->str;
5339
5340     len1 = str1->length;
5341     len2 = str2->length;
5342
5343     while (len1 > 0 && len2 > 0) {
5344         Py_UNICODE c1, c2;
5345
5346         c1 = *s1++;
5347         c2 = *s2++;
5348
5349         if (c1 > (1<<11) * 26)
5350             c1 += utf16Fixup[c1>>11];
5351         if (c2 > (1<<11) * 26)
5352             c2 += utf16Fixup[c2>>11];
5353         /* now c1 and c2 are in UTF-32-compatible order */
5354
5355         if (c1 != c2)
5356             return (c1 < c2) ? -1 : 1;
5357
5358         len1--; len2--;
5359     }
5360
5361     return (len1 < len2) ? -1 : (len1 != len2);
5362 }
5363
5364 #else
5365
5366 static int
5367 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5368 {
5369     register Py_ssize_t len1, len2;
5370
5371     Py_UNICODE *s1 = str1->str;
5372     Py_UNICODE *s2 = str2->str;
5373
5374     len1 = str1->length;
5375     len2 = str2->length;
5376
5377     while (len1 > 0 && len2 > 0) {
5378         Py_UNICODE c1, c2;
5379
5380         c1 = *s1++;
5381         c2 = *s2++;
5382
5383         if (c1 != c2)
5384             return (c1 < c2) ? -1 : 1;
5385
5386         len1--; len2--;
5387     }
5388
5389     return (len1 < len2) ? -1 : (len1 != len2);
5390 }
5391
5392 #endif
5393
5394 int PyUnicode_Compare(PyObject *left,
5395                       PyObject *right)
5396 {
5397     PyUnicodeObject *u = NULL, *v = NULL;
5398     int result;
5399
5400     /* Coerce the two arguments */
5401     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5402     if (u == NULL)
5403         goto onError;
5404     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5405     if (v == NULL)
5406         goto onError;
5407
5408     /* Shortcut for empty or interned objects */
5409     if (v == u) {
5410         Py_DECREF(u);
5411         Py_DECREF(v);
5412         return 0;
5413     }
5414
5415     result = unicode_compare(u, v);
5416
5417     Py_DECREF(u);
5418     Py_DECREF(v);
5419     return result;
5420
5421 onError:
5422     Py_XDECREF(u);
5423     Py_XDECREF(v);
5424     return -1;
5425 }
5426
5427 PyObject *PyUnicode_RichCompare(PyObject *left,
5428                                 PyObject *right,
5429                                 int op)
5430 {
5431     int result;
5432
5433     result = PyUnicode_Compare(left, right);
5434     if (result == -1 && PyErr_Occurred())
5435         goto onError;
5436
5437     /* Convert the return value to a Boolean */
5438     switch (op) {
5439     case Py_EQ:
5440         result = (result == 0);
5441         break;
5442     case Py_NE:
5443         result = (result != 0);
5444         break;
5445     case Py_LE:
5446         result = (result <= 0);
5447         break;
5448     case Py_GE:
5449         result = (result >= 0);
5450         break;
5451     case Py_LT:
5452         result = (result == -1);
5453         break;
5454     case Py_GT:
5455         result = (result == 1);
5456         break;
5457     }
5458     return PyBool_FromLong(result);
5459
5460  onError:
5461
5462     /* Standard case
5463
5464        Type errors mean that PyUnicode_FromObject() could not convert
5465        one of the arguments (usually the right hand side) to Unicode,
5466        ie. we can't handle the comparison request. However, it is
5467        possible that the other object knows a comparison method, which
5468        is why we return Py_NotImplemented to give the other object a
5469        chance.
5470
5471     */
5472     if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5473         PyErr_Clear();
5474         Py_INCREF(Py_NotImplemented);
5475         return Py_NotImplemented;
5476     }
5477     if (op != Py_EQ && op != Py_NE)
5478         return NULL;
5479
5480     /* Equality comparison.
5481
5482        This is a special case: we silence any PyExc_UnicodeDecodeError
5483        and instead turn it into a PyErr_UnicodeWarning.
5484
5485     */
5486     if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5487         return NULL;
5488     PyErr_Clear();
5489     if (PyErr_Warn(PyExc_UnicodeWarning,
5490                    (op == Py_EQ) ?
5491                    "Unicode equal comparison "
5492                    "failed to convert both arguments to Unicode - "
5493                    "interpreting them as being unequal" :
5494                    "Unicode unequal comparison "
5495                    "failed to convert both arguments to Unicode - "
5496                    "interpreting them as being unequal"
5497                    ) < 0)
5498         return NULL;
5499     result = (op == Py_NE);
5500     return PyBool_FromLong(result);
5501 }
5502
5503 int PyUnicode_Contains(PyObject *container,
5504                        PyObject *element)
5505 {
5506     PyObject *str, *sub;
5507     int result;
5508
5509     /* Coerce the two arguments */
5510     sub = PyUnicode_FromObject(element);
5511     if (!sub) {
5512         PyErr_SetString(PyExc_TypeError,
5513             "'in <string>' requires string as left operand");
5514         return -1;
5515     }
5516
5517     str = PyUnicode_FromObject(container);
5518     if (!str) {
5519         Py_DECREF(sub);
5520         return -1;
5521     }
5522
5523     result = stringlib_contains_obj(str, sub);
5524
5525     Py_DECREF(str);
5526     Py_DECREF(sub);
5527
5528     return result;
5529 }
5530
5531 /* Concat to string or Unicode object giving a new Unicode object. */
5532
5533 PyObject *PyUnicode_Concat(PyObject *left,
5534                            PyObject *right)
5535 {
5536     PyUnicodeObject *u = NULL, *v = NULL, *w;
5537
5538     /* Coerce the two arguments */
5539     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5540     if (u == NULL)
5541         goto onError;
5542     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5543     if (v == NULL)
5544         goto onError;
5545
5546     /* Shortcuts */
5547     if (v == unicode_empty) {
5548         Py_DECREF(v);
5549         return (PyObject *)u;
5550     }
5551     if (u == unicode_empty) {
5552         Py_DECREF(u);
5553         return (PyObject *)v;
5554     }
5555
5556     /* Concat the two Unicode strings */
5557     w = _PyUnicode_New(u->length + v->length);
5558     if (w == NULL)
5559         goto onError;
5560     Py_UNICODE_COPY(w->str, u->str, u->length);
5561     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5562
5563     Py_DECREF(u);
5564     Py_DECREF(v);
5565     return (PyObject *)w;
5566
5567 onError:
5568     Py_XDECREF(u);
5569     Py_XDECREF(v);
5570     return NULL;
5571 }
5572
5573 PyDoc_STRVAR(count__doc__,
5574 "S.count(sub[, start[, end]]) -> int\n\
5575 \n\
5576 Return the number of non-overlapping occurrences of substring sub in\n\
5577 Unicode string S[start:end].  Optional arguments start and end are\n\
5578 interpreted as in slice notation.");
5579
5580 static PyObject *
5581 unicode_count(PyUnicodeObject *self, PyObject *args)
5582 {
5583     PyUnicodeObject *substring;
5584     Py_ssize_t start = 0;
5585     Py_ssize_t end = PY_SSIZE_T_MAX;
5586     PyObject *result;
5587
5588     if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5589                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5590         return NULL;
5591
5592     substring = (PyUnicodeObject *)PyUnicode_FromObject(
5593         (PyObject *)substring);
5594     if (substring == NULL)
5595         return NULL;
5596
5597     FIX_START_END(self);
5598
5599     result = PyInt_FromSsize_t(
5600         stringlib_count(self->str + start, end - start,
5601                         substring->str, substring->length)
5602         );
5603
5604     Py_DECREF(substring);
5605
5606     return result;
5607 }
5608
5609 PyDoc_STRVAR(encode__doc__,
5610 "S.encode([encoding[,errors]]) -> string or unicode\n\
5611 \n\
5612 Encodes S using the codec registered for encoding. encoding defaults\n\
5613 to the default encoding. errors may be given to set a different error\n\
5614 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5615 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5616 'xmlcharrefreplace' as well as any other name registered with\n\
5617 codecs.register_error that can handle UnicodeEncodeErrors.");
5618
5619 static PyObject *
5620 unicode_encode(PyUnicodeObject *self, PyObject *args)
5621 {
5622     char *encoding = NULL;
5623     char *errors = NULL;
5624     PyObject *v;
5625
5626     if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5627         return NULL;
5628     v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
5629     if (v == NULL)
5630         goto onError;
5631     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5632         PyErr_Format(PyExc_TypeError,
5633                      "encoder did not return a string/unicode object "
5634                      "(type=%.400s)",
5635                      v->ob_type->tp_name);
5636         Py_DECREF(v);
5637         return NULL;
5638     }
5639     return v;
5640
5641  onError:
5642     return NULL;
5643 }
5644
5645 PyDoc_STRVAR(decode__doc__,
5646 "S.decode([encoding[,errors]]) -> string or unicode\n\
5647 \n\
5648 Decodes S using the codec registered for encoding. encoding defaults\n\
5649 to the default encoding. errors may be given to set a different error\n\
5650 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5651 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5652 as well as any other name registerd with codecs.register_error that is\n\
5653 able to handle UnicodeDecodeErrors.");
5654
5655 static PyObject *
5656 unicode_decode(PyUnicodeObject *self, PyObject *args)
5657 {
5658     char *encoding = NULL;
5659     char *errors = NULL;
5660     PyObject *v;
5661
5662     if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5663         return NULL;
5664     v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
5665     if (v == NULL)
5666         goto onError;
5667     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5668         PyErr_Format(PyExc_TypeError,
5669                      "decoder did not return a string/unicode object "
5670                      "(type=%.400s)",
5671                      v->ob_type->tp_name);
5672         Py_DECREF(v);
5673         return NULL;
5674     }
5675     return v;
5676
5677  onError:
5678     return NULL;
5679 }
5680
5681 PyDoc_STRVAR(expandtabs__doc__,
5682 "S.expandtabs([tabsize]) -> unicode\n\
5683 \n\
5684 Return a copy of S where all tab characters are expanded using spaces.\n\
5685 If tabsize is not given, a tab size of 8 characters is assumed.");
5686
5687 static PyObject*
5688 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5689 {
5690     Py_UNICODE *e;
5691     Py_UNICODE *p;
5692     Py_UNICODE *q;
5693     Py_ssize_t i, j;
5694     PyUnicodeObject *u;
5695     int tabsize = 8;
5696
5697     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5698         return NULL;
5699
5700     /* First pass: determine size of output string */
5701     i = j = 0;
5702     e = self->str + self->length;
5703     for (p = self->str; p < e; p++)
5704         if (*p == '\t') {
5705             if (tabsize > 0)
5706                 j += tabsize - (j % tabsize);
5707         }
5708         else {
5709             j++;
5710             if (*p == '\n' || *p == '\r') {
5711                 i += j;
5712                 j = 0;
5713             }
5714         }
5715
5716     /* Second pass: create output string and fill it */
5717     u = _PyUnicode_New(i + j);
5718     if (!u)
5719         return NULL;
5720
5721     j = 0;
5722     q = u->str;
5723
5724     for (p = self->str; p < e; p++)
5725         if (*p == '\t') {
5726             if (tabsize > 0) {
5727                 i = tabsize - (j % tabsize);
5728                 j += i;
5729                 while (i--)
5730                     *q++ = ' ';
5731             }
5732         }
5733         else {
5734             j++;
5735             *q++ = *p;
5736             if (*p == '\n' || *p == '\r')
5737                 j = 0;
5738         }
5739
5740     return (PyObject*) u;
5741 }
5742
5743 PyDoc_STRVAR(find__doc__,
5744 "S.find(sub [,start [,end]]) -> int\n\
5745 \n\
5746 Return the lowest index in S where substring sub is found,\n\
5747 such that sub is contained within s[start,end].  Optional\n\
5748 arguments start and end are interpreted as in slice notation.\n\
5749 \n\
5750 Return -1 on failure.");
5751
5752 static PyObject *
5753 unicode_find(PyUnicodeObject *self, PyObject *args)
5754 {
5755     PyObject *substring;
5756     Py_ssize_t start = 0;
5757     Py_ssize_t end = PY_SSIZE_T_MAX;
5758     Py_ssize_t result;
5759
5760     if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5761                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5762         return NULL;
5763     substring = PyUnicode_FromObject(substring);
5764     if (!substring)
5765         return NULL;
5766
5767     result = stringlib_find_slice(
5768         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5769         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5770         start, end
5771         );
5772
5773     Py_DECREF(substring);
5774
5775     return PyInt_FromSsize_t(result);
5776 }
5777
5778 static PyObject *
5779 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
5780 {
5781     if (index < 0 || index >= self->length) {
5782         PyErr_SetString(PyExc_IndexError, "string index out of range");
5783         return NULL;
5784     }
5785
5786     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5787 }
5788
5789 static long
5790 unicode_hash(PyUnicodeObject *self)
5791 {
5792     /* Since Unicode objects compare equal to their ASCII string
5793        counterparts, they should use the individual character values
5794        as basis for their hash value.  This is needed to assure that
5795        strings and Unicode objects behave in the same way as
5796        dictionary keys. */
5797
5798     register Py_ssize_t len;
5799     register Py_UNICODE *p;
5800     register long x;
5801
5802     if (self->hash != -1)
5803         return self->hash;
5804     len = PyUnicode_GET_SIZE(self);
5805     p = PyUnicode_AS_UNICODE(self);
5806     x = *p << 7;
5807     while (--len >= 0)
5808         x = (1000003*x) ^ *p++;
5809     x ^= PyUnicode_GET_SIZE(self);
5810     if (x == -1)
5811         x = -2;
5812     self->hash = x;
5813     return x;
5814 }
5815
5816 PyDoc_STRVAR(index__doc__,
5817 "S.index(sub [,start [,end]]) -> int\n\
5818 \n\
5819 Like S.find() but raise ValueError when the substring is not found.");
5820
5821 static PyObject *
5822 unicode_index(PyUnicodeObject *self, PyObject *args)
5823 {
5824     Py_ssize_t result;
5825     PyObject *substring;
5826     Py_ssize_t start = 0;
5827     Py_ssize_t end = PY_SSIZE_T_MAX;
5828
5829     if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5830                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5831         return NULL;
5832     substring = PyUnicode_FromObject(substring);
5833     if (!substring)
5834         return NULL;
5835
5836     result = stringlib_find_slice(
5837         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5838         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5839         start, end
5840         );
5841
5842     Py_DECREF(substring);
5843
5844     if (result < 0) {
5845         PyErr_SetString(PyExc_ValueError, "substring not found");
5846         return NULL;
5847     }
5848
5849     return PyInt_FromSsize_t(result);
5850 }
5851
5852 PyDoc_STRVAR(islower__doc__,
5853 "S.islower() -> bool\n\
5854 \n\
5855 Return True if all cased characters in S are lowercase and there is\n\
5856 at least one cased character in S, False otherwise.");
5857
5858 static PyObject*
5859 unicode_islower(PyUnicodeObject *self)
5860 {
5861     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5862     register const Py_UNICODE *e;
5863     int cased;
5864
5865     /* Shortcut for single character strings */
5866     if (PyUnicode_GET_SIZE(self) == 1)
5867         return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
5868
5869     /* Special case for empty strings */
5870     if (PyUnicode_GET_SIZE(self) == 0)
5871         return PyBool_FromLong(0);
5872
5873     e = p + PyUnicode_GET_SIZE(self);
5874     cased = 0;
5875     for (; p < e; p++) {
5876         register const Py_UNICODE ch = *p;
5877
5878         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
5879             return PyBool_FromLong(0);
5880         else if (!cased && Py_UNICODE_ISLOWER(ch))
5881             cased = 1;
5882     }
5883     return PyBool_FromLong(cased);
5884 }
5885
5886 PyDoc_STRVAR(isupper__doc__,
5887 "S.isupper() -> bool\n\
5888 \n\
5889 Return True if all cased characters in S are uppercase and there is\n\
5890 at least one cased character in S, False otherwise.");
5891
5892 static PyObject*
5893 unicode_isupper(PyUnicodeObject *self)
5894 {
5895     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5896     register const Py_UNICODE *e;
5897     int cased;
5898
5899     /* Shortcut for single character strings */
5900     if (PyUnicode_GET_SIZE(self) == 1)
5901         return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
5902
5903     /* Special case for empty strings */
5904     if (PyUnicode_GET_SIZE(self) == 0)
5905         return PyBool_FromLong(0);
5906
5907     e = p + PyUnicode_GET_SIZE(self);
5908     cased = 0;
5909     for (; p < e; p++) {
5910         register const Py_UNICODE ch = *p;
5911
5912         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
5913             return PyBool_FromLong(0);
5914         else if (!cased && Py_UNICODE_ISUPPER(ch))
5915             cased = 1;
5916     }
5917     return PyBool_FromLong(cased);
5918 }
5919
5920 PyDoc_STRVAR(istitle__doc__,
5921 "S.istitle() -> bool\n\
5922 \n\
5923 Return True if S is a titlecased string and there is at least one\n\
5924 character in S, i.e. upper- and titlecase characters may only\n\
5925 follow uncased characters and lowercase characters only cased ones.\n\
5926 Return False otherwise.");
5927
5928 static PyObject*
5929 unicode_istitle(PyUnicodeObject *self)
5930 {
5931     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5932     register const Py_UNICODE *e;
5933     int cased, previous_is_cased;
5934
5935     /* Shortcut for single character strings */
5936     if (PyUnicode_GET_SIZE(self) == 1)
5937         return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5938                                (Py_UNICODE_ISUPPER(*p) != 0));
5939
5940     /* Special case for empty strings */
5941     if (PyUnicode_GET_SIZE(self) == 0)
5942         return PyBool_FromLong(0);
5943
5944     e = p + PyUnicode_GET_SIZE(self);
5945     cased = 0;
5946     previous_is_cased = 0;
5947     for (; p < e; p++) {
5948         register const Py_UNICODE ch = *p;
5949
5950         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5951             if (previous_is_cased)
5952                 return PyBool_FromLong(0);
5953             previous_is_cased = 1;
5954             cased = 1;
5955         }
5956         else if (Py_UNICODE_ISLOWER(ch)) {
5957             if (!previous_is_cased)
5958                 return PyBool_FromLong(0);
5959             previous_is_cased = 1;
5960             cased = 1;
5961         }
5962         else
5963             previous_is_cased = 0;
5964     }
5965     return PyBool_FromLong(cased);
5966 }
5967
5968 PyDoc_STRVAR(isspace__doc__,
5969 "S.isspace() -> bool\n\
5970 \n\
5971 Return True if all characters in S are whitespace\n\
5972 and there is at least one character in S, False otherwise.");
5973
5974 static PyObject*
5975 unicode_isspace(PyUnicodeObject *self)
5976 {
5977     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5978     register const Py_UNICODE *e;
5979
5980     /* Shortcut for single character strings */
5981     if (PyUnicode_GET_SIZE(self) == 1 &&
5982         Py_UNICODE_ISSPACE(*p))
5983         return PyBool_FromLong(1);
5984
5985     /* Special case for empty strings */
5986     if (PyUnicode_GET_SIZE(self) == 0)
5987         return PyBool_FromLong(0);
5988
5989     e = p + PyUnicode_GET_SIZE(self);
5990     for (; p < e; p++) {
5991         if (!Py_UNICODE_ISSPACE(*p))
5992             return PyBool_FromLong(0);
5993     }
5994     return PyBool_FromLong(1);
5995 }
5996
5997 PyDoc_STRVAR(isalpha__doc__,
5998 "S.isalpha() -> bool\n\
5999 \n\
6000 Return True if all characters in S are alphabetic\n\
6001 and there is at least one character in S, False otherwise.");
6002
6003 static PyObject*
6004 unicode_isalpha(PyUnicodeObject *self)
6005 {
6006     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6007     register const Py_UNICODE *e;
6008
6009     /* Shortcut for single character strings */
6010     if (PyUnicode_GET_SIZE(self) == 1 &&
6011         Py_UNICODE_ISALPHA(*p))
6012         return PyBool_FromLong(1);
6013
6014     /* Special case for empty strings */
6015     if (PyUnicode_GET_SIZE(self) == 0)
6016         return PyBool_FromLong(0);
6017
6018     e = p + PyUnicode_GET_SIZE(self);
6019     for (; p < e; p++) {
6020         if (!Py_UNICODE_ISALPHA(*p))
6021             return PyBool_FromLong(0);
6022     }
6023     return PyBool_FromLong(1);
6024 }
6025
6026 PyDoc_STRVAR(isalnum__doc__,
6027 "S.isalnum() -> bool\n\
6028 \n\
6029 Return True if all characters in S are alphanumeric\n\
6030 and there is at least one character in S, False otherwise.");
6031
6032 static PyObject*
6033 unicode_isalnum(PyUnicodeObject *self)
6034 {
6035     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6036     register const Py_UNICODE *e;
6037
6038     /* Shortcut for single character strings */
6039     if (PyUnicode_GET_SIZE(self) == 1 &&
6040         Py_UNICODE_ISALNUM(*p))
6041         return PyBool_FromLong(1);
6042
6043     /* Special case for empty strings */
6044     if (PyUnicode_GET_SIZE(self) == 0)
6045         return PyBool_FromLong(0);
6046
6047     e = p + PyUnicode_GET_SIZE(self);
6048     for (; p < e; p++) {
6049         if (!Py_UNICODE_ISALNUM(*p))
6050             return PyBool_FromLong(0);
6051     }
6052     return PyBool_FromLong(1);
6053 }
6054
6055 PyDoc_STRVAR(isdecimal__doc__,
6056 "S.isdecimal() -> bool\n\
6057 \n\
6058 Return True if there are only decimal characters in S,\n\
6059 False otherwise.");
6060
6061 static PyObject*
6062 unicode_isdecimal(PyUnicodeObject *self)
6063 {
6064     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6065     register const Py_UNICODE *e;
6066
6067     /* Shortcut for single character strings */
6068     if (PyUnicode_GET_SIZE(self) == 1 &&
6069         Py_UNICODE_ISDECIMAL(*p))
6070         return PyBool_FromLong(1);
6071
6072     /* Special case for empty strings */
6073     if (PyUnicode_GET_SIZE(self) == 0)
6074         return PyBool_FromLong(0);
6075
6076     e = p + PyUnicode_GET_SIZE(self);
6077     for (; p < e; p++) {
6078         if (!Py_UNICODE_ISDECIMAL(*p))
6079             return PyBool_FromLong(0);
6080     }
6081     return PyBool_FromLong(1);
6082 }
6083
6084 PyDoc_STRVAR(isdigit__doc__,
6085 "S.isdigit() -> bool\n\
6086 \n\
6087 Return True if all characters in S are digits\n\
6088 and there is at least one character in S, False otherwise.");
6089
6090 static PyObject*
6091 unicode_isdigit(PyUnicodeObject *self)
6092 {
6093     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6094     register const Py_UNICODE *e;
6095
6096     /* Shortcut for single character strings */
6097     if (PyUnicode_GET_SIZE(self) == 1 &&
6098         Py_UNICODE_ISDIGIT(*p))
6099         return PyBool_FromLong(1);
6100
6101     /* Special case for empty strings */
6102     if (PyUnicode_GET_SIZE(self) == 0)
6103         return PyBool_FromLong(0);
6104
6105     e = p + PyUnicode_GET_SIZE(self);
6106     for (; p < e; p++) {
6107         if (!Py_UNICODE_ISDIGIT(*p))
6108             return PyBool_FromLong(0);
6109     }
6110     return PyBool_FromLong(1);
6111 }
6112
6113 PyDoc_STRVAR(isnumeric__doc__,
6114 "S.isnumeric() -> bool\n\
6115 \n\
6116 Return True if there are only numeric characters in S,\n\
6117 False otherwise.");
6118
6119 static PyObject*
6120 unicode_isnumeric(PyUnicodeObject *self)
6121 {
6122     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6123     register const Py_UNICODE *e;
6124
6125     /* Shortcut for single character strings */
6126     if (PyUnicode_GET_SIZE(self) == 1 &&
6127         Py_UNICODE_ISNUMERIC(*p))
6128         return PyBool_FromLong(1);
6129
6130     /* Special case for empty strings */
6131     if (PyUnicode_GET_SIZE(self) == 0)
6132         return PyBool_FromLong(0);
6133
6134     e = p + PyUnicode_GET_SIZE(self);
6135     for (; p < e; p++) {
6136         if (!Py_UNICODE_ISNUMERIC(*p))
6137             return PyBool_FromLong(0);
6138     }
6139     return PyBool_FromLong(1);
6140 }
6141
6142 PyDoc_STRVAR(join__doc__,
6143 "S.join(sequence) -> unicode\n\
6144 \n\
6145 Return a string which is the concatenation of the strings in the\n\
6146 sequence.  The separator between elements is S.");
6147
6148 static PyObject*
6149 unicode_join(PyObject *self, PyObject *data)
6150 {
6151     return PyUnicode_Join(self, data);
6152 }
6153
6154 static Py_ssize_t
6155 unicode_length(PyUnicodeObject *self)
6156 {
6157     return self->length;
6158 }
6159
6160 PyDoc_STRVAR(ljust__doc__,
6161 "S.ljust(width[, fillchar]) -> int\n\
6162 \n\
6163 Return S left justified in a Unicode string of length width. Padding is\n\
6164 done using the specified fill character (default is a space).");
6165
6166 static PyObject *
6167 unicode_ljust(PyUnicodeObject *self, PyObject *args)
6168 {
6169     Py_ssize_t width;
6170     Py_UNICODE fillchar = ' ';
6171
6172     if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
6173         return NULL;
6174
6175     if (self->length >= width && PyUnicode_CheckExact(self)) {
6176         Py_INCREF(self);
6177         return (PyObject*) self;
6178     }
6179
6180     return (PyObject*) pad(self, 0, width - self->length, fillchar);
6181 }
6182
6183 PyDoc_STRVAR(lower__doc__,
6184 "S.lower() -> unicode\n\
6185 \n\
6186 Return a copy of the string S converted to lowercase.");
6187
6188 static PyObject*
6189 unicode_lower(PyUnicodeObject *self)
6190 {
6191     return fixup(self, fixlower);
6192 }
6193
6194 #define LEFTSTRIP 0
6195 #define RIGHTSTRIP 1
6196 #define BOTHSTRIP 2
6197
6198 /* Arrays indexed by above */
6199 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6200
6201 #define STRIPNAME(i) (stripformat[i]+3)
6202
6203 /* externally visible for str.strip(unicode) */
6204 PyObject *
6205 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6206 {
6207         Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6208         Py_ssize_t len = PyUnicode_GET_SIZE(self);
6209         Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6210         Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6211         Py_ssize_t i, j;
6212
6213         BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6214
6215         i = 0;
6216         if (striptype != RIGHTSTRIP) {
6217             while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6218                 i++;
6219             }
6220         }
6221
6222         j = len;
6223         if (striptype != LEFTSTRIP) {
6224             do {
6225                 j--;
6226             } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6227             j++;
6228         }
6229
6230         if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6231             Py_INCREF(self);
6232             return (PyObject*)self;
6233         }
6234         else
6235             return PyUnicode_FromUnicode(s+i, j-i);
6236 }
6237
6238
6239 static PyObject *
6240 do_strip(PyUnicodeObject *self, int striptype)
6241 {
6242         Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6243         Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
6244
6245         i = 0;
6246         if (striptype != RIGHTSTRIP) {
6247                 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6248                         i++;
6249                 }
6250         }
6251
6252         j = len;
6253         if (striptype != LEFTSTRIP) {
6254                 do {
6255                         j--;
6256                 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6257                 j++;
6258         }
6259
6260         if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6261                 Py_INCREF(self);
6262                 return (PyObject*)self;
6263         }
6264         else
6265                 return PyUnicode_FromUnicode(s+i, j-i);
6266 }
6267
6268
6269 static PyObject *
6270 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6271 {
6272         PyObject *sep = NULL;
6273
6274         if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6275                 return NULL;
6276
6277         if (sep != NULL && sep != Py_None) {
6278                 if (PyUnicode_Check(sep))
6279                         return _PyUnicode_XStrip(self, striptype, sep);
6280                 else if (PyString_Check(sep)) {
6281                         PyObject *res;
6282                         sep = PyUnicode_FromObject(sep);
6283                         if (sep==NULL)
6284                                 return NULL;
6285                         res = _PyUnicode_XStrip(self, striptype, sep);
6286                         Py_DECREF(sep);
6287                         return res;
6288                 }
6289                 else {
6290                         PyErr_Format(PyExc_TypeError,
6291                                      "%s arg must be None, unicode or str",
6292                                      STRIPNAME(striptype));
6293                         return NULL;
6294                 }
6295         }
6296
6297         return do_strip(self, striptype);
6298 }
6299
6300
6301 PyDoc_STRVAR(strip__doc__,
6302 "S.strip([chars]) -> unicode\n\
6303 \n\
6304 Return a copy of the string S with leading and trailing\n\
6305 whitespace removed.\n\
6306 If chars is given and not None, remove characters in chars instead.\n\
6307 If chars is a str, it will be converted to unicode before stripping");
6308
6309 static PyObject *
6310 unicode_strip(PyUnicodeObject *self, PyObject *args)
6311 {
6312         if (PyTuple_GET_SIZE(args) == 0)
6313                 return do_strip(self, BOTHSTRIP); /* Common case */
6314         else
6315                 return do_argstrip(self, BOTHSTRIP, args);
6316 }
6317
6318
6319 PyDoc_STRVAR(lstrip__doc__,
6320 "S.lstrip([chars]) -> unicode\n\
6321 \n\
6322 Return a copy of the string S with leading whitespace removed.\n\
6323 If chars is given and not None, remove characters in chars instead.\n\
6324 If chars is a str, it will be converted to unicode before stripping");
6325
6326 static PyObject *
6327 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6328 {
6329         if (PyTuple_GET_SIZE(args) == 0)
6330                 return do_strip(self, LEFTSTRIP); /* Common case */
6331         else
6332                 return do_argstrip(self, LEFTSTRIP, args);
6333 }
6334
6335
6336 PyDoc_STRVAR(rstrip__doc__,
6337 "S.rstrip([chars]) -> unicode\n\
6338 \n\
6339 Return a copy of the string S with trailing whitespace removed.\n\
6340 If chars is given and not None, remove characters in chars instead.\n\
6341 If chars is a str, it will be converted to unicode before stripping");
6342
6343 static PyObject *
6344 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6345 {
6346         if (PyTuple_GET_SIZE(args) == 0)
6347                 return do_strip(self, RIGHTSTRIP); /* Common case */
6348         else
6349                 return do_argstrip(self, RIGHTSTRIP, args);
6350 }
6351
6352
6353 static PyObject*
6354 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
6355 {
6356     PyUnicodeObject *u;
6357     Py_UNICODE *p;
6358     Py_ssize_t nchars;
6359     size_t nbytes;
6360
6361     if (len < 0)
6362         len = 0;
6363
6364     if (len == 1 && PyUnicode_CheckExact(str)) {
6365         /* no repeat, return original string */
6366         Py_INCREF(str);
6367         return (PyObject*) str;
6368     }
6369
6370     /* ensure # of chars needed doesn't overflow int and # of bytes
6371      * needed doesn't overflow size_t
6372      */
6373     nchars = len * str->length;
6374     if (len && nchars / len != str->length) {
6375         PyErr_SetString(PyExc_OverflowError,
6376                         "repeated string is too long");
6377         return NULL;
6378     }
6379     nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6380     if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6381         PyErr_SetString(PyExc_OverflowError,
6382                         "repeated string is too long");
6383         return NULL;
6384     }
6385     u = _PyUnicode_New(nchars);
6386     if (!u)
6387         return NULL;
6388
6389     p = u->str;
6390
6391     if (str->length == 1 && len > 0) {
6392         Py_UNICODE_FILL(p, str->str[0], len);
6393     } else {
6394         Py_ssize_t done = 0; /* number of characters copied this far */
6395         if (done < nchars) {
6396             Py_UNICODE_COPY(p, str->str, str->length);
6397             done = str->length;
6398         }
6399         while (done < nchars) {
6400             int n = (done <= nchars-done) ? done : nchars-done;
6401             Py_UNICODE_COPY(p+done, p, n);
6402             done += n;
6403         }
6404     }
6405
6406     return (PyObject*) u;
6407 }
6408
6409 PyObject *PyUnicode_Replace(PyObject *obj,
6410                             PyObject *subobj,
6411                             PyObject *replobj,
6412                             Py_ssize_t maxcount)
6413 {
6414     PyObject *self;
6415     PyObject *str1;
6416     PyObject *str2;
6417     PyObject *result;
6418
6419     self = PyUnicode_FromObject(obj);
6420     if (self == NULL)
6421         return NULL;
6422     str1 = PyUnicode_FromObject(subobj);
6423     if (str1 == NULL) {
6424         Py_DECREF(self);
6425         return NULL;
6426     }
6427     str2 = PyUnicode_FromObject(replobj);
6428     if (str2 == NULL) {
6429         Py_DECREF(self);
6430         Py_DECREF(str1);
6431         return NULL;
6432     }
6433     result = replace((PyUnicodeObject *)self,
6434                      (PyUnicodeObject *)str1,
6435                      (PyUnicodeObject *)str2,
6436                      maxcount);
6437     Py_DECREF(self);
6438     Py_DECREF(str1);
6439     Py_DECREF(str2);
6440     return result;
6441 }
6442
6443 PyDoc_STRVAR(replace__doc__,
6444 "S.replace (old, new[, maxsplit]) -> unicode\n\
6445 \n\
6446 Return a copy of S with all occurrences of substring\n\
6447 old replaced by new.  If the optional argument maxsplit is\n\
6448 given, only the first maxsplit occurrences are replaced.");
6449
6450 static PyObject*
6451 unicode_replace(PyUnicodeObject *self, PyObject *args)
6452 {
6453     PyUnicodeObject *str1;
6454     PyUnicodeObject *str2;
6455     Py_ssize_t maxcount = -1;
6456     PyObject *result;
6457
6458     if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
6459         return NULL;
6460     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6461     if (str1 == NULL)
6462         return NULL;
6463     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
6464     if (str2 == NULL) {
6465         Py_DECREF(str1);
6466         return NULL;
6467     }
6468
6469     result = replace(self, str1, str2, maxcount);
6470
6471     Py_DECREF(str1);
6472     Py_DECREF(str2);
6473     return result;
6474 }
6475
6476 static
6477 PyObject *unicode_repr(PyObject *unicode)
6478 {
6479     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6480                                 PyUnicode_GET_SIZE(unicode),
6481                                 1);
6482 }
6483
6484 PyDoc_STRVAR(rfind__doc__,
6485 "S.rfind(sub [,start [,end]]) -> int\n\
6486 \n\
6487 Return the highest index in S where substring sub is found,\n\
6488 such that sub is contained within s[start,end].  Optional\n\
6489 arguments start and end are interpreted as in slice notation.\n\
6490 \n\
6491 Return -1 on failure.");
6492
6493 static PyObject *
6494 unicode_rfind(PyUnicodeObject *self, PyObject *args)
6495 {
6496     PyObject *substring;
6497     Py_ssize_t start = 0;
6498     Py_ssize_t end = PY_SSIZE_T_MAX;
6499     Py_ssize_t result;
6500
6501     if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6502                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6503         return NULL;
6504     substring = PyUnicode_FromObject(substring);
6505     if (!substring)
6506         return NULL;
6507
6508     result = stringlib_rfind_slice(
6509         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6510         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6511         start, end
6512         );
6513
6514     Py_DECREF(substring);
6515
6516     return PyInt_FromSsize_t(result);
6517 }
6518
6519 PyDoc_STRVAR(rindex__doc__,
6520 "S.rindex(sub [,start [,end]]) -> int\n\
6521 \n\
6522 Like S.rfind() but raise ValueError when the substring is not found.");
6523
6524 static PyObject *
6525 unicode_rindex(PyUnicodeObject *self, PyObject *args)
6526 {
6527     PyObject *substring;
6528     Py_ssize_t start = 0;
6529     Py_ssize_t end = PY_SSIZE_T_MAX;
6530     Py_ssize_t result;
6531
6532     if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6533                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6534         return NULL;
6535     substring = PyUnicode_FromObject(substring);
6536     if (!substring)
6537         return NULL;
6538
6539     result = stringlib_rfind_slice(
6540         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6541         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6542         start, end
6543         );
6544
6545     Py_DECREF(substring);
6546
6547     if (result < 0) {
6548         PyErr_SetString(PyExc_ValueError, "substring not found");
6549         return NULL;
6550     }
6551     return PyInt_FromSsize_t(result);
6552 }
6553
6554 PyDoc_STRVAR(rjust__doc__,
6555 "S.rjust(width[, fillchar]) -> unicode\n\
6556 \n\
6557 Return S right justified in a Unicode string of length width. Padding is\n\
6558 done using the specified fill character (default is a space).");
6559
6560 static PyObject *
6561 unicode_rjust(PyUnicodeObject *self, PyObject *args)
6562 {
6563     Py_ssize_t width;
6564     Py_UNICODE fillchar = ' ';
6565
6566     if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
6567         return NULL;
6568
6569     if (self->length >= width && PyUnicode_CheckExact(self)) {
6570         Py_INCREF(self);
6571         return (PyObject*) self;
6572     }
6573
6574     return (PyObject*) pad(self, width - self->length, 0, fillchar);
6575 }
6576
6577 static PyObject*
6578 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
6579 {
6580     /* standard clamping */
6581     if (start < 0)
6582         start = 0;
6583     if (end < 0)
6584         end = 0;
6585     if (end > self->length)
6586         end = self->length;
6587     if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
6588         /* full slice, return original string */
6589         Py_INCREF(self);
6590         return (PyObject*) self;
6591     }
6592     if (start > end)
6593         start = end;
6594     /* copy slice */
6595     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6596                                              end - start);
6597 }
6598
6599 PyObject *PyUnicode_Split(PyObject *s,
6600                           PyObject *sep,
6601                           Py_ssize_t maxsplit)
6602 {
6603     PyObject *result;
6604
6605     s = PyUnicode_FromObject(s);
6606     if (s == NULL)
6607         return NULL;
6608     if (sep != NULL) {
6609         sep = PyUnicode_FromObject(sep);
6610         if (sep == NULL) {
6611             Py_DECREF(s);
6612             return NULL;
6613         }
6614     }
6615
6616     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6617
6618     Py_DECREF(s);
6619     Py_XDECREF(sep);
6620     return result;
6621 }
6622
6623 PyDoc_STRVAR(split__doc__,
6624 "S.split([sep [,maxsplit]]) -> list of strings\n\
6625 \n\
6626 Return a list of the words in S, using sep as the\n\
6627 delimiter string.  If maxsplit is given, at most maxsplit\n\
6628 splits are done. If sep is not specified or is None,\n\
6629 any whitespace string is a separator.");
6630
6631 static PyObject*
6632 unicode_split(PyUnicodeObject *self, PyObject *args)
6633 {
6634     PyObject *substring = Py_None;
6635     Py_ssize_t maxcount = -1;
6636
6637     if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
6638         return NULL;
6639
6640     if (substring == Py_None)
6641         return split(self, NULL, maxcount);
6642     else if (PyUnicode_Check(substring))
6643         return split(self, (PyUnicodeObject *)substring, maxcount);
6644     else
6645         return PyUnicode_Split((PyObject *)self, substring, maxcount);
6646 }
6647
6648 PyObject *
6649 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6650 {
6651     PyObject* str_obj;
6652     PyObject* sep_obj;
6653     PyObject* out;
6654
6655     str_obj = PyUnicode_FromObject(str_in);
6656     if (!str_obj)
6657         return NULL;
6658     sep_obj = PyUnicode_FromObject(sep_in);
6659     if (!sep_obj) {
6660         Py_DECREF(str_obj);
6661         return NULL;
6662     }
6663
6664     out = stringlib_partition(
6665         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6666         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6667         );
6668
6669     Py_DECREF(sep_obj);
6670     Py_DECREF(str_obj);
6671
6672     return out;
6673 }
6674
6675
6676 PyObject *
6677 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6678 {
6679     PyObject* str_obj;
6680     PyObject* sep_obj;
6681     PyObject* out;
6682
6683     str_obj = PyUnicode_FromObject(str_in);
6684     if (!str_obj)
6685         return NULL;
6686     sep_obj = PyUnicode_FromObject(sep_in);
6687     if (!sep_obj) {
6688         Py_DECREF(str_obj);
6689         return NULL;
6690     }
6691
6692     out = stringlib_rpartition(
6693         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6694         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6695         );
6696
6697     Py_DECREF(sep_obj);
6698     Py_DECREF(str_obj);
6699
6700     return out;
6701 }
6702
6703 PyDoc_STRVAR(partition__doc__,
6704 "S.partition(sep) -> (head, sep, tail)\n\
6705 \n\
6706 Searches for the separator sep in S, and returns the part before it,\n\
6707 the separator itself, and the part after it.  If the separator is not\n\
6708 found, returns S and two empty strings.");
6709
6710 static PyObject*
6711 unicode_partition(PyUnicodeObject *self, PyObject *separator)
6712 {
6713     return PyUnicode_Partition((PyObject *)self, separator);
6714 }
6715
6716 PyDoc_STRVAR(rpartition__doc__,
6717 "S.rpartition(sep) -> (tail, sep, head)\n\
6718 \n\
6719 Searches for the separator sep in S, starting at the end of S, and returns\n\
6720 the part before it, the separator itself, and the part after it.  If the\n\
6721 separator is not found, returns two empty strings and S.");
6722
6723 static PyObject*
6724 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
6725 {
6726     return PyUnicode_RPartition((PyObject *)self, separator);
6727 }
6728
6729 PyObject *PyUnicode_RSplit(PyObject *s,
6730                            PyObject *sep,
6731                            Py_ssize_t maxsplit)
6732 {
6733     PyObject *result;
6734
6735     s = PyUnicode_FromObject(s);
6736     if (s == NULL)
6737         return NULL;
6738     if (sep != NULL) {
6739         sep = PyUnicode_FromObject(sep);
6740         if (sep == NULL) {
6741             Py_DECREF(s);
6742             return NULL;
6743         }
6744     }
6745
6746     result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6747
6748     Py_DECREF(s);
6749     Py_XDECREF(sep);
6750     return result;
6751 }
6752
6753 PyDoc_STRVAR(rsplit__doc__,
6754 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6755 \n\
6756 Return a list of the words in S, using sep as the\n\
6757 delimiter string, starting at the end of the string and\n\
6758 working to the front.  If maxsplit is given, at most maxsplit\n\
6759 splits are done. If sep is not specified, any whitespace string\n\
6760 is a separator.");
6761
6762 static PyObject*
6763 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6764 {
6765     PyObject *substring = Py_None;
6766     Py_ssize_t maxcount = -1;
6767
6768     if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
6769         return NULL;
6770
6771     if (substring == Py_None)
6772         return rsplit(self, NULL, maxcount);
6773     else if (PyUnicode_Check(substring))
6774         return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6775     else
6776         return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6777 }
6778
6779 PyDoc_STRVAR(splitlines__doc__,
6780 "S.splitlines([keepends]]) -> list of strings\n\
6781 \n\
6782 Return a list of the lines in S, breaking at line boundaries.\n\
6783 Line breaks are not included in the resulting list unless keepends\n\
6784 is given and true.");
6785
6786 static PyObject*
6787 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6788 {
6789     int keepends = 0;
6790
6791     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
6792         return NULL;
6793
6794     return PyUnicode_Splitlines((PyObject *)self, keepends);
6795 }
6796
6797 static
6798 PyObject *unicode_str(PyUnicodeObject *self)
6799 {
6800     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
6801 }
6802
6803 PyDoc_STRVAR(swapcase__doc__,
6804 "S.swapcase() -> unicode\n\
6805 \n\
6806 Return a copy of S with uppercase characters converted to lowercase\n\
6807 and vice versa.");
6808
6809 static PyObject*
6810 unicode_swapcase(PyUnicodeObject *self)
6811 {
6812     return fixup(self, fixswapcase);
6813 }
6814
6815 PyDoc_STRVAR(translate__doc__,
6816 "S.translate(table) -> unicode\n\
6817 \n\
6818 Return a copy of the string S, where all characters have been mapped\n\
6819 through the given translation table, which must be a mapping of\n\
6820 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6821 Unmapped characters are left untouched. Characters mapped to None\n\
6822 are deleted.");
6823
6824 static PyObject*
6825 unicode_translate(PyUnicodeObject *self, PyObject *table)
6826 {
6827     return PyUnicode_TranslateCharmap(self->str,
6828                                       self->length,
6829                                       table,
6830                                       "ignore");
6831 }
6832
6833 PyDoc_STRVAR(upper__doc__,
6834 "S.upper() -> unicode\n\
6835 \n\
6836 Return a copy of S converted to uppercase.");
6837
6838 static PyObject*
6839 unicode_upper(PyUnicodeObject *self)
6840 {
6841     return fixup(self, fixupper);
6842 }
6843
6844 PyDoc_STRVAR(zfill__doc__,
6845 "S.zfill(width) -> unicode\n\
6846 \n\
6847 Pad a numeric string x with zeros on the left, to fill a field\n\
6848 of the specified width. The string x is never truncated.");
6849
6850 static PyObject *
6851 unicode_zfill(PyUnicodeObject *self, PyObject *args)
6852 {
6853     Py_ssize_t fill;
6854     PyUnicodeObject *u;
6855
6856     Py_ssize_t width;
6857     if (!PyArg_ParseTuple(args, "n:zfill", &width))
6858         return NULL;
6859
6860     if (self->length >= width) {
6861         if (PyUnicode_CheckExact(self)) {
6862             Py_INCREF(self);
6863             return (PyObject*) self;
6864         }
6865         else
6866             return PyUnicode_FromUnicode(
6867                 PyUnicode_AS_UNICODE(self),
6868                 PyUnicode_GET_SIZE(self)
6869             );
6870     }
6871
6872     fill = width - self->length;
6873
6874     u = pad(self, fill, 0, '0');
6875
6876     if (u == NULL)
6877         return NULL;
6878
6879     if (u->str[fill] == '+' || u->str[fill] == '-') {
6880         /* move sign to beginning of string */
6881         u->str[0] = u->str[fill];
6882         u->str[fill] = '0';
6883     }
6884
6885     return (PyObject*) u;
6886 }
6887
6888 #if 0
6889 static PyObject*
6890 unicode_freelistsize(PyUnicodeObject *self)
6891 {
6892     return PyInt_FromLong(unicode_freelist_size);
6893 }
6894 #endif
6895
6896 PyDoc_STRVAR(startswith__doc__,
6897 "S.startswith(prefix[, start[, end]]) -> bool\n\
6898 \n\
6899 Return True if S starts with the specified prefix, False otherwise.\n\
6900 With optional start, test S beginning at that position.\n\
6901 With optional end, stop comparing S at that position.\n\
6902 prefix can also be a tuple of strings to try.");
6903
6904 static PyObject *
6905 unicode_startswith(PyUnicodeObject *self,
6906                    PyObject *args)
6907 {
6908     PyObject *subobj;
6909     PyUnicodeObject *substring;
6910     Py_ssize_t start = 0;
6911     Py_ssize_t end = PY_SSIZE_T_MAX;
6912     int result;
6913
6914     if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
6915                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6916         return NULL;
6917     if (PyTuple_Check(subobj)) {
6918         Py_ssize_t i;
6919         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6920             substring = (PyUnicodeObject *)PyUnicode_FromObject(
6921                             PyTuple_GET_ITEM(subobj, i));
6922             if (substring == NULL)
6923                 return NULL;
6924             result = tailmatch(self, substring, start, end, -1);
6925             Py_DECREF(substring);
6926             if (result) {
6927                 Py_RETURN_TRUE;
6928             }
6929         }
6930         /* nothing matched */
6931         Py_RETURN_FALSE;
6932     }
6933     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
6934     if (substring == NULL)
6935          return NULL;
6936     result = tailmatch(self, substring, start, end, -1);
6937     Py_DECREF(substring);
6938     return PyBool_FromLong(result);
6939 }
6940
6941
6942 PyDoc_STRVAR(endswith__doc__,
6943 "S.endswith(suffix[, start[, end]]) -> bool\n\
6944 \n\
6945 Return True if S ends with the specified suffix, False otherwise.\n\
6946 With optional start, test S beginning at that position.\n\
6947 With optional end, stop comparing S at that position.\n\
6948 suffix can also be a tuple of strings to try.");
6949
6950 static PyObject *
6951 unicode_endswith(PyUnicodeObject *self,
6952                  PyObject *args)
6953 {
6954     PyObject *subobj;
6955     PyUnicodeObject *substring;
6956     Py_ssize_t start = 0;
6957     Py_ssize_t end = PY_SSIZE_T_MAX;
6958     int result;
6959
6960     if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
6961         _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6962         return NULL;
6963     if (PyTuple_Check(subobj)) {
6964         Py_ssize_t i;
6965         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6966             substring = (PyUnicodeObject *)PyUnicode_FromObject(
6967                             PyTuple_GET_ITEM(subobj, i));
6968             if (substring == NULL)
6969             return NULL;
6970             result = tailmatch(self, substring, start, end, +1);
6971             Py_DECREF(substring);
6972             if (result) {
6973                 Py_RETURN_TRUE;
6974             }
6975         }
6976         Py_RETURN_FALSE;
6977     }
6978     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
6979     if (substring == NULL)
6980     return NULL;
6981
6982     result = tailmatch(self, substring, start, end, +1);
6983     Py_DECREF(substring);
6984     return PyBool_FromLong(result);
6985 }
6986
6987
6988
6989 static PyObject *
6990 unicode_getnewargs(PyUnicodeObject *v)
6991 {
6992         return Py_BuildValue("(u#)", v->str, v->length);
6993 }
6994
6995
6996 static PyMethodDef unicode_methods[] = {
6997
6998     /* Order is according to common usage: often used methods should
6999        appear first, since lookup is done sequentially. */
7000
7001     {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7002     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7003     {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
7004     {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
7005     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7006     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7007     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7008     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7009     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7010     {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7011     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
7012     {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
7013     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7014     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7015     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
7016     {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
7017     {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
7018 /*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7019     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7020     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7021     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
7022     {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
7023     {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
7024     {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
7025     {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
7026     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7027     {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7028     {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7029     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7030     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7031     {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7032     {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7033     {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7034     {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7035     {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7036     {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7037     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7038     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7039     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
7040     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
7041 #if 0
7042     {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
7043 #endif
7044
7045 #if 0
7046     /* This one is just used for debugging the implementation. */
7047     {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
7048 #endif
7049
7050     {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
7051     {NULL, NULL}
7052 };
7053
7054 static PyObject *
7055 unicode_mod(PyObject *v, PyObject *w)
7056 {
7057        if (!PyUnicode_Check(v)) {
7058                Py_INCREF(Py_NotImplemented);
7059                return Py_NotImplemented;
7060        }
7061        return PyUnicode_Format(v, w);
7062 }
7063
7064 static PyNumberMethods unicode_as_number = {
7065         0,                              /*nb_add*/
7066         0,                              /*nb_subtract*/
7067         0,                              /*nb_multiply*/
7068         0,                              /*nb_divide*/
7069         unicode_mod,                    /*nb_remainder*/
7070 };
7071
7072 static PySequenceMethods unicode_as_sequence = {
7073     (lenfunc) unicode_length,           /* sq_length */
7074     PyUnicode_Concat,                   /* sq_concat */
7075     (ssizeargfunc) unicode_repeat,      /* sq_repeat */
7076     (ssizeargfunc) unicode_getitem,     /* sq_item */
7077     (ssizessizeargfunc) unicode_slice,  /* sq_slice */
7078     0,                                  /* sq_ass_item */
7079     0,                                  /* sq_ass_slice */
7080     PyUnicode_Contains,                 /* sq_contains */
7081 };
7082
7083 static PyObject*
7084 unicode_subscript(PyUnicodeObject* self, PyObject* item)
7085 {
7086     if (PyIndex_Check(item)) {
7087         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
7088         if (i == -1 && PyErr_Occurred())
7089             return NULL;
7090         if (i < 0)
7091             i += PyUnicode_GET_SIZE(self);
7092         return unicode_getitem(self, i);
7093     } else if (PySlice_Check(item)) {
7094         Py_ssize_t start, stop, step, slicelength, cur, i;
7095         Py_UNICODE* source_buf;
7096         Py_UNICODE* result_buf;
7097         PyObject* result;
7098
7099         if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
7100                                  &start, &stop, &step, &slicelength) < 0) {
7101             return NULL;
7102         }
7103
7104         if (slicelength <= 0) {
7105             return PyUnicode_FromUnicode(NULL, 0);
7106         } else {
7107             source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
7108             result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7109                                                     sizeof(Py_UNICODE));
7110
7111             if (result_buf == NULL)
7112                     return PyErr_NoMemory();
7113
7114             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7115                 result_buf[i] = source_buf[cur];
7116             }
7117
7118             result = PyUnicode_FromUnicode(result_buf, slicelength);
7119             PyMem_FREE(result_buf);
7120             return result;
7121         }
7122     } else {
7123         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7124         return NULL;
7125     }
7126 }
7127
7128 static PyMappingMethods unicode_as_mapping = {
7129     (lenfunc)unicode_length,            /* mp_length */
7130     (binaryfunc)unicode_subscript,      /* mp_subscript */
7131     (objobjargproc)0,                   /* mp_ass_subscript */
7132 };
7133
7134 static Py_ssize_t
7135 unicode_buffer_getreadbuf(PyUnicodeObject *self,
7136                           Py_ssize_t index,
7137                           const void **ptr)
7138 {
7139     if (index != 0) {
7140         PyErr_SetString(PyExc_SystemError,
7141                         "accessing non-existent unicode segment");
7142         return -1;
7143     }
7144     *ptr = (void *) self->str;
7145     return PyUnicode_GET_DATA_SIZE(self);
7146 }
7147
7148 static Py_ssize_t
7149 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
7150                            const void **ptr)
7151 {
7152     PyErr_SetString(PyExc_TypeError,
7153                     "cannot use unicode as modifiable buffer");
7154     return -1;
7155 }
7156
7157 static int
7158 unicode_buffer_getsegcount(PyUnicodeObject *self,
7159                            Py_ssize_t *lenp)
7160 {
7161     if (lenp)
7162         *lenp = PyUnicode_GET_DATA_SIZE(self);
7163     return 1;
7164 }
7165
7166 static Py_ssize_t
7167 unicode_buffer_getcharbuf(PyUnicodeObject *self,
7168                           Py_ssize_t index,
7169                           const void **ptr)
7170 {
7171     PyObject *str;
7172
7173     if (index != 0) {
7174         PyErr_SetString(PyExc_SystemError,
7175                         "accessing non-existent unicode segment");
7176         return -1;
7177     }
7178     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
7179     if (str == NULL)
7180         return -1;
7181     *ptr = (void *) PyString_AS_STRING(str);
7182     return PyString_GET_SIZE(str);
7183 }
7184
7185 /* Helpers for PyUnicode_Format() */
7186
7187 static PyObject *
7188 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
7189 {
7190     Py_ssize_t argidx = *p_argidx;
7191     if (argidx < arglen) {
7192         (*p_argidx)++;
7193         if (arglen < 0)
7194             return args;
7195         else
7196             return PyTuple_GetItem(args, argidx);
7197     }
7198     PyErr_SetString(PyExc_TypeError,
7199                     "not enough arguments for format string");
7200     return NULL;
7201 }
7202
7203 #define F_LJUST (1<<0)
7204 #define F_SIGN  (1<<1)
7205 #define F_BLANK (1<<2)
7206 #define F_ALT   (1<<3)
7207 #define F_ZERO  (1<<4)
7208
7209 static Py_ssize_t
7210 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
7211 {
7212     register Py_ssize_t i;
7213     Py_ssize_t len = strlen(charbuffer);
7214     for (i = len - 1; i >= 0; i--)
7215         buffer[i] = (Py_UNICODE) charbuffer[i];
7216
7217     return len;
7218 }
7219
7220 static int
7221 doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7222 {
7223     Py_ssize_t result;
7224
7225     PyOS_ascii_formatd((char *)buffer, len, format, x);
7226     result = strtounicode(buffer, (char *)buffer);
7227     return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
7228 }
7229
7230 static int
7231 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7232 {
7233     Py_ssize_t result;
7234
7235     PyOS_snprintf((char *)buffer, len, format, x);
7236     result = strtounicode(buffer, (char *)buffer);
7237     return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
7238 }
7239
7240 /* XXX To save some code duplication, formatfloat/long/int could have been
7241    shared with stringobject.c, converting from 8-bit to Unicode after the
7242    formatting is done. */
7243
7244 static int
7245 formatfloat(Py_UNICODE *buf,
7246             size_t buflen,
7247             int flags,
7248             int prec,
7249             int type,
7250             PyObject *v)
7251 {
7252     /* fmt = '%#.' + `prec` + `type`
7253        worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
7254     char fmt[20];
7255     double x;
7256
7257     x = PyFloat_AsDouble(v);
7258     if (x == -1.0 && PyErr_Occurred())
7259         return -1;
7260     if (prec < 0)
7261         prec = 6;
7262     if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7263         type = 'g';
7264     /* Worst case length calc to ensure no buffer overrun:
7265
7266        'g' formats:
7267          fmt = %#.<prec>g
7268          buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7269             for any double rep.)
7270          len = 1 + prec + 1 + 2 + 5 = 9 + prec
7271
7272        'f' formats:
7273          buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7274          len = 1 + 50 + 1 + prec = 52 + prec
7275
7276        If prec=0 the effective precision is 1 (the leading digit is
7277        always given), therefore increase the length by one.
7278
7279     */
7280     if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7281         (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
7282         PyErr_SetString(PyExc_OverflowError,
7283                         "formatted float is too long (precision too large?)");
7284         return -1;
7285     }
7286     PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7287                   (flags&F_ALT) ? "#" : "",
7288                   prec, type);
7289     return doubletounicode(buf, buflen, fmt, x);
7290 }
7291
7292 static PyObject*
7293 formatlong(PyObject *val, int flags, int prec, int type)
7294 {
7295         char *buf;
7296         int i, len;
7297         PyObject *str; /* temporary string object. */
7298         PyUnicodeObject *result;
7299
7300         str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7301         if (!str)
7302                 return NULL;
7303         result = _PyUnicode_New(len);
7304         if (!result) {
7305                 Py_DECREF(str);
7306                 return NULL;
7307         }
7308         for (i = 0; i < len; i++)
7309                 result->str[i] = buf[i];
7310         result->str[len] = 0;
7311         Py_DECREF(str);
7312         return (PyObject*)result;
7313 }
7314
7315 static int
7316 formatint(Py_UNICODE *buf,
7317           size_t buflen,
7318           int flags,
7319           int prec,
7320           int type,
7321           PyObject *v)
7322 {
7323     /* fmt = '%#.' + `prec` + 'l' + `type`
7324      * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7325      *                     + 1 + 1
7326      *                   = 24
7327      */
7328     char fmt[64]; /* plenty big enough! */
7329     char *sign;
7330     long x;
7331
7332     x = PyInt_AsLong(v);
7333     if (x == -1 && PyErr_Occurred())
7334         return -1;
7335     if (x < 0 && type == 'u') {
7336         type = 'd';
7337     }
7338     if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7339         sign = "-";
7340     else
7341         sign = "";
7342     if (prec < 0)
7343         prec = 1;
7344
7345     /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7346      * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
7347      */
7348     if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
7349         PyErr_SetString(PyExc_OverflowError,
7350                 "formatted integer is too long (precision too large?)");
7351         return -1;
7352     }
7353
7354     if ((flags & F_ALT) &&
7355         (type == 'x' || type == 'X')) {
7356         /* When converting under %#x or %#X, there are a number
7357          * of issues that cause pain:
7358          * - when 0 is being converted, the C standard leaves off
7359          *   the '0x' or '0X', which is inconsistent with other
7360          *   %#x/%#X conversions and inconsistent with Python's
7361          *   hex() function
7362          * - there are platforms that violate the standard and
7363          *   convert 0 with the '0x' or '0X'
7364          *   (Metrowerks, Compaq Tru64)
7365          * - there are platforms that give '0x' when converting
7366          *   under %#X, but convert 0 in accordance with the
7367          *   standard (OS/2 EMX)
7368          *
7369          * We can achieve the desired consistency by inserting our
7370          * own '0x' or '0X' prefix, and substituting %x/%X in place
7371          * of %#x/%#X.
7372          *
7373          * Note that this is the same approach as used in
7374          * formatint() in stringobject.c
7375          */
7376         PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7377                       sign, type, prec, type);
7378     }
7379     else {
7380         PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7381                       sign, (flags&F_ALT) ? "#" : "",
7382                       prec, type);
7383     }
7384     if (sign[0])
7385         return longtounicode(buf, buflen, fmt, -x);
7386     else
7387         return longtounicode(buf, buflen, fmt, x);
7388 }
7389
7390 static int
7391 formatchar(Py_UNICODE *buf,
7392            size_t buflen,
7393            PyObject *v)
7394 {
7395     /* presume that the buffer is at least 2 characters long */
7396     if (PyUnicode_Check(v)) {
7397         if (PyUnicode_GET_SIZE(v) != 1)
7398             goto onError;
7399         buf[0] = PyUnicode_AS_UNICODE(v)[0];
7400     }
7401
7402     else if (PyString_Check(v)) {
7403         if (PyString_GET_SIZE(v) != 1)
7404             goto onError;
7405         buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7406     }
7407
7408     else {
7409         /* Integer input truncated to a character */
7410         long x;
7411         x = PyInt_AsLong(v);
7412         if (x == -1 && PyErr_Occurred())
7413             goto onError;
7414 #ifdef Py_UNICODE_WIDE
7415         if (x < 0 || x > 0x10ffff) {
7416             PyErr_SetString(PyExc_OverflowError,
7417                             "%c arg not in range(0x110000) "
7418                             "(wide Python build)");
7419             return -1;
7420         }
7421 #else
7422         if (x < 0 || x > 0xffff) {
7423             PyErr_SetString(PyExc_OverflowError,
7424                             "%c arg not in range(0x10000) "
7425                             "(narrow Python build)");
7426             return -1;
7427         }
7428 #endif
7429         buf[0] = (Py_UNICODE) x;
7430     }
7431     buf[1] = '\0';
7432     return 1;
7433
7434  onError:
7435     PyErr_SetString(PyExc_TypeError,
7436                     "%c requires int or char");
7437     return -1;
7438 }
7439
7440 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7441
7442    FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7443    chars are formatted. XXX This is a magic number. Each formatting
7444    routine does bounds checking to ensure no overflow, but a better
7445    solution may be to malloc a buffer of appropriate size for each
7446    format. For now, the current solution is sufficient.
7447 */
7448 #define FORMATBUFLEN (size_t)120
7449
7450 PyObject *PyUnicode_Format(PyObject *format,
7451                            PyObject *args)
7452 {
7453     Py_UNICODE *fmt, *res;
7454     Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
7455     int args_owned = 0;
7456     PyUnicodeObject *result = NULL;
7457     PyObject *dict = NULL;
7458     PyObject *uformat;
7459
7460     if (format == NULL || args == NULL) {
7461         PyErr_BadInternalCall();
7462         return NULL;
7463     }
7464     uformat = PyUnicode_FromObject(format);
7465     if (uformat == NULL)
7466         return NULL;
7467     fmt = PyUnicode_AS_UNICODE(uformat);
7468     fmtcnt = PyUnicode_GET_SIZE(uformat);
7469
7470     reslen = rescnt = fmtcnt + 100;
7471     result = _PyUnicode_New(reslen);
7472     if (result == NULL)
7473         goto onError;
7474     res = PyUnicode_AS_UNICODE(result);
7475
7476     if (PyTuple_Check(args)) {
7477         arglen = PyTuple_Size(args);
7478         argidx = 0;
7479     }
7480     else {
7481         arglen = -1;
7482         argidx = -2;
7483     }
7484     if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7485         !PyObject_TypeCheck(args, &PyBaseString_Type))
7486         dict = args;
7487
7488     while (--fmtcnt >= 0) {
7489         if (*fmt != '%') {
7490             if (--rescnt < 0) {
7491                 rescnt = fmtcnt + 100;
7492                 reslen += rescnt;
7493                 if (_PyUnicode_Resize(&result, reslen) < 0)
7494                     goto onError;
7495                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7496                 --rescnt;
7497             }
7498             *res++ = *fmt++;
7499         }
7500         else {
7501             /* Got a format specifier */
7502             int flags = 0;
7503             Py_ssize_t width = -1;
7504             int prec = -1;
7505             Py_UNICODE c = '\0';
7506             Py_UNICODE fill;
7507             PyObject *v = NULL;
7508             PyObject *temp = NULL;
7509             Py_UNICODE *pbuf;
7510             Py_UNICODE sign;
7511             Py_ssize_t len;
7512             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
7513
7514             fmt++;
7515             if (*fmt == '(') {
7516                 Py_UNICODE *keystart;
7517                 Py_ssize_t keylen;
7518                 PyObject *key;
7519                 int pcount = 1;
7520
7521                 if (dict == NULL) {
7522                     PyErr_SetString(PyExc_TypeError,
7523                                     "format requires a mapping");
7524                     goto onError;
7525                 }
7526                 ++fmt;
7527                 --fmtcnt;
7528                 keystart = fmt;
7529                 /* Skip over balanced parentheses */
7530                 while (pcount > 0 && --fmtcnt >= 0) {
7531                     if (*fmt == ')')
7532                         --pcount;
7533                     else if (*fmt == '(')
7534                         ++pcount;
7535                     fmt++;
7536                 }
7537                 keylen = fmt - keystart - 1;
7538                 if (fmtcnt < 0 || pcount > 0) {
7539                     PyErr_SetString(PyExc_ValueError,
7540                                     "incomplete format key");
7541                     goto onError;
7542                 }
7543 #if 0
7544                 /* keys are converted to strings using UTF-8 and
7545                    then looked up since Python uses strings to hold
7546                    variables names etc. in its namespaces and we
7547                    wouldn't want to break common idioms. */
7548                 key = PyUnicode_EncodeUTF8(keystart,
7549                                            keylen,
7550                                            NULL);
7551 #else
7552                 key = PyUnicode_FromUnicode(keystart, keylen);
7553 #endif
7554                 if (key == NULL)
7555                     goto onError;
7556                 if (args_owned) {
7557                     Py_DECREF(args);
7558                     args_owned = 0;
7559                 }
7560                 args = PyObject_GetItem(dict, key);
7561                 Py_DECREF(key);
7562                 if (args == NULL) {
7563                     goto onError;
7564                 }
7565                 args_owned = 1;
7566                 arglen = -1;
7567                 argidx = -2;
7568             }
7569             while (--fmtcnt >= 0) {
7570                 switch (c = *fmt++) {
7571                 case '-': flags |= F_LJUST; continue;
7572                 case '+': flags |= F_SIGN; continue;
7573                 case ' ': flags |= F_BLANK; continue;
7574                 case '#': flags |= F_ALT; continue;
7575                 case '0': flags |= F_ZERO; continue;
7576                 }
7577                 break;
7578             }
7579             if (c == '*') {
7580                 v = getnextarg(args, arglen, &argidx);
7581                 if (v == NULL)
7582                     goto onError;
7583                 if (!PyInt_Check(v)) {
7584                     PyErr_SetString(PyExc_TypeError,
7585                                     "* wants int");
7586                     goto onError;
7587                 }
7588                 width = PyInt_AsLong(v);
7589                 if (width < 0) {
7590                     flags |= F_LJUST;
7591                     width = -width;
7592                 }
7593                 if (--fmtcnt >= 0)
7594                     c = *fmt++;
7595             }
7596             else if (c >= '0' && c <= '9') {
7597                 width = c - '0';
7598                 while (--fmtcnt >= 0) {
7599                     c = *fmt++;
7600                     if (c < '0' || c > '9')
7601                         break;
7602                     if ((width*10) / 10 != width) {
7603                         PyErr_SetString(PyExc_ValueError,
7604                                         "width too big");
7605                         goto onError;
7606                     }
7607                     width = width*10 + (c - '0');
7608                 }
7609             }
7610             if (c == '.') {
7611                 prec = 0;
7612                 if (--fmtcnt >= 0)
7613                     c = *fmt++;
7614                 if (c == '*') {
7615                     v = getnextarg(args, arglen, &argidx);
7616                     if (v == NULL)
7617                         goto onError;
7618                     if (!PyInt_Check(v)) {
7619                         PyErr_SetString(PyExc_TypeError,
7620                                         "* wants int");
7621                         goto onError;
7622                     }
7623                     prec = PyInt_AsLong(v);
7624                     if (prec < 0)
7625                         prec = 0;
7626                     if (--fmtcnt >= 0)
7627                         c = *fmt++;
7628                 }
7629                 else if (c >= '0' && c <= '9') {
7630                     prec = c - '0';
7631                     while (--fmtcnt >= 0) {
7632                         c = Py_CHARMASK(*fmt++);
7633                         if (c < '0' || c > '9')
7634                             break;
7635                         if ((prec*10) / 10 != prec) {
7636                             PyErr_SetString(PyExc_ValueError,
7637                                             "prec too big");
7638                             goto onError;
7639                         }
7640                         prec = prec*10 + (c - '0');
7641                     }
7642                 }
7643             } /* prec */
7644             if (fmtcnt >= 0) {
7645                 if (c == 'h' || c == 'l' || c == 'L') {
7646                     if (--fmtcnt >= 0)
7647                         c = *fmt++;
7648                 }
7649             }
7650             if (fmtcnt < 0) {
7651                 PyErr_SetString(PyExc_ValueError,
7652                                 "incomplete format");
7653                 goto onError;
7654             }
7655             if (c != '%') {
7656                 v = getnextarg(args, arglen, &argidx);
7657                 if (v == NULL)
7658                     goto onError;
7659             }
7660             sign = 0;
7661             fill = ' ';
7662             switch (c) {
7663
7664             case '%':
7665                 pbuf = formatbuf;
7666                 /* presume that buffer length is at least 1 */
7667                 pbuf[0] = '%';
7668                 len = 1;
7669                 break;
7670
7671             case 's':
7672             case 'r':
7673                 if (PyUnicode_Check(v) && c == 's') {
7674                     temp = v;
7675                     Py_INCREF(temp);
7676                 }
7677                 else {
7678                     PyObject *unicode;
7679                     if (c == 's')
7680                         temp = PyObject_Unicode(v);
7681                     else
7682                         temp = PyObject_Repr(v);
7683                     if (temp == NULL)
7684                         goto onError;
7685                     if (PyUnicode_Check(temp))
7686                         /* nothing to do */;
7687                     else if (PyString_Check(temp)) {
7688                         /* convert to string to Unicode */
7689                         unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
7690                                                    PyString_GET_SIZE(temp),
7691                                                    NULL,
7692                                                    "strict");
7693                         Py_DECREF(temp);
7694                         temp = unicode;
7695                         if (temp == NULL)
7696                             goto onError;
7697                     }
7698                     else {
7699                         Py_DECREF(temp);
7700                         PyErr_SetString(PyExc_TypeError,
7701                                         "%s argument has non-string str()");
7702                         goto onError;
7703                     }
7704                 }
7705                 pbuf = PyUnicode_AS_UNICODE(temp);
7706                 len = PyUnicode_GET_SIZE(temp);
7707                 if (prec >= 0 && len > prec)
7708                     len = prec;
7709                 break;
7710
7711             case 'i':
7712             case 'd':
7713             case 'u':
7714             case 'o':
7715             case 'x':
7716             case 'X':
7717                 if (c == 'i')
7718                     c = 'd';
7719                 if (PyLong_Check(v)) {
7720                     temp = formatlong(v, flags, prec, c);
7721                     if (!temp)
7722                         goto onError;
7723                     pbuf = PyUnicode_AS_UNICODE(temp);
7724                     len = PyUnicode_GET_SIZE(temp);
7725                     sign = 1;
7726                 }
7727                 else {
7728                     pbuf = formatbuf;
7729                     len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7730                                     flags, prec, c, v);
7731                     if (len < 0)
7732                         goto onError;
7733                     sign = 1;
7734                 }
7735                 if (flags & F_ZERO)
7736                     fill = '0';
7737                 break;
7738
7739             case 'e':
7740             case 'E':
7741             case 'f':
7742             case 'F':
7743             case 'g':
7744             case 'G':
7745                 if (c == 'F')
7746                         c = 'f';
7747                 pbuf = formatbuf;
7748                 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7749                         flags, prec, c, v);
7750                 if (len < 0)
7751                     goto onError;
7752                 sign = 1;
7753                 if (flags & F_ZERO)
7754                     fill = '0';
7755                 break;
7756
7757             case 'c':
7758                 pbuf = formatbuf;
7759                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
7760                 if (len < 0)
7761                     goto onError;
7762                 break;
7763
7764             default:
7765                 PyErr_Format(PyExc_ValueError,
7766                              "unsupported format character '%c' (0x%x) "
7767                              "at index %zd",
7768                              (31<=c && c<=126) ? (char)c : '?',
7769                              (int)c,
7770                              (Py_ssize_t)(fmt - 1 -
7771                                           PyUnicode_AS_UNICODE(uformat)));
7772                 goto onError;
7773             }
7774             if (sign) {
7775                 if (*pbuf == '-' || *pbuf == '+') {
7776                     sign = *pbuf++;
7777                     len--;
7778                 }
7779                 else if (flags & F_SIGN)
7780                     sign = '+';
7781                 else if (flags & F_BLANK)
7782                     sign = ' ';
7783                 else
7784                     sign = 0;
7785             }
7786             if (width < len)
7787                 width = len;
7788             if (rescnt - (sign != 0) < width) {
7789                 reslen -= rescnt;
7790                 rescnt = width + fmtcnt + 100;
7791                 reslen += rescnt;
7792                 if (reslen < 0) {
7793                     Py_XDECREF(temp);
7794                     PyErr_NoMemory();
7795                     goto onError;
7796                 }
7797                 if (_PyUnicode_Resize(&result, reslen) < 0) {
7798                     Py_XDECREF(temp);
7799                     goto onError;
7800                 }
7801                 res = PyUnicode_AS_UNICODE(result)
7802                     + reslen - rescnt;
7803             }
7804             if (sign) {
7805                 if (fill != ' ')
7806                     *res++ = sign;
7807                 rescnt--;
7808                 if (width > len)
7809                     width--;
7810             }
7811             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7812                 assert(pbuf[0] == '0');
7813                 assert(pbuf[1] == c);
7814                 if (fill != ' ') {
7815                     *res++ = *pbuf++;
7816                     *res++ = *pbuf++;
7817                 }
7818                 rescnt -= 2;
7819                 width -= 2;
7820                 if (width < 0)
7821                     width = 0;
7822                 len -= 2;
7823             }
7824             if (width > len && !(flags & F_LJUST)) {
7825                 do {
7826                     --rescnt;
7827                     *res++ = fill;
7828                 } while (--width > len);
7829             }
7830             if (fill == ' ') {
7831                 if (sign)
7832                     *res++ = sign;
7833                 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7834                     assert(pbuf[0] == '0');
7835                     assert(pbuf[1] == c);
7836                     *res++ = *pbuf++;
7837                     *res++ = *pbuf++;
7838                 }
7839             }
7840             Py_UNICODE_COPY(res, pbuf, len);
7841             res += len;
7842             rescnt -= len;
7843             while (--width >= len) {
7844                 --rescnt;
7845                 *res++ = ' ';
7846             }
7847             if (dict && (argidx < arglen) && c != '%') {
7848                 PyErr_SetString(PyExc_TypeError,
7849                                 "not all arguments converted during string formatting");
7850                 Py_XDECREF(temp);
7851                 goto onError;
7852             }
7853             Py_XDECREF(temp);
7854         } /* '%' */
7855     } /* until end */
7856     if (argidx < arglen && !dict) {
7857         PyErr_SetString(PyExc_TypeError,
7858                         "not all arguments converted during string formatting");
7859         goto onError;
7860     }
7861
7862     if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7863         goto onError;
7864     if (args_owned) {
7865         Py_DECREF(args);
7866     }
7867     Py_DECREF(uformat);
7868     return (PyObject *)result;
7869
7870  onError:
7871     Py_XDECREF(result);
7872     Py_DECREF(uformat);
7873     if (args_owned) {
7874         Py_DECREF(args);
7875     }
7876     return NULL;
7877 }
7878
7879 static PyBufferProcs unicode_as_buffer = {
7880     (readbufferproc) unicode_buffer_getreadbuf,
7881     (writebufferproc) unicode_buffer_getwritebuf,
7882     (segcountproc) unicode_buffer_getsegcount,
7883     (charbufferproc) unicode_buffer_getcharbuf,
7884 };
7885
7886 static PyObject *
7887 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7888
7889 static PyObject *
7890 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7891 {
7892         PyObject *x = NULL;
7893         static char *kwlist[] = {"string", "encoding", "errors", 0};
7894         char *encoding = NULL;
7895         char *errors = NULL;
7896
7897         if (type != &PyUnicode_Type)
7898                 return unicode_subtype_new(type, args, kwds);
7899         if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7900                                           kwlist, &x, &encoding, &errors))
7901             return NULL;
7902         if (x == NULL)
7903                 return (PyObject *)_PyUnicode_New(0);
7904         if (encoding == NULL && errors == NULL)
7905             return PyObject_Unicode(x);
7906         else
7907         return PyUnicode_FromEncodedObject(x, encoding, errors);
7908 }
7909
7910 static PyObject *
7911 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7912 {
7913         PyUnicodeObject *tmp, *pnew;
7914         Py_ssize_t n;
7915
7916         assert(PyType_IsSubtype(type, &PyUnicode_Type));
7917         tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7918         if (tmp == NULL)
7919                 return NULL;
7920         assert(PyUnicode_Check(tmp));
7921         pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
7922         if (pnew == NULL) {
7923                 Py_DECREF(tmp);
7924                 return NULL;
7925         }
7926         pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7927         if (pnew->str == NULL) {
7928                 _Py_ForgetReference((PyObject *)pnew);
7929                 PyObject_Del(pnew);
7930                 Py_DECREF(tmp);
7931                 return PyErr_NoMemory();
7932         }
7933         Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7934         pnew->length = n;
7935         pnew->hash = tmp->hash;
7936         Py_DECREF(tmp);
7937         return (PyObject *)pnew;
7938 }
7939
7940 PyDoc_STRVAR(unicode_doc,
7941 "unicode(string [, encoding[, errors]]) -> object\n\
7942 \n\
7943 Create a new Unicode object from the given encoded string.\n\
7944 encoding defaults to the current default string encoding.\n\
7945 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
7946
7947 PyTypeObject PyUnicode_Type = {
7948     PyObject_HEAD_INIT(&PyType_Type)
7949     0,                                  /* ob_size */
7950     "unicode",                          /* tp_name */
7951     sizeof(PyUnicodeObject),            /* tp_size */
7952     0,                                  /* tp_itemsize */
7953     /* Slots */
7954     (destructor)unicode_dealloc,        /* tp_dealloc */
7955     0,                                  /* tp_print */
7956     0,                                  /* tp_getattr */
7957     0,                                  /* tp_setattr */
7958     0,                                  /* tp_compare */
7959     unicode_repr,                       /* tp_repr */
7960     &unicode_as_number,                 /* tp_as_number */
7961     &unicode_as_sequence,               /* tp_as_sequence */
7962     &unicode_as_mapping,                /* tp_as_mapping */
7963     (hashfunc) unicode_hash,            /* tp_hash*/
7964     0,                                  /* tp_call*/
7965     (reprfunc) unicode_str,             /* tp_str */
7966     PyObject_GenericGetAttr,            /* tp_getattro */
7967     0,                                  /* tp_setattro */
7968     &unicode_as_buffer,                 /* tp_as_buffer */
7969     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7970             Py_TPFLAGS_BASETYPE,        /* tp_flags */
7971     unicode_doc,                        /* tp_doc */
7972     0,                                  /* tp_traverse */
7973     0,                                  /* tp_clear */
7974     PyUnicode_RichCompare,              /* tp_richcompare */
7975     0,                                  /* tp_weaklistoffset */
7976     0,                                  /* tp_iter */
7977     0,                                  /* tp_iternext */
7978     unicode_methods,                    /* tp_methods */
7979     0,                                  /* tp_members */
7980     0,                                  /* tp_getset */
7981     &PyBaseString_Type,                 /* tp_base */
7982     0,                                  /* tp_dict */
7983     0,                                  /* tp_descr_get */
7984     0,                                  /* tp_descr_set */
7985     0,                                  /* tp_dictoffset */
7986     0,                                  /* tp_init */
7987     0,                                  /* tp_alloc */
7988     unicode_new,                        /* tp_new */
7989     PyObject_Del,               /* tp_free */
7990 };
7991
7992 /* Initialize the Unicode implementation */
7993
7994 void _PyUnicode_Init(void)
7995 {
7996     int i;
7997
7998     /* XXX - move this array to unicodectype.c ? */
7999     Py_UNICODE linebreak[] = {
8000         0x000A, /* LINE FEED */
8001         0x000D, /* CARRIAGE RETURN */
8002         0x001C, /* FILE SEPARATOR */
8003         0x001D, /* GROUP SEPARATOR */
8004         0x001E, /* RECORD SEPARATOR */
8005         0x0085, /* NEXT LINE */
8006         0x2028, /* LINE SEPARATOR */
8007         0x2029, /* PARAGRAPH SEPARATOR */
8008     };
8009
8010     /* Init the implementation */
8011     unicode_freelist = NULL;
8012     unicode_freelist_size = 0;
8013     unicode_empty = _PyUnicode_New(0);
8014     if (!unicode_empty)
8015         return;
8016
8017     strcpy(unicode_default_encoding, "ascii");
8018     for (i = 0; i < 256; i++)
8019         unicode_latin1[i] = NULL;
8020     if (PyType_Ready(&PyUnicode_Type) < 0)
8021         Py_FatalError("Can't initialize 'unicode'");
8022
8023     /* initialize the linebreak bloom filter */
8024     bloom_linebreak = make_bloom_mask(
8025         linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8026         );
8027
8028     PyType_Ready(&EncodingMapType);
8029 }
8030
8031 /* Finalize the Unicode implementation */
8032
8033 void
8034 _PyUnicode_Fini(void)
8035 {
8036     PyUnicodeObject *u;
8037     int i;
8038
8039     Py_XDECREF(unicode_empty);
8040     unicode_empty = NULL;
8041
8042     for (i = 0; i < 256; i++) {
8043         if (unicode_latin1[i]) {
8044             Py_DECREF(unicode_latin1[i]);
8045             unicode_latin1[i] = NULL;
8046         }
8047     }
8048
8049     for (u = unicode_freelist; u != NULL;) {
8050         PyUnicodeObject *v = u;
8051         u = *(PyUnicodeObject **)u;
8052         if (v->str)
8053             PyMem_DEL(v->str);
8054         Py_XDECREF(v->defenc);
8055         PyObject_Del(v);
8056     }
8057     unicode_freelist = NULL;
8058     unicode_freelist_size = 0;
8059 }
8060
8061 #ifdef __cplusplus
8062 }
8063 #endif
8064
8065
8066 /*
8067 Local variables:
8068 c-basic-offset: 4
8069 indent-tabs-mode: nil
8070 End:
8071 */