Objects/stringobject.c

   1 /* String (str/bytes) object implementation */
   2
   3 #define PY_SSIZE_T_CLEAN
   4
   5 #include "Python.h"
   6 #include <ctype.h>
   7 #include <stddef.h>
   8
   9 #ifdef COUNT_ALLOCS
  10 Py_ssize_t null_strings, one_strings;
  11 #endif
  12
  13 static PyStringObject *characters[UCHAR_MAX + 1];
  14 static PyStringObject *nullstring;
  15
  16 /* This dictionary holds all interned strings.  Note that references to
  17    strings in this dictionary are *not* counted in the string's ob_refcnt.
  18    When the interned string reaches a refcnt of 0 the string deallocation
  19    function will delete the reference from this dictionary.
  20
  21    Another way to look at this is that to say that the actual reference
  22    count of a string is:  s->ob_refcnt + (s->ob_sstate?2:0)
  23 */
  24 static PyObject *interned;
  25
  26 /* PyStringObject_SIZE gives the basic size of a string; any memory allocation
  27    for a string of length n should request PyStringObject_SIZE + n bytes.
  28
  29    Using PyStringObject_SIZE instead of sizeof(PyStringObject) saves
  30    3 bytes per string allocation on a typical system.
  31 */
  32 #define PyStringObject_SIZE (offsetof(PyStringObject, ob_sval) + 1)
  33
  34 /*
  35    For both PyString_FromString() and PyString_FromStringAndSize(), the
  36    parameter `size' denotes number of characters to allocate, not counting any
  37    null terminating character.
  38
  39    For PyString_FromString(), the parameter `str' points to a null-terminated
  40    string containing exactly `size' bytes.
  41
  42    For PyString_FromStringAndSize(), the parameter the parameter `str' is
  43    either NULL or else points to a string containing at least `size' bytes.
  44    For PyString_FromStringAndSize(), the string in the `str' parameter does
  45    not have to be null-terminated.  (Therefore it is safe to construct a
  46    substring by calling `PyString_FromStringAndSize(origstring, substrlen)'.)
  47    If `str' is NULL then PyString_FromStringAndSize() will allocate `size+1'
  48    bytes (setting the last byte to the null terminating character) and you can
  49    fill in the data yourself.  If `str' is non-NULL then the resulting
  50    PyString object must be treated as immutable and you must not fill in nor
  51    alter the data yourself, since the strings may be shared.
  52
  53    The PyObject member `op->ob_size', which denotes the number of "extra
  54    items" in a variable-size object, will contain the number of bytes
  55    allocated for string data, not counting the null terminating character.  It
  56    is therefore equal to the equal to the `size' parameter (for
  57    PyString_FromStringAndSize()) or the length of the string in the `str'
  58    parameter (for PyString_FromString()).
  59 */
  60 PyObject *
  61 PyString_FromStringAndSize(const char *str, Py_ssize_t size)
  62 {
  63         register PyStringObject *op;
  64         if (size < 0) {
  65                 PyErr_SetString(PyExc_SystemError,
  66                     "Negative size passed to PyString_FromStringAndSize");
  67                 return NULL;
  68         }
  69         if (size == 0 && (op = nullstring) != NULL) {
  70 #ifdef COUNT_ALLOCS
  71                 null_strings++;
  72 #endif
  73                 Py_INCREF(op);
  74                 return (PyObject *)op;
  75         }
  76         if (size == 1 && str != NULL &&
  77             (op = characters[*str & UCHAR_MAX]) != NULL)
  78         {
  79 #ifdef COUNT_ALLOCS
  80                 one_strings++;
  81 #endif
  82                 Py_INCREF(op);
  83                 return (PyObject *)op;
  84         }
  85
  86         if (size > PY_SSIZE_T_MAX - PyStringObject_SIZE) {
  87                 PyErr_SetString(PyExc_OverflowError, "string is too large");
  88                 return NULL;
  89         }
  90
  91         /* Inline PyObject_NewVar */
  92         op = (PyStringObject *)PyObject_MALLOC(PyStringObject_SIZE + size);
  93         if (op == NULL)
  94                 return PyErr_NoMemory();
  95         PyObject_INIT_VAR(op, &PyString_Type, size);
  96         op->ob_shash = -1;
  97         op->ob_sstate = SSTATE_NOT_INTERNED;
  98         if (str != NULL)
  99                 Py_MEMCPY(op->ob_sval, str, size);
 100         op->ob_sval[size] = '\0';
 101         /* share short strings */
 102         if (size == 0) {
 103                 PyObject *t = (PyObject *)op;
 104                 PyString_InternInPlace(&t);
 105                 op = (PyStringObject *)t;
 106                 nullstring = op;
 107                 Py_INCREF(op);
 108         } else if (size == 1 && str != NULL) {
 109                 PyObject *t = (PyObject *)op;
 110                 PyString_InternInPlace(&t);
 111                 op = (PyStringObject *)t;
 112                 characters[*str & UCHAR_MAX] = op;
 113                 Py_INCREF(op);
 114         }
 115         return (PyObject *) op;
 116 }
 117
 118 PyObject *
 119 PyString_FromString(const char *str)
 120 {
 121         register size_t size;
 122         register PyStringObject *op;
 123
 124         assert(str != NULL);
 125         size = strlen(str);
 126         if (size > PY_SSIZE_T_MAX - PyStringObject_SIZE) {
 127                 PyErr_SetString(PyExc_OverflowError,
 128                         "string is too long for a Python string");
 129                 return NULL;
 130         }
 131         if (size == 0 && (op = nullstring) != NULL) {
 132 #ifdef COUNT_ALLOCS
 133                 null_strings++;
 134 #endif
 135                 Py_INCREF(op);
 136                 return (PyObject *)op;
 137         }
 138         if (size == 1 && (op = characters[*str & UCHAR_MAX]) != NULL) {
 139 #ifdef COUNT_ALLOCS
 140                 one_strings++;
 141 #endif
 142                 Py_INCREF(op);
 143                 return (PyObject *)op;
 144         }
 145
 146         /* Inline PyObject_NewVar */
 147         op = (PyStringObject *)PyObject_MALLOC(PyStringObject_SIZE + size);
 148         if (op == NULL)
 149                 return PyErr_NoMemory();
 150         PyObject_INIT_VAR(op, &PyString_Type, size);
 151         op->ob_shash = -1;
 152         op->ob_sstate = SSTATE_NOT_INTERNED;
 153         Py_MEMCPY(op->ob_sval, str, size+1);
 154         /* share short strings */
 155         if (size == 0) {
 156                 PyObject *t = (PyObject *)op;
 157                 PyString_InternInPlace(&t);
 158                 op = (PyStringObject *)t;
 159                 nullstring = op;
 160                 Py_INCREF(op);
 161         } else if (size == 1) {
 162                 PyObject *t = (PyObject *)op;
 163                 PyString_InternInPlace(&t);
 164                 op = (PyStringObject *)t;
 165                 characters[*str & UCHAR_MAX] = op;
 166                 Py_INCREF(op);
 167         }
 168         return (PyObject *) op;
 169 }
 170
 171 PyObject *
 172 PyString_FromFormatV(const char *format, va_list vargs)
 173 {
 174         va_list count;
 175         Py_ssize_t n = 0;
 176         const char* f;
 177         char *s;
 178         PyObject* string;
 179
 180 #ifdef VA_LIST_IS_ARRAY
 181         Py_MEMCPY(count, vargs, sizeof(va_list));
 182 #else
 183 #ifdef  __va_copy
 184         __va_copy(count, vargs);
 185 #else
 186         count = vargs;
 187 #endif
 188 #endif
 189         /* step 1: figure out how large a buffer we need */
 190         for (f = format; *f; f++) {
 191                 if (*f == '%') {
 192 #ifdef HAVE_LONG_LONG
 193                         int longlongflag = 0;
 194 #endif
 195                         const char* p = f;
 196                         while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
 197                                 ;
 198
 199                         /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
 200                          * they don't affect the amount of space we reserve.
 201                          */
 202                         if (*f == 'l') {
 203                                 if (f[1] == 'd' || f[1] == 'u') {
 204                                         ++f;
 205                                 }
 206 #ifdef HAVE_LONG_LONG
 207                                 else if (f[1] == 'l' &&
 208                                          (f[2] == 'd' || f[2] == 'u')) {
 209                                         longlongflag = 1;
 210                                         f += 2;
 211                                 }
 212 #endif
 213                         }
 214                         else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
 215                                 ++f;
 216                         }
 217
 218                         switch (*f) {
 219                         case 'c':
 220                                 (void)va_arg(count, int);
 221                                 /* fall through... */
 222                         case '%':
 223                                 n++;
 224                                 break;
 225                         case 'd': case 'u': case 'i': case 'x':
 226                                 (void) va_arg(count, int);
 227 #ifdef HAVE_LONG_LONG
 228                                 /* Need at most
 229                                    ceil(log10(256)*SIZEOF_LONG_LONG) digits,
 230                                    plus 1 for the sign.  53/22 is an upper
 231                                    bound for log10(256). */
 232                                 if (longlongflag)
 233                                         n += 2 + (SIZEOF_LONG_LONG*53-1) / 22;
 234                                 else
 235 #endif
 236                                         /* 20 bytes is enough to hold a 64-bit
 237                                            integer.  Decimal takes the most
 238                                            space.  This isn't enough for
 239                                            octal. */
 240                                         n += 20;
 241
 242                                 break;
 243                         case 's':
 244                                 s = va_arg(count, char*);
 245                                 n += strlen(s);
 246                                 break;
 247                         case 'p':
 248                                 (void) va_arg(count, int);
 249                                 /* maximum 64-bit pointer representation:
 250                                  * 0xffffffffffffffff
 251                                  * so 19 characters is enough.
 252                                  * XXX I count 18 -- what's the extra for?
 253                                  */
 254                                 n += 19;
 255                                 break;
 256                         default:
 257                                 /* if we stumble upon an unknown
 258                                    formatting code, copy the rest of
 259                                    the format string to the output
 260                                    string. (we cannot just skip the
 261                                    code, since there's no way to know
 262                                    what's in the argument list) */
 263                                 n += strlen(p);
 264                                 goto expand;
 265                         }
 266                 } else
 267                         n++;
 268         }
 269  expand:
 270         /* step 2: fill the buffer */
 271         /* Since we've analyzed how much space we need for the worst case,
 272            use sprintf directly instead of the slower PyOS_snprintf. */
 273         string = PyString_FromStringAndSize(NULL, n);
 274         if (!string)
 275                 return NULL;
 276
 277         s = PyString_AsString(string);
 278
 279         for (f = format; *f; f++) {
 280                 if (*f == '%') {
 281                         const char* p = f++;
 282                         Py_ssize_t i;
 283                         int longflag = 0;
 284 #ifdef HAVE_LONG_LONG
 285                         int longlongflag = 0;
 286 #endif
 287                         int size_tflag = 0;
 288                         /* parse the width.precision part (we're only
 289                            interested in the precision value, if any) */
 290                         n = 0;
 291                         while (isdigit(Py_CHARMASK(*f)))
 292                                 n = (n*10) + *f++ - '0';
 293                         if (*f == '.') {
 294                                 f++;
 295                                 n = 0;
 296                                 while (isdigit(Py_CHARMASK(*f)))
 297                                         n = (n*10) + *f++ - '0';
 298                         }
 299                         while (*f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
 300                                 f++;
 301                         /* Handle %ld, %lu, %lld and %llu. */
 302                         if (*f == 'l') {
 303                                 if (f[1] == 'd' || f[1] == 'u') {
 304                                         longflag = 1;
 305                                         ++f;
 306                                 }
 307 #ifdef HAVE_LONG_LONG
 308                                 else if (f[1] == 'l' &&
 309                                          (f[2] == 'd' || f[2] == 'u')) {
 310                                         longlongflag = 1;
 311                                         f += 2;
 312                                 }
 313 #endif
 314                         }
 315                         /* handle the size_t flag. */
 316                         else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
 317                                 size_tflag = 1;
 318                                 ++f;
 319                         }
 320
 321                         switch (*f) {
 322                         case 'c':
 323                                 *s++ = va_arg(vargs, int);
 324                                 break;
 325                         case 'd':
 326                                 if (longflag)
 327                                         sprintf(s, "%ld", va_arg(vargs, long));
 328 #ifdef HAVE_LONG_LONG
 329                                 else if (longlongflag)
 330                                         sprintf(s, "%" PY_FORMAT_LONG_LONG "d",
 331                                                 va_arg(vargs, PY_LONG_LONG));
 332 #endif
 333                                 else if (size_tflag)
 334                                         sprintf(s, "%" PY_FORMAT_SIZE_T "d",
 335                                                 va_arg(vargs, Py_ssize_t));
 336                                 else
 337                                         sprintf(s, "%d", va_arg(vargs, int));
 338                                 s += strlen(s);
 339                                 break;
 340                         case 'u':
 341                                 if (longflag)
 342                                         sprintf(s, "%lu",
 343                                                 va_arg(vargs, unsigned long));
 344 #ifdef HAVE_LONG_LONG
 345                                 else if (longlongflag)
 346                                         sprintf(s, "%" PY_FORMAT_LONG_LONG "u",
 347                                                 va_arg(vargs, PY_LONG_LONG));
 348 #endif
 349                                 else if (size_tflag)
 350                                         sprintf(s, "%" PY_FORMAT_SIZE_T "u",
 351                                                 va_arg(vargs, size_t));
 352                                 else
 353                                         sprintf(s, "%u",
 354                                                 va_arg(vargs, unsigned int));
 355                                 s += strlen(s);
 356                                 break;
 357                         case 'i':
 358                                 sprintf(s, "%i", va_arg(vargs, int));
 359                                 s += strlen(s);
 360                                 break;
 361                         case 'x':
 362                                 sprintf(s, "%x", va_arg(vargs, int));
 363                                 s += strlen(s);
 364                                 break;
 365                         case 's':
 366                                 p = va_arg(vargs, char*);
 367                                 i = strlen(p);
 368                                 if (n > 0 && i > n)
 369                                         i = n;
 370                                 Py_MEMCPY(s, p, i);
 371                                 s += i;
 372                                 break;
 373                         case 'p':
 374                                 sprintf(s, "%p", va_arg(vargs, void*));
 375                                 /* %p is ill-defined:  ensure leading 0x. */
 376                                 if (s[1] == 'X')
 377                                         s[1] = 'x';
 378                                 else if (s[1] != 'x') {
 379                                         memmove(s+2, s, strlen(s)+1);
 380                                         s[0] = '0';
 381                                         s[1] = 'x';
 382                                 }
 383                                 s += strlen(s);
 384                                 break;
 385                         case '%':
 386                                 *s++ = '%';
 387                                 break;
 388                         default:
 389                                 strcpy(s, p);
 390                                 s += strlen(s);
 391                                 goto end;
 392                         }
 393                 } else
 394                         *s++ = *f;
 395         }
 396
 397  end:
 398         _PyString_Resize(&string, s - PyString_AS_STRING(string));
 399         return string;
 400 }
 401
 402 PyObject *
 403 PyString_FromFormat(const char *format, ...)
 404 {
 405         PyObject* ret;
 406         va_list vargs;
 407
 408 #ifdef HAVE_STDARG_PROTOTYPES
 409         va_start(vargs, format);
 410 #else
 411         va_start(vargs);
 412 #endif
 413         ret = PyString_FromFormatV(format, vargs);
 414         va_end(vargs);
 415         return ret;
 416 }
 417
 418
 419 PyObject *PyString_Decode(const char *s,
 420                           Py_ssize_t size,
 421                           const char *encoding,
 422                           const char *errors)
 423 {
 424     PyObject *v, *str;
 425
 426     str = PyString_FromStringAndSize(s, size);
 427     if (str == NULL)
 428         return NULL;
 429     v = PyString_AsDecodedString(str, encoding, errors);
 430     Py_DECREF(str);
 431     return v;
 432 }
 433
 434 PyObject *PyString_AsDecodedObject(PyObject *str,
 435                                    const char *encoding,
 436                                    const char *errors)
 437 {
 438     PyObject *v;
 439
 440     if (!PyString_Check(str)) {
 441         PyErr_BadArgument();
 442         goto onError;
 443     }
 444
 445     if (encoding == NULL) {
 446 #ifdef Py_USING_UNICODE
 447         encoding = PyUnicode_GetDefaultEncoding();
 448 #else
 449         PyErr_SetString(PyExc_ValueError, "no encoding specified");
 450         goto onError;
 451 #endif
 452     }
 453
 454     /* Decode via the codec registry */
 455     v = PyCodec_Decode(str, encoding, errors);
 456     if (v == NULL)
 457         goto onError;
 458
 459     return v;
 460
 461  onError:
 462     return NULL;
 463 }
 464
 465 PyObject *PyString_AsDecodedString(PyObject *str,
 466                                    const char *encoding,
 467                                    const char *errors)
 468 {
 469     PyObject *v;
 470
 471     v = PyString_AsDecodedObject(str, encoding, errors);
 472     if (v == NULL)
 473         goto onError;
 474
 475 #ifdef Py_USING_UNICODE
 476     /* Convert Unicode to a string using the default encoding */
 477     if (PyUnicode_Check(v)) {
 478         PyObject *temp = v;
 479         v = PyUnicode_AsEncodedString(v, NULL, NULL);
 480         Py_DECREF(temp);
 481         if (v == NULL)
 482             goto onError;
 483     }
 484 #endif
 485     if (!PyString_Check(v)) {
 486         PyErr_Format(PyExc_TypeError,
 487                      "decoder did not return a string object (type=%.400s)",
 488                      Py_TYPE(v)->tp_name);
 489         Py_DECREF(v);
 490         goto onError;
 491     }
 492
 493     return v;
 494
 495  onError:
 496     return NULL;
 497 }
 498
 499 PyObject *PyString_Encode(const char *s,
 500                           Py_ssize_t size,
 501                           const char *encoding,
 502                           const char *errors)
 503 {
 504     PyObject *v, *str;
 505
 506     str = PyString_FromStringAndSize(s, size);
 507     if (str == NULL)
 508         return NULL;
 509     v = PyString_AsEncodedString(str, encoding, errors);
 510     Py_DECREF(str);
 511     return v;
 512 }
 513
 514 PyObject *PyString_AsEncodedObject(PyObject *str,
 515                                    const char *encoding,
 516                                    const char *errors)
 517 {
 518     PyObject *v;
 519
 520     if (!PyString_Check(str)) {
 521         PyErr_BadArgument();
 522         goto onError;
 523     }
 524
 525     if (encoding == NULL) {
 526 #ifdef Py_USING_UNICODE
 527         encoding = PyUnicode_GetDefaultEncoding();
 528 #else
 529         PyErr_SetString(PyExc_ValueError, "no encoding specified");
 530         goto onError;
 531 #endif
 532     }
 533
 534     /* Encode via the codec registry */
 535     v = PyCodec_Encode(str, encoding, errors);
 536     if (v == NULL)
 537         goto onError;
 538
 539     return v;
 540
 541  onError:
 542     return NULL;
 543 }
 544
 545 PyObject *PyString_AsEncodedString(PyObject *str,
 546                                    const char *encoding,
 547                                    const char *errors)
 548 {
 549     PyObject *v;
 550
 551     v = PyString_AsEncodedObject(str, encoding, errors);
 552     if (v == NULL)
 553         goto onError;
 554
 555 #ifdef Py_USING_UNICODE
 556     /* Convert Unicode to a string using the default encoding */
 557     if (PyUnicode_Check(v)) {
 558         PyObject *temp = v;
 559         v = PyUnicode_AsEncodedString(v, NULL, NULL);
 560         Py_DECREF(temp);
 561         if (v == NULL)
 562             goto onError;
 563     }
 564 #endif
 565     if (!PyString_Check(v)) {
 566         PyErr_Format(PyExc_TypeError,
 567                      "encoder did not return a string object (type=%.400s)",
 568                      Py_TYPE(v)->tp_name);
 569         Py_DECREF(v);
 570         goto onError;
 571     }
 572
 573     return v;
 574
 575  onError:
 576     return NULL;
 577 }
 578
 579 static void
 580 string_dealloc(PyObject *op)
 581 {
 582         switch (PyString_CHECK_INTERNED(op)) {
 583                 case SSTATE_NOT_INTERNED:
 584                         break;
 585
 586                 case SSTATE_INTERNED_MORTAL:
 587                         /* revive dead object temporarily for DelItem */
 588                         Py_REFCNT(op) = 3;
 589                         if (PyDict_DelItem(interned, op) != 0)
 590                                 Py_FatalError(
 591                                         "deletion of interned string failed");
 592                         break;
 593
 594                 case SSTATE_INTERNED_IMMORTAL:
 595                         Py_FatalError("Immortal interned string died.");
 596
 597                 default:
 598                         Py_FatalError("Inconsistent interned string state.");
 599         }
 600         Py_TYPE(op)->tp_free(op);
 601 }
 602
 603 /* Unescape a backslash-escaped string. If unicode is non-zero,
 604    the string is a u-literal. If recode_encoding is non-zero,
 605    the string is UTF-8 encoded and should be re-encoded in the
 606    specified encoding.  */
 607
 608 PyObject *PyString_DecodeEscape(const char *s,
 609                                 Py_ssize_t len,
 610                                 const char *errors,
 611                                 Py_ssize_t unicode,
 612                                 const char *recode_encoding)
 613 {
 614         int c;
 615         char *p, *buf;
 616         const char *end;
 617         PyObject *v;
 618         Py_ssize_t newlen = recode_encoding ? 4*len:len;
 619         v = PyString_FromStringAndSize((char *)NULL, newlen);
 620         if (v == NULL)
 621                 return NULL;
 622         p = buf = PyString_AsString(v);
 623         end = s + len;
 624         while (s < end) {
 625                 if (*s != '\\') {
 626                   non_esc:
 627 #ifdef Py_USING_UNICODE
 628                         if (recode_encoding && (*s & 0x80)) {
 629                                 PyObject *u, *w;
 630                                 char *r;
 631                                 const char* t;
 632                                 Py_ssize_t rn;
 633                                 t = s;
 634                                 /* Decode non-ASCII bytes as UTF-8. */
 635                                 while (t < end && (*t & 0x80)) t++;
 636                                 u = PyUnicode_DecodeUTF8(s, t - s, errors);
 637                                 if(!u) goto failed;
 638
 639                                 /* Recode them in target encoding. */
 640                                 w = PyUnicode_AsEncodedString(
 641                                         u, recode_encoding, errors);
 642                                 Py_DECREF(u);
 643                                 if (!w) goto failed;
 644
 645                                 /* Append bytes to output buffer. */
 646                                 assert(PyString_Check(w));
 647                                 r = PyString_AS_STRING(w);
 648                                 rn = PyString_GET_SIZE(w);
 649                                 Py_MEMCPY(p, r, rn);
 650                                 p += rn;
 651                                 Py_DECREF(w);
 652                                 s = t;
 653                         } else {
 654                                 *p++ = *s++;
 655                         }
 656 #else
 657                         *p++ = *s++;
 658 #endif
 659                         continue;
 660                 }
 661                 s++;
 662                 if (s==end) {
 663                         PyErr_SetString(PyExc_ValueError,
 664                                         "Trailing \\ in string");
 665                         goto failed;
 666                 }
 667                 switch (*s++) {
 668                 /* XXX This assumes ASCII! */
 669                 case '\n': break;
 670                 case '\\': *p++ = '\\'; break;
 671                 case '\'': *p++ = '\''; break;
 672                 case '\"': *p++ = '\"'; break;
 673                 case 'b': *p++ = '\b'; break;
 674                 case 'f': *p++ = '\014'; break; /* FF */
 675                 case 't': *p++ = '\t'; break;
 676                 case 'n': *p++ = '\n'; break;
 677                 case 'r': *p++ = '\r'; break;
 678                 case 'v': *p++ = '\013'; break; /* VT */
 679                 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
 680                 case '0': case '1': case '2': case '3':
 681                 case '4': case '5': case '6': case '7':
 682                         c = s[-1] - '0';
 683                         if (s < end && '0' <= *s && *s <= '7') {
 684                                 c = (c<<3) + *s++ - '0';
 685                                 if (s < end && '0' <= *s && *s <= '7')
 686                                         c = (c<<3) + *s++ - '0';
 687                         }
 688                         *p++ = c;
 689                         break;
 690                 case 'x':
 691                         if (s+1 < end &&
 692                             isxdigit(Py_CHARMASK(s[0])) &&
 693                             isxdigit(Py_CHARMASK(s[1])))
 694                         {
 695                                 unsigned int x = 0;
 696                                 c = Py_CHARMASK(*s);
 697                                 s++;
 698                                 if (isdigit(c))
 699                                         x = c - '0';
 700                                 else if (islower(c))
 701                                         x = 10 + c - 'a';
 702                                 else
 703                                         x = 10 + c - 'A';
 704                                 x = x << 4;
 705                                 c = Py_CHARMASK(*s);
 706                                 s++;
 707                                 if (isdigit(c))
 708                                         x += c - '0';
 709                                 else if (islower(c))
 710                                         x += 10 + c - 'a';
 711                                 else
 712                                         x += 10 + c - 'A';
 713                                 *p++ = x;
 714                                 break;
 715                         }
 716                         if (!errors || strcmp(errors, "strict") == 0) {
 717                                 PyErr_SetString(PyExc_ValueError,
 718                                                 "invalid \\x escape");
 719                                 goto failed;
 720                         }
 721                         if (strcmp(errors, "replace") == 0) {
 722                                 *p++ = '?';
 723                         } else if (strcmp(errors, "ignore") == 0)
 724                                 /* do nothing */;
 725                         else {
 726                                 PyErr_Format(PyExc_ValueError,
 727                                              "decoding error; "
 728                                              "unknown error handling code: %.400s",
 729                                              errors);
 730                                 goto failed;
 731                         }
 732 #ifndef Py_USING_UNICODE
 733                 case 'u':
 734                 case 'U':
 735                 case 'N':
 736                         if (unicode) {
 737                                 PyErr_SetString(PyExc_ValueError,
 738                                           "Unicode escapes not legal "
 739                                           "when Unicode disabled");
 740                                 goto failed;
 741                         }
 742 #endif
 743                 default:
 744                         *p++ = '\\';
 745                         s--;
 746                         goto non_esc; /* an arbitry number of unescaped
 747                                          UTF-8 bytes may follow. */
 748                 }
 749         }
 750         if (p-buf < newlen)
 751                 _PyString_Resize(&v, p - buf);
 752         return v;
 753   failed:
 754         Py_DECREF(v);
 755         return NULL;
 756 }
 757
 758 /* -------------------------------------------------------------------- */
 759 /* object api */
 760
 761 static Py_ssize_t
 762 string_getsize(register PyObject *op)
 763 {
 764         char *s;
 765         Py_ssize_t len;
 766         if (PyString_AsStringAndSize(op, &s, &len))
 767                 return -1;
 768         return len;
 769 }
 770
 771 static /*const*/ char *
 772 string_getbuffer(register PyObject *op)
 773 {
 774         char *s;
 775         Py_ssize_t len;
 776         if (PyString_AsStringAndSize(op, &s, &len))
 777                 return NULL;
 778         return s;
 779 }
 780
 781 Py_ssize_t
 782 PyString_Size(register PyObject *op)
 783 {
 784         if (!PyString_Check(op))
 785                 return string_getsize(op);
 786         return Py_SIZE(op);
 787 }
 788
 789 /*const*/ char *
 790 PyString_AsString(register PyObject *op)
 791 {
 792         if (!PyString_Check(op))
 793                 return string_getbuffer(op);
 794         return ((PyStringObject *)op) -> ob_sval;
 795 }
 796
 797 int
 798 PyString_AsStringAndSize(register PyObject *obj,
 799                          register char **s,
 800                          register Py_ssize_t *len)
 801 {
 802         if (s == NULL) {
 803                 PyErr_BadInternalCall();
 804                 return -1;
 805         }
 806
 807         if (!PyString_Check(obj)) {
 808 #ifdef Py_USING_UNICODE
 809                 if (PyUnicode_Check(obj)) {
 810                         obj = _PyUnicode_AsDefaultEncodedString(obj, NULL);
 811                         if (obj == NULL)
 812                                 return -1;
 813                 }
 814                 else
 815 #endif
 816                 {
 817                         PyErr_Format(PyExc_TypeError,
 818                                      "expected string or Unicode object, "
 819                                      "%.200s found", Py_TYPE(obj)->tp_name);
 820                         return -1;
 821                 }
 822         }
 823
 824         *s = PyString_AS_STRING(obj);
 825         if (len != NULL)
 826                 *len = PyString_GET_SIZE(obj);
 827         else if (strlen(*s) != (size_t)PyString_GET_SIZE(obj)) {
 828                 PyErr_SetString(PyExc_TypeError,
 829                                 "expected string without null bytes");
 830                 return -1;
 831         }
 832         return 0;
 833 }
 834
 835 /* -------------------------------------------------------------------- */
 836 /* Methods */
 837
 838 #include "stringlib/stringdefs.h"
 839 #include "stringlib/fastsearch.h"
 840
 841 #include "stringlib/count.h"
 842 #include "stringlib/find.h"
 843 #include "stringlib/partition.h"
 844
 845 #define _Py_InsertThousandsGrouping _PyString_InsertThousandsGrouping
 846 #include "stringlib/localeutil.h"
 847
 848
 849
 850 static int
 851 string_print(PyStringObject *op, FILE *fp, int flags)
 852 {
 853         Py_ssize_t i, str_len;
 854         char c;
 855         int quote;
 856
 857         /* XXX Ought to check for interrupts when writing long strings */
 858         if (! PyString_CheckExact(op)) {
 859                 int ret;
 860                 /* A str subclass may have its own __str__ method. */
 861                 op = (PyStringObject *) PyObject_Str((PyObject *)op);
 862                 if (op == NULL)
 863                         return -1;
 864                 ret = string_print(op, fp, flags);
 865                 Py_DECREF(op);
 866                 return ret;
 867         }
 868         if (flags & Py_PRINT_RAW) {
 869                 char *data = op->ob_sval;
 870                 Py_ssize_t size = Py_SIZE(op);
 871                 Py_BEGIN_ALLOW_THREADS
 872                 while (size > INT_MAX) {
 873                         /* Very long strings cannot be written atomically.
 874                          * But don't write exactly INT_MAX bytes at a time
 875                          * to avoid memory aligment issues.
 876                          */
 877                         const int chunk_size = INT_MAX & ~0x3FFF;
 878                         fwrite(data, 1, chunk_size, fp);
 879                         data += chunk_size;
 880                         size -= chunk_size;
 881                 }
 882 #ifdef __VMS
 883                 if (size) fwrite(data, (int)size, 1, fp);
 884 #else
 885                 fwrite(data, 1, (int)size, fp);
 886 #endif
 887                 Py_END_ALLOW_THREADS
 888                 return 0;
 889         }
 890
 891         /* figure out which quote to use; single is preferred */
 892         quote = '\'';
 893         if (memchr(op->ob_sval, '\'', Py_SIZE(op)) &&
 894             !memchr(op->ob_sval, '"', Py_SIZE(op)))
 895                 quote = '"';
 896
 897         str_len = Py_SIZE(op);
 898         Py_BEGIN_ALLOW_THREADS
 899         fputc(quote, fp);
 900         for (i = 0; i < str_len; i++) {
 901                 /* Since strings are immutable and the caller should have a
 902                 reference, accessing the interal buffer should not be an issue
 903                 with the GIL released. */
 904                 c = op->ob_sval[i];
 905                 if (c == quote || c == '\\')
 906                         fprintf(fp, "\\%c", c);
 907                 else if (c == '\t')
 908                         fprintf(fp, "\\t");
 909                 else if (c == '\n')
 910                         fprintf(fp, "\\n");
 911                 else if (c == '\r')
 912                         fprintf(fp, "\\r");
 913                 else if (c < ' ' || c >= 0x7f)
 914                         fprintf(fp, "\\x%02x", c & 0xff);
 915                 else
 916                         fputc(c, fp);
 917         }
 918         fputc(quote, fp);
 919         Py_END_ALLOW_THREADS
 920         return 0;
 921 }
 922
 923 PyObject *
 924 PyString_Repr(PyObject *obj, int smartquotes)
 925 {
 926         register PyStringObject* op = (PyStringObject*) obj;
 927         size_t newsize = 2 + 4 * Py_SIZE(op);
 928         PyObject *v;
 929         if (newsize > PY_SSIZE_T_MAX || newsize / 4 != Py_SIZE(op)) {
 930                 PyErr_SetString(PyExc_OverflowError,
 931                         "string is too large to make repr");
 932                 return NULL;
 933         }
 934         v = PyString_FromStringAndSize((char *)NULL, newsize);
 935         if (v == NULL) {
 936                 return NULL;
 937         }
 938         else {
 939                 register Py_ssize_t i;
 940                 register char c;
 941                 register char *p;
 942                 int quote;
 943
 944                 /* figure out which quote to use; single is preferred */
 945                 quote = '\'';
 946                 if (smartquotes &&
 947                     memchr(op->ob_sval, '\'', Py_SIZE(op)) &&
 948                     !memchr(op->ob_sval, '"', Py_SIZE(op)))
 949                         quote = '"';
 950
 951                 p = PyString_AS_STRING(v);
 952                 *p++ = quote;
 953                 for (i = 0; i < Py_SIZE(op); i++) {
 954                         /* There's at least enough room for a hex escape
 955                            and a closing quote. */
 956                         assert(newsize - (p - PyString_AS_STRING(v)) >= 5);
 957                         c = op->ob_sval[i];
 958                         if (c == quote || c == '\\')
 959                                 *p++ = '\\', *p++ = c;
 960                         else if (c == '\t')
 961                                 *p++ = '\\', *p++ = 't';
 962                         else if (c == '\n')
 963                                 *p++ = '\\', *p++ = 'n';
 964                         else if (c == '\r')
 965                                 *p++ = '\\', *p++ = 'r';
 966                         else if (c < ' ' || c >= 0x7f) {
 967                                 /* For performance, we don't want to call
 968                                    PyOS_snprintf here (extra layers of
 969                                    function call). */
 970                                 sprintf(p, "\\x%02x", c & 0xff);
 971                                 p += 4;
 972                         }
 973                         else
 974                                 *p++ = c;
 975                 }
 976                 assert(newsize - (p - PyString_AS_STRING(v)) >= 1);
 977                 *p++ = quote;
 978                 *p = '\0';
 979                 _PyString_Resize(
 980                         &v, (p - PyString_AS_STRING(v)));
 981                 return v;
 982         }
 983 }
 984
 985 static PyObject *
 986 string_repr(PyObject *op)
 987 {
 988         return PyString_Repr(op, 1);
 989 }
 990
 991 static PyObject *
 992 string_str(PyObject *s)
 993 {
 994         assert(PyString_Check(s));
 995         if (PyString_CheckExact(s)) {
 996                 Py_INCREF(s);
 997                 return s;
 998         }
 999         else {
1000                 /* Subtype -- return genuine string with the same value. */
1001                 PyStringObject *t = (PyStringObject *) s;
1002                 return PyString_FromStringAndSize(t->ob_sval, Py_SIZE(t));
1003         }
1004 }
1005
1006 static Py_ssize_t
1007 string_length(PyStringObject *a)
1008 {
1009         return Py_SIZE(a);
1010 }
1011
1012 static PyObject *
1013 string_concat(register PyStringObject *a, register PyObject *bb)
1014 {
1015         register Py_ssize_t size;
1016         register PyStringObject *op;
1017         if (!PyString_Check(bb)) {
1018 #ifdef Py_USING_UNICODE
1019                 if (PyUnicode_Check(bb))
1020                     return PyUnicode_Concat((PyObject *)a, bb);
1021 #endif
1022                 if (PyByteArray_Check(bb))
1023                     return PyByteArray_Concat((PyObject *)a, bb);
1024                 PyErr_Format(PyExc_TypeError,
1025                              "cannot concatenate 'str' and '%.200s' objects",
1026                              Py_TYPE(bb)->tp_name);
1027                 return NULL;
1028         }
1029 #define b ((PyStringObject *)bb)
1030         /* Optimize cases with empty left or right operand */
1031         if ((Py_SIZE(a) == 0 || Py_SIZE(b) == 0) &&
1032             PyString_CheckExact(a) && PyString_CheckExact(b)) {
1033                 if (Py_SIZE(a) == 0) {
1034                         Py_INCREF(bb);
1035                         return bb;
1036                 }
1037                 Py_INCREF(a);
1038                 return (PyObject *)a;
1039         }
1040         size = Py_SIZE(a) + Py_SIZE(b);
1041         /* Check that string sizes are not negative, to prevent an
1042            overflow in cases where we are passed incorrectly-created
1043            strings with negative lengths (due to a bug in other code).
1044         */
1045         if (Py_SIZE(a) < 0 || Py_SIZE(b) < 0 ||
1046             Py_SIZE(a) > PY_SSIZE_T_MAX - Py_SIZE(b)) {
1047                 PyErr_SetString(PyExc_OverflowError,
1048                                 "strings are too large to concat");
1049                 return NULL;
1050         }
1051
1052         /* Inline PyObject_NewVar */
1053         if (size > PY_SSIZE_T_MAX - PyStringObject_SIZE) {
1054                 PyErr_SetString(PyExc_OverflowError,
1055                                 "strings are too large to concat");
1056                 return NULL;
1057         }
1058         op = (PyStringObject *)PyObject_MALLOC(PyStringObject_SIZE + size);
1059         if (op == NULL)
1060                 return PyErr_NoMemory();
1061         PyObject_INIT_VAR(op, &PyString_Type, size);
1062         op->ob_shash = -1;
1063         op->ob_sstate = SSTATE_NOT_INTERNED;
1064         Py_MEMCPY(op->ob_sval, a->ob_sval, Py_SIZE(a));
1065         Py_MEMCPY(op->ob_sval + Py_SIZE(a), b->ob_sval, Py_SIZE(b));
1066         op->ob_sval[size] = '\0';
1067         return (PyObject *) op;
1068 #undef b
1069 }
1070
1071 static PyObject *
1072 string_repeat(register PyStringObject *a, register Py_ssize_t n)
1073 {
1074         register Py_ssize_t i;
1075         register Py_ssize_t j;
1076         register Py_ssize_t size;
1077         register PyStringObject *op;
1078         size_t nbytes;
1079         if (n < 0)
1080                 n = 0;
1081         /* watch out for overflows:  the size can overflow int,
1082          * and the # of bytes needed can overflow size_t
1083          */
1084         size = Py_SIZE(a) * n;
1085         if (n && size / n != Py_SIZE(a)) {
1086                 PyErr_SetString(PyExc_OverflowError,
1087                         "repeated string is too long");
1088                 return NULL;
1089         }
1090         if (size == Py_SIZE(a) && PyString_CheckExact(a)) {
1091                 Py_INCREF(a);
1092                 return (PyObject *)a;
1093         }
1094         nbytes = (size_t)size;
1095         if (nbytes + PyStringObject_SIZE <= nbytes) {
1096                 PyErr_SetString(PyExc_OverflowError,
1097                         "repeated string is too long");
1098                 return NULL;
1099         }
1100         op = (PyStringObject *)PyObject_MALLOC(PyStringObject_SIZE + nbytes);
1101         if (op == NULL)
1102                 return PyErr_NoMemory();
1103         PyObject_INIT_VAR(op, &PyString_Type, size);
1104         op->ob_shash = -1;
1105         op->ob_sstate = SSTATE_NOT_INTERNED;
1106         op->ob_sval[size] = '\0';
1107         if (Py_SIZE(a) == 1 && n > 0) {
1108                 memset(op->ob_sval, a->ob_sval[0] , n);
1109                 return (PyObject *) op;
1110         }
1111         i = 0;
1112         if (i < size) {
1113                 Py_MEMCPY(op->ob_sval, a->ob_sval, Py_SIZE(a));
1114                 i = Py_SIZE(a);
1115         }
1116         while (i < size) {
1117                 j = (i <= size-i)  ?  i  :  size-i;
1118                 Py_MEMCPY(op->ob_sval+i, op->ob_sval, j);
1119                 i += j;
1120         }
1121         return (PyObject *) op;
1122 }
1123
1124 /* String slice a[i:j] consists of characters a[i] ... a[j-1] */
1125
1126 static PyObject *
1127 string_slice(register PyStringObject *a, register Py_ssize_t i,
1128              register Py_ssize_t j)
1129      /* j -- may be negative! */
1130 {
1131         if (i < 0)
1132                 i = 0;
1133         if (j < 0)
1134                 j = 0; /* Avoid signed/unsigned bug in next line */
1135         if (j > Py_SIZE(a))
1136                 j = Py_SIZE(a);
1137         if (i == 0 && j == Py_SIZE(a) && PyString_CheckExact(a)) {
1138                 /* It's the same as a */
1139                 Py_INCREF(a);
1140                 return (PyObject *)a;
1141         }
1142         if (j < i)
1143                 j = i;
1144         return PyString_FromStringAndSize(a->ob_sval + i, j-i);
1145 }
1146
1147 static int
1148 string_contains(PyObject *str_obj, PyObject *sub_obj)
1149 {
1150         if (!PyString_CheckExact(sub_obj)) {
1151 #ifdef Py_USING_UNICODE
1152                 if (PyUnicode_Check(sub_obj))
1153                         return PyUnicode_Contains(str_obj, sub_obj);
1154 #endif
1155                 if (!PyString_Check(sub_obj)) {
1156                         PyErr_Format(PyExc_TypeError,
1157                             "'in <string>' requires string as left operand, "
1158                             "not %.200s", Py_TYPE(sub_obj)->tp_name);
1159                         return -1;
1160                 }
1161         }
1162
1163         return stringlib_contains_obj(str_obj, sub_obj);
1164 }
1165
1166 static PyObject *
1167 string_item(PyStringObject *a, register Py_ssize_t i)
1168 {
1169         char pchar;
1170         PyObject *v;
1171         if (i < 0 || i >= Py_SIZE(a)) {
1172                 PyErr_SetString(PyExc_IndexError, "string index out of range");
1173                 return NULL;
1174         }
1175         pchar = a->ob_sval[i];
1176         v = (PyObject *)characters[pchar & UCHAR_MAX];
1177         if (v == NULL)
1178                 v = PyString_FromStringAndSize(&pchar, 1);
1179         else {
1180 #ifdef COUNT_ALLOCS
1181                 one_strings++;
1182 #endif
1183                 Py_INCREF(v);
1184         }
1185         return v;
1186 }
1187
1188 static PyObject*
1189 string_richcompare(PyStringObject *a, PyStringObject *b, int op)
1190 {
1191         int c;
1192         Py_ssize_t len_a, len_b;
1193         Py_ssize_t min_len;
1194         PyObject *result;
1195
1196         /* Make sure both arguments are strings. */
1197         if (!(PyString_Check(a) && PyString_Check(b))) {
1198                 result = Py_NotImplemented;
1199                 goto out;
1200         }
1201         if (a == b) {
1202                 switch (op) {
1203                 case Py_EQ:case Py_LE:case Py_GE:
1204                         result = Py_True;
1205                         goto out;
1206                 case Py_NE:case Py_LT:case Py_GT:
1207                         result = Py_False;
1208                         goto out;
1209                 }
1210         }
1211         if (op == Py_EQ) {
1212                 /* Supporting Py_NE here as well does not save
1213                    much time, since Py_NE is rarely used.  */
1214                 if (Py_SIZE(a) == Py_SIZE(b)
1215                     && (a->ob_sval[0] == b->ob_sval[0]
1216                         && memcmp(a->ob_sval, b->ob_sval, Py_SIZE(a)) == 0)) {
1217                         result = Py_True;
1218                 } else {
1219                         result = Py_False;
1220                 }
1221                 goto out;
1222         }
1223         len_a = Py_SIZE(a); len_b = Py_SIZE(b);
1224         min_len = (len_a < len_b) ? len_a : len_b;
1225         if (min_len > 0) {
1226                 c = Py_CHARMASK(*a->ob_sval) - Py_CHARMASK(*b->ob_sval);
1227                 if (c==0)
1228                         c = memcmp(a->ob_sval, b->ob_sval, min_len);
1229         } else
1230                 c = 0;
1231         if (c == 0)
1232                 c = (len_a < len_b) ? -1 : (len_a > len_b) ? 1 : 0;
1233         switch (op) {
1234         case Py_LT: c = c <  0; break;
1235         case Py_LE: c = c <= 0; break;
1236         case Py_EQ: assert(0);  break; /* unreachable */
1237         case Py_NE: c = c != 0; break;
1238         case Py_GT: c = c >  0; break;
1239         case Py_GE: c = c >= 0; break;
1240         default:
1241                 result = Py_NotImplemented;
1242                 goto out;
1243         }
1244         result = c ? Py_True : Py_False;
1245   out:
1246         Py_INCREF(result);
1247         return result;
1248 }
1249
1250 int
1251 _PyString_Eq(PyObject *o1, PyObject *o2)
1252 {
1253         PyStringObject *a = (PyStringObject*) o1;
1254         PyStringObject *b = (PyStringObject*) o2;
1255         return Py_SIZE(a) == Py_SIZE(b)
1256           && *a->ob_sval == *b->ob_sval
1257           && memcmp(a->ob_sval, b->ob_sval, Py_SIZE(a)) == 0;
1258 }
1259
1260 static long
1261 string_hash(PyStringObject *a)
1262 {
1263         register Py_ssize_t len;
1264         register unsigned char *p;
1265         register long x;
1266
1267         if (a->ob_shash != -1)
1268                 return a->ob_shash;
1269         len = Py_SIZE(a);
1270         p = (unsigned char *) a->ob_sval;
1271         x = *p << 7;
1272         while (--len >= 0)
1273                 x = (1000003*x) ^ *p++;
1274         x ^= Py_SIZE(a);
1275         if (x == -1)
1276                 x = -2;
1277         a->ob_shash = x;
1278         return x;
1279 }
1280
1281 static PyObject*
1282 string_subscript(PyStringObject* self, PyObject* item)
1283 {
1284         if (PyIndex_Check(item)) {
1285                 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
1286                 if (i == -1 && PyErr_Occurred())
1287                         return NULL;
1288                 if (i < 0)
1289                         i += PyString_GET_SIZE(self);
1290                 return string_item(self, i);
1291         }
1292         else if (PySlice_Check(item)) {
1293                 Py_ssize_t start, stop, step, slicelength, cur, i;
1294                 char* source_buf;
1295                 char* result_buf;
1296                 PyObject* result;
1297
1298                 if (PySlice_GetIndicesEx((PySliceObject*)item,
1299                                  PyString_GET_SIZE(self),
1300                                  &start, &stop, &step, &slicelength) < 0) {
1301                         return NULL;
1302                 }
1303
1304                 if (slicelength <= 0) {
1305                         return PyString_FromStringAndSize("", 0);
1306                 }
1307                 else if (start == 0 && step == 1 &&
1308                          slicelength == PyString_GET_SIZE(self) &&
1309                          PyString_CheckExact(self)) {
1310                         Py_INCREF(self);
1311                         return (PyObject *)self;
1312                 }
1313                 else if (step == 1) {
1314                         return PyString_FromStringAndSize(
1315                                 PyString_AS_STRING(self) + start,
1316                                 slicelength);
1317                 }
1318                 else {
1319                         source_buf = PyString_AsString((PyObject*)self);
1320                         result_buf = (char *)PyMem_Malloc(slicelength);
1321                         if (result_buf == NULL)
1322                                 return PyErr_NoMemory();
1323
1324                         for (cur = start, i = 0; i < slicelength;
1325                              cur += step, i++) {
1326                                 result_buf[i] = source_buf[cur];
1327                         }
1328
1329                         result = PyString_FromStringAndSize(result_buf,
1330                                                             slicelength);
1331                         PyMem_Free(result_buf);
1332                         return result;
1333                 }
1334         }
1335         else {
1336                 PyErr_Format(PyExc_TypeError,
1337                              "string indices must be integers, not %.200s",
1338                              Py_TYPE(item)->tp_name);
1339                 return NULL;
1340         }
1341 }
1342
1343 static Py_ssize_t
1344 string_buffer_getreadbuf(PyStringObject *self, Py_ssize_t index, const void **ptr)
1345 {
1346         if ( index != 0 ) {
1347                 PyErr_SetString(PyExc_SystemError,
1348                                 "accessing non-existent string segment");
1349                 return -1;
1350         }
1351         *ptr = (void *)self->ob_sval;
1352         return Py_SIZE(self);
1353 }
1354
1355 static Py_ssize_t
1356 string_buffer_getwritebuf(PyStringObject *self, Py_ssize_t index, const void **ptr)
1357 {
1358         PyErr_SetString(PyExc_TypeError,
1359                         "Cannot use string as modifiable buffer");
1360         return -1;
1361 }
1362
1363 static Py_ssize_t
1364 string_buffer_getsegcount(PyStringObject *self, Py_ssize_t *lenp)
1365 {
1366         if ( lenp )
1367                 *lenp = Py_SIZE(self);
1368         return 1;
1369 }
1370
1371 static Py_ssize_t
1372 string_buffer_getcharbuf(PyStringObject *self, Py_ssize_t index, const char **ptr)
1373 {
1374         if ( index != 0 ) {
1375                 PyErr_SetString(PyExc_SystemError,
1376                                 "accessing non-existent string segment");
1377                 return -1;
1378         }
1379         *ptr = self->ob_sval;
1380         return Py_SIZE(self);
1381 }
1382
1383 static int
1384 string_buffer_getbuffer(PyStringObject *self, Py_buffer *view, int flags)
1385 {
1386         return PyBuffer_FillInfo(view, (PyObject*)self,
1387                                  (void *)self->ob_sval, Py_SIZE(self),
1388                                  1, flags);
1389 }
1390
1391 static PySequenceMethods string_as_sequence = {
1392         (lenfunc)string_length, /*sq_length*/
1393         (binaryfunc)string_concat, /*sq_concat*/
1394         (ssizeargfunc)string_repeat, /*sq_repeat*/
1395         (ssizeargfunc)string_item, /*sq_item*/
1396         (ssizessizeargfunc)string_slice, /*sq_slice*/
1397         0,              /*sq_ass_item*/
1398         0,              /*sq_ass_slice*/
1399         (objobjproc)string_contains /*sq_contains*/
1400 };
1401
1402 static PyMappingMethods string_as_mapping = {
1403         (lenfunc)string_length,
1404         (binaryfunc)string_subscript,
1405         0,
1406 };
1407
1408 static PyBufferProcs string_as_buffer = {
1409         (readbufferproc)string_buffer_getreadbuf,
1410         (writebufferproc)string_buffer_getwritebuf,
1411         (segcountproc)string_buffer_getsegcount,
1412         (charbufferproc)string_buffer_getcharbuf,
1413         (getbufferproc)string_buffer_getbuffer,
1414         0, /* XXX */
1415 };
1416
1417
1418
1419 #define LEFTSTRIP 0
1420 #define RIGHTSTRIP 1
1421 #define BOTHSTRIP 2
1422
1423 /* Arrays indexed by above */
1424 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
1425
1426 #define STRIPNAME(i) (stripformat[i]+3)
1427
1428
1429 /* Don't call if length < 2 */
1430 #define Py_STRING_MATCH(target, offset, pattern, length)        \
1431   (target[offset] == pattern[0] &&                              \
1432    target[offset+length-1] == pattern[length-1] &&              \
1433    !memcmp(target+offset+1, pattern+1, length-2) )
1434
1435
1436 /* Overallocate the initial list to reduce the number of reallocs for small
1437    split sizes.  Eg, "A A A A A A A A A A".split() (10 elements) has three
1438    resizes, to sizes 4, 8, then 16.  Most observed string splits are for human
1439    text (roughly 11 words per line) and field delimited data (usually 1-10
1440    fields).  For large strings the split algorithms are bandwidth limited
1441    so increasing the preallocation likely will not improve things.*/
1442
1443 #define MAX_PREALLOC 12
1444
1445 /* 5 splits gives 6 elements */
1446 #define PREALLOC_SIZE(maxsplit) \
1447         (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
1448
1449 #define SPLIT_APPEND(data, left, right)                         \
1450         str = PyString_FromStringAndSize((data) + (left),       \
1451                                          (right) - (left));     \
1452         if (str == NULL)                                        \
1453                 goto onError;                                   \
1454         if (PyList_Append(list, str)) {                         \
1455                 Py_DECREF(str);                                 \
1456                 goto onError;                                   \
1457         }                                                       \
1458         else                                                    \
1459                 Py_DECREF(str);
1460
1461 #define SPLIT_ADD(data, left, right) {                          \
1462         str = PyString_FromStringAndSize((data) + (left),       \
1463                                          (right) - (left));     \
1464         if (str == NULL)                                        \
1465                 goto onError;                                   \
1466         if (count < MAX_PREALLOC) {                             \
1467                 PyList_SET_ITEM(list, count, str);              \
1468         } else {                                                \
1469                 if (PyList_Append(list, str)) {                 \
1470                         Py_DECREF(str);                         \
1471                         goto onError;                           \
1472                 }                                               \
1473                 else                                            \
1474                         Py_DECREF(str);                         \
1475         }                                                       \
1476         count++; }
1477
1478 /* Always force the list to the expected size. */
1479 #define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count
1480
1481 #define SKIP_SPACE(s, i, len)    { while (i<len &&  isspace(Py_CHARMASK(s[i]))) i++; }
1482 #define SKIP_NONSPACE(s, i, len) { while (i<len && !isspace(Py_CHARMASK(s[i]))) i++; }
1483 #define RSKIP_SPACE(s, i)        { while (i>=0  &&  isspace(Py_CHARMASK(s[i]))) i--; }
1484 #define RSKIP_NONSPACE(s, i)     { while (i>=0  && !isspace(Py_CHARMASK(s[i]))) i--; }
1485
1486 Py_LOCAL_INLINE(PyObject *)
1487 split_whitespace(PyStringObject *self, Py_ssize_t len, Py_ssize_t maxsplit)
1488 {
1489         const char *s = PyString_AS_STRING(self);
1490         Py_ssize_t i, j, count=0;
1491         PyObject *str;
1492         PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit));
1493
1494         if (list == NULL)
1495                 return NULL;
1496
1497         i = j = 0;
1498
1499         while (maxsplit-- > 0) {
1500                 SKIP_SPACE(s, i, len);
1501                 if (i==len) break;
1502                 j = i; i++;
1503                 SKIP_NONSPACE(s, i, len);
1504                 if (j == 0 && i == len && PyString_CheckExact(self)) {
1505                         /* No whitespace in self, so just use it as list[0] */
1506                         Py_INCREF(self);
1507                         PyList_SET_ITEM(list, 0, (PyObject *)self);
1508                         count++;
1509                         break;
1510                 }
1511                 SPLIT_ADD(s, j, i);
1512         }
1513
1514         if (i < len) {
1515                 /* Only occurs when maxsplit was reached */
1516                 /* Skip any remaining whitespace and copy to end of string */
1517                 SKIP_SPACE(s, i, len);
1518                 if (i != len)
1519                         SPLIT_ADD(s, i, len);
1520         }
1521         FIX_PREALLOC_SIZE(list);
1522         return list;
1523   onError:
1524         Py_DECREF(list);
1525         return NULL;
1526 }
1527
1528 Py_LOCAL_INLINE(PyObject *)
1529 split_char(PyStringObject *self, Py_ssize_t len, char ch, Py_ssize_t maxcount)
1530 {
1531         const char *s = PyString_AS_STRING(self);
1532         register Py_ssize_t i, j, count=0;
1533         PyObject *str;
1534         PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
1535
1536         if (list == NULL)
1537                 return NULL;
1538
1539         i = j = 0;
1540         while ((j < len) && (maxcount-- > 0)) {
1541                 for(; j<len; j++) {
1542                         /* I found that using memchr makes no difference */
1543                         if (s[j] == ch) {
1544                                 SPLIT_ADD(s, i, j);
1545                                 i = j = j + 1;
1546                                 break;
1547                         }
1548                 }
1549         }
1550         if (i == 0 && count == 0 && PyString_CheckExact(self)) {
1551                 /* ch not in self, so just use self as list[0] */
1552                 Py_INCREF(self);
1553                 PyList_SET_ITEM(list, 0, (PyObject *)self);
1554                 count++;
1555         }
1556         else if (i <= len) {
1557                 SPLIT_ADD(s, i, len);
1558         }
1559         FIX_PREALLOC_SIZE(list);
1560         return list;
1561
1562   onError:
1563         Py_DECREF(list);
1564         return NULL;
1565 }
1566
1567 PyDoc_STRVAR(split__doc__,
1568 "S.split([sep [,maxsplit]]) -> list of strings\n\
1569 \n\
1570 Return a list of the words in the string S, using sep as the\n\
1571 delimiter string.  If maxsplit is given, at most maxsplit\n\
1572 splits are done. If sep is not specified or is None, any\n\
1573 whitespace string is a separator and empty strings are removed\n\
1574 from the result.");
1575
1576 static PyObject *
1577 string_split(PyStringObject *self, PyObject *args)
1578 {
1579         Py_ssize_t len = PyString_GET_SIZE(self), n, i, j;
1580         Py_ssize_t maxsplit = -1, count=0;
1581         const char *s = PyString_AS_STRING(self), *sub;
1582         PyObject *list, *str, *subobj = Py_None;
1583 #ifdef USE_FAST
1584         Py_ssize_t pos;
1585 #endif
1586
1587         if (!PyArg_ParseTuple(args, "|On:split", &subobj, &maxsplit))
1588                 return NULL;
1589         if (maxsplit < 0)
1590                 maxsplit = PY_SSIZE_T_MAX;
1591         if (subobj == Py_None)
1592                 return split_whitespace(self, len, maxsplit);
1593         if (PyString_Check(subobj)) {
1594                 sub = PyString_AS_STRING(subobj);
1595                 n = PyString_GET_SIZE(subobj);
1596         }
1597 #ifdef Py_USING_UNICODE
1598         else if (PyUnicode_Check(subobj))
1599                 return PyUnicode_Split((PyObject *)self, subobj, maxsplit);
1600 #endif
1601         else if (PyObject_AsCharBuffer(subobj, &sub, &n))
1602                 return NULL;
1603
1604         if (n == 0) {
1605                 PyErr_SetString(PyExc_ValueError, "empty separator");
1606                 return NULL;
1607         }
1608         else if (n == 1)
1609                 return split_char(self, len, sub[0], maxsplit);
1610
1611         list = PyList_New(PREALLOC_SIZE(maxsplit));
1612         if (list == NULL)
1613                 return NULL;
1614
1615 #ifdef USE_FAST
1616         i = j = 0;
1617         while (maxsplit-- > 0) {
1618                 pos = fastsearch(s+i, len-i, sub, n, FAST_SEARCH);
1619                 if (pos < 0)
1620                         break;
1621                 j = i+pos;
1622                 SPLIT_ADD(s, i, j);
1623                 i = j + n;
1624         }
1625 #else
1626         i = j = 0;
1627         while ((j+n <= len) && (maxsplit-- > 0)) {
1628                 for (; j+n <= len; j++) {
1629                         if (Py_STRING_MATCH(s, j, sub, n)) {
1630                                 SPLIT_ADD(s, i, j);
1631                                 i = j = j + n;
1632                                 break;
1633                         }
1634                 }
1635         }
1636 #endif
1637         SPLIT_ADD(s, i, len);
1638         FIX_PREALLOC_SIZE(list);
1639         return list;
1640
1641  onError:
1642         Py_DECREF(list);
1643         return NULL;
1644 }
1645
1646 PyDoc_STRVAR(partition__doc__,
1647 "S.partition(sep) -> (head, sep, tail)\n\
1648 \n\
1649 Search for the separator sep in S, and return the part before it,\n\
1650 the separator itself, and the part after it.  If the separator is not\n\
1651 found, return S and two empty strings.");
1652
1653 static PyObject *
1654 string_partition(PyStringObject *self, PyObject *sep_obj)
1655 {
1656         const char *sep;
1657         Py_ssize_t sep_len;
1658
1659         if (PyString_Check(sep_obj)) {
1660                 sep = PyString_AS_STRING(sep_obj);
1661                 sep_len = PyString_GET_SIZE(sep_obj);
1662         }
1663 #ifdef Py_USING_UNICODE
1664         else if (PyUnicode_Check(sep_obj))
1665                 return PyUnicode_Partition((PyObject *) self, sep_obj);
1666 #endif
1667         else if (PyObject_AsCharBuffer(sep_obj, &sep, &sep_len))
1668                 return NULL;
1669
1670         return stringlib_partition(
1671                 (PyObject*) self,
1672                 PyString_AS_STRING(self), PyString_GET_SIZE(self),
1673                 sep_obj, sep, sep_len
1674                 );
1675 }
1676
1677 PyDoc_STRVAR(rpartition__doc__,
1678 "S.rpartition(sep) -> (tail, sep, head)\n\
1679 \n\
1680 Search for the separator sep in S, starting at the end of S, and return\n\
1681 the part before it, the separator itself, and the part after it.  If the\n\
1682 separator is not found, return two empty strings and S.");
1683
1684 static PyObject *
1685 string_rpartition(PyStringObject *self, PyObject *sep_obj)
1686 {
1687         const char *sep;
1688         Py_ssize_t sep_len;
1689
1690         if (PyString_Check(sep_obj)) {
1691                 sep = PyString_AS_STRING(sep_obj);
1692                 sep_len = PyString_GET_SIZE(sep_obj);
1693         }
1694 #ifdef Py_USING_UNICODE
1695         else if (PyUnicode_Check(sep_obj))
1696                 return PyUnicode_RPartition((PyObject *) self, sep_obj);
1697 #endif
1698         else if (PyObject_AsCharBuffer(sep_obj, &sep, &sep_len))
1699                 return NULL;
1700
1701         return stringlib_rpartition(
1702                 (PyObject*) self,
1703                 PyString_AS_STRING(self), PyString_GET_SIZE(self),
1704                 sep_obj, sep, sep_len
1705                 );
1706 }
1707
1708 Py_LOCAL_INLINE(PyObject *)
1709 rsplit_whitespace(PyStringObject *self, Py_ssize_t len, Py_ssize_t maxsplit)
1710 {
1711         const char *s = PyString_AS_STRING(self);
1712         Py_ssize_t i, j, count=0;
1713         PyObject *str;
1714         PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit));
1715
1716         if (list == NULL)
1717                 return NULL;
1718
1719         i = j = len-1;
1720
1721         while (maxsplit-- > 0) {
1722                 RSKIP_SPACE(s, i);
1723                 if (i<0) break;
1724                 j = i; i--;
1725                 RSKIP_NONSPACE(s, i);
1726                 if (j == len-1 && i < 0 && PyString_CheckExact(self)) {
1727                         /* No whitespace in self, so just use it as list[0] */
1728                         Py_INCREF(self);
1729                         PyList_SET_ITEM(list, 0, (PyObject *)self);
1730                         count++;
1731                         break;
1732                 }
1733                 SPLIT_ADD(s, i + 1, j + 1);
1734         }
1735         if (i >= 0) {
1736                 /* Only occurs when maxsplit was reached */
1737                 /* Skip any remaining whitespace and copy to beginning of string */
1738                 RSKIP_SPACE(s, i);
1739                 if (i >= 0)
1740                         SPLIT_ADD(s, 0, i + 1);
1741
1742         }
1743         FIX_PREALLOC_SIZE(list);
1744         if (PyList_Reverse(list) < 0)
1745                 goto onError;
1746         return list;
1747   onError:
1748         Py_DECREF(list);
1749         return NULL;
1750 }
1751
1752 Py_LOCAL_INLINE(PyObject *)
1753 rsplit_char(PyStringObject *self, Py_ssize_t len, char ch, Py_ssize_t maxcount)
1754 {
1755         const char *s = PyString_AS_STRING(self);
1756         register Py_ssize_t i, j, count=0;
1757         PyObject *str;
1758         PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
1759
1760         if (list == NULL)
1761                 return NULL;
1762
1763         i = j = len - 1;
1764         while ((i >= 0) && (maxcount-- > 0)) {
1765                 for (; i >= 0; i--) {
1766                         if (s[i] == ch) {
1767                                 SPLIT_ADD(s, i + 1, j + 1);
1768                                 j = i = i - 1;
1769                                 break;
1770                         }
1771                 }
1772         }
1773         if (i < 0 && count == 0 && PyString_CheckExact(self)) {
1774                 /* ch not in self, so just use self as list[0] */
1775                 Py_INCREF(self);
1776                 PyList_SET_ITEM(list, 0, (PyObject *)self);
1777                 count++;
1778         }
1779         else if (j >= -1) {
1780                 SPLIT_ADD(s, 0, j + 1);
1781         }
1782         FIX_PREALLOC_SIZE(list);
1783         if (PyList_Reverse(list) < 0)
1784                 goto onError;
1785         return list;
1786
1787  onError:
1788         Py_DECREF(list);
1789         return NULL;
1790 }
1791
1792 PyDoc_STRVAR(rsplit__doc__,
1793 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
1794 \n\
1795 Return a list of the words in the string S, using sep as the\n\
1796 delimiter string, starting at the end of the string and working\n\
1797 to the front.  If maxsplit is given, at most maxsplit splits are\n\
1798 done. If sep is not specified or is None, any whitespace string\n\
1799 is a separator.");
1800
1801 static PyObject *
1802 string_rsplit(PyStringObject *self, PyObject *args)
1803 {
1804         Py_ssize_t len = PyString_GET_SIZE(self), n, i, j;
1805         Py_ssize_t maxsplit = -1, count=0;
1806         const char *s, *sub;
1807         PyObject *list, *str, *subobj = Py_None;
1808
1809         if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit))
1810                 return NULL;
1811         if (maxsplit < 0)
1812                 maxsplit = PY_SSIZE_T_MAX;
1813         if (subobj == Py_None)
1814                 return rsplit_whitespace(self, len, maxsplit);
1815         if (PyString_Check(subobj)) {
1816                 sub = PyString_AS_STRING(subobj);
1817                 n = PyString_GET_SIZE(subobj);
1818         }
1819 #ifdef Py_USING_UNICODE
1820         else if (PyUnicode_Check(subobj))
1821                 return PyUnicode_RSplit((PyObject *)self, subobj, maxsplit);
1822 #endif
1823         else if (PyObject_AsCharBuffer(subobj, &sub, &n))
1824                 return NULL;
1825
1826         if (n == 0) {
1827                 PyErr_SetString(PyExc_ValueError, "empty separator");
1828                 return NULL;
1829         }
1830         else if (n == 1)
1831                 return rsplit_char(self, len, sub[0], maxsplit);
1832
1833         list = PyList_New(PREALLOC_SIZE(maxsplit));
1834         if (list == NULL)
1835                 return NULL;
1836
1837         j = len;
1838         i = j - n;
1839
1840         s = PyString_AS_STRING(self);
1841         while ( (i >= 0) && (maxsplit-- > 0) ) {
1842                 for (; i>=0; i--) {
1843                         if (Py_STRING_MATCH(s, i, sub, n)) {
1844                                 SPLIT_ADD(s, i + n, j);
1845                                 j = i;
1846                                 i -= n;
1847                                 break;
1848                         }
1849                 }
1850         }
1851         SPLIT_ADD(s, 0, j);
1852         FIX_PREALLOC_SIZE(list);
1853         if (PyList_Reverse(list) < 0)
1854                 goto onError;
1855         return list;
1856
1857 onError:
1858         Py_DECREF(list);
1859         return NULL;
1860 }
1861
1862
1863 PyDoc_STRVAR(join__doc__,
1864 "S.join(iterable) -> string\n\
1865 \n\
1866 Return a string which is the concatenation of the strings in the\n\
1867 iterable.  The separator between elements is S.");
1868
1869 static PyObject *
1870 string_join(PyStringObject *self, PyObject *orig)
1871 {
1872         char *sep = PyString_AS_STRING(self);
1873         const Py_ssize_t seplen = PyString_GET_SIZE(self);
1874         PyObject *res = NULL;
1875         char *p;
1876         Py_ssize_t seqlen = 0;
1877         size_t sz = 0;
1878         Py_ssize_t i;
1879         PyObject *seq, *item;
1880
1881         seq = PySequence_Fast(orig, "");
1882         if (seq == NULL) {
1883                 return NULL;
1884         }
1885
1886         seqlen = PySequence_Size(seq);
1887         if (seqlen == 0) {
1888                 Py_DECREF(seq);
1889                 return PyString_FromString("");
1890         }
1891         if (seqlen == 1) {
1892                 item = PySequence_Fast_GET_ITEM(seq, 0);
1893                 if (PyString_CheckExact(item) || PyUnicode_CheckExact(item)) {
1894                         Py_INCREF(item);
1895                         Py_DECREF(seq);
1896                         return item;
1897                 }
1898         }
1899
1900         /* There are at least two things to join, or else we have a subclass
1901          * of the builtin types in the sequence.
1902          * Do a pre-pass to figure out the total amount of space we'll
1903          * need (sz), see whether any argument is absurd, and defer to
1904          * the Unicode join if appropriate.
1905          */
1906         for (i = 0; i < seqlen; i++) {
1907                 const size_t old_sz = sz;
1908                 item = PySequence_Fast_GET_ITEM(seq, i);
1909                 if (!PyString_Check(item)){
1910 #ifdef Py_USING_UNICODE
1911                         if (PyUnicode_Check(item)) {
1912                                 /* Defer to Unicode join.
1913                                  * CAUTION:  There's no gurantee that the
1914                                  * original sequence can be iterated over
1915                                  * again, so we must pass seq here.
1916                                  */
1917                                 PyObject *result;
1918                                 result = PyUnicode_Join((PyObject *)self, seq);
1919                                 Py_DECREF(seq);
1920                                 return result;
1921                         }
1922 #endif
1923                         PyErr_Format(PyExc_TypeError,
1924                                      "sequence item %zd: expected string,"
1925                                      " %.80s found",
1926                                      i, Py_TYPE(item)->tp_name);
1927                         Py_DECREF(seq);
1928                         return NULL;
1929                 }
1930                 sz += PyString_GET_SIZE(item);
1931                 if (i != 0)
1932                         sz += seplen;
1933                 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
1934                         PyErr_SetString(PyExc_OverflowError,
1935                                 "join() result is too long for a Python string");
1936                         Py_DECREF(seq);
1937                         return NULL;
1938                 }
1939         }
1940
1941         /* Allocate result space. */
1942         res = PyString_FromStringAndSize((char*)NULL, sz);
1943         if (res == NULL) {
1944                 Py_DECREF(seq);
1945                 return NULL;
1946         }
1947
1948         /* Catenate everything. */
1949         p = PyString_AS_STRING(res);
1950         for (i = 0; i < seqlen; ++i) {
1951                 size_t n;
1952                 item = PySequence_Fast_GET_ITEM(seq, i);
1953                 n = PyString_GET_SIZE(item);
1954                 Py_MEMCPY(p, PyString_AS_STRING(item), n);
1955                 p += n;
1956                 if (i < seqlen - 1) {
1957                         Py_MEMCPY(p, sep, seplen);
1958                         p += seplen;
1959                 }
1960         }
1961
1962         Py_DECREF(seq);
1963         return res;
1964 }
1965
1966 PyObject *
1967 _PyString_Join(PyObject *sep, PyObject *x)
1968 {
1969         assert(sep != NULL && PyString_Check(sep));
1970         assert(x != NULL);
1971         return string_join((PyStringObject *)sep, x);
1972 }
1973
1974 Py_LOCAL_INLINE(void)
1975 string_adjust_indices(Py_ssize_t *start, Py_ssize_t *end, Py_ssize_t len)
1976 {
1977         if (*end > len)
1978                 *end = len;
1979         else if (*end < 0)
1980                 *end += len;
1981         if (*end < 0)
1982                 *end = 0;
1983         if (*start < 0)
1984                 *start += len;
1985         if (*start < 0)
1986                 *start = 0;
1987 }
1988
1989 Py_LOCAL_INLINE(Py_ssize_t)
1990 string_find_internal(PyStringObject *self, PyObject *args, int dir)
1991 {
1992         PyObject *subobj;
1993         const char *sub;
1994         Py_ssize_t sub_len;
1995         Py_ssize_t start=0, end=PY_SSIZE_T_MAX;
1996         PyObject *obj_start=Py_None, *obj_end=Py_None;
1997
1998         if (!PyArg_ParseTuple(args, "O|OO:find/rfind/index/rindex", &subobj,
1999                 &obj_start, &obj_end))
2000                 return -2;
2001         /* To support None in "start" and "end" arguments, meaning
2002            the same as if they were not passed.
2003         */
2004         if (obj_start != Py_None)
2005                 if (!_PyEval_SliceIndex(obj_start, &start))
2006                 return -2;
2007         if (obj_end != Py_None)
2008                 if (!_PyEval_SliceIndex(obj_end, &end))
2009                 return -2;
2010
2011         if (PyString_Check(subobj)) {
2012                 sub = PyString_AS_STRING(subobj);
2013                 sub_len = PyString_GET_SIZE(subobj);
2014         }
2015 #ifdef Py_USING_UNICODE
2016         else if (PyUnicode_Check(subobj))
2017                 return PyUnicode_Find(
2018                         (PyObject *)self, subobj, start, end, dir);
2019 #endif
2020         else if (PyObject_AsCharBuffer(subobj, &sub, &sub_len))
2021                 /* XXX - the "expected a character buffer object" is pretty
2022                    confusing for a non-expert.  remap to something else ? */
2023                 return -2;
2024
2025         if (dir > 0)
2026                 return stringlib_find_slice(
2027                         PyString_AS_STRING(self), PyString_GET_SIZE(self),
2028                         sub, sub_len, start, end);
2029         else
2030                 return stringlib_rfind_slice(
2031                         PyString_AS_STRING(self), PyString_GET_SIZE(self),
2032                         sub, sub_len, start, end);
2033 }
2034
2035
2036 PyDoc_STRVAR(find__doc__,
2037 "S.find(sub [,start [,end]]) -> int\n\
2038 \n\
2039 Return the lowest index in S where substring sub is found,\n\
2040 such that sub is contained within s[start:end].  Optional\n\
2041 arguments start and end are interpreted as in slice notation.\n\
2042 \n\
2043 Return -1 on failure.");
2044
2045 static PyObject *
2046 string_find(PyStringObject *self, PyObject *args)
2047 {
2048         Py_ssize_t result = string_find_internal(self, args, +1);
2049         if (result == -2)
2050                 return NULL;
2051         return PyInt_FromSsize_t(result);
2052 }
2053
2054
2055 PyDoc_STRVAR(index__doc__,
2056 "S.index(sub [,start [,end]]) -> int\n\
2057 \n\
2058 Like S.find() but raise ValueError when the substring is not found.");
2059
2060 static PyObject *
2061 string_index(PyStringObject *self, PyObject *args)
2062 {
2063         Py_ssize_t result = string_find_internal(self, args, +1);
2064         if (result == -2)
2065                 return NULL;
2066         if (result == -1) {
2067                 PyErr_SetString(PyExc_ValueError,
2068                                 "substring not found");
2069                 return NULL;
2070         }
2071         return PyInt_FromSsize_t(result);
2072 }
2073
2074
2075 PyDoc_STRVAR(rfind__doc__,
2076 "S.rfind(sub [,start [,end]]) -> int\n\
2077 \n\
2078 Return the highest index in S where substring sub is found,\n\
2079 such that sub is contained within s[start:end].  Optional\n\
2080 arguments start and end are interpreted as in slice notation.\n\
2081 \n\
2082 Return -1 on failure.");
2083
2084 static PyObject *
2085 string_rfind(PyStringObject *self, PyObject *args)
2086 {
2087         Py_ssize_t result = string_find_internal(self, args, -1);
2088         if (result == -2)
2089                 return NULL;
2090         return PyInt_FromSsize_t(result);
2091 }
2092
2093
2094 PyDoc_STRVAR(rindex__doc__,
2095 "S.rindex(sub [,start [,end]]) -> int\n\
2096 \n\
2097 Like S.rfind() but raise ValueError when the substring is not found.");
2098
2099 static PyObject *
2100 string_rindex(PyStringObject *self, PyObject *args)
2101 {
2102         Py_ssize_t result = string_find_internal(self, args, -1);
2103         if (result == -2)
2104                 return NULL;
2105         if (result == -1) {
2106                 PyErr_SetString(PyExc_ValueError,
2107                                 "substring not found");
2108                 return NULL;
2109         }
2110         return PyInt_FromSsize_t(result);
2111 }
2112
2113
2114 Py_LOCAL_INLINE(PyObject *)
2115 do_xstrip(PyStringObject *self, int striptype, PyObject *sepobj)
2116 {
2117         char *s = PyString_AS_STRING(self);
2118         Py_ssize_t len = PyString_GET_SIZE(self);
2119         char *sep = PyString_AS_STRING(sepobj);
2120         Py_ssize_t seplen = PyString_GET_SIZE(sepobj);
2121         Py_ssize_t i, j;
2122
2123         i = 0;
2124         if (striptype != RIGHTSTRIP) {
2125                 while (i < len && memchr(sep, Py_CHARMASK(s[i]), seplen)) {
2126                         i++;
2127                 }
2128         }
2129
2130         j = len;
2131         if (striptype != LEFTSTRIP) {
2132                 do {
2133                         j--;
2134                 } while (j >= i && memchr(sep, Py_CHARMASK(s[j]), seplen));
2135                 j++;
2136         }
2137
2138         if (i == 0 && j == len && PyString_CheckExact(self)) {
2139                 Py_INCREF(self);
2140                 return (PyObject*)self;
2141         }
2142         else
2143                 return PyString_FromStringAndSize(s+i, j-i);
2144 }
2145
2146
2147 Py_LOCAL_INLINE(PyObject *)
2148 do_strip(PyStringObject *self, int striptype)
2149 {
2150         char *s = PyString_AS_STRING(self);
2151         Py_ssize_t len = PyString_GET_SIZE(self), i, j;
2152
2153         i = 0;
2154         if (striptype != RIGHTSTRIP) {
2155                 while (i < len && isspace(Py_CHARMASK(s[i]))) {
2156                         i++;
2157                 }
2158         }
2159
2160         j = len;
2161         if (striptype != LEFTSTRIP) {
2162                 do {
2163                         j--;
2164                 } while (j >= i && isspace(Py_CHARMASK(s[j])));
2165                 j++;
2166         }
2167
2168         if (i == 0 && j == len && PyString_CheckExact(self)) {
2169                 Py_INCREF(self);
2170                 return (PyObject*)self;
2171         }
2172         else
2173                 return PyString_FromStringAndSize(s+i, j-i);
2174 }
2175
2176
2177 Py_LOCAL_INLINE(PyObject *)
2178 do_argstrip(PyStringObject *self, int striptype, PyObject *args)
2179 {
2180         PyObject *sep = NULL;
2181
2182         if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
2183                 return NULL;
2184
2185         if (sep != NULL && sep != Py_None) {
2186                 if (PyString_Check(sep))
2187                         return do_xstrip(self, striptype, sep);
2188 #ifdef Py_USING_UNICODE
2189                 else if (PyUnicode_Check(sep)) {
2190                         PyObject *uniself = PyUnicode_FromObject((PyObject *)self);
2191                         PyObject *res;
2192                         if (uniself==NULL)
2193                                 return NULL;
2194                         res = _PyUnicode_XStrip((PyUnicodeObject *)uniself,
2195                                 striptype, sep);
2196                         Py_DECREF(uniself);
2197                         return res;
2198                 }
2199 #endif
2200                 PyErr_Format(PyExc_TypeError,
2201 #ifdef Py_USING_UNICODE
2202                              "%s arg must be None, str or unicode",
2203 #else
2204                              "%s arg must be None or str",
2205 #endif
2206                              STRIPNAME(striptype));
2207                 return NULL;
2208         }
2209
2210         return do_strip(self, striptype);
2211 }
2212
2213
2214 PyDoc_STRVAR(strip__doc__,
2215 "S.strip([chars]) -> string or unicode\n\
2216 \n\
2217 Return a copy of the string S with leading and trailing\n\
2218 whitespace removed.\n\
2219 If chars is given and not None, remove characters in chars instead.\n\
2220 If chars is unicode, S will be converted to unicode before stripping");
2221
2222 static PyObject *
2223 string_strip(PyStringObject *self, PyObject *args)
2224 {
2225         if (PyTuple_GET_SIZE(args) == 0)
2226                 return do_strip(self, BOTHSTRIP); /* Common case */
2227         else
2228                 return do_argstrip(self, BOTHSTRIP, args);
2229 }
2230
2231
2232 PyDoc_STRVAR(lstrip__doc__,
2233 "S.lstrip([chars]) -> string or unicode\n\
2234 \n\
2235 Return a copy of the string S with leading whitespace removed.\n\
2236 If chars is given and not None, remove characters in chars instead.\n\
2237 If chars is unicode, S will be converted to unicode before stripping");
2238
2239 static PyObject *
2240 string_lstrip(PyStringObject *self, PyObject *args)
2241 {
2242         if (PyTuple_GET_SIZE(args) == 0)
2243                 return do_strip(self, LEFTSTRIP); /* Common case */
2244         else
2245                 return do_argstrip(self, LEFTSTRIP, args);
2246 }
2247
2248
2249 PyDoc_STRVAR(rstrip__doc__,
2250 "S.rstrip([chars]) -> string or unicode\n\
2251 \n\
2252 Return a copy of the string S with trailing whitespace removed.\n\
2253 If chars is given and not None, remove characters in chars instead.\n\
2254 If chars is unicode, S will be converted to unicode before stripping");
2255
2256 static PyObject *
2257 string_rstrip(PyStringObject *self, PyObject *args)
2258 {
2259         if (PyTuple_GET_SIZE(args) == 0)
2260                 return do_strip(self, RIGHTSTRIP); /* Common case */
2261         else
2262                 return do_argstrip(self, RIGHTSTRIP, args);
2263 }
2264
2265
2266 PyDoc_STRVAR(lower__doc__,
2267 "S.lower() -> string\n\
2268 \n\
2269 Return a copy of the string S converted to lowercase.");
2270
2271 /* _tolower and _toupper are defined by SUSv2, but they're not ISO C */
2272 #ifndef _tolower
2273 #define _tolower tolower
2274 #endif
2275
2276 static PyObject *
2277 string_lower(PyStringObject *self)
2278 {
2279         char *s;
2280         Py_ssize_t i, n = PyString_GET_SIZE(self);
2281         PyObject *newobj;
2282
2283         newobj = PyString_FromStringAndSize(NULL, n);
2284         if (!newobj)
2285                 return NULL;
2286
2287         s = PyString_AS_STRING(newobj);
2288
2289         Py_MEMCPY(s, PyString_AS_STRING(self), n);
2290
2291         for (i = 0; i < n; i++) {
2292                 int c = Py_CHARMASK(s[i]);
2293                 if (isupper(c))
2294                         s[i] = _tolower(c);
2295         }
2296
2297         return newobj;
2298 }
2299
2300 PyDoc_STRVAR(upper__doc__,
2301 "S.upper() -> string\n\
2302 \n\
2303 Return a copy of the string S converted to uppercase.");
2304
2305 #ifndef _toupper
2306 #define _toupper toupper
2307 #endif
2308
2309 static PyObject *
2310 string_upper(PyStringObject *self)
2311 {
2312         char *s;
2313         Py_ssize_t i, n = PyString_GET_SIZE(self);
2314         PyObject *newobj;
2315
2316         newobj = PyString_FromStringAndSize(NULL, n);
2317         if (!newobj)
2318                 return NULL;
2319
2320         s = PyString_AS_STRING(newobj);
2321
2322         Py_MEMCPY(s, PyString_AS_STRING(self), n);
2323
2324         for (i = 0; i < n; i++) {
2325                 int c = Py_CHARMASK(s[i]);
2326                 if (islower(c))
2327                         s[i] = _toupper(c);
2328         }
2329
2330         return newobj;
2331 }
2332
2333 PyDoc_STRVAR(title__doc__,
2334 "S.title() -> string\n\
2335 \n\
2336 Return a titlecased version of S, i.e. words start with uppercase\n\
2337 characters, all remaining cased characters have lowercase.");
2338
2339 static PyObject*
2340 string_title(PyStringObject *self)
2341 {
2342         char *s = PyString_AS_STRING(self), *s_new;
2343         Py_ssize_t i, n = PyString_GET_SIZE(self);
2344         int previous_is_cased = 0;
2345         PyObject *newobj;
2346
2347         newobj = PyString_FromStringAndSize(NULL, n);
2348         if (newobj == NULL)
2349                 return NULL;
2350         s_new = PyString_AsString(newobj);
2351         for (i = 0; i < n; i++) {
2352                 int c = Py_CHARMASK(*s++);
2353                 if (islower(c)) {
2354                         if (!previous_is_cased)
2355                             c = toupper(c);
2356                         previous_is_cased = 1;
2357                 } else if (isupper(c)) {
2358                         if (previous_is_cased)
2359                             c = tolower(c);
2360                         previous_is_cased = 1;
2361                 } else
2362                         previous_is_cased = 0;
2363                 *s_new++ = c;
2364         }
2365         return newobj;
2366 }
2367
2368 PyDoc_STRVAR(capitalize__doc__,
2369 "S.capitalize() -> string\n\
2370 \n\
2371 Return a copy of the string S with only its first character\n\
2372 capitalized.");
2373
2374 static PyObject *
2375 string_capitalize(PyStringObject *self)
2376 {
2377         char *s = PyString_AS_STRING(self), *s_new;
2378         Py_ssize_t i, n = PyString_GET_SIZE(self);
2379         PyObject *newobj;
2380
2381         newobj = PyString_FromStringAndSize(NULL, n);
2382         if (newobj == NULL)
2383                 return NULL;
2384         s_new = PyString_AsString(newobj);
2385         if (0 < n) {
2386                 int c = Py_CHARMASK(*s++);
2387                 if (islower(c))
2388                         *s_new = toupper(c);
2389                 else
2390                         *s_new = c;
2391                 s_new++;
2392         }
2393         for (i = 1; i < n; i++) {
2394                 int c = Py_CHARMASK(*s++);
2395                 if (isupper(c))
2396                         *s_new = tolower(c);
2397                 else
2398                         *s_new = c;
2399                 s_new++;
2400         }
2401         return newobj;
2402 }
2403
2404
2405 PyDoc_STRVAR(count__doc__,
2406 "S.count(sub[, start[, end]]) -> int\n\
2407 \n\
2408 Return the number of non-overlapping occurrences of substring sub in\n\
2409 string S[start:end].  Optional arguments start and end are interpreted\n\
2410 as in slice notation.");
2411
2412 static PyObject *
2413 string_count(PyStringObject *self, PyObject *args)
2414 {
2415         PyObject *sub_obj;
2416         const char *str = PyString_AS_STRING(self), *sub;
2417         Py_ssize_t sub_len;
2418         Py_ssize_t start = 0, end = PY_SSIZE_T_MAX;
2419
2420         if (!PyArg_ParseTuple(args, "O|O&O&:count", &sub_obj,
2421                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
2422                 return NULL;
2423
2424         if (PyString_Check(sub_obj)) {
2425                 sub = PyString_AS_STRING(sub_obj);
2426                 sub_len = PyString_GET_SIZE(sub_obj);
2427         }
2428 #ifdef Py_USING_UNICODE
2429         else if (PyUnicode_Check(sub_obj)) {
2430                 Py_ssize_t count;
2431                 count = PyUnicode_Count((PyObject *)self, sub_obj, start, end);
2432                 if (count == -1)
2433                         return NULL;
2434                 else
2435                         return PyInt_FromSsize_t(count);
2436         }
2437 #endif
2438         else if (PyObject_AsCharBuffer(sub_obj, &sub, &sub_len))
2439                 return NULL;
2440
2441         string_adjust_indices(&start, &end, PyString_GET_SIZE(self));
2442
2443         return PyInt_FromSsize_t(
2444                 stringlib_count(str + start, end - start, sub, sub_len)
2445                 );
2446 }
2447
2448 PyDoc_STRVAR(swapcase__doc__,
2449 "S.swapcase() -> string\n\
2450 \n\
2451 Return a copy of the string S with uppercase characters\n\
2452 converted to lowercase and vice versa.");
2453
2454 static PyObject *
2455 string_swapcase(PyStringObject *self)
2456 {
2457         char *s = PyString_AS_STRING(self), *s_new;
2458         Py_ssize_t i, n = PyString_GET_SIZE(self);
2459         PyObject *newobj;
2460
2461         newobj = PyString_FromStringAndSize(NULL, n);
2462         if (newobj == NULL)
2463                 return NULL;
2464         s_new = PyString_AsString(newobj);
2465         for (i = 0; i < n; i++) {
2466                 int c = Py_CHARMASK(*s++);
2467                 if (islower(c)) {
2468                         *s_new = toupper(c);
2469                 }
2470                 else if (isupper(c)) {
2471                         *s_new = tolower(c);
2472                 }
2473                 else
2474                         *s_new = c;
2475                 s_new++;
2476         }
2477         return newobj;
2478 }
2479
2480
2481 PyDoc_STRVAR(translate__doc__,
2482 "S.translate(table [,deletechars]) -> string\n\
2483 \n\
2484 Return a copy of the string S, where all characters occurring\n\
2485 in the optional argument deletechars are removed, and the\n\
2486 remaining characters have been mapped through the given\n\
2487 translation table, which must be a string of length 256.");
2488
2489 static PyObject *
2490 string_translate(PyStringObject *self, PyObject *args)
2491 {
2492         register char *input, *output;
2493         const char *table;
2494         register Py_ssize_t i, c, changed = 0;
2495         PyObject *input_obj = (PyObject*)self;
2496         const char *output_start, *del_table=NULL;
2497         Py_ssize_t inlen, tablen, dellen = 0;
2498         PyObject *result;
2499         int trans_table[256];
2500         PyObject *tableobj, *delobj = NULL;
2501
2502         if (!PyArg_UnpackTuple(args, "translate", 1, 2,
2503                               &tableobj, &delobj))
2504                 return NULL;
2505
2506         if (PyString_Check(tableobj)) {
2507                 table = PyString_AS_STRING(tableobj);
2508                 tablen = PyString_GET_SIZE(tableobj);
2509         }
2510         else if (tableobj == Py_None) {
2511                 table = NULL;
2512                 tablen = 256;
2513         }
2514 #ifdef Py_USING_UNICODE
2515         else if (PyUnicode_Check(tableobj)) {
2516                 /* Unicode .translate() does not support the deletechars
2517                    parameter; instead a mapping to None will cause characters
2518                    to be deleted. */
2519                 if (delobj != NULL) {
2520                         PyErr_SetString(PyExc_TypeError,
2521                         "deletions are implemented differently for unicode");
2522                         return NULL;
2523                 }
2524                 return PyUnicode_Translate((PyObject *)self, tableobj, NULL);
2525         }
2526 #endif
2527         else if (PyObject_AsCharBuffer(tableobj, &table, &tablen))
2528                 return NULL;
2529
2530         if (tablen != 256) {
2531                 PyErr_SetString(PyExc_ValueError,
2532                   "translation table must be 256 characters long");
2533                 return NULL;
2534         }
2535
2536         if (delobj != NULL) {
2537                 if (PyString_Check(delobj)) {
2538                         del_table = PyString_AS_STRING(delobj);
2539                         dellen = PyString_GET_SIZE(delobj);
2540                 }
2541 #ifdef Py_USING_UNICODE
2542                 else if (PyUnicode_Check(delobj)) {
2543                         PyErr_SetString(PyExc_TypeError,
2544                         "deletions are implemented differently for unicode");
2545                         return NULL;
2546                 }
2547 #endif
2548                 else if (PyObject_AsCharBuffer(delobj, &del_table, &dellen))
2549                         return NULL;
2550         }
2551         else {
2552                 del_table = NULL;
2553                 dellen = 0;
2554         }
2555
2556         inlen = PyString_GET_SIZE(input_obj);
2557         result = PyString_FromStringAndSize((char *)NULL, inlen);
2558         if (result == NULL)
2559                 return NULL;
2560         output_start = output = PyString_AsString(result);
2561         input = PyString_AS_STRING(input_obj);
2562
2563         if (dellen == 0 && table != NULL) {
2564                 /* If no deletions are required, use faster code */
2565                 for (i = inlen; --i >= 0; ) {
2566                         c = Py_CHARMASK(*input++);
2567                         if (Py_CHARMASK((*output++ = table[c])) != c)
2568                                 changed = 1;
2569                 }
2570                 if (changed || !PyString_CheckExact(input_obj))
2571                         return result;
2572                 Py_DECREF(result);
2573                 Py_INCREF(input_obj);
2574                 return input_obj;
2575         }
2576
2577         if (table == NULL) {
2578                 for (i = 0; i < 256; i++)
2579                         trans_table[i] = Py_CHARMASK(i);
2580         } else {
2581                 for (i = 0; i < 256; i++)
2582                         trans_table[i] = Py_CHARMASK(table[i]);
2583         }
2584
2585         for (i = 0; i < dellen; i++)
2586                 trans_table[(int) Py_CHARMASK(del_table[i])] = -1;
2587
2588         for (i = inlen; --i >= 0; ) {
2589                 c = Py_CHARMASK(*input++);
2590                 if (trans_table[c] != -1)
2591                         if (Py_CHARMASK(*output++ = (char)trans_table[c]) == c)
2592                                 continue;
2593                 changed = 1;
2594         }
2595         if (!changed && PyString_CheckExact(input_obj)) {
2596                 Py_DECREF(result);
2597                 Py_INCREF(input_obj);
2598                 return input_obj;
2599         }
2600         /* Fix the size of the resulting string */
2601         if (inlen > 0)
2602                 _PyString_Resize(&result, output - output_start);
2603         return result;
2604 }
2605
2606
2607 #define FORWARD 1
2608 #define REVERSE -1
2609
2610 /* find and count characters and substrings */
2611
2612 #define findchar(target, target_len, c)                         \
2613   ((char *)memchr((const void *)(target), c, target_len))
2614
2615 /* String ops must return a string.  */
2616 /* If the object is subclass of string, create a copy */
2617 Py_LOCAL(PyStringObject *)
2618 return_self(PyStringObject *self)
2619 {
2620         if (PyString_CheckExact(self)) {
2621                 Py_INCREF(self);
2622                 return self;
2623         }
2624         return (PyStringObject *)PyString_FromStringAndSize(
2625                 PyString_AS_STRING(self),
2626                 PyString_GET_SIZE(self));
2627 }
2628
2629 Py_LOCAL_INLINE(Py_ssize_t)
2630 countchar(const char *target, int target_len, char c, Py_ssize_t maxcount)
2631 {
2632         Py_ssize_t count=0;
2633         const char *start=target;
2634         const char *end=target+target_len;
2635
2636         while ( (start=findchar(start, end-start, c)) != NULL ) {
2637                 count++;
2638                 if (count >= maxcount)
2639                         break;
2640                 start += 1;
2641         }
2642         return count;
2643 }
2644
2645 Py_LOCAL(Py_ssize_t)
2646 findstring(const char *target, Py_ssize_t target_len,
2647            const char *pattern, Py_ssize_t pattern_len,
2648            Py_ssize_t start,
2649            Py_ssize_t end,
2650            int direction)
2651 {
2652         if (start < 0) {
2653                 start += target_len;
2654                 if (start < 0)
2655                         start = 0;
2656         }
2657         if (end > target_len) {
2658                 end = target_len;
2659         } else if (end < 0) {
2660                 end += target_len;
2661                 if (end < 0)
2662                         end = 0;
2663         }
2664
2665         /* zero-length substrings always match at the first attempt */
2666         if (pattern_len == 0)
2667                 return (direction > 0) ? start : end;
2668
2669         end -= pattern_len;
2670
2671         if (direction < 0) {
2672                 for (; end >= start; end--)
2673                         if (Py_STRING_MATCH(target, end, pattern, pattern_len))
2674                                 return end;
2675         } else {
2676                 for (; start <= end; start++)
2677                         if (Py_STRING_MATCH(target, start, pattern, pattern_len))
2678                                 return start;
2679         }
2680         return -1;
2681 }
2682
2683 Py_LOCAL_INLINE(Py_ssize_t)
2684 countstring(const char *target, Py_ssize_t target_len,
2685             const char *pattern, Py_ssize_t pattern_len,
2686             Py_ssize_t start,
2687             Py_ssize_t end,
2688             int direction, Py_ssize_t maxcount)
2689 {
2690         Py_ssize_t count=0;
2691
2692         if (start < 0) {
2693                 start += target_len;
2694                 if (start < 0)
2695                         start = 0;
2696         }
2697         if (end > target_len) {
2698                 end = target_len;
2699         } else if (end < 0) {
2700                 end += target_len;
2701                 if (end < 0)
2702                         end = 0;
2703         }
2704
2705         /* zero-length substrings match everywhere */
2706         if (pattern_len == 0 || maxcount == 0) {
2707                 if (target_len+1 < maxcount)
2708                         return target_len+1;
2709                 return maxcount;
2710         }
2711
2712         end -= pattern_len;
2713         if (direction < 0) {
2714                 for (; (end >= start); end--)
2715                         if (Py_STRING_MATCH(target, end, pattern, pattern_len)) {
2716                                 count++;
2717                                 if (--maxcount <= 0) break;
2718                                 end -= pattern_len-1;
2719                         }
2720         } else {
2721                 for (; (start <= end); start++)
2722                         if (Py_STRING_MATCH(target, start, pattern, pattern_len)) {
2723                                 count++;
2724                                 if (--maxcount <= 0)
2725                                         break;
2726                                 start += pattern_len-1;
2727                         }
2728         }
2729         return count;
2730 }
2731
2732
2733 /* Algorithms for different cases of string replacement */
2734
2735 /* len(self)>=1, from="", len(to)>=1, maxcount>=1 */
2736 Py_LOCAL(PyStringObject *)
2737 replace_interleave(PyStringObject *self,
2738                    const char *to_s, Py_ssize_t to_len,
2739                    Py_ssize_t maxcount)
2740 {
2741         char *self_s, *result_s;
2742         Py_ssize_t self_len, result_len;
2743         Py_ssize_t count, i, product;
2744         PyStringObject *result;
2745
2746         self_len = PyString_GET_SIZE(self);
2747
2748         /* 1 at the end plus 1 after every character */
2749         count = self_len+1;
2750         if (maxcount < count)
2751                 count = maxcount;
2752
2753         /* Check for overflow */
2754         /*   result_len = count * to_len + self_len; */
2755         product = count * to_len;
2756         if (product / to_len != count) {
2757                 PyErr_SetString(PyExc_OverflowError,
2758                                 "replace string is too long");
2759                 return NULL;
2760         }
2761         result_len = product + self_len;
2762         if (result_len < 0) {
2763                 PyErr_SetString(PyExc_OverflowError,
2764                                 "replace string is too long");
2765                 return NULL;
2766         }
2767
2768         if (! (result = (PyStringObject *)
2769                          PyString_FromStringAndSize(NULL, result_len)) )
2770                 return NULL;
2771
2772         self_s = PyString_AS_STRING(self);
2773         result_s = PyString_AS_STRING(result);
2774
2775         /* TODO: special case single character, which doesn't need memcpy */
2776
2777         /* Lay the first one down (guaranteed this will occur) */
2778         Py_MEMCPY(result_s, to_s, to_len);
2779         result_s += to_len;
2780         count -= 1;
2781
2782         for (i=0; i<count; i++) {
2783                 *result_s++ = *self_s++;
2784                 Py_MEMCPY(result_s, to_s, to_len);
2785                 result_s += to_len;
2786         }
2787
2788         /* Copy the rest of the original string */
2789         Py_MEMCPY(result_s, self_s, self_len-i);
2790
2791         return result;
2792 }
2793
2794 /* Special case for deleting a single character */
2795 /* len(self)>=1, len(from)==1, to="", maxcount>=1 */
2796 Py_LOCAL(PyStringObject *)
2797 replace_delete_single_character(PyStringObject *self,
2798                                 char from_c, Py_ssize_t maxcount)
2799 {
2800         char *self_s, *result_s;
2801         char *start, *next, *end;
2802         Py_ssize_t self_len, result_len;
2803         Py_ssize_t count;
2804         PyStringObject *result;
2805
2806         self_len = PyString_GET_SIZE(self);
2807         self_s = PyString_AS_STRING(self);
2808
2809         count = countchar(self_s, self_len, from_c, maxcount);
2810         if (count == 0) {
2811                 return return_self(self);
2812         }
2813
2814         result_len = self_len - count;  /* from_len == 1 */
2815         assert(result_len>=0);
2816
2817         if ( (result = (PyStringObject *)
2818                         PyString_FromStringAndSize(NULL, result_len)) == NULL)
2819                 return NULL;
2820         result_s = PyString_AS_STRING(result);
2821
2822         start = self_s;
2823         end = self_s + self_len;
2824         while (count-- > 0) {
2825                 next = findchar(start, end-start, from_c);
2826                 if (next == NULL)
2827                         break;
2828                 Py_MEMCPY(result_s, start, next-start);
2829                 result_s += (next-start);
2830                 start = next+1;
2831         }
2832         Py_MEMCPY(result_s, start, end-start);
2833
2834         return result;
2835 }
2836
2837 /* len(self)>=1, len(from)>=2, to="", maxcount>=1 */
2838
2839 Py_LOCAL(PyStringObject *)
2840 replace_delete_substring(PyStringObject *self,
2841                          const char *from_s, Py_ssize_t from_len,
2842                          Py_ssize_t maxcount) {
2843         char *self_s, *result_s;
2844         char *start, *next, *end;
2845         Py_ssize_t self_len, result_len;
2846         Py_ssize_t count, offset;
2847         PyStringObject *result;
2848
2849         self_len = PyString_GET_SIZE(self);
2850         self_s = PyString_AS_STRING(self);
2851
2852         count = countstring(self_s, self_len,
2853                             from_s, from_len,
2854                             0, self_len, 1,
2855                             maxcount);
2856
2857         if (count == 0) {
2858                 /* no matches */
2859                 return return_self(self);
2860         }
2861
2862         result_len = self_len - (count * from_len);
2863         assert (result_len>=0);
2864
2865         if ( (result = (PyStringObject *)
2866               PyString_FromStringAndSize(NULL, result_len)) == NULL )
2867                 return NULL;
2868
2869         result_s = PyString_AS_STRING(result);
2870
2871         start = self_s;
2872         end = self_s + self_len;
2873         while (count-- > 0) {
2874                 offset = findstring(start, end-start,
2875                                     from_s, from_len,
2876                                     0, end-start, FORWARD);
2877                 if (offset == -1)
2878                         break;
2879                 next = start + offset;
2880
2881                 Py_MEMCPY(result_s, start, next-start);
2882
2883                 result_s += (next-start);
2884                 start = next+from_len;
2885         }
2886         Py_MEMCPY(result_s, start, end-start);
2887         return result;
2888 }
2889
2890 /* len(self)>=1, len(from)==len(to)==1, maxcount>=1 */
2891 Py_LOCAL(PyStringObject *)
2892 replace_single_character_in_place(PyStringObject *self,
2893                                   char from_c, char to_c,
2894                                   Py_ssize_t maxcount)
2895 {
2896         char *self_s, *result_s, *start, *end, *next;
2897         Py_ssize_t self_len;
2898         PyStringObject *result;
2899
2900         /* The result string will be the same size */
2901         self_s = PyString_AS_STRING(self);
2902         self_len = PyString_GET_SIZE(self);
2903
2904         next = findchar(self_s, self_len, from_c);
2905
2906         if (next == NULL) {
2907                 /* No matches; return the original string */
2908                 return return_self(self);
2909         }
2910
2911         /* Need to make a new string */
2912         result = (PyStringObject *) PyString_FromStringAndSize(NULL, self_len);
2913         if (result == NULL)
2914                 return NULL;
2915         result_s = PyString_AS_STRING(result);
2916         Py_MEMCPY(result_s, self_s, self_len);
2917
2918         /* change everything in-place, starting with this one */
2919         start =  result_s + (next-self_s);
2920         *start = to_c;
2921         start++;
2922         end = result_s + self_len;
2923
2924         while (--maxcount > 0) {
2925                 next = findchar(start, end-start, from_c);
2926                 if (next == NULL)
2927                         break;
2928                 *next = to_c;
2929                 start = next+1;
2930         }
2931
2932         return result;
2933 }
2934
2935 /* len(self)>=1, len(from)==len(to)>=2, maxcount>=1 */
2936 Py_LOCAL(PyStringObject *)
2937 replace_substring_in_place(PyStringObject *self,
2938                            const char *from_s, Py_ssize_t from_len,
2939                            const char *to_s, Py_ssize_t to_len,
2940                            Py_ssize_t maxcount)
2941 {
2942         char *result_s, *start, *end;
2943         char *self_s;
2944         Py_ssize_t self_len, offset;
2945         PyStringObject *result;
2946
2947         /* The result string will be the same size */
2948
2949         self_s = PyString_AS_STRING(self);
2950         self_len = PyString_GET_SIZE(self);
2951
2952         offset = findstring(self_s, self_len,
2953                             from_s, from_len,
2954                             0, self_len, FORWARD);
2955         if (offset == -1) {
2956                 /* No matches; return the original string */
2957                 return return_self(self);
2958         }
2959
2960         /* Need to make a new string */
2961         result = (PyStringObject *) PyString_FromStringAndSize(NULL, self_len);
2962         if (result == NULL)
2963                 return NULL;
2964         result_s = PyString_AS_STRING(result);
2965         Py_MEMCPY(result_s, self_s, self_len);
2966
2967         /* change everything in-place, starting with this one */
2968         start =  result_s + offset;
2969         Py_MEMCPY(start, to_s, from_len);
2970         start += from_len;
2971         end = result_s + self_len;
2972
2973         while ( --maxcount > 0) {
2974                 offset = findstring(start, end-start,
2975                                     from_s, from_len,
2976                                     0, end-start, FORWARD);
2977                 if (offset==-1)
2978                         break;
2979                 Py_MEMCPY(start+offset, to_s, from_len);
2980                 start += offset+from_len;
2981         }
2982
2983         return result;
2984 }
2985
2986 /* len(self)>=1, len(from)==1, len(to)>=2, maxcount>=1 */
2987 Py_LOCAL(PyStringObject *)
2988 replace_single_character(PyStringObject *self,
2989                          char from_c,
2990                          const char *to_s, Py_ssize_t to_len,
2991                          Py_ssize_t maxcount)
2992 {
2993         char *self_s, *result_s;
2994         char *start, *next, *end;
2995         Py_ssize_t self_len, result_len;
2996         Py_ssize_t count, product;
2997         PyStringObject *result;
2998
2999         self_s = PyString_AS_STRING(self);
3000         self_len = PyString_GET_SIZE(self);
3001
3002         count = countchar(self_s, self_len, from_c, maxcount);
3003         if (count == 0) {
3004                 /* no matches, return unchanged */
3005                 return return_self(self);
3006         }
3007
3008         /* use the difference between current and new, hence the "-1" */
3009         /*   result_len = self_len + count * (to_len-1)  */
3010         product = count * (to_len-1);
3011         if (product / (to_len-1) != count) {
3012                 PyErr_SetString(PyExc_OverflowError, "replace string is too long");
3013                 return NULL;
3014         }
3015         result_len = self_len + product;
3016         if (result_len < 0) {
3017                 PyErr_SetString(PyExc_OverflowError, "replace string is too long");
3018                 return NULL;
3019         }
3020
3021         if ( (result = (PyStringObject *)
3022               PyString_FromStringAndSize(NULL, result_len)) == NULL)
3023                 return NULL;
3024         result_s = PyString_AS_STRING(result);
3025
3026         start = self_s;
3027         end = self_s + self_len;
3028         while (count-- > 0) {
3029                 next = findchar(start, end-start, from_c);
3030                 if (next == NULL)
3031                         break;
3032
3033                 if (next == start) {
3034                         /* replace with the 'to' */
3035                         Py_MEMCPY(result_s, to_s, to_len);
3036                         result_s += to_len;
3037                         start += 1;
3038                 } else {
3039                         /* copy the unchanged old then the 'to' */
3040                         Py_MEMCPY(result_s, start, next-start);
3041                         result_s += (next-start);
3042                         Py_MEMCPY(result_s, to_s, to_len);
3043                         result_s += to_len;
3044                         start = next+1;
3045                 }
3046         }
3047         /* Copy the remainder of the remaining string */
3048         Py_MEMCPY(result_s, start, end-start);
3049
3050         return result;
3051 }
3052
3053 /* len(self)>=1, len(from)>=2, len(to)>=2, maxcount>=1 */
3054 Py_LOCAL(PyStringObject *)
3055 replace_substring(PyStringObject *self,
3056                   const char *from_s, Py_ssize_t from_len,
3057                   const char *to_s, Py_ssize_t to_len,
3058                   Py_ssize_t maxcount) {
3059         char *self_s, *result_s;
3060         char *start, *next, *end;
3061         Py_ssize_t self_len, result_len;
3062         Py_ssize_t count, offset, product;
3063         PyStringObject *result;
3064
3065         self_s = PyString_AS_STRING(self);
3066         self_len = PyString_GET_SIZE(self);
3067
3068         count = countstring(self_s, self_len,
3069                             from_s, from_len,
3070                             0, self_len, FORWARD, maxcount);
3071         if (count == 0) {
3072                 /* no matches, return unchanged */
3073                 return return_self(self);
3074         }
3075
3076         /* Check for overflow */
3077         /*    result_len = self_len + count * (to_len-from_len) */
3078         product = count * (to_len-from_len);
3079         if (product / (to_len-from_len) != count) {
3080                 PyErr_SetString(PyExc_OverflowError, "replace string is too long");
3081                 return NULL;
3082         }
3083         result_len = self_len + product;
3084         if (result_len < 0) {
3085                 PyErr_SetString(PyExc_OverflowError, "replace string is too long");
3086                 return NULL;
3087         }
3088
3089         if ( (result = (PyStringObject *)
3090               PyString_FromStringAndSize(NULL, result_len)) == NULL)
3091                 return NULL;
3092         result_s = PyString_AS_STRING(result);
3093
3094         start = self_s;
3095         end = self_s + self_len;
3096         while (count-- > 0) {
3097                 offset = findstring(start, end-start,
3098                                     from_s, from_len,
3099                                     0, end-start, FORWARD);
3100                 if (offset == -1)
3101                         break;
3102                 next = start+offset;
3103                 if (next == start) {
3104                         /* replace with the 'to' */
3105                         Py_MEMCPY(result_s, to_s, to_len);
3106                         result_s += to_len;
3107                         start += from_len;
3108                 } else {
3109                         /* copy the unchanged old then the 'to' */
3110                         Py_MEMCPY(result_s, start, next-start);
3111                         result_s += (next-start);
3112                         Py_MEMCPY(result_s, to_s, to_len);
3113                         result_s += to_len;
3114                         start = next+from_len;
3115                 }
3116         }
3117         /* Copy the remainder of the remaining string */
3118         Py_MEMCPY(result_s, start, end-start);
3119
3120         return result;
3121 }
3122
3123
3124 Py_LOCAL(PyStringObject *)
3125 replace(PyStringObject *self,
3126         const char *from_s, Py_ssize_t from_len,
3127         const char *to_s, Py_ssize_t to_len,
3128         Py_ssize_t maxcount)
3129 {
3130         if (maxcount < 0) {
3131                 maxcount = PY_SSIZE_T_MAX;
3132         } else if (maxcount == 0 || PyString_GET_SIZE(self) == 0) {
3133                 /* nothing to do; return the original string */
3134                 return return_self(self);
3135         }
3136
3137         if (maxcount == 0 ||
3138             (from_len == 0 && to_len == 0)) {
3139                 /* nothing to do; return the original string */
3140                 return return_self(self);
3141         }
3142
3143         /* Handle zero-length special cases */
3144
3145         if (from_len == 0) {
3146                 /* insert the 'to' string everywhere.   */
3147                 /*    >>> "Python".replace("", ".")     */
3148                 /*    '.P.y.t.h.o.n.'                   */
3149                 return replace_interleave(self, to_s, to_len, maxcount);
3150         }
3151
3152         /* Except for "".replace("", "A") == "A" there is no way beyond this */
3153         /* point for an empty self string to generate a non-empty string */
3154         /* Special case so the remaining code always gets a non-empty string */
3155         if (PyString_GET_SIZE(self) == 0) {
3156                 return return_self(self);
3157         }
3158
3159         if (to_len == 0) {
3160                 /* delete all occurances of 'from' string */
3161                 if (from_len == 1) {
3162                         return replace_delete_single_character(
3163                                 self, from_s[0], maxcount);
3164                 } else {
3165                         return replace_delete_substring(self, from_s, from_len, maxcount);
3166                 }
3167         }
3168
3169         /* Handle special case where both strings have the same length */
3170
3171         if (from_len == to_len) {
3172                 if (from_len == 1) {
3173                         return replace_single_character_in_place(
3174                                 self,
3175                                 from_s[0],
3176                                 to_s[0],
3177                                 maxcount);
3178                 } else {
3179                         return replace_substring_in_place(
3180                                 self, from_s, from_len, to_s, to_len, maxcount);
3181                 }
3182         }
3183
3184         /* Otherwise use the more generic algorithms */
3185         if (from_len == 1) {
3186                 return replace_single_character(self, from_s[0],
3187                                                 to_s, to_len, maxcount);
3188         } else {
3189                 /* len('from')>=2, len('to')>=1 */
3190                 return replace_substring(self, from_s, from_len, to_s, to_len, maxcount);
3191         }
3192 }
3193
3194 PyDoc_STRVAR(replace__doc__,
3195 "S.replace (old, new[, count]) -> string\n\
3196 \n\
3197 Return a copy of string S with all occurrences of substring\n\
3198 old replaced by new.  If the optional argument count is\n\
3199 given, only the first count occurrences are replaced.");
3200
3201 static PyObject *
3202 string_replace(PyStringObject *self, PyObject *args)
3203 {
3204         Py_ssize_t count = -1;
3205         PyObject *from, *to;
3206         const char *from_s, *to_s;
3207         Py_ssize_t from_len, to_len;
3208
3209         if (!PyArg_ParseTuple(args, "OO|n:replace", &from, &to, &count))
3210                 return NULL;
3211
3212         if (PyString_Check(from)) {
3213                 from_s = PyString_AS_STRING(from);
3214                 from_len = PyString_GET_SIZE(from);
3215         }
3216 #ifdef Py_USING_UNICODE
3217         if (PyUnicode_Check(from))
3218                 return PyUnicode_Replace((PyObject *)self,
3219                                          from, to, count);
3220 #endif
3221         else if (PyObject_AsCharBuffer(from, &from_s, &from_len))
3222                 return NULL;
3223
3224         if (PyString_Check(to)) {
3225                 to_s = PyString_AS_STRING(to);
3226                 to_len = PyString_GET_SIZE(to);
3227         }
3228 #ifdef Py_USING_UNICODE
3229         else if (PyUnicode_Check(to))
3230                 return PyUnicode_Replace((PyObject *)self,
3231                                          from, to, count);
3232 #endif
3233         else if (PyObject_AsCharBuffer(to, &to_s, &to_len))
3234                 return NULL;
3235
3236         return (PyObject *)replace((PyStringObject *) self,
3237                                    from_s, from_len,
3238                                    to_s, to_len, count);
3239 }
3240
3241 /** End DALKE **/
3242
3243 /* Matches the end (direction >= 0) or start (direction < 0) of self
3244  * against substr, using the start and end arguments. Returns
3245  * -1 on error, 0 if not found and 1 if found.
3246  */
3247 Py_LOCAL(int)
3248 _string_tailmatch(PyStringObject *self, PyObject *substr, Py_ssize_t start,
3249                   Py_ssize_t end, int direction)
3250 {
3251         Py_ssize_t len = PyString_GET_SIZE(self);
3252         Py_ssize_t slen;
3253         const char* sub;
3254         const char* str;
3255
3256         if (PyString_Check(substr)) {
3257                 sub = PyString_AS_STRING(substr);
3258                 slen = PyString_GET_SIZE(substr);
3259         }
3260 #ifdef Py_USING_UNICODE
3261         else if (PyUnicode_Check(substr))
3262                 return PyUnicode_Tailmatch((PyObject *)self,
3263                                            substr, start, end, direction);
3264 #endif
3265         else if (PyObject_AsCharBuffer(substr, &sub, &slen))
3266                 return -1;
3267         str = PyString_AS_STRING(self);
3268
3269         string_adjust_indices(&start, &end, len);
3270
3271         if (direction < 0) {
3272                 /* startswith */
3273                 if (start+slen > len)
3274                         return 0;
3275         } else {
3276                 /* endswith */
3277                 if (end-start < slen || start > len)
3278                         return 0;
3279
3280                 if (end-slen > start)
3281                         start = end - slen;
3282         }
3283         if (end-start >= slen)
3284                 return ! memcmp(str+start, sub, slen);
3285         return 0;
3286 }
3287
3288
3289 PyDoc_STRVAR(startswith__doc__,
3290 "S.startswith(prefix[, start[, end]]) -> bool\n\
3291 \n\
3292 Return True if S starts with the specified prefix, False otherwise.\n\
3293 With optional start, test S beginning at that position.\n\
3294 With optional end, stop comparing S at that position.\n\
3295 prefix can also be a tuple of strings to try.");
3296
3297 static PyObject *
3298 string_startswith(PyStringObject *self, PyObject *args)
3299 {
3300         Py_ssize_t start = 0;
3301         Py_ssize_t end = PY_SSIZE_T_MAX;
3302         PyObject *subobj;
3303         int result;
3304
3305         if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
3306                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3307                 return NULL;
3308         if (PyTuple_Check(subobj)) {
3309                 Py_ssize_t i;
3310                 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
3311                         result = _string_tailmatch(self,
3312                                         PyTuple_GET_ITEM(subobj, i),
3313                                         start, end, -1);
3314                         if (result == -1)
3315                                 return NULL;
3316                         else if (result) {
3317                                 Py_RETURN_TRUE;
3318                         }
3319                 }
3320                 Py_RETURN_FALSE;
3321         }
3322         result = _string_tailmatch(self, subobj, start, end, -1);
3323         if (result == -1)
3324                 return NULL;
3325         else
3326                 return PyBool_FromLong(result);
3327 }
3328
3329
3330 PyDoc_STRVAR(endswith__doc__,
3331 "S.endswith(suffix[, start[, end]]) -> bool\n\
3332 \n\
3333 Return True if S ends with the specified suffix, False otherwise.\n\
3334 With optional start, test S beginning at that position.\n\
3335 With optional end, stop comparing S at that position.\n\
3336 suffix can also be a tuple of strings to try.");
3337
3338 static PyObject *
3339 string_endswith(PyStringObject *self, PyObject *args)
3340 {
3341         Py_ssize_t start = 0;
3342         Py_ssize_t end = PY_SSIZE_T_MAX;
3343         PyObject *subobj;
3344         int result;
3345
3346         if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
3347                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3348                 return NULL;
3349         if (PyTuple_Check(subobj)) {
3350                 Py_ssize_t i;
3351                 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
3352                         result = _string_tailmatch(self,
3353                                         PyTuple_GET_ITEM(subobj, i),
3354                                         start, end, +1);
3355                         if (result == -1)
3356                                 return NULL;
3357                         else if (result) {
3358                                 Py_RETURN_TRUE;
3359                         }
3360                 }
3361                 Py_RETURN_FALSE;
3362         }
3363         result = _string_tailmatch(self, subobj, start, end, +1);
3364         if (result == -1)
3365                 return NULL;
3366         else
3367                 return PyBool_FromLong(result);
3368 }
3369
3370
3371 PyDoc_STRVAR(encode__doc__,
3372 "S.encode([encoding[,errors]]) -> object\n\
3373 \n\
3374 Encodes S using the codec registered for encoding. encoding defaults\n\
3375 to the default encoding. errors may be given to set a different error\n\
3376 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3377 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
3378 'xmlcharrefreplace' as well as any other name registered with\n\
3379 codecs.register_error that is able to handle UnicodeEncodeErrors.");
3380
3381 static PyObject *
3382 string_encode(PyStringObject *self, PyObject *args, PyObject *kwargs)
3383 {
3384     static char *kwlist[] = {"encoding", "errors", 0};
3385     char *encoding = NULL;
3386     char *errors = NULL;
3387     PyObject *v;
3388
3389     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
3390                                      kwlist, &encoding, &errors))
3391         return NULL;
3392     v = PyString_AsEncodedObject((PyObject *)self, encoding, errors);
3393     if (v == NULL)
3394         goto onError;
3395     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
3396         PyErr_Format(PyExc_TypeError,
3397                      "encoder did not return a string/unicode object "
3398                      "(type=%.400s)",
3399                      Py_TYPE(v)->tp_name);
3400         Py_DECREF(v);
3401         return NULL;
3402     }
3403     return v;
3404
3405  onError:
3406     return NULL;
3407 }
3408
3409
3410 PyDoc_STRVAR(decode__doc__,
3411 "S.decode([encoding[,errors]]) -> object\n\
3412 \n\
3413 Decodes S using the codec registered for encoding. encoding defaults\n\
3414 to the default encoding. errors may be given to set a different error\n\
3415 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3416 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
3417 as well as any other name registered with codecs.register_error that is\n\
3418 able to handle UnicodeDecodeErrors.");
3419
3420 static PyObject *
3421 string_decode(PyStringObject *self, PyObject *args, PyObject *kwargs)
3422 {
3423     static char *kwlist[] = {"encoding", "errors", 0};
3424     char *encoding = NULL;
3425     char *errors = NULL;
3426     PyObject *v;
3427
3428     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
3429                                      kwlist, &encoding, &errors))
3430         return NULL;
3431     v = PyString_AsDecodedObject((PyObject *)self, encoding, errors);
3432     if (v == NULL)
3433         goto onError;
3434     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
3435         PyErr_Format(PyExc_TypeError,
3436                      "decoder did not return a string/unicode object "
3437                      "(type=%.400s)",
3438                      Py_TYPE(v)->tp_name);
3439         Py_DECREF(v);
3440         return NULL;
3441     }
3442     return v;
3443
3444  onError:
3445     return NULL;
3446 }
3447
3448
3449 PyDoc_STRVAR(expandtabs__doc__,
3450 "S.expandtabs([tabsize]) -> string\n\
3451 \n\
3452 Return a copy of S where all tab characters are expanded using spaces.\n\
3453 If tabsize is not given, a tab size of 8 characters is assumed.");
3454
3455 static PyObject*
3456 string_expandtabs(PyStringObject *self, PyObject *args)
3457 {
3458     const char *e, *p, *qe;
3459     char *q;
3460     Py_ssize_t i, j, incr;
3461     PyObject *u;
3462     int tabsize = 8;
3463
3464     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3465         return NULL;
3466
3467     /* First pass: determine size of output string */
3468     i = 0; /* chars up to and including most recent \n or \r */
3469     j = 0; /* chars since most recent \n or \r (use in tab calculations) */
3470     e = PyString_AS_STRING(self) + PyString_GET_SIZE(self); /* end of input */
3471     for (p = PyString_AS_STRING(self); p < e; p++)
3472         if (*p == '\t') {
3473             if (tabsize > 0) {
3474                 incr = tabsize - (j % tabsize);
3475                 if (j > PY_SSIZE_T_MAX - incr)
3476                     goto overflow1;
3477                 j += incr;
3478             }
3479         }
3480         else {
3481             if (j > PY_SSIZE_T_MAX - 1)
3482                 goto overflow1;
3483             j++;
3484             if (*p == '\n' || *p == '\r') {
3485                 if (i > PY_SSIZE_T_MAX - j)
3486                     goto overflow1;
3487                 i += j;
3488                 j = 0;
3489             }
3490         }
3491
3492     if (i > PY_SSIZE_T_MAX - j)
3493         goto overflow1;
3494
3495     /* Second pass: create output string and fill it */
3496     u = PyString_FromStringAndSize(NULL, i + j);
3497     if (!u)
3498         return NULL;
3499
3500     j = 0; /* same as in first pass */
3501     q = PyString_AS_STRING(u); /* next output char */
3502     qe = PyString_AS_STRING(u) + PyString_GET_SIZE(u); /* end of output */
3503
3504     for (p = PyString_AS_STRING(self); p < e; p++)
3505         if (*p == '\t') {
3506             if (tabsize > 0) {
3507                 i = tabsize - (j % tabsize);
3508                 j += i;
3509                 while (i--) {
3510                     if (q >= qe)
3511                         goto overflow2;
3512                     *q++ = ' ';
3513                 }
3514             }
3515         }
3516         else {
3517             if (q >= qe)
3518                 goto overflow2;
3519             *q++ = *p;
3520             j++;
3521             if (*p == '\n' || *p == '\r')
3522                 j = 0;
3523         }
3524
3525     return u;
3526
3527   overflow2:
3528     Py_DECREF(u);
3529   overflow1:
3530     PyErr_SetString(PyExc_OverflowError, "new string is too long");
3531     return NULL;
3532 }
3533
3534 Py_LOCAL_INLINE(PyObject *)
3535 pad(PyStringObject *self, Py_ssize_t left, Py_ssize_t right, char fill)
3536 {
3537     PyObject *u;
3538
3539     if (left < 0)
3540         left = 0;
3541     if (right < 0)
3542         right = 0;
3543
3544     if (left == 0 && right == 0 && PyString_CheckExact(self)) {
3545         Py_INCREF(self);
3546         return (PyObject *)self;
3547     }
3548
3549     u = PyString_FromStringAndSize(NULL,
3550                                    left + PyString_GET_SIZE(self) + right);
3551     if (u) {
3552         if (left)
3553             memset(PyString_AS_STRING(u), fill, left);
3554         Py_MEMCPY(PyString_AS_STRING(u) + left,
3555                PyString_AS_STRING(self),
3556                PyString_GET_SIZE(self));
3557         if (right)
3558             memset(PyString_AS_STRING(u) + left + PyString_GET_SIZE(self),
3559                    fill, right);
3560     }
3561
3562     return u;
3563 }
3564
3565 PyDoc_STRVAR(ljust__doc__,
3566 "S.ljust(width[, fillchar]) -> string\n"
3567 "\n"
3568 "Return S left-justified in a string of length width. Padding is\n"
3569 "done using the specified fill character (default is a space).");
3570
3571 static PyObject *
3572 string_ljust(PyStringObject *self, PyObject *args)
3573 {
3574     Py_ssize_t width;
3575     char fillchar = ' ';
3576
3577     if (!PyArg_ParseTuple(args, "n|c:ljust", &width, &fillchar))
3578         return NULL;
3579
3580     if (PyString_GET_SIZE(self) >= width && PyString_CheckExact(self)) {
3581         Py_INCREF(self);
3582         return (PyObject*) self;
3583     }
3584
3585     return pad(self, 0, width - PyString_GET_SIZE(self), fillchar);
3586 }
3587
3588
3589 PyDoc_STRVAR(rjust__doc__,
3590 "S.rjust(width[, fillchar]) -> string\n"
3591 "\n"
3592 "Return S right-justified in a string of length width. Padding is\n"
3593 "done using the specified fill character (default is a space)");
3594
3595 static PyObject *
3596 string_rjust(PyStringObject *self, PyObject *args)
3597 {
3598     Py_ssize_t width;
3599     char fillchar = ' ';
3600
3601     if (!PyArg_ParseTuple(args, "n|c:rjust", &width, &fillchar))
3602         return NULL;
3603
3604     if (PyString_GET_SIZE(self) >= width && PyString_CheckExact(self)) {
3605         Py_INCREF(self);
3606         return (PyObject*) self;
3607     }
3608
3609     return pad(self, width - PyString_GET_SIZE(self), 0, fillchar);
3610 }
3611
3612
3613 PyDoc_STRVAR(center__doc__,
3614 "S.center(width[, fillchar]) -> string\n"
3615 "\n"
3616 "Return S centered in a string of length width. Padding is\n"
3617 "done using the specified fill character (default is a space)");
3618
3619 static PyObject *
3620 string_center(PyStringObject *self, PyObject *args)
3621 {
3622     Py_ssize_t marg, left;
3623     Py_ssize_t width;
3624     char fillchar = ' ';
3625
3626     if (!PyArg_ParseTuple(args, "n|c:center", &width, &fillchar))
3627         return NULL;
3628
3629     if (PyString_GET_SIZE(self) >= width && PyString_CheckExact(self)) {
3630         Py_INCREF(self);
3631         return (PyObject*) self;
3632     }
3633
3634     marg = width - PyString_GET_SIZE(self);
3635     left = marg / 2 + (marg & width & 1);
3636
3637     return pad(self, left, marg - left, fillchar);
3638 }
3639
3640 PyDoc_STRVAR(zfill__doc__,
3641 "S.zfill(width) -> string\n"
3642 "\n"
3643 "Pad a numeric string S with zeros on the left, to fill a field\n"
3644 "of the specified width.  The string S is never truncated.");
3645
3646 static PyObject *
3647 string_zfill(PyStringObject *self, PyObject *args)
3648 {
3649     Py_ssize_t fill;
3650     PyObject *s;
3651     char *p;
3652     Py_ssize_t width;
3653
3654     if (!PyArg_ParseTuple(args, "n:zfill", &width))
3655         return NULL;
3656
3657     if (PyString_GET_SIZE(self) >= width) {
3658         if (PyString_CheckExact(self)) {
3659             Py_INCREF(self);
3660             return (PyObject*) self;
3661         }
3662         else
3663             return PyString_FromStringAndSize(
3664                 PyString_AS_STRING(self),
3665                 PyString_GET_SIZE(self)
3666             );
3667     }
3668
3669     fill = width - PyString_GET_SIZE(self);
3670
3671     s = pad(self, fill, 0, '0');
3672
3673     if (s == NULL)
3674         return NULL;
3675
3676     p = PyString_AS_STRING(s);
3677     if (p[fill] == '+' || p[fill] == '-') {
3678         /* move sign to beginning of string */
3679         p[0] = p[fill];
3680         p[fill] = '0';
3681     }
3682
3683     return (PyObject*) s;
3684 }
3685
3686 PyDoc_STRVAR(isspace__doc__,
3687 "S.isspace() -> bool\n\
3688 \n\
3689 Return True if all characters in S are whitespace\n\
3690 and there is at least one character in S, False otherwise.");
3691
3692 static PyObject*
3693 string_isspace(PyStringObject *self)
3694 {
3695     register const unsigned char *p
3696         = (unsigned char *) PyString_AS_STRING(self);
3697     register const unsigned char *e;
3698
3699     /* Shortcut for single character strings */
3700     if (PyString_GET_SIZE(self) == 1 &&
3701         isspace(*p))
3702         return PyBool_FromLong(1);
3703
3704     /* Special case for empty strings */
3705     if (PyString_GET_SIZE(self) == 0)
3706         return PyBool_FromLong(0);
3707
3708     e = p + PyString_GET_SIZE(self);
3709     for (; p < e; p++) {
3710         if (!isspace(*p))
3711             return PyBool_FromLong(0);
3712     }
3713     return PyBool_FromLong(1);
3714 }
3715
3716
3717 PyDoc_STRVAR(isalpha__doc__,
3718 "S.isalpha() -> bool\n\
3719 \n\
3720 Return True if all characters in S are alphabetic\n\
3721 and there is at least one character in S, False otherwise.");
3722
3723 static PyObject*
3724 string_isalpha(PyStringObject *self)
3725 {
3726     register const unsigned char *p
3727         = (unsigned char *) PyString_AS_STRING(self);
3728     register const unsigned char *e;
3729
3730     /* Shortcut for single character strings */
3731     if (PyString_GET_SIZE(self) == 1 &&
3732         isalpha(*p))
3733         return PyBool_FromLong(1);
3734
3735     /* Special case for empty strings */
3736     if (PyString_GET_SIZE(self) == 0)
3737         return PyBool_FromLong(0);
3738
3739     e = p + PyString_GET_SIZE(self);
3740     for (; p < e; p++) {
3741         if (!isalpha(*p))
3742             return PyBool_FromLong(0);
3743     }
3744     return PyBool_FromLong(1);
3745 }
3746
3747
3748 PyDoc_STRVAR(isalnum__doc__,
3749 "S.isalnum() -> bool\n\
3750 \n\
3751 Return True if all characters in S are alphanumeric\n\
3752 and there is at least one character in S, False otherwise.");
3753
3754 static PyObject*
3755 string_isalnum(PyStringObject *self)
3756 {
3757     register const unsigned char *p
3758         = (unsigned char *) PyString_AS_STRING(self);
3759     register const unsigned char *e;
3760
3761     /* Shortcut for single character strings */
3762     if (PyString_GET_SIZE(self) == 1 &&
3763         isalnum(*p))
3764         return PyBool_FromLong(1);
3765
3766     /* Special case for empty strings */
3767     if (PyString_GET_SIZE(self) == 0)
3768         return PyBool_FromLong(0);
3769
3770     e = p + PyString_GET_SIZE(self);
3771     for (; p < e; p++) {
3772         if (!isalnum(*p))
3773             return PyBool_FromLong(0);
3774     }
3775     return PyBool_FromLong(1);
3776 }
3777
3778
3779 PyDoc_STRVAR(isdigit__doc__,
3780 "S.isdigit() -> bool\n\
3781 \n\
3782 Return True if all characters in S are digits\n\
3783 and there is at least one character in S, False otherwise.");
3784
3785 static PyObject*
3786 string_isdigit(PyStringObject *self)
3787 {
3788     register const unsigned char *p
3789         = (unsigned char *) PyString_AS_STRING(self);
3790     register const unsigned char *e;
3791
3792     /* Shortcut for single character strings */
3793     if (PyString_GET_SIZE(self) == 1 &&
3794         isdigit(*p))
3795         return PyBool_FromLong(1);
3796
3797     /* Special case for empty strings */
3798     if (PyString_GET_SIZE(self) == 0)
3799         return PyBool_FromLong(0);
3800
3801     e = p + PyString_GET_SIZE(self);
3802     for (; p < e; p++) {
3803         if (!isdigit(*p))
3804             return PyBool_FromLong(0);
3805     }
3806     return PyBool_FromLong(1);
3807 }
3808
3809
3810 PyDoc_STRVAR(islower__doc__,
3811 "S.islower() -> bool\n\
3812 \n\
3813 Return True if all cased characters in S are lowercase and there is\n\
3814 at least one cased character in S, False otherwise.");
3815
3816 static PyObject*
3817 string_islower(PyStringObject *self)
3818 {
3819     register const unsigned char *p
3820         = (unsigned char *) PyString_AS_STRING(self);
3821     register const unsigned char *e;
3822     int cased;
3823
3824     /* Shortcut for single character strings */
3825     if (PyString_GET_SIZE(self) == 1)
3826         return PyBool_FromLong(islower(*p) != 0);
3827
3828     /* Special case for empty strings */
3829     if (PyString_GET_SIZE(self) == 0)
3830         return PyBool_FromLong(0);
3831
3832     e = p + PyString_GET_SIZE(self);
3833     cased = 0;
3834     for (; p < e; p++) {
3835         if (isupper(*p))
3836             return PyBool_FromLong(0);
3837         else if (!cased && islower(*p))
3838             cased = 1;
3839     }
3840     return PyBool_FromLong(cased);
3841 }
3842
3843
3844 PyDoc_STRVAR(isupper__doc__,
3845 "S.isupper() -> bool\n\
3846 \n\
3847 Return True if all cased characters in S are uppercase and there is\n\
3848 at least one cased character in S, False otherwise.");
3849
3850 static PyObject*
3851 string_isupper(PyStringObject *self)
3852 {
3853     register const unsigned char *p
3854         = (unsigned char *) PyString_AS_STRING(self);
3855     register const unsigned char *e;
3856     int cased;
3857
3858     /* Shortcut for single character strings */
3859     if (PyString_GET_SIZE(self) == 1)
3860         return PyBool_FromLong(isupper(*p) != 0);
3861
3862     /* Special case for empty strings */
3863     if (PyString_GET_SIZE(self) == 0)
3864         return PyBool_FromLong(0);
3865
3866     e = p + PyString_GET_SIZE(self);
3867     cased = 0;
3868     for (; p < e; p++) {
3869         if (islower(*p))
3870             return PyBool_FromLong(0);
3871         else if (!cased && isupper(*p))
3872             cased = 1;
3873     }
3874     return PyBool_FromLong(cased);
3875 }
3876
3877
3878 PyDoc_STRVAR(istitle__doc__,
3879 "S.istitle() -> bool\n\
3880 \n\
3881 Return True if S is a titlecased string and there is at least one\n\
3882 character in S, i.e. uppercase characters may only follow uncased\n\
3883 characters and lowercase characters only cased ones. Return False\n\
3884 otherwise.");
3885
3886 static PyObject*
3887 string_istitle(PyStringObject *self, PyObject *uncased)
3888 {
3889     register const unsigned char *p
3890         = (unsigned char *) PyString_AS_STRING(self);
3891     register const unsigned char *e;
3892     int cased, previous_is_cased;
3893
3894     /* Shortcut for single character strings */
3895     if (PyString_GET_SIZE(self) == 1)
3896         return PyBool_FromLong(isupper(*p) != 0);
3897
3898     /* Special case for empty strings */
3899     if (PyString_GET_SIZE(self) == 0)
3900         return PyBool_FromLong(0);
3901
3902     e = p + PyString_GET_SIZE(self);
3903     cased = 0;
3904     previous_is_cased = 0;
3905     for (; p < e; p++) {
3906         register const unsigned char ch = *p;
3907
3908         if (isupper(ch)) {
3909             if (previous_is_cased)
3910                 return PyBool_FromLong(0);
3911             previous_is_cased = 1;
3912             cased = 1;
3913         }
3914         else if (islower(ch)) {
3915             if (!previous_is_cased)
3916                 return PyBool_FromLong(0);
3917             previous_is_cased = 1;
3918             cased = 1;
3919         }
3920         else
3921             previous_is_cased = 0;
3922     }
3923     return PyBool_FromLong(cased);
3924 }
3925
3926
3927 PyDoc_STRVAR(splitlines__doc__,
3928 "S.splitlines([keepends]) -> list of strings\n\
3929 \n\
3930 Return a list of the lines in S, breaking at line boundaries.\n\
3931 Line breaks are not included in the resulting list unless keepends\n\
3932 is given and true.");
3933
3934 static PyObject*
3935 string_splitlines(PyStringObject *self, PyObject *args)
3936 {
3937     register Py_ssize_t i;
3938     register Py_ssize_t j;
3939     Py_ssize_t len;
3940     int keepends = 0;
3941     PyObject *list;
3942     PyObject *str;
3943     char *data;
3944
3945     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
3946         return NULL;
3947
3948     data = PyString_AS_STRING(self);
3949     len = PyString_GET_SIZE(self);
3950
3951     /* This does not use the preallocated list because splitlines is
3952        usually run with hundreds of newlines.  The overhead of
3953        switching between PyList_SET_ITEM and append causes about a
3954        2-3% slowdown for that common case.  A smarter implementation
3955        could move the if check out, so the SET_ITEMs are done first
3956        and the appends only done when the prealloc buffer is full.
3957        That's too much work for little gain.*/
3958
3959     list = PyList_New(0);
3960     if (!list)
3961         goto onError;
3962
3963     for (i = j = 0; i < len; ) {
3964         Py_ssize_t eol;
3965
3966         /* Find a line and append it */
3967         while (i < len && data[i] != '\n' && data[i] != '\r')
3968             i++;
3969
3970         /* Skip the line break reading CRLF as one line break */
3971         eol = i;
3972         if (i < len) {
3973             if (data[i] == '\r' && i + 1 < len &&
3974                 data[i+1] == '\n')
3975                 i += 2;
3976             else
3977                 i++;
3978             if (keepends)
3979                 eol = i;
3980         }
3981         SPLIT_APPEND(data, j, eol);
3982         j = i;
3983     }
3984     if (j < len) {
3985         SPLIT_APPEND(data, j, len);
3986     }
3987
3988     return list;
3989
3990  onError:
3991     Py_XDECREF(list);
3992     return NULL;
3993 }
3994
3995 PyDoc_STRVAR(sizeof__doc__,
3996 "S.__sizeof__() -> size of S in memory, in bytes");
3997
3998 static PyObject *
3999 string_sizeof(PyStringObject *v)
4000 {
4001         Py_ssize_t res;
4002         res = PyStringObject_SIZE + PyString_GET_SIZE(v) * Py_TYPE(v)->tp_itemsize;
4003         return PyInt_FromSsize_t(res);
4004 }
4005
4006 #undef SPLIT_APPEND
4007 #undef SPLIT_ADD
4008 #undef MAX_PREALLOC
4009 #undef PREALLOC_SIZE
4010
4011 static PyObject *
4012 string_getnewargs(PyStringObject *v)
4013 {
4014         return Py_BuildValue("(s#)", v->ob_sval, Py_SIZE(v));
4015 }
4016
4017
4018 #include "stringlib/string_format.h"
4019
4020 PyDoc_STRVAR(format__doc__,
4021 "S.format(*args, **kwargs) -> unicode\n\
4022 \n\
4023 ");
4024
4025 static PyObject *
4026 string__format__(PyObject* self, PyObject* args)
4027 {
4028     PyObject *format_spec;
4029     PyObject *result = NULL;
4030     PyObject *tmp = NULL;
4031
4032     /* If 2.x, convert format_spec to the same type as value */
4033     /* This is to allow things like u''.format('') */
4034     if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
4035         goto done;
4036     if (!(PyString_Check(format_spec) || PyUnicode_Check(format_spec))) {
4037         PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
4038                      "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
4039         goto done;
4040     }
4041     tmp = PyObject_Str(format_spec);
4042     if (tmp == NULL)
4043         goto done;
4044     format_spec = tmp;
4045
4046     result = _PyBytes_FormatAdvanced(self,
4047                                      PyString_AS_STRING(format_spec),
4048                                      PyString_GET_SIZE(format_spec));
4049 done:
4050     Py_XDECREF(tmp);
4051     return result;
4052 }
4053
4054 PyDoc_STRVAR(p_format__doc__,
4055 "S.__format__(format_spec) -> unicode\n\
4056 \n\
4057 ");
4058
4059
4060 static PyMethodDef
4061 string_methods[] = {
4062         /* Counterparts of the obsolete stropmodule functions; except
4063            string.maketrans(). */
4064         {"join", (PyCFunction)string_join, METH_O, join__doc__},
4065         {"split", (PyCFunction)string_split, METH_VARARGS, split__doc__},
4066         {"rsplit", (PyCFunction)string_rsplit, METH_VARARGS, rsplit__doc__},
4067         {"lower", (PyCFunction)string_lower, METH_NOARGS, lower__doc__},
4068         {"upper", (PyCFunction)string_upper, METH_NOARGS, upper__doc__},
4069         {"islower", (PyCFunction)string_islower, METH_NOARGS, islower__doc__},
4070         {"isupper", (PyCFunction)string_isupper, METH_NOARGS, isupper__doc__},
4071         {"isspace", (PyCFunction)string_isspace, METH_NOARGS, isspace__doc__},
4072         {"isdigit", (PyCFunction)string_isdigit, METH_NOARGS, isdigit__doc__},
4073         {"istitle", (PyCFunction)string_istitle, METH_NOARGS, istitle__doc__},
4074         {"isalpha", (PyCFunction)string_isalpha, METH_NOARGS, isalpha__doc__},
4075         {"isalnum", (PyCFunction)string_isalnum, METH_NOARGS, isalnum__doc__},
4076         {"capitalize", (PyCFunction)string_capitalize, METH_NOARGS,
4077          capitalize__doc__},
4078         {"count", (PyCFunction)string_count, METH_VARARGS, count__doc__},
4079         {"endswith", (PyCFunction)string_endswith, METH_VARARGS,
4080          endswith__doc__},
4081         {"partition", (PyCFunction)string_partition, METH_O, partition__doc__},
4082         {"find", (PyCFunction)string_find, METH_VARARGS, find__doc__},
4083         {"index", (PyCFunction)string_index, METH_VARARGS, index__doc__},
4084         {"lstrip", (PyCFunction)string_lstrip, METH_VARARGS, lstrip__doc__},
4085         {"replace", (PyCFunction)string_replace, METH_VARARGS, replace__doc__},
4086         {"rfind", (PyCFunction)string_rfind, METH_VARARGS, rfind__doc__},
4087         {"rindex", (PyCFunction)string_rindex, METH_VARARGS, rindex__doc__},
4088         {"rstrip", (PyCFunction)string_rstrip, METH_VARARGS, rstrip__doc__},
4089         {"rpartition", (PyCFunction)string_rpartition, METH_O,
4090          rpartition__doc__},
4091         {"startswith", (PyCFunction)string_startswith, METH_VARARGS,
4092          startswith__doc__},
4093         {"strip", (PyCFunction)string_strip, METH_VARARGS, strip__doc__},
4094         {"swapcase", (PyCFunction)string_swapcase, METH_NOARGS,
4095          swapcase__doc__},
4096         {"translate", (PyCFunction)string_translate, METH_VARARGS,
4097          translate__doc__},
4098         {"title", (PyCFunction)string_title, METH_NOARGS, title__doc__},
4099         {"ljust", (PyCFunction)string_ljust, METH_VARARGS, ljust__doc__},
4100         {"rjust", (PyCFunction)string_rjust, METH_VARARGS, rjust__doc__},
4101         {"center", (PyCFunction)string_center, METH_VARARGS, center__doc__},
4102         {"zfill", (PyCFunction)string_zfill, METH_VARARGS, zfill__doc__},
4103         {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
4104         {"__format__", (PyCFunction) string__format__, METH_VARARGS, p_format__doc__},
4105         {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
4106         {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
4107         {"encode", (PyCFunction)string_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
4108         {"decode", (PyCFunction)string_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
4109         {"expandtabs", (PyCFunction)string_expandtabs, METH_VARARGS,
4110          expandtabs__doc__},
4111         {"splitlines", (PyCFunction)string_splitlines, METH_VARARGS,
4112          splitlines__doc__},
4113         {"__sizeof__", (PyCFunction)string_sizeof, METH_NOARGS,
4114          sizeof__doc__},
4115         {"__getnewargs__",      (PyCFunction)string_getnewargs, METH_NOARGS},
4116         {NULL,     NULL}                     /* sentinel */
4117 };
4118
4119 static PyObject *
4120 str_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
4121
4122 static PyObject *
4123 string_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
4124 {
4125         PyObject *x = NULL;
4126         static char *kwlist[] = {"object", 0};
4127
4128         if (type != &PyString_Type)
4129                 return str_subtype_new(type, args, kwds);
4130         if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O:str", kwlist, &x))
4131                 return NULL;
4132         if (x == NULL)
4133                 return PyString_FromString("");
4134         return PyObject_Str(x);
4135 }
4136
4137 static PyObject *
4138 str_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
4139 {
4140         PyObject *tmp, *pnew;
4141         Py_ssize_t n;
4142
4143         assert(PyType_IsSubtype(type, &PyString_Type));
4144         tmp = string_new(&PyString_Type, args, kwds);
4145         if (tmp == NULL)
4146                 return NULL;
4147         assert(PyString_CheckExact(tmp));
4148         n = PyString_GET_SIZE(tmp);
4149         pnew = type->tp_alloc(type, n);
4150         if (pnew != NULL) {
4151                 Py_MEMCPY(PyString_AS_STRING(pnew), PyString_AS_STRING(tmp), n+1);
4152                 ((PyStringObject *)pnew)->ob_shash =
4153                         ((PyStringObject *)tmp)->ob_shash;
4154                 ((PyStringObject *)pnew)->ob_sstate = SSTATE_NOT_INTERNED;
4155         }
4156         Py_DECREF(tmp);
4157         return pnew;
4158 }
4159
4160 static PyObject *
4161 basestring_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
4162 {
4163         PyErr_SetString(PyExc_TypeError,
4164                         "The basestring type cannot be instantiated");
4165         return NULL;
4166 }
4167
4168 static PyObject *
4169 string_mod(PyObject *v, PyObject *w)
4170 {
4171         if (!PyString_Check(v)) {
4172                 Py_INCREF(Py_NotImplemented);
4173                 return Py_NotImplemented;
4174         }
4175         return PyString_Format(v, w);
4176 }
4177
4178 PyDoc_STRVAR(basestring_doc,
4179 "Type basestring cannot be instantiated; it is the base for str and unicode.");
4180
4181 static PyNumberMethods string_as_number = {
4182         0,                      /*nb_add*/
4183         0,                      /*nb_subtract*/
4184         0,                      /*nb_multiply*/
4185         0,                      /*nb_divide*/
4186         string_mod,             /*nb_remainder*/
4187 };
4188
4189
4190 PyTypeObject PyBaseString_Type = {
4191         PyVarObject_HEAD_INIT(&PyType_Type, 0)
4192         "basestring",
4193         0,
4194         0,
4195         0,                                      /* tp_dealloc */
4196         0,                                      /* tp_print */
4197         0,                                      /* tp_getattr */
4198         0,                                      /* tp_setattr */
4199         0,                                      /* tp_compare */
4200         0,                                      /* tp_repr */
4201         0,                                      /* tp_as_number */
4202         0,                                      /* tp_as_sequence */
4203         0,                                      /* tp_as_mapping */
4204         0,                                      /* tp_hash */
4205         0,                                      /* tp_call */
4206         0,                                      /* tp_str */
4207         0,                                      /* tp_getattro */
4208         0,                                      /* tp_setattro */
4209         0,                                      /* tp_as_buffer */
4210         Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
4211         basestring_doc,                         /* tp_doc */
4212         0,                                      /* tp_traverse */
4213         0,                                      /* tp_clear */
4214         0,                                      /* tp_richcompare */
4215         0,                                      /* tp_weaklistoffset */
4216         0,                                      /* tp_iter */
4217         0,                                      /* tp_iternext */
4218         0,                                      /* tp_methods */
4219         0,                                      /* tp_members */
4220         0,                                      /* tp_getset */
4221         &PyBaseObject_Type,                     /* tp_base */
4222         0,                                      /* tp_dict */
4223         0,                                      /* tp_descr_get */
4224         0,                                      /* tp_descr_set */
4225         0,                                      /* tp_dictoffset */
4226         0,                                      /* tp_init */
4227         0,                                      /* tp_alloc */
4228         basestring_new,                         /* tp_new */
4229         0,                                      /* tp_free */
4230 };
4231
4232 PyDoc_STRVAR(string_doc,
4233 "str(object) -> string\n\
4234 \n\
4235 Return a nice string representation of the object.\n\
4236 If the argument is a string, the return value is the same object.");
4237
4238 PyTypeObject PyString_Type = {
4239         PyVarObject_HEAD_INIT(&PyType_Type, 0)
4240         "str",
4241         PyStringObject_SIZE,
4242         sizeof(char),
4243         string_dealloc,                         /* tp_dealloc */
4244         (printfunc)string_print,                /* tp_print */
4245         0,                                      /* tp_getattr */
4246         0,                                      /* tp_setattr */
4247         0,                                      /* tp_compare */
4248         string_repr,                            /* tp_repr */
4249         &string_as_number,                      /* tp_as_number */
4250         &string_as_sequence,                    /* tp_as_sequence */
4251         &string_as_mapping,                     /* tp_as_mapping */
4252         (hashfunc)string_hash,                  /* tp_hash */
4253         0,                                      /* tp_call */
4254         string_str,                             /* tp_str */
4255         PyObject_GenericGetAttr,                /* tp_getattro */
4256         0,                                      /* tp_setattro */
4257         &string_as_buffer,                      /* tp_as_buffer */
4258         Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
4259                 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_STRING_SUBCLASS |
4260                 Py_TPFLAGS_HAVE_NEWBUFFER,      /* tp_flags */
4261         string_doc,                             /* tp_doc */
4262         0,                                      /* tp_traverse */
4263         0,                                      /* tp_clear */
4264         (richcmpfunc)string_richcompare,        /* tp_richcompare */
4265         0,                                      /* tp_weaklistoffset */
4266         0,                                      /* tp_iter */
4267         0,                                      /* tp_iternext */
4268         string_methods,                         /* tp_methods */
4269         0,                                      /* tp_members */
4270         0,                                      /* tp_getset */
4271         &PyBaseString_Type,                     /* tp_base */
4272         0,                                      /* tp_dict */
4273         0,                                      /* tp_descr_get */
4274         0,                                      /* tp_descr_set */
4275         0,                                      /* tp_dictoffset */
4276         0,                                      /* tp_init */
4277         0,                                      /* tp_alloc */
4278         string_new,                             /* tp_new */
4279         PyObject_Del,                           /* tp_free */
4280 };
4281
4282 void
4283 PyString_Concat(register PyObject **pv, register PyObject *w)
4284 {
4285         register PyObject *v;
4286         if (*pv == NULL)
4287                 return;
4288         if (w == NULL || !PyString_Check(*pv)) {
4289                 Py_DECREF(*pv);
4290                 *pv = NULL;
4291                 return;
4292         }
4293         v = string_concat((PyStringObject *) *pv, w);
4294         Py_DECREF(*pv);
4295         *pv = v;
4296 }
4297
4298 void
4299 PyString_ConcatAndDel(register PyObject **pv, register PyObject *w)
4300 {
4301         PyString_Concat(pv, w);
4302         Py_XDECREF(w);
4303 }
4304
4305
4306 /* The following function breaks the notion that strings are immutable:
4307    it changes the size of a string.  We get away with this only if there
4308    is only one module referencing the object.  You can also think of it
4309    as creating a new string object and destroying the old one, only
4310    more efficiently.  In any case, don't use this if the string may
4311    already be known to some other part of the code...
4312    Note that if there's not enough memory to resize the string, the original
4313    string object at *pv is deallocated, *pv is set to NULL, an "out of
4314    memory" exception is set, and -1 is returned.  Else (on success) 0 is
4315    returned, and the value in *pv may or may not be the same as on input.
4316    As always, an extra byte is allocated for a trailing \0 byte (newsize
4317    does *not* include that), and a trailing \0 byte is stored.
4318 */
4319
4320 int
4321 _PyString_Resize(PyObject **pv, Py_ssize_t newsize)
4322 {
4323         register PyObject *v;
4324         register PyStringObject *sv;
4325         v = *pv;
4326         if (!PyString_Check(v) || Py_REFCNT(v) != 1 || newsize < 0 ||
4327             PyString_CHECK_INTERNED(v)) {
4328                 *pv = 0;
4329                 Py_DECREF(v);
4330                 PyErr_BadInternalCall();
4331                 return -1;
4332         }
4333         /* XXX UNREF/NEWREF interface should be more symmetrical */
4334         _Py_DEC_REFTOTAL;
4335         _Py_ForgetReference(v);
4336         *pv = (PyObject *)
4337                 PyObject_REALLOC((char *)v, PyStringObject_SIZE + newsize);
4338         if (*pv == NULL) {
4339                 PyObject_Del(v);
4340                 PyErr_NoMemory();
4341                 return -1;
4342         }
4343         _Py_NewReference(*pv);
4344         sv = (PyStringObject *) *pv;
4345         Py_SIZE(sv) = newsize;
4346         sv->ob_sval[newsize] = '\0';
4347         sv->ob_shash = -1;      /* invalidate cached hash value */
4348         return 0;
4349 }
4350
4351 /* Helpers for formatstring */
4352
4353 Py_LOCAL_INLINE(PyObject *)
4354 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
4355 {
4356         Py_ssize_t argidx = *p_argidx;
4357         if (argidx < arglen) {
4358                 (*p_argidx)++;
4359                 if (arglen < 0)
4360                         return args;
4361                 else
4362                         return PyTuple_GetItem(args, argidx);
4363         }
4364         PyErr_SetString(PyExc_TypeError,
4365                         "not enough arguments for format string");
4366         return NULL;
4367 }
4368
4369 /* Format codes
4370  * F_LJUST      '-'
4371  * F_SIGN       '+'
4372  * F_BLANK      ' '
4373  * F_ALT        '#'
4374  * F_ZERO       '0'
4375  */
4376 #define F_LJUST (1<<0)
4377 #define F_SIGN  (1<<1)
4378 #define F_BLANK (1<<2)
4379 #define F_ALT   (1<<3)
4380 #define F_ZERO  (1<<4)
4381
4382 /* Returns a new reference to a PyString object, or NULL on failure. */
4383
4384 static PyObject *
4385 formatfloat(PyObject *v, int flags, int prec, int type)
4386 {
4387         char *p;
4388         PyObject *result;
4389         double x;
4390
4391         x = PyFloat_AsDouble(v);
4392         if (x == -1.0 && PyErr_Occurred()) {
4393                 PyErr_Format(PyExc_TypeError, "float argument required, "
4394                              "not %.200s", Py_TYPE(v)->tp_name);
4395                 return NULL;
4396         }
4397
4398         if (prec < 0)
4399                 prec = 6;
4400
4401         p = PyOS_double_to_string(x, type, prec,
4402                                   (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
4403
4404         if (p == NULL)
4405                 return NULL;
4406         result = PyString_FromStringAndSize(p, strlen(p));
4407         PyMem_Free(p);
4408         return result;
4409 }
4410
4411 /* _PyString_FormatLong emulates the format codes d, u, o, x and X, and
4412  * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
4413  * Python's regular ints.
4414  * Return value:  a new PyString*, or NULL if error.
4415  *  .  *pbuf is set to point into it,
4416  *     *plen set to the # of chars following that.
4417  *     Caller must decref it when done using pbuf.
4418  *     The string starting at *pbuf is of the form
4419  *         "-"? ("0x" | "0X")? digit+
4420  *     "0x"/"0X" are present only for x and X conversions, with F_ALT
4421  *         set in flags.  The case of hex digits will be correct,
4422  *     There will be at least prec digits, zero-filled on the left if
4423  *         necessary to get that many.
4424  * val          object to be converted
4425  * flags        bitmask of format flags; only F_ALT is looked at
4426  * prec         minimum number of digits; 0-fill on left if needed
4427  * type         a character in [duoxX]; u acts the same as d
4428  *
4429  * CAUTION:  o, x and X conversions on regular ints can never
4430  * produce a '-' sign, but can for Python's unbounded ints.
4431  */
4432 PyObject*
4433 _PyString_FormatLong(PyObject *val, int flags, int prec, int type,
4434                      char **pbuf, int *plen)
4435 {
4436         PyObject *result = NULL;
4437         char *buf;
4438         Py_ssize_t i;
4439         int sign;       /* 1 if '-', else 0 */
4440         int len;        /* number of characters */
4441         Py_ssize_t llen;
4442         int numdigits;  /* len == numnondigits + numdigits */
4443         int numnondigits = 0;
4444
4445         switch (type) {
4446         case 'd':
4447         case 'u':
4448                 result = Py_TYPE(val)->tp_str(val);
4449                 break;
4450         case 'o':
4451                 result = Py_TYPE(val)->tp_as_number->nb_oct(val);
4452                 break;
4453         case 'x':
4454         case 'X':
4455                 numnondigits = 2;
4456                 result = Py_TYPE(val)->tp_as_number->nb_hex(val);
4457                 break;
4458         default:
4459                 assert(!"'type' not in [duoxX]");
4460         }
4461         if (!result)
4462                 return NULL;
4463
4464         buf = PyString_AsString(result);
4465         if (!buf) {
4466                 Py_DECREF(result);
4467                 return NULL;
4468         }
4469
4470         /* To modify the string in-place, there can only be one reference. */
4471         if (Py_REFCNT(result) != 1) {
4472                 PyErr_BadInternalCall();
4473                 return NULL;
4474         }
4475         llen = PyString_Size(result);
4476         if (llen > INT_MAX) {
4477                 PyErr_SetString(PyExc_ValueError, "string too large in _PyString_FormatLong");
4478                 return NULL;
4479         }
4480         len = (int)llen;
4481         if (buf[len-1] == 'L') {
4482                 --len;
4483                 buf[len] = '\0';
4484         }
4485         sign = buf[0] == '-';
4486         numnondigits += sign;
4487         numdigits = len - numnondigits;
4488         assert(numdigits > 0);
4489
4490         /* Get rid of base marker unless F_ALT */
4491         if ((flags & F_ALT) == 0) {
4492                 /* Need to skip 0x, 0X or 0. */
4493                 int skipped = 0;
4494                 switch (type) {
4495                 case 'o':
4496                         assert(buf[sign] == '0');
4497                         /* If 0 is only digit, leave it alone. */
4498                         if (numdigits > 1) {
4499                                 skipped = 1;
4500                                 --numdigits;
4501                         }
4502                         break;
4503                 case 'x':
4504                 case 'X':
4505                         assert(buf[sign] == '0');
4506                         assert(buf[sign + 1] == 'x');
4507                         skipped = 2;
4508                         numnondigits -= 2;
4509                         break;
4510                 }
4511                 if (skipped) {
4512                         buf += skipped;
4513                         len -= skipped;
4514                         if (sign)
4515                                 buf[0] = '-';
4516                 }
4517                 assert(len == numnondigits + numdigits);
4518                 assert(numdigits > 0);
4519         }
4520
4521         /* Fill with leading zeroes to meet minimum width. */
4522         if (prec > numdigits) {
4523                 PyObject *r1 = PyString_FromStringAndSize(NULL,
4524                                         numnondigits + prec);
4525                 char *b1;
4526                 if (!r1) {
4527                         Py_DECREF(result);
4528                         return NULL;
4529                 }
4530                 b1 = PyString_AS_STRING(r1);
4531                 for (i = 0; i < numnondigits; ++i)
4532                         *b1++ = *buf++;
4533                 for (i = 0; i < prec - numdigits; i++)
4534                         *b1++ = '0';
4535                 for (i = 0; i < numdigits; i++)
4536                         *b1++ = *buf++;
4537                 *b1 = '\0';
4538                 Py_DECREF(result);
4539                 result = r1;
4540                 buf = PyString_AS_STRING(result);
4541                 len = numnondigits + prec;
4542         }
4543
4544         /* Fix up case for hex conversions. */
4545         if (type == 'X') {
4546                 /* Need to convert all lower case letters to upper case.
4547                    and need to convert 0x to 0X (and -0x to -0X). */
4548                 for (i = 0; i < len; i++)
4549                         if (buf[i] >= 'a' && buf[i] <= 'x')
4550                                 buf[i] -= 'a'-'A';
4551         }
4552         *pbuf = buf;
4553         *plen = len;
4554         return result;
4555 }
4556
4557 Py_LOCAL_INLINE(int)
4558 formatint(char *buf, size_t buflen, int flags,
4559           int prec, int type, PyObject *v)
4560 {
4561         /* fmt = '%#.' + `prec` + 'l' + `type`
4562            worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4563            + 1 + 1 = 24 */
4564         char fmt[64];   /* plenty big enough! */
4565         char *sign;
4566         long x;
4567
4568         x = PyInt_AsLong(v);
4569         if (x == -1 && PyErr_Occurred()) {
4570                 PyErr_Format(PyExc_TypeError, "int argument required, not %.200s",
4571                              Py_TYPE(v)->tp_name);
4572                 return -1;
4573         }
4574         if (x < 0 && type == 'u') {
4575                 type = 'd';
4576         }
4577         if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
4578                 sign = "-";
4579         else
4580                 sign = "";
4581         if (prec < 0)
4582                 prec = 1;
4583
4584         if ((flags & F_ALT) &&
4585             (type == 'x' || type == 'X')) {
4586                 /* When converting under %#x or %#X, there are a number
4587                  * of issues that cause pain:
4588                  * - when 0 is being converted, the C standard leaves off
4589                  *   the '0x' or '0X', which is inconsistent with other
4590                  *   %#x/%#X conversions and inconsistent with Python's
4591                  *   hex() function
4592                  * - there are platforms that violate the standard and
4593                  *   convert 0 with the '0x' or '0X'
4594                  *   (Metrowerks, Compaq Tru64)
4595                  * - there are platforms that give '0x' when converting
4596                  *   under %#X, but convert 0 in accordance with the
4597                  *   standard (OS/2 EMX)
4598                  *
4599                  * We can achieve the desired consistency by inserting our
4600                  * own '0x' or '0X' prefix, and substituting %x/%X in place
4601                  * of %#x/%#X.
4602                  *
4603                  * Note that this is the same approach as used in
4604                  * formatint() in unicodeobject.c
4605                  */
4606                 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
4607                               sign, type, prec, type);
4608         }
4609         else {
4610                 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
4611                               sign, (flags&F_ALT) ? "#" : "",
4612                               prec, type);
4613         }
4614
4615         /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
4616          * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
4617          */
4618         if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
4619                 PyErr_SetString(PyExc_OverflowError,
4620                     "formatted integer is too long (precision too large?)");
4621                 return -1;
4622         }
4623         if (sign[0])
4624                 PyOS_snprintf(buf, buflen, fmt, -x);
4625         else
4626                 PyOS_snprintf(buf, buflen, fmt, x);
4627         return (int)strlen(buf);
4628 }
4629
4630 Py_LOCAL_INLINE(int)
4631 formatchar(char *buf, size_t buflen, PyObject *v)
4632 {
4633         /* presume that the buffer is at least 2 characters long */
4634         if (PyString_Check(v)) {
4635                 if (!PyArg_Parse(v, "c;%c requires int or char", &buf[0]))
4636                         return -1;
4637         }
4638         else {
4639                 if (!PyArg_Parse(v, "b;%c requires int or char", &buf[0]))
4640                         return -1;
4641         }
4642         buf[1] = '\0';
4643         return 1;
4644 }
4645
4646 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4647
4648    FORMATBUFLEN is the length of the buffer in which the ints &
4649    chars are formatted. XXX This is a magic number. Each formatting
4650    routine does bounds checking to ensure no overflow, but a better
4651    solution may be to malloc a buffer of appropriate size for each
4652    format. For now, the current solution is sufficient.
4653 */
4654 #define FORMATBUFLEN (size_t)120
4655
4656 PyObject *
4657 PyString_Format(PyObject *format, PyObject *args)
4658 {
4659         char *fmt, *res;
4660         Py_ssize_t arglen, argidx;
4661         Py_ssize_t reslen, rescnt, fmtcnt;
4662         int args_owned = 0;
4663         PyObject *result, *orig_args;
4664 #ifdef Py_USING_UNICODE
4665         PyObject *v, *w;
4666 #endif
4667         PyObject *dict = NULL;
4668         if (format == NULL || !PyString_Check(format) || args == NULL) {
4669                 PyErr_BadInternalCall();
4670                 return NULL;
4671         }
4672         orig_args = args;
4673         fmt = PyString_AS_STRING(format);
4674         fmtcnt = PyString_GET_SIZE(format);
4675         reslen = rescnt = fmtcnt + 100;
4676         result = PyString_FromStringAndSize((char *)NULL, reslen);
4677         if (result == NULL)
4678                 return NULL;
4679         res = PyString_AsString(result);
4680         if (PyTuple_Check(args)) {
4681                 arglen = PyTuple_GET_SIZE(args);
4682                 argidx = 0;
4683         }
4684         else {
4685                 arglen = -1;
4686                 argidx = -2;
4687         }
4688         if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
4689             !PyObject_TypeCheck(args, &PyBaseString_Type))
4690                 dict = args;
4691         while (--fmtcnt >= 0) {
4692                 if (*fmt != '%') {
4693                         if (--rescnt < 0) {
4694                                 rescnt = fmtcnt + 100;
4695                                 reslen += rescnt;
4696                                 if (_PyString_Resize(&result, reslen) < 0)
4697                                         return NULL;
4698                                 res = PyString_AS_STRING(result)
4699                                         + reslen - rescnt;
4700                                 --rescnt;
4701                         }
4702                         *res++ = *fmt++;
4703                 }
4704                 else {
4705                         /* Got a format specifier */
4706                         int flags = 0;
4707                         Py_ssize_t width = -1;
4708                         int prec = -1;
4709                         int c = '\0';
4710                         int fill;
4711                         int isnumok;
4712                         PyObject *v = NULL;
4713                         PyObject *temp = NULL;
4714                         char *pbuf;
4715                         int sign;
4716                         Py_ssize_t len;
4717                         char formatbuf[FORMATBUFLEN];
4718                              /* For format{int,char}() */
4719 #ifdef Py_USING_UNICODE
4720                         char *fmt_start = fmt;
4721                         Py_ssize_t argidx_start = argidx;
4722 #endif
4723
4724                         fmt++;
4725                         if (*fmt == '(') {
4726                                 char *keystart;
4727                                 Py_ssize_t keylen;
4728                                 PyObject *key;
4729                                 int pcount = 1;
4730
4731                                 if (dict == NULL) {
4732                                         PyErr_SetString(PyExc_TypeError,
4733                                                  "format requires a mapping");
4734                                         goto error;
4735                                 }
4736                                 ++fmt;
4737                                 --fmtcnt;
4738                                 keystart = fmt;
4739                                 /* Skip over balanced parentheses */
4740                                 while (pcount > 0 && --fmtcnt >= 0) {
4741                                         if (*fmt == ')')
4742                                                 --pcount;
4743                                         else if (*fmt == '(')
4744                                                 ++pcount;
4745                                         fmt++;
4746                                 }
4747                                 keylen = fmt - keystart - 1;
4748                                 if (fmtcnt < 0 || pcount > 0) {
4749                                         PyErr_SetString(PyExc_ValueError,
4750                                                    "incomplete format key");
4751                                         goto error;
4752                                 }
4753                                 key = PyString_FromStringAndSize(keystart,
4754                                                                  keylen);
4755                                 if (key == NULL)
4756                                         goto error;
4757                                 if (args_owned) {
4758                                         Py_DECREF(args);
4759                                         args_owned = 0;
4760                                 }
4761                                 args = PyObject_GetItem(dict, key);
4762                                 Py_DECREF(key);
4763                                 if (args == NULL) {
4764                                         goto error;
4765                                 }
4766                                 args_owned = 1;
4767                                 arglen = -1;
4768                                 argidx = -2;
4769                         }
4770                         while (--fmtcnt >= 0) {
4771                                 switch (c = *fmt++) {
4772                                 case '-': flags |= F_LJUST; continue;
4773                                 case '+': flags |= F_SIGN; continue;
4774                                 case ' ': flags |= F_BLANK; continue;
4775                                 case '#': flags |= F_ALT; continue;
4776                                 case '0': flags |= F_ZERO; continue;
4777                                 }
4778                                 break;
4779                         }
4780                         if (c == '*') {
4781                                 v = getnextarg(args, arglen, &argidx);
4782                                 if (v == NULL)
4783                                         goto error;
4784                                 if (!PyInt_Check(v)) {
4785                                         PyErr_SetString(PyExc_TypeError,
4786                                                         "* wants int");
4787                                         goto error;
4788                                 }
4789                                 width = PyInt_AsLong(v);
4790                                 if (width < 0) {
4791                                         flags |= F_LJUST;
4792                                         width = -width;
4793                                 }
4794                                 if (--fmtcnt >= 0)
4795                                         c = *fmt++;
4796                         }
4797                         else if (c >= 0 && isdigit(c)) {
4798                                 width = c - '0';
4799                                 while (--fmtcnt >= 0) {
4800                                         c = Py_CHARMASK(*fmt++);
4801                                         if (!isdigit(c))
4802                                                 break;
4803                                         if ((width*10) / 10 != width) {
4804                                                 PyErr_SetString(
4805                                                         PyExc_ValueError,
4806                                                         "width too big");
4807                                                 goto error;
4808                                         }
4809                                         width = width*10 + (c - '0');
4810                                 }
4811                         }
4812                         if (c == '.') {
4813                                 prec = 0;
4814                                 if (--fmtcnt >= 0)
4815                                         c = *fmt++;
4816                                 if (c == '*') {
4817                                         v = getnextarg(args, arglen, &argidx);
4818                                         if (v == NULL)
4819                                                 goto error;
4820                                         if (!PyInt_Check(v)) {
4821                                                 PyErr_SetString(
4822                                                         PyExc_TypeError,
4823                                                         "* wants int");
4824                                                 goto error;
4825                                         }
4826                                         prec = PyInt_AsLong(v);
4827                                         if (prec < 0)
4828                                                 prec = 0;
4829                                         if (--fmtcnt >= 0)
4830                                                 c = *fmt++;
4831                                 }
4832                                 else if (c >= 0 && isdigit(c)) {
4833                                         prec = c - '0';
4834                                         while (--fmtcnt >= 0) {
4835                                                 c = Py_CHARMASK(*fmt++);
4836                                                 if (!isdigit(c))
4837                                                         break;
4838                                                 if ((prec*10) / 10 != prec) {
4839                                                         PyErr_SetString(
4840                                                             PyExc_ValueError,
4841                                                             "prec too big");
4842                                                         goto error;
4843                                                 }
4844                                                 prec = prec*10 + (c - '0');
4845                                         }
4846                                 }
4847                         } /* prec */
4848                         if (fmtcnt >= 0) {
4849                                 if (c == 'h' || c == 'l' || c == 'L') {
4850                                         if (--fmtcnt >= 0)
4851                                                 c = *fmt++;
4852                                 }
4853                         }
4854                         if (fmtcnt < 0) {
4855                                 PyErr_SetString(PyExc_ValueError,
4856                                                 "incomplete format");
4857                                 goto error;
4858                         }
4859                         if (c != '%') {
4860                                 v = getnextarg(args, arglen, &argidx);
4861                                 if (v == NULL)
4862                                         goto error;
4863                         }
4864                         sign = 0;
4865                         fill = ' ';
4866                         switch (c) {
4867                         case '%':
4868                                 pbuf = "%";
4869                                 len = 1;
4870                                 break;
4871                         case 's':
4872 #ifdef Py_USING_UNICODE
4873                                 if (PyUnicode_Check(v)) {
4874                                         fmt = fmt_start;
4875                                         argidx = argidx_start;
4876                                         goto unicode;
4877                                 }
4878 #endif
4879                                 temp = _PyObject_Str(v);
4880 #ifdef Py_USING_UNICODE
4881                                 if (temp != NULL && PyUnicode_Check(temp)) {
4882                                         Py_DECREF(temp);
4883                                         fmt = fmt_start;
4884                                         argidx = argidx_start;
4885                                         goto unicode;
4886                                 }
4887 #endif
4888                                 /* Fall through */
4889                         case 'r':
4890                                 if (c == 'r')
4891                                         temp = PyObject_Repr(v);
4892                                 if (temp == NULL)
4893                                         goto error;
4894                                 if (!PyString_Check(temp)) {
4895                                         PyErr_SetString(PyExc_TypeError,
4896                                           "%s argument has non-string str()");
4897                                         Py_DECREF(temp);
4898                                         goto error;
4899                                 }
4900                                 pbuf = PyString_AS_STRING(temp);
4901                                 len = PyString_GET_SIZE(temp);
4902                                 if (prec >= 0 && len > prec)
4903                                         len = prec;
4904                                 break;
4905                         case 'i':
4906                         case 'd':
4907                         case 'u':
4908                         case 'o':
4909                         case 'x':
4910                         case 'X':
4911                                 if (c == 'i')
4912                                         c = 'd';
4913                                 isnumok = 0;
4914                                 if (PyNumber_Check(v)) {
4915                                         PyObject *iobj=NULL;
4916
4917                                         if (PyInt_Check(v) || (PyLong_Check(v))) {
4918                                                 iobj = v;
4919                                                 Py_INCREF(iobj);
4920                                         }
4921                                         else {
4922                                                 iobj = PyNumber_Int(v);
4923                                                 if (iobj==NULL) iobj = PyNumber_Long(v);
4924                                         }
4925                                         if (iobj!=NULL) {
4926                                                 if (PyInt_Check(iobj)) {
4927                                                         isnumok = 1;
4928                                                         pbuf = formatbuf;
4929                                                         len = formatint(pbuf,
4930                                                                         sizeof(formatbuf),
4931                                                                         flags, prec, c, iobj);
4932                                                         Py_DECREF(iobj);
4933                                                         if (len < 0)
4934                                                                 goto error;
4935                                                         sign = 1;
4936                                                 }
4937                                                 else if (PyLong_Check(iobj)) {
4938                                                         int ilen;
4939
4940                                                         isnumok = 1;
4941                                                         temp = _PyString_FormatLong(iobj, flags,
4942                                                                 prec, c, &pbuf, &ilen);
4943                                                         Py_DECREF(iobj);
4944                                                         len = ilen;
4945                                                         if (!temp)
4946                                                                 goto error;
4947                                                         sign = 1;
4948                                                 }
4949                                                 else {
4950                                                         Py_DECREF(iobj);
4951                                                 }
4952                                         }
4953                                 }
4954                                 if (!isnumok) {
4955                                         PyErr_Format(PyExc_TypeError,
4956                                             "%%%c format: a number is required, "
4957                                             "not %.200s", c, Py_TYPE(v)->tp_name);
4958                                         goto error;
4959                                 }
4960                                 if (flags & F_ZERO)
4961                                         fill = '0';
4962                                 break;
4963                         case 'e':
4964                         case 'E':
4965                         case 'f':
4966                         case 'F':
4967                         case 'g':
4968                         case 'G':
4969                                 temp = formatfloat(v, flags, prec, c);
4970                                 if (temp == NULL)
4971                                         goto error;
4972                                 pbuf = PyString_AS_STRING(temp);
4973                                 len = PyString_GET_SIZE(temp);
4974                                 sign = 1;
4975                                 if (flags & F_ZERO)
4976                                         fill = '0';
4977                                 break;
4978                         case 'c':
4979 #ifdef Py_USING_UNICODE
4980                                 if (PyUnicode_Check(v)) {
4981                                         fmt = fmt_start;
4982                                         argidx = argidx_start;
4983                                         goto unicode;
4984                                 }
4985 #endif
4986                                 pbuf = formatbuf;
4987                                 len = formatchar(pbuf, sizeof(formatbuf), v);
4988                                 if (len < 0)
4989                                         goto error;
4990                                 break;
4991                         default:
4992                                 PyErr_Format(PyExc_ValueError,
4993                                   "unsupported format character '%c' (0x%x) "
4994                                   "at index %zd",
4995                                   c, c,
4996                                   (Py_ssize_t)(fmt - 1 -
4997                                                PyString_AsString(format)));
4998                                 goto error;
4999                         }
5000                         if (sign) {
5001                                 if (*pbuf == '-' || *pbuf == '+') {
5002                                         sign = *pbuf++;
5003                                         len--;
5004                                 }
5005                                 else if (flags & F_SIGN)
5006                                         sign = '+';
5007                                 else if (flags & F_BLANK)
5008                                         sign = ' ';
5009                                 else
5010                                         sign = 0;
5011                         }
5012                         if (width < len)
5013                                 width = len;
5014                         if (rescnt - (sign != 0) < width) {
5015                                 reslen -= rescnt;
5016                                 rescnt = width + fmtcnt + 100;
5017                                 reslen += rescnt;
5018                                 if (reslen < 0) {
5019                                         Py_DECREF(result);
5020                                         Py_XDECREF(temp);
5021                                         return PyErr_NoMemory();
5022                                 }
5023                                 if (_PyString_Resize(&result, reslen) < 0) {
5024                                         Py_XDECREF(temp);
5025                                         return NULL;
5026                                 }
5027                                 res = PyString_AS_STRING(result)
5028                                         + reslen - rescnt;
5029                         }
5030                         if (sign) {
5031                                 if (fill != ' ')
5032                                         *res++ = sign;
5033                                 rescnt--;
5034                                 if (width > len)
5035                                         width--;
5036                         }
5037                         if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5038                                 assert(pbuf[0] == '0');
5039                                 assert(pbuf[1] == c);
5040                                 if (fill != ' ') {
5041                                         *res++ = *pbuf++;
5042                                         *res++ = *pbuf++;
5043                                 }
5044                                 rescnt -= 2;
5045                                 width -= 2;
5046                                 if (width < 0)
5047                                         width = 0;
5048                                 len -= 2;
5049                         }
5050                         if (width > len && !(flags & F_LJUST)) {
5051                                 do {
5052                                         --rescnt;
5053                                         *res++ = fill;
5054                                 } while (--width > len);
5055                         }
5056                         if (fill == ' ') {
5057                                 if (sign)
5058                                         *res++ = sign;
5059                                 if ((flags & F_ALT) &&
5060                                     (c == 'x' || c == 'X')) {
5061                                         assert(pbuf[0] == '0');
5062                                         assert(pbuf[1] == c);
5063                                         *res++ = *pbuf++;
5064                                         *res++ = *pbuf++;
5065                                 }
5066                         }
5067                         Py_MEMCPY(res, pbuf, len);
5068                         res += len;
5069                         rescnt -= len;
5070                         while (--width >= len) {
5071                                 --rescnt;
5072                                 *res++ = ' ';
5073                         }
5074                         if (dict && (argidx < arglen) && c != '%') {
5075                                 PyErr_SetString(PyExc_TypeError,
5076                                            "not all arguments converted during string formatting");
5077                                 Py_XDECREF(temp);
5078                                 goto error;
5079                         }
5080                         Py_XDECREF(temp);
5081                 } /* '%' */
5082         } /* until end */
5083         if (argidx < arglen && !dict) {
5084                 PyErr_SetString(PyExc_TypeError,
5085                                 "not all arguments converted during string formatting");
5086                 goto error;
5087         }
5088         if (args_owned) {
5089                 Py_DECREF(args);
5090         }
5091         _PyString_Resize(&result, reslen - rescnt);
5092         return result;
5093
5094 #ifdef Py_USING_UNICODE
5095  unicode:
5096         if (args_owned) {
5097                 Py_DECREF(args);
5098                 args_owned = 0;
5099         }
5100         /* Fiddle args right (remove the first argidx arguments) */
5101         if (PyTuple_Check(orig_args) && argidx > 0) {
5102                 PyObject *v;
5103                 Py_ssize_t n = PyTuple_GET_SIZE(orig_args) - argidx;
5104                 v = PyTuple_New(n);
5105                 if (v == NULL)
5106                         goto error;
5107                 while (--n >= 0) {
5108                         PyObject *w = PyTuple_GET_ITEM(orig_args, n + argidx);
5109                         Py_INCREF(w);
5110                         PyTuple_SET_ITEM(v, n, w);
5111                 }
5112                 args = v;
5113         } else {
5114                 Py_INCREF(orig_args);
5115                 args = orig_args;
5116         }
5117         args_owned = 1;
5118         /* Take what we have of the result and let the Unicode formatting
5119            function format the rest of the input. */
5120         rescnt = res - PyString_AS_STRING(result);
5121         if (_PyString_Resize(&result, rescnt))
5122                 goto error;
5123         fmtcnt = PyString_GET_SIZE(format) - \
5124                  (fmt - PyString_AS_STRING(format));
5125         format = PyUnicode_Decode(fmt, fmtcnt, NULL, NULL);
5126         if (format == NULL)
5127                 goto error;
5128         v = PyUnicode_Format(format, args);
5129         Py_DECREF(format);
5130         if (v == NULL)
5131                 goto error;
5132         /* Paste what we have (result) to what the Unicode formatting
5133            function returned (v) and return the result (or error) */
5134         w = PyUnicode_Concat(result, v);
5135         Py_DECREF(result);
5136         Py_DECREF(v);
5137         Py_DECREF(args);
5138         return w;
5139 #endif /* Py_USING_UNICODE */
5140
5141  error:
5142         Py_DECREF(result);
5143         if (args_owned) {
5144                 Py_DECREF(args);
5145         }
5146         return NULL;
5147 }
5148
5149 void
5150 PyString_InternInPlace(PyObject **p)
5151 {
5152         register PyStringObject *s = (PyStringObject *)(*p);
5153         PyObject *t;
5154         if (s == NULL || !PyString_Check(s))
5155                 Py_FatalError("PyString_InternInPlace: strings only please!");
5156         /* If it's a string subclass, we don't really know what putting
5157            it in the interned dict might do. */
5158         if (!PyString_CheckExact(s))
5159                 return;
5160         if (PyString_CHECK_INTERNED(s))
5161                 return;
5162         if (interned == NULL) {
5163                 interned = PyDict_New();
5164                 if (interned == NULL) {
5165                         PyErr_Clear(); /* Don't leave an exception */
5166                         return;
5167                 }
5168         }
5169         t = PyDict_GetItem(interned, (PyObject *)s);
5170         if (t) {
5171                 Py_INCREF(t);
5172                 Py_DECREF(*p);
5173                 *p = t;
5174                 return;
5175         }
5176
5177         if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
5178                 PyErr_Clear();
5179                 return;
5180         }
5181         /* The two references in interned are not counted by refcnt.
5182            The string deallocator will take care of this */
5183         Py_REFCNT(s) -= 2;
5184         PyString_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
5185 }
5186
5187 void
5188 PyString_InternImmortal(PyObject **p)
5189 {
5190         PyString_InternInPlace(p);
5191         if (PyString_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
5192                 PyString_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
5193                 Py_INCREF(*p);
5194         }
5195 }
5196
5197
5198 PyObject *
5199 PyString_InternFromString(const char *cp)
5200 {
5201         PyObject *s = PyString_FromString(cp);
5202         if (s == NULL)
5203                 return NULL;
5204         PyString_InternInPlace(&s);
5205         return s;
5206 }
5207
5208 void
5209 PyString_Fini(void)
5210 {
5211         int i;
5212         for (i = 0; i < UCHAR_MAX + 1; i++) {
5213                 Py_XDECREF(characters[i]);
5214                 characters[i] = NULL;
5215         }
5216         Py_XDECREF(nullstring);
5217         nullstring = NULL;
5218 }
5219
5220 void _Py_ReleaseInternedStrings(void)
5221 {
5222         PyObject *keys;
5223         PyStringObject *s;
5224         Py_ssize_t i, n;
5225         Py_ssize_t immortal_size = 0, mortal_size = 0;
5226
5227         if (interned == NULL || !PyDict_Check(interned))
5228                 return;
5229         keys = PyDict_Keys(interned);
5230         if (keys == NULL || !PyList_Check(keys)) {
5231                 PyErr_Clear();
5232                 return;
5233         }
5234
5235         /* Since _Py_ReleaseInternedStrings() is intended to help a leak
5236            detector, interned strings are not forcibly deallocated; rather, we
5237            give them their stolen references back, and then clear and DECREF
5238            the interned dict. */
5239
5240         n = PyList_GET_SIZE(keys);
5241         fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
5242                 n);
5243         for (i = 0; i < n; i++) {
5244                 s = (PyStringObject *) PyList_GET_ITEM(keys, i);
5245                 switch (s->ob_sstate) {
5246                 case SSTATE_NOT_INTERNED:
5247                         /* XXX Shouldn't happen */
5248                         break;
5249                 case SSTATE_INTERNED_IMMORTAL:
5250                         Py_REFCNT(s) += 1;
5251                         immortal_size += Py_SIZE(s);
5252                         break;
5253                 case SSTATE_INTERNED_MORTAL:
5254                         Py_REFCNT(s) += 2;
5255                         mortal_size += Py_SIZE(s);
5256                         break;
5257                 default:
5258                         Py_FatalError("Inconsistent interned string state.");
5259                 }
5260                 s->ob_sstate = SSTATE_NOT_INTERNED;
5261         }
5262         fprintf(stderr, "total size of all interned strings: "
5263                         "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
5264                         "mortal/immortal\n", mortal_size, immortal_size);
5265         Py_DECREF(keys);
5266         PyDict_Clear(interned);
5267         Py_DECREF(interned);
5268         interned = NULL;
5269 }