Objects/stringobject.c

   1 /* String (str/bytes) object implementation */
   2
   3 #define PY_SSIZE_T_CLEAN
   4
   5 #include "Python.h"
   6 #include <ctype.h>
   7 #include <stddef.h>
   8
   9 #ifdef COUNT_ALLOCS
  10 Py_ssize_t null_strings, one_strings;
  11 #endif
  12
  13 static PyStringObject *characters[UCHAR_MAX + 1];
  14 static PyStringObject *nullstring;
  15
  16 /* This dictionary holds all interned strings.  Note that references to
  17    strings in this dictionary are *not* counted in the string's ob_refcnt.
  18    When the interned string reaches a refcnt of 0 the string deallocation
  19    function will delete the reference from this dictionary.
  20
  21    Another way to look at this is that to say that the actual reference
  22    count of a string is:  s->ob_refcnt + (s->ob_sstate?2:0)
  23 */
  24 static PyObject *interned;
  25
  26 /* PyStringObject_SIZE gives the basic size of a string; any memory allocation
  27    for a string of length n should request PyStringObject_SIZE + n bytes.
  28
  29    Using PyStringObject_SIZE instead of sizeof(PyStringObject) saves
  30    3 bytes per string allocation on a typical system.
  31 */
  32 #define PyStringObject_SIZE (offsetof(PyStringObject, ob_sval) + 1)
  33
  34 /*
  35    For both PyString_FromString() and PyString_FromStringAndSize(), the
  36    parameter `size' denotes number of characters to allocate, not counting any
  37    null terminating character.
  38
  39    For PyString_FromString(), the parameter `str' points to a null-terminated
  40    string containing exactly `size' bytes.
  41
  42    For PyString_FromStringAndSize(), the parameter the parameter `str' is
  43    either NULL or else points to a string containing at least `size' bytes.
  44    For PyString_FromStringAndSize(), the string in the `str' parameter does
  45    not have to be null-terminated.  (Therefore it is safe to construct a
  46    substring by calling `PyString_FromStringAndSize(origstring, substrlen)'.)
  47    If `str' is NULL then PyString_FromStringAndSize() will allocate `size+1'
  48    bytes (setting the last byte to the null terminating character) and you can
  49    fill in the data yourself.  If `str' is non-NULL then the resulting
  50    PyString object must be treated as immutable and you must not fill in nor
  51    alter the data yourself, since the strings may be shared.
  52
  53    The PyObject member `op->ob_size', which denotes the number of "extra
  54    items" in a variable-size object, will contain the number of bytes
  55    allocated for string data, not counting the null terminating character.  It
  56    is therefore equal to the equal to the `size' parameter (for
  57    PyString_FromStringAndSize()) or the length of the string in the `str'
  58    parameter (for PyString_FromString()).
  59 */
  60 PyObject *
  61 PyString_FromStringAndSize(const char *str, Py_ssize_t size)
  62 {
  63         register PyStringObject *op;
  64         if (size < 0) {
  65                 PyErr_SetString(PyExc_SystemError,
  66                     "Negative size passed to PyString_FromStringAndSize");
  67                 return NULL;
  68         }
  69         if (size == 0 && (op = nullstring) != NULL) {
  70 #ifdef COUNT_ALLOCS
  71                 null_strings++;
  72 #endif
  73                 Py_INCREF(op);
  74                 return (PyObject *)op;
  75         }
  76         if (size == 1 && str != NULL &&
  77             (op = characters[*str & UCHAR_MAX]) != NULL)
  78         {
  79 #ifdef COUNT_ALLOCS
  80                 one_strings++;
  81 #endif
  82                 Py_INCREF(op);
  83                 return (PyObject *)op;
  84         }
  85
  86         if (size > PY_SSIZE_T_MAX - PyStringObject_SIZE) {
  87                 PyErr_SetString(PyExc_OverflowError, "string is too large");
  88                 return NULL;
  89         }
  90
  91         /* Inline PyObject_NewVar */
  92         op = (PyStringObject *)PyObject_MALLOC(PyStringObject_SIZE + size);
  93         if (op == NULL)
  94                 return PyErr_NoMemory();
  95         PyObject_INIT_VAR(op, &PyString_Type, size);
  96         op->ob_shash = -1;
  97         op->ob_sstate = SSTATE_NOT_INTERNED;
  98         if (str != NULL)
  99                 Py_MEMCPY(op->ob_sval, str, size);
 100         op->ob_sval[size] = '\0';
 101         /* share short strings */
 102         if (size == 0) {
 103                 PyObject *t = (PyObject *)op;
 104                 PyString_InternInPlace(&t);
 105                 op = (PyStringObject *)t;
 106                 nullstring = op;
 107                 Py_INCREF(op);
 108         } else if (size == 1 && str != NULL) {
 109                 PyObject *t = (PyObject *)op;
 110                 PyString_InternInPlace(&t);
 111                 op = (PyStringObject *)t;
 112                 characters[*str & UCHAR_MAX] = op;
 113                 Py_INCREF(op);
 114         }
 115         return (PyObject *) op;
 116 }
 117
 118 PyObject *
 119 PyString_FromString(const char *str)
 120 {
 121         register size_t size;
 122         register PyStringObject *op;
 123
 124         assert(str != NULL);
 125         size = strlen(str);
 126         if (size > PY_SSIZE_T_MAX - PyStringObject_SIZE) {
 127                 PyErr_SetString(PyExc_OverflowError,
 128                         "string is too long for a Python string");
 129                 return NULL;
 130         }
 131         if (size == 0 && (op = nullstring) != NULL) {
 132 #ifdef COUNT_ALLOCS
 133                 null_strings++;
 134 #endif
 135                 Py_INCREF(op);
 136                 return (PyObject *)op;
 137         }
 138         if (size == 1 && (op = characters[*str & UCHAR_MAX]) != NULL) {
 139 #ifdef COUNT_ALLOCS
 140                 one_strings++;
 141 #endif
 142                 Py_INCREF(op);
 143                 return (PyObject *)op;
 144         }
 145
 146         /* Inline PyObject_NewVar */
 147         op = (PyStringObject *)PyObject_MALLOC(PyStringObject_SIZE + size);
 148         if (op == NULL)
 149                 return PyErr_NoMemory();
 150         PyObject_INIT_VAR(op, &PyString_Type, size);
 151         op->ob_shash = -1;
 152         op->ob_sstate = SSTATE_NOT_INTERNED;
 153         Py_MEMCPY(op->ob_sval, str, size+1);
 154         /* share short strings */
 155         if (size == 0) {
 156                 PyObject *t = (PyObject *)op;
 157                 PyString_InternInPlace(&t);
 158                 op = (PyStringObject *)t;
 159                 nullstring = op;
 160                 Py_INCREF(op);
 161         } else if (size == 1) {
 162                 PyObject *t = (PyObject *)op;
 163                 PyString_InternInPlace(&t);
 164                 op = (PyStringObject *)t;
 165                 characters[*str & UCHAR_MAX] = op;
 166                 Py_INCREF(op);
 167         }
 168         return (PyObject *) op;
 169 }
 170
 171 PyObject *
 172 PyString_FromFormatV(const char *format, va_list vargs)
 173 {
 174         va_list count;
 175         Py_ssize_t n = 0;
 176         const char* f;
 177         char *s;
 178         PyObject* string;
 179
 180 #ifdef VA_LIST_IS_ARRAY
 181         Py_MEMCPY(count, vargs, sizeof(va_list));
 182 #else
 183 #ifdef  __va_copy
 184         __va_copy(count, vargs);
 185 #else
 186         count = vargs;
 187 #endif
 188 #endif
 189         /* step 1: figure out how large a buffer we need */
 190         for (f = format; *f; f++) {
 191                 if (*f == '%') {
 192                         const char* p = f;
 193                         while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
 194                                 ;
 195
 196                         /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
 197                          * they don't affect the amount of space we reserve.
 198                          */
 199                         if ((*f == 'l' || *f == 'z') &&
 200                                         (f[1] == 'd' || f[1] == 'u'))
 201                                 ++f;
 202
 203                         switch (*f) {
 204                         case 'c':
 205                                 (void)va_arg(count, int);
 206                                 /* fall through... */
 207                         case '%':
 208                                 n++;
 209                                 break;
 210                         case 'd': case 'u': case 'i': case 'x':
 211                                 (void) va_arg(count, int);
 212                                 /* 20 bytes is enough to hold a 64-bit
 213                                    integer.  Decimal takes the most space.
 214                                    This isn't enough for octal. */
 215                                 n += 20;
 216                                 break;
 217                         case 's':
 218                                 s = va_arg(count, char*);
 219                                 n += strlen(s);
 220                                 break;
 221                         case 'p':
 222                                 (void) va_arg(count, int);
 223                                 /* maximum 64-bit pointer representation:
 224                                  * 0xffffffffffffffff
 225                                  * so 19 characters is enough.
 226                                  * XXX I count 18 -- what's the extra for?
 227                                  */
 228                                 n += 19;
 229                                 break;
 230                         default:
 231                                 /* if we stumble upon an unknown
 232                                    formatting code, copy the rest of
 233                                    the format string to the output
 234                                    string. (we cannot just skip the
 235                                    code, since there's no way to know
 236                                    what's in the argument list) */
 237                                 n += strlen(p);
 238                                 goto expand;
 239                         }
 240                 } else
 241                         n++;
 242         }
 243  expand:
 244         /* step 2: fill the buffer */
 245         /* Since we've analyzed how much space we need for the worst case,
 246            use sprintf directly instead of the slower PyOS_snprintf. */
 247         string = PyString_FromStringAndSize(NULL, n);
 248         if (!string)
 249                 return NULL;
 250
 251         s = PyString_AsString(string);
 252
 253         for (f = format; *f; f++) {
 254                 if (*f == '%') {
 255                         const char* p = f++;
 256                         Py_ssize_t i;
 257                         int longflag = 0;
 258                         int size_tflag = 0;
 259                         /* parse the width.precision part (we're only
 260                            interested in the precision value, if any) */
 261                         n = 0;
 262                         while (isdigit(Py_CHARMASK(*f)))
 263                                 n = (n*10) + *f++ - '0';
 264                         if (*f == '.') {
 265                                 f++;
 266                                 n = 0;
 267                                 while (isdigit(Py_CHARMASK(*f)))
 268                                         n = (n*10) + *f++ - '0';
 269                         }
 270                         while (*f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
 271                                 f++;
 272                         /* handle the long flag, but only for %ld and %lu.
 273                            others can be added when necessary. */
 274                         if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
 275                                 longflag = 1;
 276                                 ++f;
 277                         }
 278                         /* handle the size_t flag. */
 279                         if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
 280                                 size_tflag = 1;
 281                                 ++f;
 282                         }
 283
 284                         switch (*f) {
 285                         case 'c':
 286                                 *s++ = va_arg(vargs, int);
 287                                 break;
 288                         case 'd':
 289                                 if (longflag)
 290                                         sprintf(s, "%ld", va_arg(vargs, long));
 291                                 else if (size_tflag)
 292                                         sprintf(s, "%" PY_FORMAT_SIZE_T "d",
 293                                                 va_arg(vargs, Py_ssize_t));
 294                                 else
 295                                         sprintf(s, "%d", va_arg(vargs, int));
 296                                 s += strlen(s);
 297                                 break;
 298                         case 'u':
 299                                 if (longflag)
 300                                         sprintf(s, "%lu",
 301                                                 va_arg(vargs, unsigned long));
 302                                 else if (size_tflag)
 303                                         sprintf(s, "%" PY_FORMAT_SIZE_T "u",
 304                                                 va_arg(vargs, size_t));
 305                                 else
 306                                         sprintf(s, "%u",
 307                                                 va_arg(vargs, unsigned int));
 308                                 s += strlen(s);
 309                                 break;
 310                         case 'i':
 311                                 sprintf(s, "%i", va_arg(vargs, int));
 312                                 s += strlen(s);
 313                                 break;
 314                         case 'x':
 315                                 sprintf(s, "%x", va_arg(vargs, int));
 316                                 s += strlen(s);
 317                                 break;
 318                         case 's':
 319                                 p = va_arg(vargs, char*);
 320                                 i = strlen(p);
 321                                 if (n > 0 && i > n)
 322                                         i = n;
 323                                 Py_MEMCPY(s, p, i);
 324                                 s += i;
 325                                 break;
 326                         case 'p':
 327                                 sprintf(s, "%p", va_arg(vargs, void*));
 328                                 /* %p is ill-defined:  ensure leading 0x. */
 329                                 if (s[1] == 'X')
 330                                         s[1] = 'x';
 331                                 else if (s[1] != 'x') {
 332                                         memmove(s+2, s, strlen(s)+1);
 333                                         s[0] = '0';
 334                                         s[1] = 'x';
 335                                 }
 336                                 s += strlen(s);
 337                                 break;
 338                         case '%':
 339                                 *s++ = '%';
 340                                 break;
 341                         default:
 342                                 strcpy(s, p);
 343                                 s += strlen(s);
 344                                 goto end;
 345                         }
 346                 } else
 347                         *s++ = *f;
 348         }
 349
 350  end:
 351         _PyString_Resize(&string, s - PyString_AS_STRING(string));
 352         return string;
 353 }
 354
 355 PyObject *
 356 PyString_FromFormat(const char *format, ...)
 357 {
 358         PyObject* ret;
 359         va_list vargs;
 360
 361 #ifdef HAVE_STDARG_PROTOTYPES
 362         va_start(vargs, format);
 363 #else
 364         va_start(vargs);
 365 #endif
 366         ret = PyString_FromFormatV(format, vargs);
 367         va_end(vargs);
 368         return ret;
 369 }
 370
 371
 372 PyObject *PyString_Decode(const char *s,
 373                           Py_ssize_t size,
 374                           const char *encoding,
 375                           const char *errors)
 376 {
 377     PyObject *v, *str;
 378
 379     str = PyString_FromStringAndSize(s, size);
 380     if (str == NULL)
 381         return NULL;
 382     v = PyString_AsDecodedString(str, encoding, errors);
 383     Py_DECREF(str);
 384     return v;
 385 }
 386
 387 PyObject *PyString_AsDecodedObject(PyObject *str,
 388                                    const char *encoding,
 389                                    const char *errors)
 390 {
 391     PyObject *v;
 392
 393     if (!PyString_Check(str)) {
 394         PyErr_BadArgument();
 395         goto onError;
 396     }
 397
 398     if (encoding == NULL) {
 399 #ifdef Py_USING_UNICODE
 400         encoding = PyUnicode_GetDefaultEncoding();
 401 #else
 402         PyErr_SetString(PyExc_ValueError, "no encoding specified");
 403         goto onError;
 404 #endif
 405     }
 406
 407     /* Decode via the codec registry */
 408     v = PyCodec_Decode(str, encoding, errors);
 409     if (v == NULL)
 410         goto onError;
 411
 412     return v;
 413
 414  onError:
 415     return NULL;
 416 }
 417
 418 PyObject *PyString_AsDecodedString(PyObject *str,
 419                                    const char *encoding,
 420                                    const char *errors)
 421 {
 422     PyObject *v;
 423
 424     v = PyString_AsDecodedObject(str, encoding, errors);
 425     if (v == NULL)
 426         goto onError;
 427
 428 #ifdef Py_USING_UNICODE
 429     /* Convert Unicode to a string using the default encoding */
 430     if (PyUnicode_Check(v)) {
 431         PyObject *temp = v;
 432         v = PyUnicode_AsEncodedString(v, NULL, NULL);
 433         Py_DECREF(temp);
 434         if (v == NULL)
 435             goto onError;
 436     }
 437 #endif
 438     if (!PyString_Check(v)) {
 439         PyErr_Format(PyExc_TypeError,
 440                      "decoder did not return a string object (type=%.400s)",
 441                      Py_TYPE(v)->tp_name);
 442         Py_DECREF(v);
 443         goto onError;
 444     }
 445
 446     return v;
 447
 448  onError:
 449     return NULL;
 450 }
 451
 452 PyObject *PyString_Encode(const char *s,
 453                           Py_ssize_t size,
 454                           const char *encoding,
 455                           const char *errors)
 456 {
 457     PyObject *v, *str;
 458
 459     str = PyString_FromStringAndSize(s, size);
 460     if (str == NULL)
 461         return NULL;
 462     v = PyString_AsEncodedString(str, encoding, errors);
 463     Py_DECREF(str);
 464     return v;
 465 }
 466
 467 PyObject *PyString_AsEncodedObject(PyObject *str,
 468                                    const char *encoding,
 469                                    const char *errors)
 470 {
 471     PyObject *v;
 472
 473     if (!PyString_Check(str)) {
 474         PyErr_BadArgument();
 475         goto onError;
 476     }
 477
 478     if (encoding == NULL) {
 479 #ifdef Py_USING_UNICODE
 480         encoding = PyUnicode_GetDefaultEncoding();
 481 #else
 482         PyErr_SetString(PyExc_ValueError, "no encoding specified");
 483         goto onError;
 484 #endif
 485     }
 486
 487     /* Encode via the codec registry */
 488     v = PyCodec_Encode(str, encoding, errors);
 489     if (v == NULL)
 490         goto onError;
 491
 492     return v;
 493
 494  onError:
 495     return NULL;
 496 }
 497
 498 PyObject *PyString_AsEncodedString(PyObject *str,
 499                                    const char *encoding,
 500                                    const char *errors)
 501 {
 502     PyObject *v;
 503
 504     v = PyString_AsEncodedObject(str, encoding, errors);
 505     if (v == NULL)
 506         goto onError;
 507
 508 #ifdef Py_USING_UNICODE
 509     /* Convert Unicode to a string using the default encoding */
 510     if (PyUnicode_Check(v)) {
 511         PyObject *temp = v;
 512         v = PyUnicode_AsEncodedString(v, NULL, NULL);
 513         Py_DECREF(temp);
 514         if (v == NULL)
 515             goto onError;
 516     }
 517 #endif
 518     if (!PyString_Check(v)) {
 519         PyErr_Format(PyExc_TypeError,
 520                      "encoder did not return a string object (type=%.400s)",
 521                      Py_TYPE(v)->tp_name);
 522         Py_DECREF(v);
 523         goto onError;
 524     }
 525
 526     return v;
 527
 528  onError:
 529     return NULL;
 530 }
 531
 532 static void
 533 string_dealloc(PyObject *op)
 534 {
 535         switch (PyString_CHECK_INTERNED(op)) {
 536                 case SSTATE_NOT_INTERNED:
 537                         break;
 538
 539                 case SSTATE_INTERNED_MORTAL:
 540                         /* revive dead object temporarily for DelItem */
 541                         Py_REFCNT(op) = 3;
 542                         if (PyDict_DelItem(interned, op) != 0)
 543                                 Py_FatalError(
 544                                         "deletion of interned string failed");
 545                         break;
 546
 547                 case SSTATE_INTERNED_IMMORTAL:
 548                         Py_FatalError("Immortal interned string died.");
 549
 550                 default:
 551                         Py_FatalError("Inconsistent interned string state.");
 552         }
 553         Py_TYPE(op)->tp_free(op);
 554 }
 555
 556 /* Unescape a backslash-escaped string. If unicode is non-zero,
 557    the string is a u-literal. If recode_encoding is non-zero,
 558    the string is UTF-8 encoded and should be re-encoded in the
 559    specified encoding.  */
 560
 561 PyObject *PyString_DecodeEscape(const char *s,
 562                                 Py_ssize_t len,
 563                                 const char *errors,
 564                                 Py_ssize_t unicode,
 565                                 const char *recode_encoding)
 566 {
 567         int c;
 568         char *p, *buf;
 569         const char *end;
 570         PyObject *v;
 571         Py_ssize_t newlen = recode_encoding ? 4*len:len;
 572         v = PyString_FromStringAndSize((char *)NULL, newlen);
 573         if (v == NULL)
 574                 return NULL;
 575         p = buf = PyString_AsString(v);
 576         end = s + len;
 577         while (s < end) {
 578                 if (*s != '\\') {
 579                   non_esc:
 580 #ifdef Py_USING_UNICODE
 581                         if (recode_encoding && (*s & 0x80)) {
 582                                 PyObject *u, *w;
 583                                 char *r;
 584                                 const char* t;
 585                                 Py_ssize_t rn;
 586                                 t = s;
 587                                 /* Decode non-ASCII bytes as UTF-8. */
 588                                 while (t < end && (*t & 0x80)) t++;
 589                                 u = PyUnicode_DecodeUTF8(s, t - s, errors);
 590                                 if(!u) goto failed;
 591
 592                                 /* Recode them in target encoding. */
 593                                 w = PyUnicode_AsEncodedString(
 594                                         u, recode_encoding, errors);
 595                                 Py_DECREF(u);
 596                                 if (!w) goto failed;
 597
 598                                 /* Append bytes to output buffer. */
 599                                 assert(PyString_Check(w));
 600                                 r = PyString_AS_STRING(w);
 601                                 rn = PyString_GET_SIZE(w);
 602                                 Py_MEMCPY(p, r, rn);
 603                                 p += rn;
 604                                 Py_DECREF(w);
 605                                 s = t;
 606                         } else {
 607                                 *p++ = *s++;
 608                         }
 609 #else
 610                         *p++ = *s++;
 611 #endif
 612                         continue;
 613                 }
 614                 s++;
 615                 if (s==end) {
 616                         PyErr_SetString(PyExc_ValueError,
 617                                         "Trailing \\ in string");
 618                         goto failed;
 619                 }
 620                 switch (*s++) {
 621                 /* XXX This assumes ASCII! */
 622                 case '\n': break;
 623                 case '\\': *p++ = '\\'; break;
 624                 case '\'': *p++ = '\''; break;
 625                 case '\"': *p++ = '\"'; break;
 626                 case 'b': *p++ = '\b'; break;
 627                 case 'f': *p++ = '\014'; break; /* FF */
 628                 case 't': *p++ = '\t'; break;
 629                 case 'n': *p++ = '\n'; break;
 630                 case 'r': *p++ = '\r'; break;
 631                 case 'v': *p++ = '\013'; break; /* VT */
 632                 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
 633                 case '0': case '1': case '2': case '3':
 634                 case '4': case '5': case '6': case '7':
 635                         c = s[-1] - '0';
 636                         if (s < end && '0' <= *s && *s <= '7') {
 637                                 c = (c<<3) + *s++ - '0';
 638                                 if (s < end && '0' <= *s && *s <= '7')
 639                                         c = (c<<3) + *s++ - '0';
 640                         }
 641                         *p++ = c;
 642                         break;
 643                 case 'x':
 644                         if (s+1 < end &&
 645                             isxdigit(Py_CHARMASK(s[0])) &&
 646                             isxdigit(Py_CHARMASK(s[1])))
 647                         {
 648                                 unsigned int x = 0;
 649                                 c = Py_CHARMASK(*s);
 650                                 s++;
 651                                 if (isdigit(c))
 652                                         x = c - '0';
 653                                 else if (islower(c))
 654                                         x = 10 + c - 'a';
 655                                 else
 656                                         x = 10 + c - 'A';
 657                                 x = x << 4;
 658                                 c = Py_CHARMASK(*s);
 659                                 s++;
 660                                 if (isdigit(c))
 661                                         x += c - '0';
 662                                 else if (islower(c))
 663                                         x += 10 + c - 'a';
 664                                 else
 665                                         x += 10 + c - 'A';
 666                                 *p++ = x;
 667                                 break;
 668                         }
 669                         if (!errors || strcmp(errors, "strict") == 0) {
 670                                 PyErr_SetString(PyExc_ValueError,
 671                                                 "invalid \\x escape");
 672                                 goto failed;
 673                         }
 674                         if (strcmp(errors, "replace") == 0) {
 675                                 *p++ = '?';
 676                         } else if (strcmp(errors, "ignore") == 0)
 677                                 /* do nothing */;
 678                         else {
 679                                 PyErr_Format(PyExc_ValueError,
 680                                              "decoding error; "
 681                                              "unknown error handling code: %.400s",
 682                                              errors);
 683                                 goto failed;
 684                         }
 685 #ifndef Py_USING_UNICODE
 686                 case 'u':
 687                 case 'U':
 688                 case 'N':
 689                         if (unicode) {
 690                                 PyErr_SetString(PyExc_ValueError,
 691                                           "Unicode escapes not legal "
 692                                           "when Unicode disabled");
 693                                 goto failed;
 694                         }
 695 #endif
 696                 default:
 697                         *p++ = '\\';
 698                         s--;
 699                         goto non_esc; /* an arbitry number of unescaped
 700                                          UTF-8 bytes may follow. */
 701                 }
 702         }
 703         if (p-buf < newlen)
 704                 _PyString_Resize(&v, p - buf);
 705         return v;
 706   failed:
 707         Py_DECREF(v);
 708         return NULL;
 709 }
 710
 711 /* -------------------------------------------------------------------- */
 712 /* object api */
 713
 714 static Py_ssize_t
 715 string_getsize(register PyObject *op)
 716 {
 717         char *s;
 718         Py_ssize_t len;
 719         if (PyString_AsStringAndSize(op, &s, &len))
 720                 return -1;
 721         return len;
 722 }
 723
 724 static /*const*/ char *
 725 string_getbuffer(register PyObject *op)
 726 {
 727         char *s;
 728         Py_ssize_t len;
 729         if (PyString_AsStringAndSize(op, &s, &len))
 730                 return NULL;
 731         return s;
 732 }
 733
 734 Py_ssize_t
 735 PyString_Size(register PyObject *op)
 736 {
 737         if (!PyString_Check(op))
 738                 return string_getsize(op);
 739         return Py_SIZE(op);
 740 }
 741
 742 /*const*/ char *
 743 PyString_AsString(register PyObject *op)
 744 {
 745         if (!PyString_Check(op))
 746                 return string_getbuffer(op);
 747         return ((PyStringObject *)op) -> ob_sval;
 748 }
 749
 750 int
 751 PyString_AsStringAndSize(register PyObject *obj,
 752                          register char **s,
 753                          register Py_ssize_t *len)
 754 {
 755         if (s == NULL) {
 756                 PyErr_BadInternalCall();
 757                 return -1;
 758         }
 759
 760         if (!PyString_Check(obj)) {
 761 #ifdef Py_USING_UNICODE
 762                 if (PyUnicode_Check(obj)) {
 763                         obj = _PyUnicode_AsDefaultEncodedString(obj, NULL);
 764                         if (obj == NULL)
 765                                 return -1;
 766                 }
 767                 else
 768 #endif
 769                 {
 770                         PyErr_Format(PyExc_TypeError,
 771                                      "expected string or Unicode object, "
 772                                      "%.200s found", Py_TYPE(obj)->tp_name);
 773                         return -1;
 774                 }
 775         }
 776
 777         *s = PyString_AS_STRING(obj);
 778         if (len != NULL)
 779                 *len = PyString_GET_SIZE(obj);
 780         else if (strlen(*s) != (size_t)PyString_GET_SIZE(obj)) {
 781                 PyErr_SetString(PyExc_TypeError,
 782                                 "expected string without null bytes");
 783                 return -1;
 784         }
 785         return 0;
 786 }
 787
 788 /* -------------------------------------------------------------------- */
 789 /* Methods */
 790
 791 #include "stringlib/stringdefs.h"
 792 #include "stringlib/fastsearch.h"
 793
 794 #include "stringlib/count.h"
 795 #include "stringlib/find.h"
 796 #include "stringlib/partition.h"
 797
 798 #define _Py_InsertThousandsGrouping _PyString_InsertThousandsGrouping
 799 #include "stringlib/localeutil.h"
 800
 801
 802
 803 static int
 804 string_print(PyStringObject *op, FILE *fp, int flags)
 805 {
 806         Py_ssize_t i, str_len;
 807         char c;
 808         int quote;
 809
 810         /* XXX Ought to check for interrupts when writing long strings */
 811         if (! PyString_CheckExact(op)) {
 812                 int ret;
 813                 /* A str subclass may have its own __str__ method. */
 814                 op = (PyStringObject *) PyObject_Str((PyObject *)op);
 815                 if (op == NULL)
 816                         return -1;
 817                 ret = string_print(op, fp, flags);
 818                 Py_DECREF(op);
 819                 return ret;
 820         }
 821         if (flags & Py_PRINT_RAW) {
 822                 char *data = op->ob_sval;
 823                 Py_ssize_t size = Py_SIZE(op);
 824                 Py_BEGIN_ALLOW_THREADS
 825                 while (size > INT_MAX) {
 826                         /* Very long strings cannot be written atomically.
 827                          * But don't write exactly INT_MAX bytes at a time
 828                          * to avoid memory aligment issues.
 829                          */
 830                         const int chunk_size = INT_MAX & ~0x3FFF;
 831                         fwrite(data, 1, chunk_size, fp);
 832                         data += chunk_size;
 833                         size -= chunk_size;
 834                 }
 835 #ifdef __VMS
 836                 if (size) fwrite(data, (int)size, 1, fp);
 837 #else
 838                 fwrite(data, 1, (int)size, fp);
 839 #endif
 840                 Py_END_ALLOW_THREADS
 841                 return 0;
 842         }
 843
 844         /* figure out which quote to use; single is preferred */
 845         quote = '\'';
 846         if (memchr(op->ob_sval, '\'', Py_SIZE(op)) &&
 847             !memchr(op->ob_sval, '"', Py_SIZE(op)))
 848                 quote = '"';
 849
 850         str_len = Py_SIZE(op);
 851         Py_BEGIN_ALLOW_THREADS
 852         fputc(quote, fp);
 853         for (i = 0; i < str_len; i++) {
 854                 /* Since strings are immutable and the caller should have a
 855                 reference, accessing the interal buffer should not be an issue
 856                 with the GIL released. */
 857                 c = op->ob_sval[i];
 858                 if (c == quote || c == '\\')
 859                         fprintf(fp, "\\%c", c);
 860                 else if (c == '\t')
 861                         fprintf(fp, "\\t");
 862                 else if (c == '\n')
 863                         fprintf(fp, "\\n");
 864                 else if (c == '\r')
 865                         fprintf(fp, "\\r");
 866                 else if (c < ' ' || c >= 0x7f)
 867                         fprintf(fp, "\\x%02x", c & 0xff);
 868                 else
 869                         fputc(c, fp);
 870         }
 871         fputc(quote, fp);
 872         Py_END_ALLOW_THREADS
 873         return 0;
 874 }
 875
 876 PyObject *
 877 PyString_Repr(PyObject *obj, int smartquotes)
 878 {
 879         register PyStringObject* op = (PyStringObject*) obj;
 880         size_t newsize = 2 + 4 * Py_SIZE(op);
 881         PyObject *v;
 882         if (newsize > PY_SSIZE_T_MAX || newsize / 4 != Py_SIZE(op)) {
 883                 PyErr_SetString(PyExc_OverflowError,
 884                         "string is too large to make repr");
 885                 return NULL;
 886         }
 887         v = PyString_FromStringAndSize((char *)NULL, newsize);
 888         if (v == NULL) {
 889                 return NULL;
 890         }
 891         else {
 892                 register Py_ssize_t i;
 893                 register char c;
 894                 register char *p;
 895                 int quote;
 896
 897                 /* figure out which quote to use; single is preferred */
 898                 quote = '\'';
 899                 if (smartquotes &&
 900                     memchr(op->ob_sval, '\'', Py_SIZE(op)) &&
 901                     !memchr(op->ob_sval, '"', Py_SIZE(op)))
 902                         quote = '"';
 903
 904                 p = PyString_AS_STRING(v);
 905                 *p++ = quote;
 906                 for (i = 0; i < Py_SIZE(op); i++) {
 907                         /* There's at least enough room for a hex escape
 908                            and a closing quote. */
 909                         assert(newsize - (p - PyString_AS_STRING(v)) >= 5);
 910                         c = op->ob_sval[i];
 911                         if (c == quote || c == '\\')
 912                                 *p++ = '\\', *p++ = c;
 913                         else if (c == '\t')
 914                                 *p++ = '\\', *p++ = 't';
 915                         else if (c == '\n')
 916                                 *p++ = '\\', *p++ = 'n';
 917                         else if (c == '\r')
 918                                 *p++ = '\\', *p++ = 'r';
 919                         else if (c < ' ' || c >= 0x7f) {
 920                                 /* For performance, we don't want to call
 921                                    PyOS_snprintf here (extra layers of
 922                                    function call). */
 923                                 sprintf(p, "\\x%02x", c & 0xff);
 924                                 p += 4;
 925                         }
 926                         else
 927                                 *p++ = c;
 928                 }
 929                 assert(newsize - (p - PyString_AS_STRING(v)) >= 1);
 930                 *p++ = quote;
 931                 *p = '\0';
 932                 _PyString_Resize(
 933                         &v, (p - PyString_AS_STRING(v)));
 934                 return v;
 935         }
 936 }
 937
 938 static PyObject *
 939 string_repr(PyObject *op)
 940 {
 941         return PyString_Repr(op, 1);
 942 }
 943
 944 static PyObject *
 945 string_str(PyObject *s)
 946 {
 947         assert(PyString_Check(s));
 948         if (PyString_CheckExact(s)) {
 949                 Py_INCREF(s);
 950                 return s;
 951         }
 952         else {
 953                 /* Subtype -- return genuine string with the same value. */
 954                 PyStringObject *t = (PyStringObject *) s;
 955                 return PyString_FromStringAndSize(t->ob_sval, Py_SIZE(t));
 956         }
 957 }
 958
 959 static Py_ssize_t
 960 string_length(PyStringObject *a)
 961 {
 962         return Py_SIZE(a);
 963 }
 964
 965 static PyObject *
 966 string_concat(register PyStringObject *a, register PyObject *bb)
 967 {
 968         register Py_ssize_t size;
 969         register PyStringObject *op;
 970         if (!PyString_Check(bb)) {
 971 #ifdef Py_USING_UNICODE
 972                 if (PyUnicode_Check(bb))
 973                     return PyUnicode_Concat((PyObject *)a, bb);
 974 #endif
 975                 if (PyByteArray_Check(bb))
 976                     return PyByteArray_Concat((PyObject *)a, bb);
 977                 PyErr_Format(PyExc_TypeError,
 978                              "cannot concatenate 'str' and '%.200s' objects",
 979                              Py_TYPE(bb)->tp_name);
 980                 return NULL;
 981         }
 982 #define b ((PyStringObject *)bb)
 983         /* Optimize cases with empty left or right operand */
 984         if ((Py_SIZE(a) == 0 || Py_SIZE(b) == 0) &&
 985             PyString_CheckExact(a) && PyString_CheckExact(b)) {
 986                 if (Py_SIZE(a) == 0) {
 987                         Py_INCREF(bb);
 988                         return bb;
 989                 }
 990                 Py_INCREF(a);
 991                 return (PyObject *)a;
 992         }
 993         size = Py_SIZE(a) + Py_SIZE(b);
 994         /* Check that string sizes are not negative, to prevent an
 995            overflow in cases where we are passed incorrectly-created
 996            strings with negative lengths (due to a bug in other code).
 997         */
 998         if (Py_SIZE(a) < 0 || Py_SIZE(b) < 0 ||
 999             Py_SIZE(a) > PY_SSIZE_T_MAX - Py_SIZE(b)) {
1000                 PyErr_SetString(PyExc_OverflowError,
1001                                 "strings are too large to concat");
1002                 return NULL;
1003         }
1004
1005         /* Inline PyObject_NewVar */
1006         if (size > PY_SSIZE_T_MAX - PyStringObject_SIZE) {
1007                 PyErr_SetString(PyExc_OverflowError,
1008                                 "strings are too large to concat");
1009                 return NULL;
1010         }
1011         op = (PyStringObject *)PyObject_MALLOC(PyStringObject_SIZE + size);
1012         if (op == NULL)
1013                 return PyErr_NoMemory();
1014         PyObject_INIT_VAR(op, &PyString_Type, size);
1015         op->ob_shash = -1;
1016         op->ob_sstate = SSTATE_NOT_INTERNED;
1017         Py_MEMCPY(op->ob_sval, a->ob_sval, Py_SIZE(a));
1018         Py_MEMCPY(op->ob_sval + Py_SIZE(a), b->ob_sval, Py_SIZE(b));
1019         op->ob_sval[size] = '\0';
1020         return (PyObject *) op;
1021 #undef b
1022 }
1023
1024 static PyObject *
1025 string_repeat(register PyStringObject *a, register Py_ssize_t n)
1026 {
1027         register Py_ssize_t i;
1028         register Py_ssize_t j;
1029         register Py_ssize_t size;
1030         register PyStringObject *op;
1031         size_t nbytes;
1032         if (n < 0)
1033                 n = 0;
1034         /* watch out for overflows:  the size can overflow int,
1035          * and the # of bytes needed can overflow size_t
1036          */
1037         size = Py_SIZE(a) * n;
1038         if (n && size / n != Py_SIZE(a)) {
1039                 PyErr_SetString(PyExc_OverflowError,
1040                         "repeated string is too long");
1041                 return NULL;
1042         }
1043         if (size == Py_SIZE(a) && PyString_CheckExact(a)) {
1044                 Py_INCREF(a);
1045                 return (PyObject *)a;
1046         }
1047         nbytes = (size_t)size;
1048         if (nbytes + PyStringObject_SIZE <= nbytes) {
1049                 PyErr_SetString(PyExc_OverflowError,
1050                         "repeated string is too long");
1051                 return NULL;
1052         }
1053         op = (PyStringObject *)PyObject_MALLOC(PyStringObject_SIZE + nbytes);
1054         if (op == NULL)
1055                 return PyErr_NoMemory();
1056         PyObject_INIT_VAR(op, &PyString_Type, size);
1057         op->ob_shash = -1;
1058         op->ob_sstate = SSTATE_NOT_INTERNED;
1059         op->ob_sval[size] = '\0';
1060         if (Py_SIZE(a) == 1 && n > 0) {
1061                 memset(op->ob_sval, a->ob_sval[0] , n);
1062                 return (PyObject *) op;
1063         }
1064         i = 0;
1065         if (i < size) {
1066                 Py_MEMCPY(op->ob_sval, a->ob_sval, Py_SIZE(a));
1067                 i = Py_SIZE(a);
1068         }
1069         while (i < size) {
1070                 j = (i <= size-i)  ?  i  :  size-i;
1071                 Py_MEMCPY(op->ob_sval+i, op->ob_sval, j);
1072                 i += j;
1073         }
1074         return (PyObject *) op;
1075 }
1076
1077 /* String slice a[i:j] consists of characters a[i] ... a[j-1] */
1078
1079 static PyObject *
1080 string_slice(register PyStringObject *a, register Py_ssize_t i,
1081              register Py_ssize_t j)
1082      /* j -- may be negative! */
1083 {
1084         if (i < 0)
1085                 i = 0;
1086         if (j < 0)
1087                 j = 0; /* Avoid signed/unsigned bug in next line */
1088         if (j > Py_SIZE(a))
1089                 j = Py_SIZE(a);
1090         if (i == 0 && j == Py_SIZE(a) && PyString_CheckExact(a)) {
1091                 /* It's the same as a */
1092                 Py_INCREF(a);
1093                 return (PyObject *)a;
1094         }
1095         if (j < i)
1096                 j = i;
1097         return PyString_FromStringAndSize(a->ob_sval + i, j-i);
1098 }
1099
1100 static int
1101 string_contains(PyObject *str_obj, PyObject *sub_obj)
1102 {
1103         if (!PyString_CheckExact(sub_obj)) {
1104 #ifdef Py_USING_UNICODE
1105                 if (PyUnicode_Check(sub_obj))
1106                         return PyUnicode_Contains(str_obj, sub_obj);
1107 #endif
1108                 if (!PyString_Check(sub_obj)) {
1109                         PyErr_Format(PyExc_TypeError,
1110                             "'in <string>' requires string as left operand, "
1111                             "not %.200s", Py_TYPE(sub_obj)->tp_name);
1112                         return -1;
1113                 }
1114         }
1115
1116         return stringlib_contains_obj(str_obj, sub_obj);
1117 }
1118
1119 static PyObject *
1120 string_item(PyStringObject *a, register Py_ssize_t i)
1121 {
1122         char pchar;
1123         PyObject *v;
1124         if (i < 0 || i >= Py_SIZE(a)) {
1125                 PyErr_SetString(PyExc_IndexError, "string index out of range");
1126                 return NULL;
1127         }
1128         pchar = a->ob_sval[i];
1129         v = (PyObject *)characters[pchar & UCHAR_MAX];
1130         if (v == NULL)
1131                 v = PyString_FromStringAndSize(&pchar, 1);
1132         else {
1133 #ifdef COUNT_ALLOCS
1134                 one_strings++;
1135 #endif
1136                 Py_INCREF(v);
1137         }
1138         return v;
1139 }
1140
1141 static PyObject*
1142 string_richcompare(PyStringObject *a, PyStringObject *b, int op)
1143 {
1144         int c;
1145         Py_ssize_t len_a, len_b;
1146         Py_ssize_t min_len;
1147         PyObject *result;
1148
1149         /* Make sure both arguments are strings. */
1150         if (!(PyString_Check(a) && PyString_Check(b))) {
1151                 result = Py_NotImplemented;
1152                 goto out;
1153         }
1154         if (a == b) {
1155                 switch (op) {
1156                 case Py_EQ:case Py_LE:case Py_GE:
1157                         result = Py_True;
1158                         goto out;
1159                 case Py_NE:case Py_LT:case Py_GT:
1160                         result = Py_False;
1161                         goto out;
1162                 }
1163         }
1164         if (op == Py_EQ) {
1165                 /* Supporting Py_NE here as well does not save
1166                    much time, since Py_NE is rarely used.  */
1167                 if (Py_SIZE(a) == Py_SIZE(b)
1168                     && (a->ob_sval[0] == b->ob_sval[0]
1169                         && memcmp(a->ob_sval, b->ob_sval, Py_SIZE(a)) == 0)) {
1170                         result = Py_True;
1171                 } else {
1172                         result = Py_False;
1173                 }
1174                 goto out;
1175         }
1176         len_a = Py_SIZE(a); len_b = Py_SIZE(b);
1177         min_len = (len_a < len_b) ? len_a : len_b;
1178         if (min_len > 0) {
1179                 c = Py_CHARMASK(*a->ob_sval) - Py_CHARMASK(*b->ob_sval);
1180                 if (c==0)
1181                         c = memcmp(a->ob_sval, b->ob_sval, min_len);
1182         } else
1183                 c = 0;
1184         if (c == 0)
1185                 c = (len_a < len_b) ? -1 : (len_a > len_b) ? 1 : 0;
1186         switch (op) {
1187         case Py_LT: c = c <  0; break;
1188         case Py_LE: c = c <= 0; break;
1189         case Py_EQ: assert(0);  break; /* unreachable */
1190         case Py_NE: c = c != 0; break;
1191         case Py_GT: c = c >  0; break;
1192         case Py_GE: c = c >= 0; break;
1193         default:
1194                 result = Py_NotImplemented;
1195                 goto out;
1196         }
1197         result = c ? Py_True : Py_False;
1198   out:
1199         Py_INCREF(result);
1200         return result;
1201 }
1202
1203 int
1204 _PyString_Eq(PyObject *o1, PyObject *o2)
1205 {
1206         PyStringObject *a = (PyStringObject*) o1;
1207         PyStringObject *b = (PyStringObject*) o2;
1208         return Py_SIZE(a) == Py_SIZE(b)
1209           && *a->ob_sval == *b->ob_sval
1210           && memcmp(a->ob_sval, b->ob_sval, Py_SIZE(a)) == 0;
1211 }
1212
1213 static long
1214 string_hash(PyStringObject *a)
1215 {
1216         register Py_ssize_t len;
1217         register unsigned char *p;
1218         register long x;
1219
1220         if (a->ob_shash != -1)
1221                 return a->ob_shash;
1222         len = Py_SIZE(a);
1223         p = (unsigned char *) a->ob_sval;
1224         x = *p << 7;
1225         while (--len >= 0)
1226                 x = (1000003*x) ^ *p++;
1227         x ^= Py_SIZE(a);
1228         if (x == -1)
1229                 x = -2;
1230         a->ob_shash = x;
1231         return x;
1232 }
1233
1234 static PyObject*
1235 string_subscript(PyStringObject* self, PyObject* item)
1236 {
1237         if (PyIndex_Check(item)) {
1238                 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
1239                 if (i == -1 && PyErr_Occurred())
1240                         return NULL;
1241                 if (i < 0)
1242                         i += PyString_GET_SIZE(self);
1243                 return string_item(self, i);
1244         }
1245         else if (PySlice_Check(item)) {
1246                 Py_ssize_t start, stop, step, slicelength, cur, i;
1247                 char* source_buf;
1248                 char* result_buf;
1249                 PyObject* result;
1250
1251                 if (PySlice_GetIndicesEx((PySliceObject*)item,
1252                                  PyString_GET_SIZE(self),
1253                                  &start, &stop, &step, &slicelength) < 0) {
1254                         return NULL;
1255                 }
1256
1257                 if (slicelength <= 0) {
1258                         return PyString_FromStringAndSize("", 0);
1259                 }
1260                 else if (start == 0 && step == 1 &&
1261                          slicelength == PyString_GET_SIZE(self) &&
1262                          PyString_CheckExact(self)) {
1263                         Py_INCREF(self);
1264                         return (PyObject *)self;
1265                 }
1266                 else if (step == 1) {
1267                         return PyString_FromStringAndSize(
1268                                 PyString_AS_STRING(self) + start,
1269                                 slicelength);
1270                 }
1271                 else {
1272                         source_buf = PyString_AsString((PyObject*)self);
1273                         result_buf = (char *)PyMem_Malloc(slicelength);
1274                         if (result_buf == NULL)
1275                                 return PyErr_NoMemory();
1276
1277                         for (cur = start, i = 0; i < slicelength;
1278                              cur += step, i++) {
1279                                 result_buf[i] = source_buf[cur];
1280                         }
1281
1282                         result = PyString_FromStringAndSize(result_buf,
1283                                                             slicelength);
1284                         PyMem_Free(result_buf);
1285                         return result;
1286                 }
1287         }
1288         else {
1289                 PyErr_Format(PyExc_TypeError,
1290                              "string indices must be integers, not %.200s",
1291                              Py_TYPE(item)->tp_name);
1292                 return NULL;
1293         }
1294 }
1295
1296 static Py_ssize_t
1297 string_buffer_getreadbuf(PyStringObject *self, Py_ssize_t index, const void **ptr)
1298 {
1299         if ( index != 0 ) {
1300                 PyErr_SetString(PyExc_SystemError,
1301                                 "accessing non-existent string segment");
1302                 return -1;
1303         }
1304         *ptr = (void *)self->ob_sval;
1305         return Py_SIZE(self);
1306 }
1307
1308 static Py_ssize_t
1309 string_buffer_getwritebuf(PyStringObject *self, Py_ssize_t index, const void **ptr)
1310 {
1311         PyErr_SetString(PyExc_TypeError,
1312                         "Cannot use string as modifiable buffer");
1313         return -1;
1314 }
1315
1316 static Py_ssize_t
1317 string_buffer_getsegcount(PyStringObject *self, Py_ssize_t *lenp)
1318 {
1319         if ( lenp )
1320                 *lenp = Py_SIZE(self);
1321         return 1;
1322 }
1323
1324 static Py_ssize_t
1325 string_buffer_getcharbuf(PyStringObject *self, Py_ssize_t index, const char **ptr)
1326 {
1327         if ( index != 0 ) {
1328                 PyErr_SetString(PyExc_SystemError,
1329                                 "accessing non-existent string segment");
1330                 return -1;
1331         }
1332         *ptr = self->ob_sval;
1333         return Py_SIZE(self);
1334 }
1335
1336 static int
1337 string_buffer_getbuffer(PyStringObject *self, Py_buffer *view, int flags)
1338 {
1339         return PyBuffer_FillInfo(view, (PyObject*)self,
1340                                  (void *)self->ob_sval, Py_SIZE(self),
1341                                  1, flags);
1342 }
1343
1344 static PySequenceMethods string_as_sequence = {
1345         (lenfunc)string_length, /*sq_length*/
1346         (binaryfunc)string_concat, /*sq_concat*/
1347         (ssizeargfunc)string_repeat, /*sq_repeat*/
1348         (ssizeargfunc)string_item, /*sq_item*/
1349         (ssizessizeargfunc)string_slice, /*sq_slice*/
1350         0,              /*sq_ass_item*/
1351         0,              /*sq_ass_slice*/
1352         (objobjproc)string_contains /*sq_contains*/
1353 };
1354
1355 static PyMappingMethods string_as_mapping = {
1356         (lenfunc)string_length,
1357         (binaryfunc)string_subscript,
1358         0,
1359 };
1360
1361 static PyBufferProcs string_as_buffer = {
1362         (readbufferproc)string_buffer_getreadbuf,
1363         (writebufferproc)string_buffer_getwritebuf,
1364         (segcountproc)string_buffer_getsegcount,
1365         (charbufferproc)string_buffer_getcharbuf,
1366         (getbufferproc)string_buffer_getbuffer,
1367         0, /* XXX */
1368 };
1369
1370
1371
1372 #define LEFTSTRIP 0
1373 #define RIGHTSTRIP 1
1374 #define BOTHSTRIP 2
1375
1376 /* Arrays indexed by above */
1377 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
1378
1379 #define STRIPNAME(i) (stripformat[i]+3)
1380
1381
1382 /* Don't call if length < 2 */
1383 #define Py_STRING_MATCH(target, offset, pattern, length)        \
1384   (target[offset] == pattern[0] &&                              \
1385    target[offset+length-1] == pattern[length-1] &&              \
1386    !memcmp(target+offset+1, pattern+1, length-2) )
1387
1388
1389 /* Overallocate the initial list to reduce the number of reallocs for small
1390    split sizes.  Eg, "A A A A A A A A A A".split() (10 elements) has three
1391    resizes, to sizes 4, 8, then 16.  Most observed string splits are for human
1392    text (roughly 11 words per line) and field delimited data (usually 1-10
1393    fields).  For large strings the split algorithms are bandwidth limited
1394    so increasing the preallocation likely will not improve things.*/
1395
1396 #define MAX_PREALLOC 12
1397
1398 /* 5 splits gives 6 elements */
1399 #define PREALLOC_SIZE(maxsplit) \
1400         (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
1401
1402 #define SPLIT_APPEND(data, left, right)                         \
1403         str = PyString_FromStringAndSize((data) + (left),       \
1404                                          (right) - (left));     \
1405         if (str == NULL)                                        \
1406                 goto onError;                                   \
1407         if (PyList_Append(list, str)) {                         \
1408                 Py_DECREF(str);                                 \
1409                 goto onError;                                   \
1410         }                                                       \
1411         else                                                    \
1412                 Py_DECREF(str);
1413
1414 #define SPLIT_ADD(data, left, right) {                          \
1415         str = PyString_FromStringAndSize((data) + (left),       \
1416                                          (right) - (left));     \
1417         if (str == NULL)                                        \
1418                 goto onError;                                   \
1419         if (count < MAX_PREALLOC) {                             \
1420                 PyList_SET_ITEM(list, count, str);              \
1421         } else {                                                \
1422                 if (PyList_Append(list, str)) {                 \
1423                         Py_DECREF(str);                         \
1424                         goto onError;                           \
1425                 }                                               \
1426                 else                                            \
1427                         Py_DECREF(str);                         \
1428         }                                                       \
1429         count++; }
1430
1431 /* Always force the list to the expected size. */
1432 #define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count
1433
1434 #define SKIP_SPACE(s, i, len)    { while (i<len &&  isspace(Py_CHARMASK(s[i]))) i++; }
1435 #define SKIP_NONSPACE(s, i, len) { while (i<len && !isspace(Py_CHARMASK(s[i]))) i++; }
1436 #define RSKIP_SPACE(s, i)        { while (i>=0  &&  isspace(Py_CHARMASK(s[i]))) i--; }
1437 #define RSKIP_NONSPACE(s, i)     { while (i>=0  && !isspace(Py_CHARMASK(s[i]))) i--; }
1438
1439 Py_LOCAL_INLINE(PyObject *)
1440 split_whitespace(PyStringObject *self, Py_ssize_t len, Py_ssize_t maxsplit)
1441 {
1442         const char *s = PyString_AS_STRING(self);
1443         Py_ssize_t i, j, count=0;
1444         PyObject *str;
1445         PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit));
1446
1447         if (list == NULL)
1448                 return NULL;
1449
1450         i = j = 0;
1451
1452         while (maxsplit-- > 0) {
1453                 SKIP_SPACE(s, i, len);
1454                 if (i==len) break;
1455                 j = i; i++;
1456                 SKIP_NONSPACE(s, i, len);
1457                 if (j == 0 && i == len && PyString_CheckExact(self)) {
1458                         /* No whitespace in self, so just use it as list[0] */
1459                         Py_INCREF(self);
1460                         PyList_SET_ITEM(list, 0, (PyObject *)self);
1461                         count++;
1462                         break;
1463                 }
1464                 SPLIT_ADD(s, j, i);
1465         }
1466
1467         if (i < len) {
1468                 /* Only occurs when maxsplit was reached */
1469                 /* Skip any remaining whitespace and copy to end of string */
1470                 SKIP_SPACE(s, i, len);
1471                 if (i != len)
1472                         SPLIT_ADD(s, i, len);
1473         }
1474         FIX_PREALLOC_SIZE(list);
1475         return list;
1476   onError:
1477         Py_DECREF(list);
1478         return NULL;
1479 }
1480
1481 Py_LOCAL_INLINE(PyObject *)
1482 split_char(PyStringObject *self, Py_ssize_t len, char ch, Py_ssize_t maxcount)
1483 {
1484         const char *s = PyString_AS_STRING(self);
1485         register Py_ssize_t i, j, count=0;
1486         PyObject *str;
1487         PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
1488
1489         if (list == NULL)
1490                 return NULL;
1491
1492         i = j = 0;
1493         while ((j < len) && (maxcount-- > 0)) {
1494                 for(; j<len; j++) {
1495                         /* I found that using memchr makes no difference */
1496                         if (s[j] == ch) {
1497                                 SPLIT_ADD(s, i, j);
1498                                 i = j = j + 1;
1499                                 break;
1500                         }
1501                 }
1502         }
1503         if (i == 0 && count == 0 && PyString_CheckExact(self)) {
1504                 /* ch not in self, so just use self as list[0] */
1505                 Py_INCREF(self);
1506                 PyList_SET_ITEM(list, 0, (PyObject *)self);
1507                 count++;
1508         }
1509         else if (i <= len) {
1510                 SPLIT_ADD(s, i, len);
1511         }
1512         FIX_PREALLOC_SIZE(list);
1513         return list;
1514
1515   onError:
1516         Py_DECREF(list);
1517         return NULL;
1518 }
1519
1520 PyDoc_STRVAR(split__doc__,
1521 "S.split([sep [,maxsplit]]) -> list of strings\n\
1522 \n\
1523 Return a list of the words in the string S, using sep as the\n\
1524 delimiter string.  If maxsplit is given, at most maxsplit\n\
1525 splits are done. If sep is not specified or is None, any\n\
1526 whitespace string is a separator and empty strings are removed\n\
1527 from the result.");
1528
1529 static PyObject *
1530 string_split(PyStringObject *self, PyObject *args)
1531 {
1532         Py_ssize_t len = PyString_GET_SIZE(self), n, i, j;
1533         Py_ssize_t maxsplit = -1, count=0;
1534         const char *s = PyString_AS_STRING(self), *sub;
1535         PyObject *list, *str, *subobj = Py_None;
1536 #ifdef USE_FAST
1537         Py_ssize_t pos;
1538 #endif
1539
1540         if (!PyArg_ParseTuple(args, "|On:split", &subobj, &maxsplit))
1541                 return NULL;
1542         if (maxsplit < 0)
1543                 maxsplit = PY_SSIZE_T_MAX;
1544         if (subobj == Py_None)
1545                 return split_whitespace(self, len, maxsplit);
1546         if (PyString_Check(subobj)) {
1547                 sub = PyString_AS_STRING(subobj);
1548                 n = PyString_GET_SIZE(subobj);
1549         }
1550 #ifdef Py_USING_UNICODE
1551         else if (PyUnicode_Check(subobj))
1552                 return PyUnicode_Split((PyObject *)self, subobj, maxsplit);
1553 #endif
1554         else if (PyObject_AsCharBuffer(subobj, &sub, &n))
1555                 return NULL;
1556
1557         if (n == 0) {
1558                 PyErr_SetString(PyExc_ValueError, "empty separator");
1559                 return NULL;
1560         }
1561         else if (n == 1)
1562                 return split_char(self, len, sub[0], maxsplit);
1563
1564         list = PyList_New(PREALLOC_SIZE(maxsplit));
1565         if (list == NULL)
1566                 return NULL;
1567
1568 #ifdef USE_FAST
1569         i = j = 0;
1570         while (maxsplit-- > 0) {
1571                 pos = fastsearch(s+i, len-i, sub, n, FAST_SEARCH);
1572                 if (pos < 0)
1573                         break;
1574                 j = i+pos;
1575                 SPLIT_ADD(s, i, j);
1576                 i = j + n;
1577         }
1578 #else
1579         i = j = 0;
1580         while ((j+n <= len) && (maxsplit-- > 0)) {
1581                 for (; j+n <= len; j++) {
1582                         if (Py_STRING_MATCH(s, j, sub, n)) {
1583                                 SPLIT_ADD(s, i, j);
1584                                 i = j = j + n;
1585                                 break;
1586                         }
1587                 }
1588         }
1589 #endif
1590         SPLIT_ADD(s, i, len);
1591         FIX_PREALLOC_SIZE(list);
1592         return list;
1593
1594  onError:
1595         Py_DECREF(list);
1596         return NULL;
1597 }
1598
1599 PyDoc_STRVAR(partition__doc__,
1600 "S.partition(sep) -> (head, sep, tail)\n\
1601 \n\
1602 Search for the separator sep in S, and return the part before it,\n\
1603 the separator itself, and the part after it.  If the separator is not\n\
1604 found, return S and two empty strings.");
1605
1606 static PyObject *
1607 string_partition(PyStringObject *self, PyObject *sep_obj)
1608 {
1609         const char *sep;
1610         Py_ssize_t sep_len;
1611
1612         if (PyString_Check(sep_obj)) {
1613                 sep = PyString_AS_STRING(sep_obj);
1614                 sep_len = PyString_GET_SIZE(sep_obj);
1615         }
1616 #ifdef Py_USING_UNICODE
1617         else if (PyUnicode_Check(sep_obj))
1618                 return PyUnicode_Partition((PyObject *) self, sep_obj);
1619 #endif
1620         else if (PyObject_AsCharBuffer(sep_obj, &sep, &sep_len))
1621                 return NULL;
1622
1623         return stringlib_partition(
1624                 (PyObject*) self,
1625                 PyString_AS_STRING(self), PyString_GET_SIZE(self),
1626                 sep_obj, sep, sep_len
1627                 );
1628 }
1629
1630 PyDoc_STRVAR(rpartition__doc__,
1631 "S.rpartition(sep) -> (tail, sep, head)\n\
1632 \n\
1633 Search for the separator sep in S, starting at the end of S, and return\n\
1634 the part before it, the separator itself, and the part after it.  If the\n\
1635 separator is not found, return two empty strings and S.");
1636
1637 static PyObject *
1638 string_rpartition(PyStringObject *self, PyObject *sep_obj)
1639 {
1640         const char *sep;
1641         Py_ssize_t sep_len;
1642
1643         if (PyString_Check(sep_obj)) {
1644                 sep = PyString_AS_STRING(sep_obj);
1645                 sep_len = PyString_GET_SIZE(sep_obj);
1646         }
1647 #ifdef Py_USING_UNICODE
1648         else if (PyUnicode_Check(sep_obj))
1649                 return PyUnicode_RPartition((PyObject *) self, sep_obj);
1650 #endif
1651         else if (PyObject_AsCharBuffer(sep_obj, &sep, &sep_len))
1652                 return NULL;
1653
1654         return stringlib_rpartition(
1655                 (PyObject*) self,
1656                 PyString_AS_STRING(self), PyString_GET_SIZE(self),
1657                 sep_obj, sep, sep_len
1658                 );
1659 }
1660
1661 Py_LOCAL_INLINE(PyObject *)
1662 rsplit_whitespace(PyStringObject *self, Py_ssize_t len, Py_ssize_t maxsplit)
1663 {
1664         const char *s = PyString_AS_STRING(self);
1665         Py_ssize_t i, j, count=0;
1666         PyObject *str;
1667         PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit));
1668
1669         if (list == NULL)
1670                 return NULL;
1671
1672         i = j = len-1;
1673
1674         while (maxsplit-- > 0) {
1675                 RSKIP_SPACE(s, i);
1676                 if (i<0) break;
1677                 j = i; i--;
1678                 RSKIP_NONSPACE(s, i);
1679                 if (j == len-1 && i < 0 && PyString_CheckExact(self)) {
1680                         /* No whitespace in self, so just use it as list[0] */
1681                         Py_INCREF(self);
1682                         PyList_SET_ITEM(list, 0, (PyObject *)self);
1683                         count++;
1684                         break;
1685                 }
1686                 SPLIT_ADD(s, i + 1, j + 1);
1687         }
1688         if (i >= 0) {
1689                 /* Only occurs when maxsplit was reached */
1690                 /* Skip any remaining whitespace and copy to beginning of string */
1691                 RSKIP_SPACE(s, i);
1692                 if (i >= 0)
1693                         SPLIT_ADD(s, 0, i + 1);
1694
1695         }
1696         FIX_PREALLOC_SIZE(list);
1697         if (PyList_Reverse(list) < 0)
1698                 goto onError;
1699         return list;
1700   onError:
1701         Py_DECREF(list);
1702         return NULL;
1703 }
1704
1705 Py_LOCAL_INLINE(PyObject *)
1706 rsplit_char(PyStringObject *self, Py_ssize_t len, char ch, Py_ssize_t maxcount)
1707 {
1708         const char *s = PyString_AS_STRING(self);
1709         register Py_ssize_t i, j, count=0;
1710         PyObject *str;
1711         PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
1712
1713         if (list == NULL)
1714                 return NULL;
1715
1716         i = j = len - 1;
1717         while ((i >= 0) && (maxcount-- > 0)) {
1718                 for (; i >= 0; i--) {
1719                         if (s[i] == ch) {
1720                                 SPLIT_ADD(s, i + 1, j + 1);
1721                                 j = i = i - 1;
1722                                 break;
1723                         }
1724                 }
1725         }
1726         if (i < 0 && count == 0 && PyString_CheckExact(self)) {
1727                 /* ch not in self, so just use self as list[0] */
1728                 Py_INCREF(self);
1729                 PyList_SET_ITEM(list, 0, (PyObject *)self);
1730                 count++;
1731         }
1732         else if (j >= -1) {
1733                 SPLIT_ADD(s, 0, j + 1);
1734         }
1735         FIX_PREALLOC_SIZE(list);
1736         if (PyList_Reverse(list) < 0)
1737                 goto onError;
1738         return list;
1739
1740  onError:
1741         Py_DECREF(list);
1742         return NULL;
1743 }
1744
1745 PyDoc_STRVAR(rsplit__doc__,
1746 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
1747 \n\
1748 Return a list of the words in the string S, using sep as the\n\
1749 delimiter string, starting at the end of the string and working\n\
1750 to the front.  If maxsplit is given, at most maxsplit splits are\n\
1751 done. If sep is not specified or is None, any whitespace string\n\
1752 is a separator.");
1753
1754 static PyObject *
1755 string_rsplit(PyStringObject *self, PyObject *args)
1756 {
1757         Py_ssize_t len = PyString_GET_SIZE(self), n, i, j;
1758         Py_ssize_t maxsplit = -1, count=0;
1759         const char *s, *sub;
1760         PyObject *list, *str, *subobj = Py_None;
1761
1762         if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit))
1763                 return NULL;
1764         if (maxsplit < 0)
1765                 maxsplit = PY_SSIZE_T_MAX;
1766         if (subobj == Py_None)
1767                 return rsplit_whitespace(self, len, maxsplit);
1768         if (PyString_Check(subobj)) {
1769                 sub = PyString_AS_STRING(subobj);
1770                 n = PyString_GET_SIZE(subobj);
1771         }
1772 #ifdef Py_USING_UNICODE
1773         else if (PyUnicode_Check(subobj))
1774                 return PyUnicode_RSplit((PyObject *)self, subobj, maxsplit);
1775 #endif
1776         else if (PyObject_AsCharBuffer(subobj, &sub, &n))
1777                 return NULL;
1778
1779         if (n == 0) {
1780                 PyErr_SetString(PyExc_ValueError, "empty separator");
1781                 return NULL;
1782         }
1783         else if (n == 1)
1784                 return rsplit_char(self, len, sub[0], maxsplit);
1785
1786         list = PyList_New(PREALLOC_SIZE(maxsplit));
1787         if (list == NULL)
1788                 return NULL;
1789
1790         j = len;
1791         i = j - n;
1792
1793         s = PyString_AS_STRING(self);
1794         while ( (i >= 0) && (maxsplit-- > 0) ) {
1795                 for (; i>=0; i--) {
1796                         if (Py_STRING_MATCH(s, i, sub, n)) {
1797                                 SPLIT_ADD(s, i + n, j);
1798                                 j = i;
1799                                 i -= n;
1800                                 break;
1801                         }
1802                 }
1803         }
1804         SPLIT_ADD(s, 0, j);
1805         FIX_PREALLOC_SIZE(list);
1806         if (PyList_Reverse(list) < 0)
1807                 goto onError;
1808         return list;
1809
1810 onError:
1811         Py_DECREF(list);
1812         return NULL;
1813 }
1814
1815
1816 PyDoc_STRVAR(join__doc__,
1817 "S.join(sequence) -> string\n\
1818 \n\
1819 Return a string which is the concatenation of the strings in the\n\
1820 sequence.  The separator between elements is S.");
1821
1822 static PyObject *
1823 string_join(PyStringObject *self, PyObject *orig)
1824 {
1825         char *sep = PyString_AS_STRING(self);
1826         const Py_ssize_t seplen = PyString_GET_SIZE(self);
1827         PyObject *res = NULL;
1828         char *p;
1829         Py_ssize_t seqlen = 0;
1830         size_t sz = 0;
1831         Py_ssize_t i;
1832         PyObject *seq, *item;
1833
1834         seq = PySequence_Fast(orig, "");
1835         if (seq == NULL) {
1836                 return NULL;
1837         }
1838
1839         seqlen = PySequence_Size(seq);
1840         if (seqlen == 0) {
1841                 Py_DECREF(seq);
1842                 return PyString_FromString("");
1843         }
1844         if (seqlen == 1) {
1845                 item = PySequence_Fast_GET_ITEM(seq, 0);
1846                 if (PyString_CheckExact(item) || PyUnicode_CheckExact(item)) {
1847                         Py_INCREF(item);
1848                         Py_DECREF(seq);
1849                         return item;
1850                 }
1851         }
1852
1853         /* There are at least two things to join, or else we have a subclass
1854          * of the builtin types in the sequence.
1855          * Do a pre-pass to figure out the total amount of space we'll
1856          * need (sz), see whether any argument is absurd, and defer to
1857          * the Unicode join if appropriate.
1858          */
1859         for (i = 0; i < seqlen; i++) {
1860                 const size_t old_sz = sz;
1861                 item = PySequence_Fast_GET_ITEM(seq, i);
1862                 if (!PyString_Check(item)){
1863 #ifdef Py_USING_UNICODE
1864                         if (PyUnicode_Check(item)) {
1865                                 /* Defer to Unicode join.
1866                                  * CAUTION:  There's no gurantee that the
1867                                  * original sequence can be iterated over
1868                                  * again, so we must pass seq here.
1869                                  */
1870                                 PyObject *result;
1871                                 result = PyUnicode_Join((PyObject *)self, seq);
1872                                 Py_DECREF(seq);
1873                                 return result;
1874                         }
1875 #endif
1876                         PyErr_Format(PyExc_TypeError,
1877                                      "sequence item %zd: expected string,"
1878                                      " %.80s found",
1879                                      i, Py_TYPE(item)->tp_name);
1880                         Py_DECREF(seq);
1881                         return NULL;
1882                 }
1883                 sz += PyString_GET_SIZE(item);
1884                 if (i != 0)
1885                         sz += seplen;
1886                 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
1887                         PyErr_SetString(PyExc_OverflowError,
1888                                 "join() result is too long for a Python string");
1889                         Py_DECREF(seq);
1890                         return NULL;
1891                 }
1892         }
1893
1894         /* Allocate result space. */
1895         res = PyString_FromStringAndSize((char*)NULL, sz);
1896         if (res == NULL) {
1897                 Py_DECREF(seq);
1898                 return NULL;
1899         }
1900
1901         /* Catenate everything. */
1902         p = PyString_AS_STRING(res);
1903         for (i = 0; i < seqlen; ++i) {
1904                 size_t n;
1905                 item = PySequence_Fast_GET_ITEM(seq, i);
1906                 n = PyString_GET_SIZE(item);
1907                 Py_MEMCPY(p, PyString_AS_STRING(item), n);
1908                 p += n;
1909                 if (i < seqlen - 1) {
1910                         Py_MEMCPY(p, sep, seplen);
1911                         p += seplen;
1912                 }
1913         }
1914
1915         Py_DECREF(seq);
1916         return res;
1917 }
1918
1919 PyObject *
1920 _PyString_Join(PyObject *sep, PyObject *x)
1921 {
1922         assert(sep != NULL && PyString_Check(sep));
1923         assert(x != NULL);
1924         return string_join((PyStringObject *)sep, x);
1925 }
1926
1927 Py_LOCAL_INLINE(void)
1928 string_adjust_indices(Py_ssize_t *start, Py_ssize_t *end, Py_ssize_t len)
1929 {
1930         if (*end > len)
1931                 *end = len;
1932         else if (*end < 0)
1933                 *end += len;
1934         if (*end < 0)
1935                 *end = 0;
1936         if (*start < 0)
1937                 *start += len;
1938         if (*start < 0)
1939                 *start = 0;
1940 }
1941
1942 Py_LOCAL_INLINE(Py_ssize_t)
1943 string_find_internal(PyStringObject *self, PyObject *args, int dir)
1944 {
1945         PyObject *subobj;
1946         const char *sub;
1947         Py_ssize_t sub_len;
1948         Py_ssize_t start=0, end=PY_SSIZE_T_MAX;
1949         PyObject *obj_start=Py_None, *obj_end=Py_None;
1950
1951         if (!PyArg_ParseTuple(args, "O|OO:find/rfind/index/rindex", &subobj,
1952                 &obj_start, &obj_end))
1953                 return -2;
1954         /* To support None in "start" and "end" arguments, meaning
1955            the same as if they were not passed.
1956         */
1957         if (obj_start != Py_None)
1958                 if (!_PyEval_SliceIndex(obj_start, &start))
1959                 return -2;
1960         if (obj_end != Py_None)
1961                 if (!_PyEval_SliceIndex(obj_end, &end))
1962                 return -2;
1963
1964         if (PyString_Check(subobj)) {
1965                 sub = PyString_AS_STRING(subobj);
1966                 sub_len = PyString_GET_SIZE(subobj);
1967         }
1968 #ifdef Py_USING_UNICODE
1969         else if (PyUnicode_Check(subobj))
1970                 return PyUnicode_Find(
1971                         (PyObject *)self, subobj, start, end, dir);
1972 #endif
1973         else if (PyObject_AsCharBuffer(subobj, &sub, &sub_len))
1974                 /* XXX - the "expected a character buffer object" is pretty
1975                    confusing for a non-expert.  remap to something else ? */
1976                 return -2;
1977
1978         if (dir > 0)
1979                 return stringlib_find_slice(
1980                         PyString_AS_STRING(self), PyString_GET_SIZE(self),
1981                         sub, sub_len, start, end);
1982         else
1983                 return stringlib_rfind_slice(
1984                         PyString_AS_STRING(self), PyString_GET_SIZE(self),
1985                         sub, sub_len, start, end);
1986 }
1987
1988
1989 PyDoc_STRVAR(find__doc__,
1990 "S.find(sub [,start [,end]]) -> int\n\
1991 \n\
1992 Return the lowest index in S where substring sub is found,\n\
1993 such that sub is contained within s[start:end].  Optional\n\
1994 arguments start and end are interpreted as in slice notation.\n\
1995 \n\
1996 Return -1 on failure.");
1997
1998 static PyObject *
1999 string_find(PyStringObject *self, PyObject *args)
2000 {
2001         Py_ssize_t result = string_find_internal(self, args, +1);
2002         if (result == -2)
2003                 return NULL;
2004         return PyInt_FromSsize_t(result);
2005 }
2006
2007
2008 PyDoc_STRVAR(index__doc__,
2009 "S.index(sub [,start [,end]]) -> int\n\
2010 \n\
2011 Like S.find() but raise ValueError when the substring is not found.");
2012
2013 static PyObject *
2014 string_index(PyStringObject *self, PyObject *args)
2015 {
2016         Py_ssize_t result = string_find_internal(self, args, +1);
2017         if (result == -2)
2018                 return NULL;
2019         if (result == -1) {
2020                 PyErr_SetString(PyExc_ValueError,
2021                                 "substring not found");
2022                 return NULL;
2023         }
2024         return PyInt_FromSsize_t(result);
2025 }
2026
2027
2028 PyDoc_STRVAR(rfind__doc__,
2029 "S.rfind(sub [,start [,end]]) -> int\n\
2030 \n\
2031 Return the highest index in S where substring sub is found,\n\
2032 such that sub is contained within s[start:end].  Optional\n\
2033 arguments start and end are interpreted as in slice notation.\n\
2034 \n\
2035 Return -1 on failure.");
2036
2037 static PyObject *
2038 string_rfind(PyStringObject *self, PyObject *args)
2039 {
2040         Py_ssize_t result = string_find_internal(self, args, -1);
2041         if (result == -2)
2042                 return NULL;
2043         return PyInt_FromSsize_t(result);
2044 }
2045
2046
2047 PyDoc_STRVAR(rindex__doc__,
2048 "S.rindex(sub [,start [,end]]) -> int\n\
2049 \n\
2050 Like S.rfind() but raise ValueError when the substring is not found.");
2051
2052 static PyObject *
2053 string_rindex(PyStringObject *self, PyObject *args)
2054 {
2055         Py_ssize_t result = string_find_internal(self, args, -1);
2056         if (result == -2)
2057                 return NULL;
2058         if (result == -1) {
2059                 PyErr_SetString(PyExc_ValueError,
2060                                 "substring not found");
2061                 return NULL;
2062         }
2063         return PyInt_FromSsize_t(result);
2064 }
2065
2066
2067 Py_LOCAL_INLINE(PyObject *)
2068 do_xstrip(PyStringObject *self, int striptype, PyObject *sepobj)
2069 {
2070         char *s = PyString_AS_STRING(self);
2071         Py_ssize_t len = PyString_GET_SIZE(self);
2072         char *sep = PyString_AS_STRING(sepobj);
2073         Py_ssize_t seplen = PyString_GET_SIZE(sepobj);
2074         Py_ssize_t i, j;
2075
2076         i = 0;
2077         if (striptype != RIGHTSTRIP) {
2078                 while (i < len && memchr(sep, Py_CHARMASK(s[i]), seplen)) {
2079                         i++;
2080                 }
2081         }
2082
2083         j = len;
2084         if (striptype != LEFTSTRIP) {
2085                 do {
2086                         j--;
2087                 } while (j >= i && memchr(sep, Py_CHARMASK(s[j]), seplen));
2088                 j++;
2089         }
2090
2091         if (i == 0 && j == len && PyString_CheckExact(self)) {
2092                 Py_INCREF(self);
2093                 return (PyObject*)self;
2094         }
2095         else
2096                 return PyString_FromStringAndSize(s+i, j-i);
2097 }
2098
2099
2100 Py_LOCAL_INLINE(PyObject *)
2101 do_strip(PyStringObject *self, int striptype)
2102 {
2103         char *s = PyString_AS_STRING(self);
2104         Py_ssize_t len = PyString_GET_SIZE(self), i, j;
2105
2106         i = 0;
2107         if (striptype != RIGHTSTRIP) {
2108                 while (i < len && isspace(Py_CHARMASK(s[i]))) {
2109                         i++;
2110                 }
2111         }
2112
2113         j = len;
2114         if (striptype != LEFTSTRIP) {
2115                 do {
2116                         j--;
2117                 } while (j >= i && isspace(Py_CHARMASK(s[j])));
2118                 j++;
2119         }
2120
2121         if (i == 0 && j == len && PyString_CheckExact(self)) {
2122                 Py_INCREF(self);
2123                 return (PyObject*)self;
2124         }
2125         else
2126                 return PyString_FromStringAndSize(s+i, j-i);
2127 }
2128
2129
2130 Py_LOCAL_INLINE(PyObject *)
2131 do_argstrip(PyStringObject *self, int striptype, PyObject *args)
2132 {
2133         PyObject *sep = NULL;
2134
2135         if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
2136                 return NULL;
2137
2138         if (sep != NULL && sep != Py_None) {
2139                 if (PyString_Check(sep))
2140                         return do_xstrip(self, striptype, sep);
2141 #ifdef Py_USING_UNICODE
2142                 else if (PyUnicode_Check(sep)) {
2143                         PyObject *uniself = PyUnicode_FromObject((PyObject *)self);
2144                         PyObject *res;
2145                         if (uniself==NULL)
2146                                 return NULL;
2147                         res = _PyUnicode_XStrip((PyUnicodeObject *)uniself,
2148                                 striptype, sep);
2149                         Py_DECREF(uniself);
2150                         return res;
2151                 }
2152 #endif
2153                 PyErr_Format(PyExc_TypeError,
2154 #ifdef Py_USING_UNICODE
2155                              "%s arg must be None, str or unicode",
2156 #else
2157                              "%s arg must be None or str",
2158 #endif
2159                              STRIPNAME(striptype));
2160                 return NULL;
2161         }
2162
2163         return do_strip(self, striptype);
2164 }
2165
2166
2167 PyDoc_STRVAR(strip__doc__,
2168 "S.strip([chars]) -> string or unicode\n\
2169 \n\
2170 Return a copy of the string S with leading and trailing\n\
2171 whitespace removed.\n\
2172 If chars is given and not None, remove characters in chars instead.\n\
2173 If chars is unicode, S will be converted to unicode before stripping");
2174
2175 static PyObject *
2176 string_strip(PyStringObject *self, PyObject *args)
2177 {
2178         if (PyTuple_GET_SIZE(args) == 0)
2179                 return do_strip(self, BOTHSTRIP); /* Common case */
2180         else
2181                 return do_argstrip(self, BOTHSTRIP, args);
2182 }
2183
2184
2185 PyDoc_STRVAR(lstrip__doc__,
2186 "S.lstrip([chars]) -> string or unicode\n\
2187 \n\
2188 Return a copy of the string S with leading whitespace removed.\n\
2189 If chars is given and not None, remove characters in chars instead.\n\
2190 If chars is unicode, S will be converted to unicode before stripping");
2191
2192 static PyObject *
2193 string_lstrip(PyStringObject *self, PyObject *args)
2194 {
2195         if (PyTuple_GET_SIZE(args) == 0)
2196                 return do_strip(self, LEFTSTRIP); /* Common case */
2197         else
2198                 return do_argstrip(self, LEFTSTRIP, args);
2199 }
2200
2201
2202 PyDoc_STRVAR(rstrip__doc__,
2203 "S.rstrip([chars]) -> string or unicode\n\
2204 \n\
2205 Return a copy of the string S with trailing whitespace removed.\n\
2206 If chars is given and not None, remove characters in chars instead.\n\
2207 If chars is unicode, S will be converted to unicode before stripping");
2208
2209 static PyObject *
2210 string_rstrip(PyStringObject *self, PyObject *args)
2211 {
2212         if (PyTuple_GET_SIZE(args) == 0)
2213                 return do_strip(self, RIGHTSTRIP); /* Common case */
2214         else
2215                 return do_argstrip(self, RIGHTSTRIP, args);
2216 }
2217
2218
2219 PyDoc_STRVAR(lower__doc__,
2220 "S.lower() -> string\n\
2221 \n\
2222 Return a copy of the string S converted to lowercase.");
2223
2224 /* _tolower and _toupper are defined by SUSv2, but they're not ISO C */
2225 #ifndef _tolower
2226 #define _tolower tolower
2227 #endif
2228
2229 static PyObject *
2230 string_lower(PyStringObject *self)
2231 {
2232         char *s;
2233         Py_ssize_t i, n = PyString_GET_SIZE(self);
2234         PyObject *newobj;
2235
2236         newobj = PyString_FromStringAndSize(NULL, n);
2237         if (!newobj)
2238                 return NULL;
2239
2240         s = PyString_AS_STRING(newobj);
2241
2242         Py_MEMCPY(s, PyString_AS_STRING(self), n);
2243
2244         for (i = 0; i < n; i++) {
2245                 int c = Py_CHARMASK(s[i]);
2246                 if (isupper(c))
2247                         s[i] = _tolower(c);
2248         }
2249
2250         return newobj;
2251 }
2252
2253 PyDoc_STRVAR(upper__doc__,
2254 "S.upper() -> string\n\
2255 \n\
2256 Return a copy of the string S converted to uppercase.");
2257
2258 #ifndef _toupper
2259 #define _toupper toupper
2260 #endif
2261
2262 static PyObject *
2263 string_upper(PyStringObject *self)
2264 {
2265         char *s;
2266         Py_ssize_t i, n = PyString_GET_SIZE(self);
2267         PyObject *newobj;
2268
2269         newobj = PyString_FromStringAndSize(NULL, n);
2270         if (!newobj)
2271                 return NULL;
2272
2273         s = PyString_AS_STRING(newobj);
2274
2275         Py_MEMCPY(s, PyString_AS_STRING(self), n);
2276
2277         for (i = 0; i < n; i++) {
2278                 int c = Py_CHARMASK(s[i]);
2279                 if (islower(c))
2280                         s[i] = _toupper(c);
2281         }
2282
2283         return newobj;
2284 }
2285
2286 PyDoc_STRVAR(title__doc__,
2287 "S.title() -> string\n\
2288 \n\
2289 Return a titlecased version of S, i.e. words start with uppercase\n\
2290 characters, all remaining cased characters have lowercase.");
2291
2292 static PyObject*
2293 string_title(PyStringObject *self)
2294 {
2295         char *s = PyString_AS_STRING(self), *s_new;
2296         Py_ssize_t i, n = PyString_GET_SIZE(self);
2297         int previous_is_cased = 0;
2298         PyObject *newobj;
2299
2300         newobj = PyString_FromStringAndSize(NULL, n);
2301         if (newobj == NULL)
2302                 return NULL;
2303         s_new = PyString_AsString(newobj);
2304         for (i = 0; i < n; i++) {
2305                 int c = Py_CHARMASK(*s++);
2306                 if (islower(c)) {
2307                         if (!previous_is_cased)
2308                             c = toupper(c);
2309                         previous_is_cased = 1;
2310                 } else if (isupper(c)) {
2311                         if (previous_is_cased)
2312                             c = tolower(c);
2313                         previous_is_cased = 1;
2314                 } else
2315                         previous_is_cased = 0;
2316                 *s_new++ = c;
2317         }
2318         return newobj;
2319 }
2320
2321 PyDoc_STRVAR(capitalize__doc__,
2322 "S.capitalize() -> string\n\
2323 \n\
2324 Return a copy of the string S with only its first character\n\
2325 capitalized.");
2326
2327 static PyObject *
2328 string_capitalize(PyStringObject *self)
2329 {
2330         char *s = PyString_AS_STRING(self), *s_new;
2331         Py_ssize_t i, n = PyString_GET_SIZE(self);
2332         PyObject *newobj;
2333
2334         newobj = PyString_FromStringAndSize(NULL, n);
2335         if (newobj == NULL)
2336                 return NULL;
2337         s_new = PyString_AsString(newobj);
2338         if (0 < n) {
2339                 int c = Py_CHARMASK(*s++);
2340                 if (islower(c))
2341                         *s_new = toupper(c);
2342                 else
2343                         *s_new = c;
2344                 s_new++;
2345         }
2346         for (i = 1; i < n; i++) {
2347                 int c = Py_CHARMASK(*s++);
2348                 if (isupper(c))
2349                         *s_new = tolower(c);
2350                 else
2351                         *s_new = c;
2352                 s_new++;
2353         }
2354         return newobj;
2355 }
2356
2357
2358 PyDoc_STRVAR(count__doc__,
2359 "S.count(sub[, start[, end]]) -> int\n\
2360 \n\
2361 Return the number of non-overlapping occurrences of substring sub in\n\
2362 string S[start:end].  Optional arguments start and end are interpreted\n\
2363 as in slice notation.");
2364
2365 static PyObject *
2366 string_count(PyStringObject *self, PyObject *args)
2367 {
2368         PyObject *sub_obj;
2369         const char *str = PyString_AS_STRING(self), *sub;
2370         Py_ssize_t sub_len;
2371         Py_ssize_t start = 0, end = PY_SSIZE_T_MAX;
2372
2373         if (!PyArg_ParseTuple(args, "O|O&O&:count", &sub_obj,
2374                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
2375                 return NULL;
2376
2377         if (PyString_Check(sub_obj)) {
2378                 sub = PyString_AS_STRING(sub_obj);
2379                 sub_len = PyString_GET_SIZE(sub_obj);
2380         }
2381 #ifdef Py_USING_UNICODE
2382         else if (PyUnicode_Check(sub_obj)) {
2383                 Py_ssize_t count;
2384                 count = PyUnicode_Count((PyObject *)self, sub_obj, start, end);
2385                 if (count == -1)
2386                         return NULL;
2387                 else
2388                         return PyInt_FromSsize_t(count);
2389         }
2390 #endif
2391         else if (PyObject_AsCharBuffer(sub_obj, &sub, &sub_len))
2392                 return NULL;
2393
2394         string_adjust_indices(&start, &end, PyString_GET_SIZE(self));
2395
2396         return PyInt_FromSsize_t(
2397                 stringlib_count(str + start, end - start, sub, sub_len)
2398                 );
2399 }
2400
2401 PyDoc_STRVAR(swapcase__doc__,
2402 "S.swapcase() -> string\n\
2403 \n\
2404 Return a copy of the string S with uppercase characters\n\
2405 converted to lowercase and vice versa.");
2406
2407 static PyObject *
2408 string_swapcase(PyStringObject *self)
2409 {
2410         char *s = PyString_AS_STRING(self), *s_new;
2411         Py_ssize_t i, n = PyString_GET_SIZE(self);
2412         PyObject *newobj;
2413
2414         newobj = PyString_FromStringAndSize(NULL, n);
2415         if (newobj == NULL)
2416                 return NULL;
2417         s_new = PyString_AsString(newobj);
2418         for (i = 0; i < n; i++) {
2419                 int c = Py_CHARMASK(*s++);
2420                 if (islower(c)) {
2421                         *s_new = toupper(c);
2422                 }
2423                 else if (isupper(c)) {
2424                         *s_new = tolower(c);
2425                 }
2426                 else
2427                         *s_new = c;
2428                 s_new++;
2429         }
2430         return newobj;
2431 }
2432
2433
2434 PyDoc_STRVAR(translate__doc__,
2435 "S.translate(table [,deletechars]) -> string\n\
2436 \n\
2437 Return a copy of the string S, where all characters occurring\n\
2438 in the optional argument deletechars are removed, and the\n\
2439 remaining characters have been mapped through the given\n\
2440 translation table, which must be a string of length 256.");
2441
2442 static PyObject *
2443 string_translate(PyStringObject *self, PyObject *args)
2444 {
2445         register char *input, *output;
2446         const char *table;
2447         register Py_ssize_t i, c, changed = 0;
2448         PyObject *input_obj = (PyObject*)self;
2449         const char *output_start, *del_table=NULL;
2450         Py_ssize_t inlen, tablen, dellen = 0;
2451         PyObject *result;
2452         int trans_table[256];
2453         PyObject *tableobj, *delobj = NULL;
2454
2455         if (!PyArg_UnpackTuple(args, "translate", 1, 2,
2456                               &tableobj, &delobj))
2457                 return NULL;
2458
2459         if (PyString_Check(tableobj)) {
2460                 table = PyString_AS_STRING(tableobj);
2461                 tablen = PyString_GET_SIZE(tableobj);
2462         }
2463         else if (tableobj == Py_None) {
2464                 table = NULL;
2465                 tablen = 256;
2466         }
2467 #ifdef Py_USING_UNICODE
2468         else if (PyUnicode_Check(tableobj)) {
2469                 /* Unicode .translate() does not support the deletechars
2470                    parameter; instead a mapping to None will cause characters
2471                    to be deleted. */
2472                 if (delobj != NULL) {
2473                         PyErr_SetString(PyExc_TypeError,
2474                         "deletions are implemented differently for unicode");
2475                         return NULL;
2476                 }
2477                 return PyUnicode_Translate((PyObject *)self, tableobj, NULL);
2478         }
2479 #endif
2480         else if (PyObject_AsCharBuffer(tableobj, &table, &tablen))
2481                 return NULL;
2482
2483         if (tablen != 256) {
2484                 PyErr_SetString(PyExc_ValueError,
2485                   "translation table must be 256 characters long");
2486                 return NULL;
2487         }
2488
2489         if (delobj != NULL) {
2490                 if (PyString_Check(delobj)) {
2491                         del_table = PyString_AS_STRING(delobj);
2492                         dellen = PyString_GET_SIZE(delobj);
2493                 }
2494 #ifdef Py_USING_UNICODE
2495                 else if (PyUnicode_Check(delobj)) {
2496                         PyErr_SetString(PyExc_TypeError,
2497                         "deletions are implemented differently for unicode");
2498                         return NULL;
2499                 }
2500 #endif
2501                 else if (PyObject_AsCharBuffer(delobj, &del_table, &dellen))
2502                         return NULL;
2503         }
2504         else {
2505                 del_table = NULL;
2506                 dellen = 0;
2507         }
2508
2509         inlen = PyString_GET_SIZE(input_obj);
2510         result = PyString_FromStringAndSize((char *)NULL, inlen);
2511         if (result == NULL)
2512                 return NULL;
2513         output_start = output = PyString_AsString(result);
2514         input = PyString_AS_STRING(input_obj);
2515
2516         if (dellen == 0 && table != NULL) {
2517                 /* If no deletions are required, use faster code */
2518                 for (i = inlen; --i >= 0; ) {
2519                         c = Py_CHARMASK(*input++);
2520                         if (Py_CHARMASK((*output++ = table[c])) != c)
2521                                 changed = 1;
2522                 }
2523                 if (changed || !PyString_CheckExact(input_obj))
2524                         return result;
2525                 Py_DECREF(result);
2526                 Py_INCREF(input_obj);
2527                 return input_obj;
2528         }
2529
2530         if (table == NULL) {
2531                 for (i = 0; i < 256; i++)
2532                         trans_table[i] = Py_CHARMASK(i);
2533         } else {
2534                 for (i = 0; i < 256; i++)
2535                         trans_table[i] = Py_CHARMASK(table[i]);
2536         }
2537
2538         for (i = 0; i < dellen; i++)
2539                 trans_table[(int) Py_CHARMASK(del_table[i])] = -1;
2540
2541         for (i = inlen; --i >= 0; ) {
2542                 c = Py_CHARMASK(*input++);
2543                 if (trans_table[c] != -1)
2544                         if (Py_CHARMASK(*output++ = (char)trans_table[c]) == c)
2545                                 continue;
2546                 changed = 1;
2547         }
2548         if (!changed && PyString_CheckExact(input_obj)) {
2549                 Py_DECREF(result);
2550                 Py_INCREF(input_obj);
2551                 return input_obj;
2552         }
2553         /* Fix the size of the resulting string */
2554         if (inlen > 0)
2555                 _PyString_Resize(&result, output - output_start);
2556         return result;
2557 }
2558
2559
2560 #define FORWARD 1
2561 #define REVERSE -1
2562
2563 /* find and count characters and substrings */
2564
2565 #define findchar(target, target_len, c)                         \
2566   ((char *)memchr((const void *)(target), c, target_len))
2567
2568 /* String ops must return a string.  */
2569 /* If the object is subclass of string, create a copy */
2570 Py_LOCAL(PyStringObject *)
2571 return_self(PyStringObject *self)
2572 {
2573         if (PyString_CheckExact(self)) {
2574                 Py_INCREF(self);
2575                 return self;
2576         }
2577         return (PyStringObject *)PyString_FromStringAndSize(
2578                 PyString_AS_STRING(self),
2579                 PyString_GET_SIZE(self));
2580 }
2581
2582 Py_LOCAL_INLINE(Py_ssize_t)
2583 countchar(const char *target, int target_len, char c, Py_ssize_t maxcount)
2584 {
2585         Py_ssize_t count=0;
2586         const char *start=target;
2587         const char *end=target+target_len;
2588
2589         while ( (start=findchar(start, end-start, c)) != NULL ) {
2590                 count++;
2591                 if (count >= maxcount)
2592                         break;
2593                 start += 1;
2594         }
2595         return count;
2596 }
2597
2598 Py_LOCAL(Py_ssize_t)
2599 findstring(const char *target, Py_ssize_t target_len,
2600            const char *pattern, Py_ssize_t pattern_len,
2601            Py_ssize_t start,
2602            Py_ssize_t end,
2603            int direction)
2604 {
2605         if (start < 0) {
2606                 start += target_len;
2607                 if (start < 0)
2608                         start = 0;
2609         }
2610         if (end > target_len) {
2611                 end = target_len;
2612         } else if (end < 0) {
2613                 end += target_len;
2614                 if (end < 0)
2615                         end = 0;
2616         }
2617
2618         /* zero-length substrings always match at the first attempt */
2619         if (pattern_len == 0)
2620                 return (direction > 0) ? start : end;
2621
2622         end -= pattern_len;
2623
2624         if (direction < 0) {
2625                 for (; end >= start; end--)
2626                         if (Py_STRING_MATCH(target, end, pattern, pattern_len))
2627                                 return end;
2628         } else {
2629                 for (; start <= end; start++)
2630                         if (Py_STRING_MATCH(target, start, pattern, pattern_len))
2631                                 return start;
2632         }
2633         return -1;
2634 }
2635
2636 Py_LOCAL_INLINE(Py_ssize_t)
2637 countstring(const char *target, Py_ssize_t target_len,
2638             const char *pattern, Py_ssize_t pattern_len,
2639             Py_ssize_t start,
2640             Py_ssize_t end,
2641             int direction, Py_ssize_t maxcount)
2642 {
2643         Py_ssize_t count=0;
2644
2645         if (start < 0) {
2646                 start += target_len;
2647                 if (start < 0)
2648                         start = 0;
2649         }
2650         if (end > target_len) {
2651                 end = target_len;
2652         } else if (end < 0) {
2653                 end += target_len;
2654                 if (end < 0)
2655                         end = 0;
2656         }
2657
2658         /* zero-length substrings match everywhere */
2659         if (pattern_len == 0 || maxcount == 0) {
2660                 if (target_len+1 < maxcount)
2661                         return target_len+1;
2662                 return maxcount;
2663         }
2664
2665         end -= pattern_len;
2666         if (direction < 0) {
2667                 for (; (end >= start); end--)
2668                         if (Py_STRING_MATCH(target, end, pattern, pattern_len)) {
2669                                 count++;
2670                                 if (--maxcount <= 0) break;
2671                                 end -= pattern_len-1;
2672                         }
2673         } else {
2674                 for (; (start <= end); start++)
2675                         if (Py_STRING_MATCH(target, start, pattern, pattern_len)) {
2676                                 count++;
2677                                 if (--maxcount <= 0)
2678                                         break;
2679                                 start += pattern_len-1;
2680                         }
2681         }
2682         return count;
2683 }
2684
2685
2686 /* Algorithms for different cases of string replacement */
2687
2688 /* len(self)>=1, from="", len(to)>=1, maxcount>=1 */
2689 Py_LOCAL(PyStringObject *)
2690 replace_interleave(PyStringObject *self,
2691                    const char *to_s, Py_ssize_t to_len,
2692                    Py_ssize_t maxcount)
2693 {
2694         char *self_s, *result_s;
2695         Py_ssize_t self_len, result_len;
2696         Py_ssize_t count, i, product;
2697         PyStringObject *result;
2698
2699         self_len = PyString_GET_SIZE(self);
2700
2701         /* 1 at the end plus 1 after every character */
2702         count = self_len+1;
2703         if (maxcount < count)
2704                 count = maxcount;
2705
2706         /* Check for overflow */
2707         /*   result_len = count * to_len + self_len; */
2708         product = count * to_len;
2709         if (product / to_len != count) {
2710                 PyErr_SetString(PyExc_OverflowError,
2711                                 "replace string is too long");
2712                 return NULL;
2713         }
2714         result_len = product + self_len;
2715         if (result_len < 0) {
2716                 PyErr_SetString(PyExc_OverflowError,
2717                                 "replace string is too long");
2718                 return NULL;
2719         }
2720
2721         if (! (result = (PyStringObject *)
2722                          PyString_FromStringAndSize(NULL, result_len)) )
2723                 return NULL;
2724
2725         self_s = PyString_AS_STRING(self);
2726         result_s = PyString_AS_STRING(result);
2727
2728         /* TODO: special case single character, which doesn't need memcpy */
2729
2730         /* Lay the first one down (guaranteed this will occur) */
2731         Py_MEMCPY(result_s, to_s, to_len);
2732         result_s += to_len;
2733         count -= 1;
2734
2735         for (i=0; i<count; i++) {
2736                 *result_s++ = *self_s++;
2737                 Py_MEMCPY(result_s, to_s, to_len);
2738                 result_s += to_len;
2739         }
2740
2741         /* Copy the rest of the original string */
2742         Py_MEMCPY(result_s, self_s, self_len-i);
2743
2744         return result;
2745 }
2746
2747 /* Special case for deleting a single character */
2748 /* len(self)>=1, len(from)==1, to="", maxcount>=1 */
2749 Py_LOCAL(PyStringObject *)
2750 replace_delete_single_character(PyStringObject *self,
2751                                 char from_c, Py_ssize_t maxcount)
2752 {
2753         char *self_s, *result_s;
2754         char *start, *next, *end;
2755         Py_ssize_t self_len, result_len;
2756         Py_ssize_t count;
2757         PyStringObject *result;
2758
2759         self_len = PyString_GET_SIZE(self);
2760         self_s = PyString_AS_STRING(self);
2761
2762         count = countchar(self_s, self_len, from_c, maxcount);
2763         if (count == 0) {
2764                 return return_self(self);
2765         }
2766
2767         result_len = self_len - count;  /* from_len == 1 */
2768         assert(result_len>=0);
2769
2770         if ( (result = (PyStringObject *)
2771                         PyString_FromStringAndSize(NULL, result_len)) == NULL)
2772                 return NULL;
2773         result_s = PyString_AS_STRING(result);
2774
2775         start = self_s;
2776         end = self_s + self_len;
2777         while (count-- > 0) {
2778                 next = findchar(start, end-start, from_c);
2779                 if (next == NULL)
2780                         break;
2781                 Py_MEMCPY(result_s, start, next-start);
2782                 result_s += (next-start);
2783                 start = next+1;
2784         }
2785         Py_MEMCPY(result_s, start, end-start);
2786
2787         return result;
2788 }
2789
2790 /* len(self)>=1, len(from)>=2, to="", maxcount>=1 */
2791
2792 Py_LOCAL(PyStringObject *)
2793 replace_delete_substring(PyStringObject *self,
2794                          const char *from_s, Py_ssize_t from_len,
2795                          Py_ssize_t maxcount) {
2796         char *self_s, *result_s;
2797         char *start, *next, *end;
2798         Py_ssize_t self_len, result_len;
2799         Py_ssize_t count, offset;
2800         PyStringObject *result;
2801
2802         self_len = PyString_GET_SIZE(self);
2803         self_s = PyString_AS_STRING(self);
2804
2805         count = countstring(self_s, self_len,
2806                             from_s, from_len,
2807                             0, self_len, 1,
2808                             maxcount);
2809
2810         if (count == 0) {
2811                 /* no matches */
2812                 return return_self(self);
2813         }
2814
2815         result_len = self_len - (count * from_len);
2816         assert (result_len>=0);
2817
2818         if ( (result = (PyStringObject *)
2819               PyString_FromStringAndSize(NULL, result_len)) == NULL )
2820                 return NULL;
2821
2822         result_s = PyString_AS_STRING(result);
2823
2824         start = self_s;
2825         end = self_s + self_len;
2826         while (count-- > 0) {
2827                 offset = findstring(start, end-start,
2828                                     from_s, from_len,
2829                                     0, end-start, FORWARD);
2830                 if (offset == -1)
2831                         break;
2832                 next = start + offset;
2833
2834                 Py_MEMCPY(result_s, start, next-start);
2835
2836                 result_s += (next-start);
2837                 start = next+from_len;
2838         }
2839         Py_MEMCPY(result_s, start, end-start);
2840         return result;
2841 }
2842
2843 /* len(self)>=1, len(from)==len(to)==1, maxcount>=1 */
2844 Py_LOCAL(PyStringObject *)
2845 replace_single_character_in_place(PyStringObject *self,
2846                                   char from_c, char to_c,
2847                                   Py_ssize_t maxcount)
2848 {
2849         char *self_s, *result_s, *start, *end, *next;
2850         Py_ssize_t self_len;
2851         PyStringObject *result;
2852
2853         /* The result string will be the same size */
2854         self_s = PyString_AS_STRING(self);
2855         self_len = PyString_GET_SIZE(self);
2856
2857         next = findchar(self_s, self_len, from_c);
2858
2859         if (next == NULL) {
2860                 /* No matches; return the original string */
2861                 return return_self(self);
2862         }
2863
2864         /* Need to make a new string */
2865         result = (PyStringObject *) PyString_FromStringAndSize(NULL, self_len);
2866         if (result == NULL)
2867                 return NULL;
2868         result_s = PyString_AS_STRING(result);
2869         Py_MEMCPY(result_s, self_s, self_len);
2870
2871         /* change everything in-place, starting with this one */
2872         start =  result_s + (next-self_s);
2873         *start = to_c;
2874         start++;
2875         end = result_s + self_len;
2876
2877         while (--maxcount > 0) {
2878                 next = findchar(start, end-start, from_c);
2879                 if (next == NULL)
2880                         break;
2881                 *next = to_c;
2882                 start = next+1;
2883         }
2884
2885         return result;
2886 }
2887
2888 /* len(self)>=1, len(from)==len(to)>=2, maxcount>=1 */
2889 Py_LOCAL(PyStringObject *)
2890 replace_substring_in_place(PyStringObject *self,
2891                            const char *from_s, Py_ssize_t from_len,
2892                            const char *to_s, Py_ssize_t to_len,
2893                            Py_ssize_t maxcount)
2894 {
2895         char *result_s, *start, *end;
2896         char *self_s;
2897         Py_ssize_t self_len, offset;
2898         PyStringObject *result;
2899
2900         /* The result string will be the same size */
2901
2902         self_s = PyString_AS_STRING(self);
2903         self_len = PyString_GET_SIZE(self);
2904
2905         offset = findstring(self_s, self_len,
2906                             from_s, from_len,
2907                             0, self_len, FORWARD);
2908         if (offset == -1) {
2909                 /* No matches; return the original string */
2910                 return return_self(self);
2911         }
2912
2913         /* Need to make a new string */
2914         result = (PyStringObject *) PyString_FromStringAndSize(NULL, self_len);
2915         if (result == NULL)
2916                 return NULL;
2917         result_s = PyString_AS_STRING(result);
2918         Py_MEMCPY(result_s, self_s, self_len);
2919
2920         /* change everything in-place, starting with this one */
2921         start =  result_s + offset;
2922         Py_MEMCPY(start, to_s, from_len);
2923         start += from_len;
2924         end = result_s + self_len;
2925
2926         while ( --maxcount > 0) {
2927                 offset = findstring(start, end-start,
2928                                     from_s, from_len,
2929                                     0, end-start, FORWARD);
2930                 if (offset==-1)
2931                         break;
2932                 Py_MEMCPY(start+offset, to_s, from_len);
2933                 start += offset+from_len;
2934         }
2935
2936         return result;
2937 }
2938
2939 /* len(self)>=1, len(from)==1, len(to)>=2, maxcount>=1 */
2940 Py_LOCAL(PyStringObject *)
2941 replace_single_character(PyStringObject *self,
2942                          char from_c,
2943                          const char *to_s, Py_ssize_t to_len,
2944                          Py_ssize_t maxcount)
2945 {
2946         char *self_s, *result_s;
2947         char *start, *next, *end;
2948         Py_ssize_t self_len, result_len;
2949         Py_ssize_t count, product;
2950         PyStringObject *result;
2951
2952         self_s = PyString_AS_STRING(self);
2953         self_len = PyString_GET_SIZE(self);
2954
2955         count = countchar(self_s, self_len, from_c, maxcount);
2956         if (count == 0) {
2957                 /* no matches, return unchanged */
2958                 return return_self(self);
2959         }
2960
2961         /* use the difference between current and new, hence the "-1" */
2962         /*   result_len = self_len + count * (to_len-1)  */
2963         product = count * (to_len-1);
2964         if (product / (to_len-1) != count) {
2965                 PyErr_SetString(PyExc_OverflowError, "replace string is too long");
2966                 return NULL;
2967         }
2968         result_len = self_len + product;
2969         if (result_len < 0) {
2970                 PyErr_SetString(PyExc_OverflowError, "replace string is too long");
2971                 return NULL;
2972         }
2973
2974         if ( (result = (PyStringObject *)
2975               PyString_FromStringAndSize(NULL, result_len)) == NULL)
2976                 return NULL;
2977         result_s = PyString_AS_STRING(result);
2978
2979         start = self_s;
2980         end = self_s + self_len;
2981         while (count-- > 0) {
2982                 next = findchar(start, end-start, from_c);
2983                 if (next == NULL)
2984                         break;
2985
2986                 if (next == start) {
2987                         /* replace with the 'to' */
2988                         Py_MEMCPY(result_s, to_s, to_len);
2989                         result_s += to_len;
2990                         start += 1;
2991                 } else {
2992                         /* copy the unchanged old then the 'to' */
2993                         Py_MEMCPY(result_s, start, next-start);
2994                         result_s += (next-start);
2995                         Py_MEMCPY(result_s, to_s, to_len);
2996                         result_s += to_len;
2997                         start = next+1;
2998                 }
2999         }
3000         /* Copy the remainder of the remaining string */
3001         Py_MEMCPY(result_s, start, end-start);
3002
3003         return result;
3004 }
3005
3006 /* len(self)>=1, len(from)>=2, len(to)>=2, maxcount>=1 */
3007 Py_LOCAL(PyStringObject *)
3008 replace_substring(PyStringObject *self,
3009                   const char *from_s, Py_ssize_t from_len,
3010                   const char *to_s, Py_ssize_t to_len,
3011                   Py_ssize_t maxcount) {
3012         char *self_s, *result_s;
3013         char *start, *next, *end;
3014         Py_ssize_t self_len, result_len;
3015         Py_ssize_t count, offset, product;
3016         PyStringObject *result;
3017
3018         self_s = PyString_AS_STRING(self);
3019         self_len = PyString_GET_SIZE(self);
3020
3021         count = countstring(self_s, self_len,
3022                             from_s, from_len,
3023                             0, self_len, FORWARD, maxcount);
3024         if (count == 0) {
3025                 /* no matches, return unchanged */
3026                 return return_self(self);
3027         }
3028
3029         /* Check for overflow */
3030         /*    result_len = self_len + count * (to_len-from_len) */
3031         product = count * (to_len-from_len);
3032         if (product / (to_len-from_len) != count) {
3033                 PyErr_SetString(PyExc_OverflowError, "replace string is too long");
3034                 return NULL;
3035         }
3036         result_len = self_len + product;
3037         if (result_len < 0) {
3038                 PyErr_SetString(PyExc_OverflowError, "replace string is too long");
3039                 return NULL;
3040         }
3041
3042         if ( (result = (PyStringObject *)
3043               PyString_FromStringAndSize(NULL, result_len)) == NULL)
3044                 return NULL;
3045         result_s = PyString_AS_STRING(result);
3046
3047         start = self_s;
3048         end = self_s + self_len;
3049         while (count-- > 0) {
3050                 offset = findstring(start, end-start,
3051                                     from_s, from_len,
3052                                     0, end-start, FORWARD);
3053                 if (offset == -1)
3054                         break;
3055                 next = start+offset;
3056                 if (next == start) {
3057                         /* replace with the 'to' */
3058                         Py_MEMCPY(result_s, to_s, to_len);
3059                         result_s += to_len;
3060                         start += from_len;
3061                 } else {
3062                         /* copy the unchanged old then the 'to' */
3063                         Py_MEMCPY(result_s, start, next-start);
3064                         result_s += (next-start);
3065                         Py_MEMCPY(result_s, to_s, to_len);
3066                         result_s += to_len;
3067                         start = next+from_len;
3068                 }
3069         }
3070         /* Copy the remainder of the remaining string */
3071         Py_MEMCPY(result_s, start, end-start);
3072
3073         return result;
3074 }
3075
3076
3077 Py_LOCAL(PyStringObject *)
3078 replace(PyStringObject *self,
3079         const char *from_s, Py_ssize_t from_len,
3080         const char *to_s, Py_ssize_t to_len,
3081         Py_ssize_t maxcount)
3082 {
3083         if (maxcount < 0) {
3084                 maxcount = PY_SSIZE_T_MAX;
3085         } else if (maxcount == 0 || PyString_GET_SIZE(self) == 0) {
3086                 /* nothing to do; return the original string */
3087                 return return_self(self);
3088         }
3089
3090         if (maxcount == 0 ||
3091             (from_len == 0 && to_len == 0)) {
3092                 /* nothing to do; return the original string */
3093                 return return_self(self);
3094         }
3095
3096         /* Handle zero-length special cases */
3097
3098         if (from_len == 0) {
3099                 /* insert the 'to' string everywhere.   */
3100                 /*    >>> "Python".replace("", ".")     */
3101                 /*    '.P.y.t.h.o.n.'                   */
3102                 return replace_interleave(self, to_s, to_len, maxcount);
3103         }
3104
3105         /* Except for "".replace("", "A") == "A" there is no way beyond this */
3106         /* point for an empty self string to generate a non-empty string */
3107         /* Special case so the remaining code always gets a non-empty string */
3108         if (PyString_GET_SIZE(self) == 0) {
3109                 return return_self(self);
3110         }
3111
3112         if (to_len == 0) {
3113                 /* delete all occurances of 'from' string */
3114                 if (from_len == 1) {
3115                         return replace_delete_single_character(
3116                                 self, from_s[0], maxcount);
3117                 } else {
3118                         return replace_delete_substring(self, from_s, from_len, maxcount);
3119                 }
3120         }
3121
3122         /* Handle special case where both strings have the same length */
3123
3124         if (from_len == to_len) {
3125                 if (from_len == 1) {
3126                         return replace_single_character_in_place(
3127                                 self,
3128                                 from_s[0],
3129                                 to_s[0],
3130                                 maxcount);
3131                 } else {
3132                         return replace_substring_in_place(
3133                                 self, from_s, from_len, to_s, to_len, maxcount);
3134                 }
3135         }
3136
3137         /* Otherwise use the more generic algorithms */
3138         if (from_len == 1) {
3139                 return replace_single_character(self, from_s[0],
3140                                                 to_s, to_len, maxcount);
3141         } else {
3142                 /* len('from')>=2, len('to')>=1 */
3143                 return replace_substring(self, from_s, from_len, to_s, to_len, maxcount);
3144         }
3145 }
3146
3147 PyDoc_STRVAR(replace__doc__,
3148 "S.replace (old, new[, count]) -> string\n\
3149 \n\
3150 Return a copy of string S with all occurrences of substring\n\
3151 old replaced by new.  If the optional argument count is\n\
3152 given, only the first count occurrences are replaced.");
3153
3154 static PyObject *
3155 string_replace(PyStringObject *self, PyObject *args)
3156 {
3157         Py_ssize_t count = -1;
3158         PyObject *from, *to;
3159         const char *from_s, *to_s;
3160         Py_ssize_t from_len, to_len;
3161
3162         if (!PyArg_ParseTuple(args, "OO|n:replace", &from, &to, &count))
3163                 return NULL;
3164
3165         if (PyString_Check(from)) {
3166                 from_s = PyString_AS_STRING(from);
3167                 from_len = PyString_GET_SIZE(from);
3168         }
3169 #ifdef Py_USING_UNICODE
3170         if (PyUnicode_Check(from))
3171                 return PyUnicode_Replace((PyObject *)self,
3172                                          from, to, count);
3173 #endif
3174         else if (PyObject_AsCharBuffer(from, &from_s, &from_len))
3175                 return NULL;
3176
3177         if (PyString_Check(to)) {
3178                 to_s = PyString_AS_STRING(to);
3179                 to_len = PyString_GET_SIZE(to);
3180         }
3181 #ifdef Py_USING_UNICODE
3182         else if (PyUnicode_Check(to))
3183                 return PyUnicode_Replace((PyObject *)self,
3184                                          from, to, count);
3185 #endif
3186         else if (PyObject_AsCharBuffer(to, &to_s, &to_len))
3187                 return NULL;
3188
3189         return (PyObject *)replace((PyStringObject *) self,
3190                                    from_s, from_len,
3191                                    to_s, to_len, count);
3192 }
3193
3194 /** End DALKE **/
3195
3196 /* Matches the end (direction >= 0) or start (direction < 0) of self
3197  * against substr, using the start and end arguments. Returns
3198  * -1 on error, 0 if not found and 1 if found.
3199  */
3200 Py_LOCAL(int)
3201 _string_tailmatch(PyStringObject *self, PyObject *substr, Py_ssize_t start,
3202                   Py_ssize_t end, int direction)
3203 {
3204         Py_ssize_t len = PyString_GET_SIZE(self);
3205         Py_ssize_t slen;
3206         const char* sub;
3207         const char* str;
3208
3209         if (PyString_Check(substr)) {
3210                 sub = PyString_AS_STRING(substr);
3211                 slen = PyString_GET_SIZE(substr);
3212         }
3213 #ifdef Py_USING_UNICODE
3214         else if (PyUnicode_Check(substr))
3215                 return PyUnicode_Tailmatch((PyObject *)self,
3216                                            substr, start, end, direction);
3217 #endif
3218         else if (PyObject_AsCharBuffer(substr, &sub, &slen))
3219                 return -1;
3220         str = PyString_AS_STRING(self);
3221
3222         string_adjust_indices(&start, &end, len);
3223
3224         if (direction < 0) {
3225                 /* startswith */
3226                 if (start+slen > len)
3227                         return 0;
3228         } else {
3229                 /* endswith */
3230                 if (end-start < slen || start > len)
3231                         return 0;
3232
3233                 if (end-slen > start)
3234                         start = end - slen;
3235         }
3236         if (end-start >= slen)
3237                 return ! memcmp(str+start, sub, slen);
3238         return 0;
3239 }
3240
3241
3242 PyDoc_STRVAR(startswith__doc__,
3243 "S.startswith(prefix[, start[, end]]) -> bool\n\
3244 \n\
3245 Return True if S starts with the specified prefix, False otherwise.\n\
3246 With optional start, test S beginning at that position.\n\
3247 With optional end, stop comparing S at that position.\n\
3248 prefix can also be a tuple of strings to try.");
3249
3250 static PyObject *
3251 string_startswith(PyStringObject *self, PyObject *args)
3252 {
3253         Py_ssize_t start = 0;
3254         Py_ssize_t end = PY_SSIZE_T_MAX;
3255         PyObject *subobj;
3256         int result;
3257
3258         if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
3259                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3260                 return NULL;
3261         if (PyTuple_Check(subobj)) {
3262                 Py_ssize_t i;
3263                 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
3264                         result = _string_tailmatch(self,
3265                                         PyTuple_GET_ITEM(subobj, i),
3266                                         start, end, -1);
3267                         if (result == -1)
3268                                 return NULL;
3269                         else if (result) {
3270                                 Py_RETURN_TRUE;
3271                         }
3272                 }
3273                 Py_RETURN_FALSE;
3274         }
3275         result = _string_tailmatch(self, subobj, start, end, -1);
3276         if (result == -1)
3277                 return NULL;
3278         else
3279                 return PyBool_FromLong(result);
3280 }
3281
3282
3283 PyDoc_STRVAR(endswith__doc__,
3284 "S.endswith(suffix[, start[, end]]) -> bool\n\
3285 \n\
3286 Return True if S ends with the specified suffix, False otherwise.\n\
3287 With optional start, test S beginning at that position.\n\
3288 With optional end, stop comparing S at that position.\n\
3289 suffix can also be a tuple of strings to try.");
3290
3291 static PyObject *
3292 string_endswith(PyStringObject *self, PyObject *args)
3293 {
3294         Py_ssize_t start = 0;
3295         Py_ssize_t end = PY_SSIZE_T_MAX;
3296         PyObject *subobj;
3297         int result;
3298
3299         if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
3300                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3301                 return NULL;
3302         if (PyTuple_Check(subobj)) {
3303                 Py_ssize_t i;
3304                 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
3305                         result = _string_tailmatch(self,
3306                                         PyTuple_GET_ITEM(subobj, i),
3307                                         start, end, +1);
3308                         if (result == -1)
3309                                 return NULL;
3310                         else if (result) {
3311                                 Py_RETURN_TRUE;
3312                         }
3313                 }
3314                 Py_RETURN_FALSE;
3315         }
3316         result = _string_tailmatch(self, subobj, start, end, +1);
3317         if (result == -1)
3318                 return NULL;
3319         else
3320                 return PyBool_FromLong(result);
3321 }
3322
3323
3324 PyDoc_STRVAR(encode__doc__,
3325 "S.encode([encoding[,errors]]) -> object\n\
3326 \n\
3327 Encodes S using the codec registered for encoding. encoding defaults\n\
3328 to the default encoding. errors may be given to set a different error\n\
3329 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3330 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
3331 'xmlcharrefreplace' as well as any other name registered with\n\
3332 codecs.register_error that is able to handle UnicodeEncodeErrors.");
3333
3334 static PyObject *
3335 string_encode(PyStringObject *self, PyObject *args, PyObject *kwargs)
3336 {
3337     static char *kwlist[] = {"encoding", "errors", 0};
3338     char *encoding = NULL;
3339     char *errors = NULL;
3340     PyObject *v;
3341
3342     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
3343                                      kwlist, &encoding, &errors))
3344         return NULL;
3345     v = PyString_AsEncodedObject((PyObject *)self, encoding, errors);
3346     if (v == NULL)
3347         goto onError;
3348     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
3349         PyErr_Format(PyExc_TypeError,
3350                      "encoder did not return a string/unicode object "
3351                      "(type=%.400s)",
3352                      Py_TYPE(v)->tp_name);
3353         Py_DECREF(v);
3354         return NULL;
3355     }
3356     return v;
3357
3358  onError:
3359     return NULL;
3360 }
3361
3362
3363 PyDoc_STRVAR(decode__doc__,
3364 "S.decode([encoding[,errors]]) -> object\n\
3365 \n\
3366 Decodes S using the codec registered for encoding. encoding defaults\n\
3367 to the default encoding. errors may be given to set a different error\n\
3368 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3369 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
3370 as well as any other name registered with codecs.register_error that is\n\
3371 able to handle UnicodeDecodeErrors.");
3372
3373 static PyObject *
3374 string_decode(PyStringObject *self, PyObject *args, PyObject *kwargs)
3375 {
3376     static char *kwlist[] = {"encoding", "errors", 0};
3377     char *encoding = NULL;
3378     char *errors = NULL;
3379     PyObject *v;
3380
3381     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
3382                                      kwlist, &encoding, &errors))
3383         return NULL;
3384     v = PyString_AsDecodedObject((PyObject *)self, encoding, errors);
3385     if (v == NULL)
3386         goto onError;
3387     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
3388         PyErr_Format(PyExc_TypeError,
3389                      "decoder did not return a string/unicode object "
3390                      "(type=%.400s)",
3391                      Py_TYPE(v)->tp_name);
3392         Py_DECREF(v);
3393         return NULL;
3394     }
3395     return v;
3396
3397  onError:
3398     return NULL;
3399 }
3400
3401
3402 PyDoc_STRVAR(expandtabs__doc__,
3403 "S.expandtabs([tabsize]) -> string\n\
3404 \n\
3405 Return a copy of S where all tab characters are expanded using spaces.\n\
3406 If tabsize is not given, a tab size of 8 characters is assumed.");
3407
3408 static PyObject*
3409 string_expandtabs(PyStringObject *self, PyObject *args)
3410 {
3411     const char *e, *p, *qe;
3412     char *q;
3413     Py_ssize_t i, j, incr;
3414     PyObject *u;
3415     int tabsize = 8;
3416
3417     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3418         return NULL;
3419
3420     /* First pass: determine size of output string */
3421     i = 0; /* chars up to and including most recent \n or \r */
3422     j = 0; /* chars since most recent \n or \r (use in tab calculations) */
3423     e = PyString_AS_STRING(self) + PyString_GET_SIZE(self); /* end of input */
3424     for (p = PyString_AS_STRING(self); p < e; p++)
3425         if (*p == '\t') {
3426             if (tabsize > 0) {
3427                 incr = tabsize - (j % tabsize);
3428                 if (j > PY_SSIZE_T_MAX - incr)
3429                     goto overflow1;
3430                 j += incr;
3431             }
3432         }
3433         else {
3434             if (j > PY_SSIZE_T_MAX - 1)
3435                 goto overflow1;
3436             j++;
3437             if (*p == '\n' || *p == '\r') {
3438                 if (i > PY_SSIZE_T_MAX - j)
3439                     goto overflow1;
3440                 i += j;
3441                 j = 0;
3442             }
3443         }
3444
3445     if (i > PY_SSIZE_T_MAX - j)
3446         goto overflow1;
3447
3448     /* Second pass: create output string and fill it */
3449     u = PyString_FromStringAndSize(NULL, i + j);
3450     if (!u)
3451         return NULL;
3452
3453     j = 0; /* same as in first pass */
3454     q = PyString_AS_STRING(u); /* next output char */
3455     qe = PyString_AS_STRING(u) + PyString_GET_SIZE(u); /* end of output */
3456
3457     for (p = PyString_AS_STRING(self); p < e; p++)
3458         if (*p == '\t') {
3459             if (tabsize > 0) {
3460                 i = tabsize - (j % tabsize);
3461                 j += i;
3462                 while (i--) {
3463                     if (q >= qe)
3464                         goto overflow2;
3465                     *q++ = ' ';
3466                 }
3467             }
3468         }
3469         else {
3470             if (q >= qe)
3471                 goto overflow2;
3472             *q++ = *p;
3473             j++;
3474             if (*p == '\n' || *p == '\r')
3475                 j = 0;
3476         }
3477
3478     return u;
3479
3480   overflow2:
3481     Py_DECREF(u);
3482   overflow1:
3483     PyErr_SetString(PyExc_OverflowError, "new string is too long");
3484     return NULL;
3485 }
3486
3487 Py_LOCAL_INLINE(PyObject *)
3488 pad(PyStringObject *self, Py_ssize_t left, Py_ssize_t right, char fill)
3489 {
3490     PyObject *u;
3491
3492     if (left < 0)
3493         left = 0;
3494     if (right < 0)
3495         right = 0;
3496
3497     if (left == 0 && right == 0 && PyString_CheckExact(self)) {
3498         Py_INCREF(self);
3499         return (PyObject *)self;
3500     }
3501
3502     u = PyString_FromStringAndSize(NULL,
3503                                    left + PyString_GET_SIZE(self) + right);
3504     if (u) {
3505         if (left)
3506             memset(PyString_AS_STRING(u), fill, left);
3507         Py_MEMCPY(PyString_AS_STRING(u) + left,
3508                PyString_AS_STRING(self),
3509                PyString_GET_SIZE(self));
3510         if (right)
3511             memset(PyString_AS_STRING(u) + left + PyString_GET_SIZE(self),
3512                    fill, right);
3513     }
3514
3515     return u;
3516 }
3517
3518 PyDoc_STRVAR(ljust__doc__,
3519 "S.ljust(width[, fillchar]) -> string\n"
3520 "\n"
3521 "Return S left-justified in a string of length width. Padding is\n"
3522 "done using the specified fill character (default is a space).");
3523
3524 static PyObject *
3525 string_ljust(PyStringObject *self, PyObject *args)
3526 {
3527     Py_ssize_t width;
3528     char fillchar = ' ';
3529
3530     if (!PyArg_ParseTuple(args, "n|c:ljust", &width, &fillchar))
3531         return NULL;
3532
3533     if (PyString_GET_SIZE(self) >= width && PyString_CheckExact(self)) {
3534         Py_INCREF(self);
3535         return (PyObject*) self;
3536     }
3537
3538     return pad(self, 0, width - PyString_GET_SIZE(self), fillchar);
3539 }
3540
3541
3542 PyDoc_STRVAR(rjust__doc__,
3543 "S.rjust(width[, fillchar]) -> string\n"
3544 "\n"
3545 "Return S right-justified in a string of length width. Padding is\n"
3546 "done using the specified fill character (default is a space)");
3547
3548 static PyObject *
3549 string_rjust(PyStringObject *self, PyObject *args)
3550 {
3551     Py_ssize_t width;
3552     char fillchar = ' ';
3553
3554     if (!PyArg_ParseTuple(args, "n|c:rjust", &width, &fillchar))
3555         return NULL;
3556
3557     if (PyString_GET_SIZE(self) >= width && PyString_CheckExact(self)) {
3558         Py_INCREF(self);
3559         return (PyObject*) self;
3560     }
3561
3562     return pad(self, width - PyString_GET_SIZE(self), 0, fillchar);
3563 }
3564
3565
3566 PyDoc_STRVAR(center__doc__,
3567 "S.center(width[, fillchar]) -> string\n"
3568 "\n"
3569 "Return S centered in a string of length width. Padding is\n"
3570 "done using the specified fill character (default is a space)");
3571
3572 static PyObject *
3573 string_center(PyStringObject *self, PyObject *args)
3574 {
3575     Py_ssize_t marg, left;
3576     Py_ssize_t width;
3577     char fillchar = ' ';
3578
3579     if (!PyArg_ParseTuple(args, "n|c:center", &width, &fillchar))
3580         return NULL;
3581
3582     if (PyString_GET_SIZE(self) >= width && PyString_CheckExact(self)) {
3583         Py_INCREF(self);
3584         return (PyObject*) self;
3585     }
3586
3587     marg = width - PyString_GET_SIZE(self);
3588     left = marg / 2 + (marg & width & 1);
3589
3590     return pad(self, left, marg - left, fillchar);
3591 }
3592
3593 PyDoc_STRVAR(zfill__doc__,
3594 "S.zfill(width) -> string\n"
3595 "\n"
3596 "Pad a numeric string S with zeros on the left, to fill a field\n"
3597 "of the specified width.  The string S is never truncated.");
3598
3599 static PyObject *
3600 string_zfill(PyStringObject *self, PyObject *args)
3601 {
3602     Py_ssize_t fill;
3603     PyObject *s;
3604     char *p;
3605     Py_ssize_t width;
3606
3607     if (!PyArg_ParseTuple(args, "n:zfill", &width))
3608         return NULL;
3609
3610     if (PyString_GET_SIZE(self) >= width) {
3611         if (PyString_CheckExact(self)) {
3612             Py_INCREF(self);
3613             return (PyObject*) self;
3614         }
3615         else
3616             return PyString_FromStringAndSize(
3617                 PyString_AS_STRING(self),
3618                 PyString_GET_SIZE(self)
3619             );
3620     }
3621
3622     fill = width - PyString_GET_SIZE(self);
3623
3624     s = pad(self, fill, 0, '0');
3625
3626     if (s == NULL)
3627         return NULL;
3628
3629     p = PyString_AS_STRING(s);
3630     if (p[fill] == '+' || p[fill] == '-') {
3631         /* move sign to beginning of string */
3632         p[0] = p[fill];
3633         p[fill] = '0';
3634     }
3635
3636     return (PyObject*) s;
3637 }
3638
3639 PyDoc_STRVAR(isspace__doc__,
3640 "S.isspace() -> bool\n\
3641 \n\
3642 Return True if all characters in S are whitespace\n\
3643 and there is at least one character in S, False otherwise.");
3644
3645 static PyObject*
3646 string_isspace(PyStringObject *self)
3647 {
3648     register const unsigned char *p
3649         = (unsigned char *) PyString_AS_STRING(self);
3650     register const unsigned char *e;
3651
3652     /* Shortcut for single character strings */
3653     if (PyString_GET_SIZE(self) == 1 &&
3654         isspace(*p))
3655         return PyBool_FromLong(1);
3656
3657     /* Special case for empty strings */
3658     if (PyString_GET_SIZE(self) == 0)
3659         return PyBool_FromLong(0);
3660
3661     e = p + PyString_GET_SIZE(self);
3662     for (; p < e; p++) {
3663         if (!isspace(*p))
3664             return PyBool_FromLong(0);
3665     }
3666     return PyBool_FromLong(1);
3667 }
3668
3669
3670 PyDoc_STRVAR(isalpha__doc__,
3671 "S.isalpha() -> bool\n\
3672 \n\
3673 Return True if all characters in S are alphabetic\n\
3674 and there is at least one character in S, False otherwise.");
3675
3676 static PyObject*
3677 string_isalpha(PyStringObject *self)
3678 {
3679     register const unsigned char *p
3680         = (unsigned char *) PyString_AS_STRING(self);
3681     register const unsigned char *e;
3682
3683     /* Shortcut for single character strings */
3684     if (PyString_GET_SIZE(self) == 1 &&
3685         isalpha(*p))
3686         return PyBool_FromLong(1);
3687
3688     /* Special case for empty strings */
3689     if (PyString_GET_SIZE(self) == 0)
3690         return PyBool_FromLong(0);
3691
3692     e = p + PyString_GET_SIZE(self);
3693     for (; p < e; p++) {
3694         if (!isalpha(*p))
3695             return PyBool_FromLong(0);
3696     }
3697     return PyBool_FromLong(1);
3698 }
3699
3700
3701 PyDoc_STRVAR(isalnum__doc__,
3702 "S.isalnum() -> bool\n\
3703 \n\
3704 Return True if all characters in S are alphanumeric\n\
3705 and there is at least one character in S, False otherwise.");
3706
3707 static PyObject*
3708 string_isalnum(PyStringObject *self)
3709 {
3710     register const unsigned char *p
3711         = (unsigned char *) PyString_AS_STRING(self);
3712     register const unsigned char *e;
3713
3714     /* Shortcut for single character strings */
3715     if (PyString_GET_SIZE(self) == 1 &&
3716         isalnum(*p))
3717         return PyBool_FromLong(1);
3718
3719     /* Special case for empty strings */
3720     if (PyString_GET_SIZE(self) == 0)
3721         return PyBool_FromLong(0);
3722
3723     e = p + PyString_GET_SIZE(self);
3724     for (; p < e; p++) {
3725         if (!isalnum(*p))
3726             return PyBool_FromLong(0);
3727     }
3728     return PyBool_FromLong(1);
3729 }
3730
3731
3732 PyDoc_STRVAR(isdigit__doc__,
3733 "S.isdigit() -> bool\n\
3734 \n\
3735 Return True if all characters in S are digits\n\
3736 and there is at least one character in S, False otherwise.");
3737
3738 static PyObject*
3739 string_isdigit(PyStringObject *self)
3740 {
3741     register const unsigned char *p
3742         = (unsigned char *) PyString_AS_STRING(self);
3743     register const unsigned char *e;
3744
3745     /* Shortcut for single character strings */
3746     if (PyString_GET_SIZE(self) == 1 &&
3747         isdigit(*p))
3748         return PyBool_FromLong(1);
3749
3750     /* Special case for empty strings */
3751     if (PyString_GET_SIZE(self) == 0)
3752         return PyBool_FromLong(0);
3753
3754     e = p + PyString_GET_SIZE(self);
3755     for (; p < e; p++) {
3756         if (!isdigit(*p))
3757             return PyBool_FromLong(0);
3758     }
3759     return PyBool_FromLong(1);
3760 }
3761
3762
3763 PyDoc_STRVAR(islower__doc__,
3764 "S.islower() -> bool\n\
3765 \n\
3766 Return True if all cased characters in S are lowercase and there is\n\
3767 at least one cased character in S, False otherwise.");
3768
3769 static PyObject*
3770 string_islower(PyStringObject *self)
3771 {
3772     register const unsigned char *p
3773         = (unsigned char *) PyString_AS_STRING(self);
3774     register const unsigned char *e;
3775     int cased;
3776
3777     /* Shortcut for single character strings */
3778     if (PyString_GET_SIZE(self) == 1)
3779         return PyBool_FromLong(islower(*p) != 0);
3780
3781     /* Special case for empty strings */
3782     if (PyString_GET_SIZE(self) == 0)
3783         return PyBool_FromLong(0);
3784
3785     e = p + PyString_GET_SIZE(self);
3786     cased = 0;
3787     for (; p < e; p++) {
3788         if (isupper(*p))
3789             return PyBool_FromLong(0);
3790         else if (!cased && islower(*p))
3791             cased = 1;
3792     }
3793     return PyBool_FromLong(cased);
3794 }
3795
3796
3797 PyDoc_STRVAR(isupper__doc__,
3798 "S.isupper() -> bool\n\
3799 \n\
3800 Return True if all cased characters in S are uppercase and there is\n\
3801 at least one cased character in S, False otherwise.");
3802
3803 static PyObject*
3804 string_isupper(PyStringObject *self)
3805 {
3806     register const unsigned char *p
3807         = (unsigned char *) PyString_AS_STRING(self);
3808     register const unsigned char *e;
3809     int cased;
3810
3811     /* Shortcut for single character strings */
3812     if (PyString_GET_SIZE(self) == 1)
3813         return PyBool_FromLong(isupper(*p) != 0);
3814
3815     /* Special case for empty strings */
3816     if (PyString_GET_SIZE(self) == 0)
3817         return PyBool_FromLong(0);
3818
3819     e = p + PyString_GET_SIZE(self);
3820     cased = 0;
3821     for (; p < e; p++) {
3822         if (islower(*p))
3823             return PyBool_FromLong(0);
3824         else if (!cased && isupper(*p))
3825             cased = 1;
3826     }
3827     return PyBool_FromLong(cased);
3828 }
3829
3830
3831 PyDoc_STRVAR(istitle__doc__,
3832 "S.istitle() -> bool\n\
3833 \n\
3834 Return True if S is a titlecased string and there is at least one\n\
3835 character in S, i.e. uppercase characters may only follow uncased\n\
3836 characters and lowercase characters only cased ones. Return False\n\
3837 otherwise.");
3838
3839 static PyObject*
3840 string_istitle(PyStringObject *self, PyObject *uncased)
3841 {
3842     register const unsigned char *p
3843         = (unsigned char *) PyString_AS_STRING(self);
3844     register const unsigned char *e;
3845     int cased, previous_is_cased;
3846
3847     /* Shortcut for single character strings */
3848     if (PyString_GET_SIZE(self) == 1)
3849         return PyBool_FromLong(isupper(*p) != 0);
3850
3851     /* Special case for empty strings */
3852     if (PyString_GET_SIZE(self) == 0)
3853         return PyBool_FromLong(0);
3854
3855     e = p + PyString_GET_SIZE(self);
3856     cased = 0;
3857     previous_is_cased = 0;
3858     for (; p < e; p++) {
3859         register const unsigned char ch = *p;
3860
3861         if (isupper(ch)) {
3862             if (previous_is_cased)
3863                 return PyBool_FromLong(0);
3864             previous_is_cased = 1;
3865             cased = 1;
3866         }
3867         else if (islower(ch)) {
3868             if (!previous_is_cased)
3869                 return PyBool_FromLong(0);
3870             previous_is_cased = 1;
3871             cased = 1;
3872         }
3873         else
3874             previous_is_cased = 0;
3875     }
3876     return PyBool_FromLong(cased);
3877 }
3878
3879
3880 PyDoc_STRVAR(splitlines__doc__,
3881 "S.splitlines([keepends]) -> list of strings\n\
3882 \n\
3883 Return a list of the lines in S, breaking at line boundaries.\n\
3884 Line breaks are not included in the resulting list unless keepends\n\
3885 is given and true.");
3886
3887 static PyObject*
3888 string_splitlines(PyStringObject *self, PyObject *args)
3889 {
3890     register Py_ssize_t i;
3891     register Py_ssize_t j;
3892     Py_ssize_t len;
3893     int keepends = 0;
3894     PyObject *list;
3895     PyObject *str;
3896     char *data;
3897
3898     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
3899         return NULL;
3900
3901     data = PyString_AS_STRING(self);
3902     len = PyString_GET_SIZE(self);
3903
3904     /* This does not use the preallocated list because splitlines is
3905        usually run with hundreds of newlines.  The overhead of
3906        switching between PyList_SET_ITEM and append causes about a
3907        2-3% slowdown for that common case.  A smarter implementation
3908        could move the if check out, so the SET_ITEMs are done first
3909        and the appends only done when the prealloc buffer is full.
3910        That's too much work for little gain.*/
3911
3912     list = PyList_New(0);
3913     if (!list)
3914         goto onError;
3915
3916     for (i = j = 0; i < len; ) {
3917         Py_ssize_t eol;
3918
3919         /* Find a line and append it */
3920         while (i < len && data[i] != '\n' && data[i] != '\r')
3921             i++;
3922
3923         /* Skip the line break reading CRLF as one line break */
3924         eol = i;
3925         if (i < len) {
3926             if (data[i] == '\r' && i + 1 < len &&
3927                 data[i+1] == '\n')
3928                 i += 2;
3929             else
3930                 i++;
3931             if (keepends)
3932                 eol = i;
3933         }
3934         SPLIT_APPEND(data, j, eol);
3935         j = i;
3936     }
3937     if (j < len) {
3938         SPLIT_APPEND(data, j, len);
3939     }
3940
3941     return list;
3942
3943  onError:
3944     Py_XDECREF(list);
3945     return NULL;
3946 }
3947
3948 PyDoc_STRVAR(sizeof__doc__,
3949 "S.__sizeof__() -> size of S in memory, in bytes");
3950
3951 static PyObject *
3952 string_sizeof(PyStringObject *v)
3953 {
3954         Py_ssize_t res;
3955         res = PyStringObject_SIZE + PyString_GET_SIZE(v) * Py_TYPE(v)->tp_itemsize;
3956         return PyInt_FromSsize_t(res);
3957 }
3958
3959 #undef SPLIT_APPEND
3960 #undef SPLIT_ADD
3961 #undef MAX_PREALLOC
3962 #undef PREALLOC_SIZE
3963
3964 static PyObject *
3965 string_getnewargs(PyStringObject *v)
3966 {
3967         return Py_BuildValue("(s#)", v->ob_sval, Py_SIZE(v));
3968 }
3969
3970
3971 #include "stringlib/string_format.h"
3972
3973 PyDoc_STRVAR(format__doc__,
3974 "S.format(*args, **kwargs) -> unicode\n\
3975 \n\
3976 ");
3977
3978 static PyObject *
3979 string__format__(PyObject* self, PyObject* args)
3980 {
3981     PyObject *format_spec;
3982     PyObject *result = NULL;
3983     PyObject *tmp = NULL;
3984
3985     /* If 2.x, convert format_spec to the same type as value */
3986     /* This is to allow things like u''.format('') */
3987     if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
3988         goto done;
3989     if (!(PyString_Check(format_spec) || PyUnicode_Check(format_spec))) {
3990         PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
3991                      "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
3992         goto done;
3993     }
3994     tmp = PyObject_Str(format_spec);
3995     if (tmp == NULL)
3996         goto done;
3997     format_spec = tmp;
3998
3999     result = _PyBytes_FormatAdvanced(self,
4000                                      PyString_AS_STRING(format_spec),
4001                                      PyString_GET_SIZE(format_spec));
4002 done:
4003     Py_XDECREF(tmp);
4004     return result;
4005 }
4006
4007 PyDoc_STRVAR(p_format__doc__,
4008 "S.__format__(format_spec) -> unicode\n\
4009 \n\
4010 ");
4011
4012
4013 static PyMethodDef
4014 string_methods[] = {
4015         /* Counterparts of the obsolete stropmodule functions; except
4016            string.maketrans(). */
4017         {"join", (PyCFunction)string_join, METH_O, join__doc__},
4018         {"split", (PyCFunction)string_split, METH_VARARGS, split__doc__},
4019         {"rsplit", (PyCFunction)string_rsplit, METH_VARARGS, rsplit__doc__},
4020         {"lower", (PyCFunction)string_lower, METH_NOARGS, lower__doc__},
4021         {"upper", (PyCFunction)string_upper, METH_NOARGS, upper__doc__},
4022         {"islower", (PyCFunction)string_islower, METH_NOARGS, islower__doc__},
4023         {"isupper", (PyCFunction)string_isupper, METH_NOARGS, isupper__doc__},
4024         {"isspace", (PyCFunction)string_isspace, METH_NOARGS, isspace__doc__},
4025         {"isdigit", (PyCFunction)string_isdigit, METH_NOARGS, isdigit__doc__},
4026         {"istitle", (PyCFunction)string_istitle, METH_NOARGS, istitle__doc__},
4027         {"isalpha", (PyCFunction)string_isalpha, METH_NOARGS, isalpha__doc__},
4028         {"isalnum", (PyCFunction)string_isalnum, METH_NOARGS, isalnum__doc__},
4029         {"capitalize", (PyCFunction)string_capitalize, METH_NOARGS,
4030          capitalize__doc__},
4031         {"count", (PyCFunction)string_count, METH_VARARGS, count__doc__},
4032         {"endswith", (PyCFunction)string_endswith, METH_VARARGS,
4033          endswith__doc__},
4034         {"partition", (PyCFunction)string_partition, METH_O, partition__doc__},
4035         {"find", (PyCFunction)string_find, METH_VARARGS, find__doc__},
4036         {"index", (PyCFunction)string_index, METH_VARARGS, index__doc__},
4037         {"lstrip", (PyCFunction)string_lstrip, METH_VARARGS, lstrip__doc__},
4038         {"replace", (PyCFunction)string_replace, METH_VARARGS, replace__doc__},
4039         {"rfind", (PyCFunction)string_rfind, METH_VARARGS, rfind__doc__},
4040         {"rindex", (PyCFunction)string_rindex, METH_VARARGS, rindex__doc__},
4041         {"rstrip", (PyCFunction)string_rstrip, METH_VARARGS, rstrip__doc__},
4042         {"rpartition", (PyCFunction)string_rpartition, METH_O,
4043          rpartition__doc__},
4044         {"startswith", (PyCFunction)string_startswith, METH_VARARGS,
4045          startswith__doc__},
4046         {"strip", (PyCFunction)string_strip, METH_VARARGS, strip__doc__},
4047         {"swapcase", (PyCFunction)string_swapcase, METH_NOARGS,
4048          swapcase__doc__},
4049         {"translate", (PyCFunction)string_translate, METH_VARARGS,
4050          translate__doc__},
4051         {"title", (PyCFunction)string_title, METH_NOARGS, title__doc__},
4052         {"ljust", (PyCFunction)string_ljust, METH_VARARGS, ljust__doc__},
4053         {"rjust", (PyCFunction)string_rjust, METH_VARARGS, rjust__doc__},
4054         {"center", (PyCFunction)string_center, METH_VARARGS, center__doc__},
4055         {"zfill", (PyCFunction)string_zfill, METH_VARARGS, zfill__doc__},
4056         {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
4057         {"__format__", (PyCFunction) string__format__, METH_VARARGS, p_format__doc__},
4058         {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
4059         {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
4060         {"encode", (PyCFunction)string_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
4061         {"decode", (PyCFunction)string_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
4062         {"expandtabs", (PyCFunction)string_expandtabs, METH_VARARGS,
4063          expandtabs__doc__},
4064         {"splitlines", (PyCFunction)string_splitlines, METH_VARARGS,
4065          splitlines__doc__},
4066         {"__sizeof__", (PyCFunction)string_sizeof, METH_NOARGS,
4067          sizeof__doc__},
4068         {"__getnewargs__",      (PyCFunction)string_getnewargs, METH_NOARGS},
4069         {NULL,     NULL}                     /* sentinel */
4070 };
4071
4072 static PyObject *
4073 str_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
4074
4075 static PyObject *
4076 string_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
4077 {
4078         PyObject *x = NULL;
4079         static char *kwlist[] = {"object", 0};
4080
4081         if (type != &PyString_Type)
4082                 return str_subtype_new(type, args, kwds);
4083         if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O:str", kwlist, &x))
4084                 return NULL;
4085         if (x == NULL)
4086                 return PyString_FromString("");
4087         return PyObject_Str(x);
4088 }
4089
4090 static PyObject *
4091 str_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
4092 {
4093         PyObject *tmp, *pnew;
4094         Py_ssize_t n;
4095
4096         assert(PyType_IsSubtype(type, &PyString_Type));
4097         tmp = string_new(&PyString_Type, args, kwds);
4098         if (tmp == NULL)
4099                 return NULL;
4100         assert(PyString_CheckExact(tmp));
4101         n = PyString_GET_SIZE(tmp);
4102         pnew = type->tp_alloc(type, n);
4103         if (pnew != NULL) {
4104                 Py_MEMCPY(PyString_AS_STRING(pnew), PyString_AS_STRING(tmp), n+1);
4105                 ((PyStringObject *)pnew)->ob_shash =
4106                         ((PyStringObject *)tmp)->ob_shash;
4107                 ((PyStringObject *)pnew)->ob_sstate = SSTATE_NOT_INTERNED;
4108         }
4109         Py_DECREF(tmp);
4110         return pnew;
4111 }
4112
4113 static PyObject *
4114 basestring_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
4115 {
4116         PyErr_SetString(PyExc_TypeError,
4117                         "The basestring type cannot be instantiated");
4118         return NULL;
4119 }
4120
4121 static PyObject *
4122 string_mod(PyObject *v, PyObject *w)
4123 {
4124         if (!PyString_Check(v)) {
4125                 Py_INCREF(Py_NotImplemented);
4126                 return Py_NotImplemented;
4127         }
4128         return PyString_Format(v, w);
4129 }
4130
4131 PyDoc_STRVAR(basestring_doc,
4132 "Type basestring cannot be instantiated; it is the base for str and unicode.");
4133
4134 static PyNumberMethods string_as_number = {
4135         0,                      /*nb_add*/
4136         0,                      /*nb_subtract*/
4137         0,                      /*nb_multiply*/
4138         0,                      /*nb_divide*/
4139         string_mod,             /*nb_remainder*/
4140 };
4141
4142
4143 PyTypeObject PyBaseString_Type = {
4144         PyVarObject_HEAD_INIT(&PyType_Type, 0)
4145         "basestring",
4146         0,
4147         0,
4148         0,                                      /* tp_dealloc */
4149         0,                                      /* tp_print */
4150         0,                                      /* tp_getattr */
4151         0,                                      /* tp_setattr */
4152         0,                                      /* tp_compare */
4153         0,                                      /* tp_repr */
4154         0,                                      /* tp_as_number */
4155         0,                                      /* tp_as_sequence */
4156         0,                                      /* tp_as_mapping */
4157         0,                                      /* tp_hash */
4158         0,                                      /* tp_call */
4159         0,                                      /* tp_str */
4160         0,                                      /* tp_getattro */
4161         0,                                      /* tp_setattro */
4162         0,                                      /* tp_as_buffer */
4163         Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
4164         basestring_doc,                         /* tp_doc */
4165         0,                                      /* tp_traverse */
4166         0,                                      /* tp_clear */
4167         0,                                      /* tp_richcompare */
4168         0,                                      /* tp_weaklistoffset */
4169         0,                                      /* tp_iter */
4170         0,                                      /* tp_iternext */
4171         0,                                      /* tp_methods */
4172         0,                                      /* tp_members */
4173         0,                                      /* tp_getset */
4174         &PyBaseObject_Type,                     /* tp_base */
4175         0,                                      /* tp_dict */
4176         0,                                      /* tp_descr_get */
4177         0,                                      /* tp_descr_set */
4178         0,                                      /* tp_dictoffset */
4179         0,                                      /* tp_init */
4180         0,                                      /* tp_alloc */
4181         basestring_new,                         /* tp_new */
4182         0,                                      /* tp_free */
4183 };
4184
4185 PyDoc_STRVAR(string_doc,
4186 "str(object) -> string\n\
4187 \n\
4188 Return a nice string representation of the object.\n\
4189 If the argument is a string, the return value is the same object.");
4190
4191 PyTypeObject PyString_Type = {
4192         PyVarObject_HEAD_INIT(&PyType_Type, 0)
4193         "str",
4194         PyStringObject_SIZE,
4195         sizeof(char),
4196         string_dealloc,                         /* tp_dealloc */
4197         (printfunc)string_print,                /* tp_print */
4198         0,                                      /* tp_getattr */
4199         0,                                      /* tp_setattr */
4200         0,                                      /* tp_compare */
4201         string_repr,                            /* tp_repr */
4202         &string_as_number,                      /* tp_as_number */
4203         &string_as_sequence,                    /* tp_as_sequence */
4204         &string_as_mapping,                     /* tp_as_mapping */
4205         (hashfunc)string_hash,                  /* tp_hash */
4206         0,                                      /* tp_call */
4207         string_str,                             /* tp_str */
4208         PyObject_GenericGetAttr,                /* tp_getattro */
4209         0,                                      /* tp_setattro */
4210         &string_as_buffer,                      /* tp_as_buffer */
4211         Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
4212                 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_STRING_SUBCLASS |
4213                 Py_TPFLAGS_HAVE_NEWBUFFER,      /* tp_flags */
4214         string_doc,                             /* tp_doc */
4215         0,                                      /* tp_traverse */
4216         0,                                      /* tp_clear */
4217         (richcmpfunc)string_richcompare,        /* tp_richcompare */
4218         0,                                      /* tp_weaklistoffset */
4219         0,                                      /* tp_iter */
4220         0,                                      /* tp_iternext */
4221         string_methods,                         /* tp_methods */
4222         0,                                      /* tp_members */
4223         0,                                      /* tp_getset */
4224         &PyBaseString_Type,                     /* tp_base */
4225         0,                                      /* tp_dict */
4226         0,                                      /* tp_descr_get */
4227         0,                                      /* tp_descr_set */
4228         0,                                      /* tp_dictoffset */
4229         0,                                      /* tp_init */
4230         0,                                      /* tp_alloc */
4231         string_new,                             /* tp_new */
4232         PyObject_Del,                           /* tp_free */
4233 };
4234
4235 void
4236 PyString_Concat(register PyObject **pv, register PyObject *w)
4237 {
4238         register PyObject *v;
4239         if (*pv == NULL)
4240                 return;
4241         if (w == NULL || !PyString_Check(*pv)) {
4242                 Py_DECREF(*pv);
4243                 *pv = NULL;
4244                 return;
4245         }
4246         v = string_concat((PyStringObject *) *pv, w);
4247         Py_DECREF(*pv);
4248         *pv = v;
4249 }
4250
4251 void
4252 PyString_ConcatAndDel(register PyObject **pv, register PyObject *w)
4253 {
4254         PyString_Concat(pv, w);
4255         Py_XDECREF(w);
4256 }
4257
4258
4259 /* The following function breaks the notion that strings are immutable:
4260    it changes the size of a string.  We get away with this only if there
4261    is only one module referencing the object.  You can also think of it
4262    as creating a new string object and destroying the old one, only
4263    more efficiently.  In any case, don't use this if the string may
4264    already be known to some other part of the code...
4265    Note that if there's not enough memory to resize the string, the original
4266    string object at *pv is deallocated, *pv is set to NULL, an "out of
4267    memory" exception is set, and -1 is returned.  Else (on success) 0 is
4268    returned, and the value in *pv may or may not be the same as on input.
4269    As always, an extra byte is allocated for a trailing \0 byte (newsize
4270    does *not* include that), and a trailing \0 byte is stored.
4271 */
4272
4273 int
4274 _PyString_Resize(PyObject **pv, Py_ssize_t newsize)
4275 {
4276         register PyObject *v;
4277         register PyStringObject *sv;
4278         v = *pv;
4279         if (!PyString_Check(v) || Py_REFCNT(v) != 1 || newsize < 0 ||
4280             PyString_CHECK_INTERNED(v)) {
4281                 *pv = 0;
4282                 Py_DECREF(v);
4283                 PyErr_BadInternalCall();
4284                 return -1;
4285         }
4286         /* XXX UNREF/NEWREF interface should be more symmetrical */
4287         _Py_DEC_REFTOTAL;
4288         _Py_ForgetReference(v);
4289         *pv = (PyObject *)
4290                 PyObject_REALLOC((char *)v, PyStringObject_SIZE + newsize);
4291         if (*pv == NULL) {
4292                 PyObject_Del(v);
4293                 PyErr_NoMemory();
4294                 return -1;
4295         }
4296         _Py_NewReference(*pv);
4297         sv = (PyStringObject *) *pv;
4298         Py_SIZE(sv) = newsize;
4299         sv->ob_sval[newsize] = '\0';
4300         sv->ob_shash = -1;      /* invalidate cached hash value */
4301         return 0;
4302 }
4303
4304 /* Helpers for formatstring */
4305
4306 Py_LOCAL_INLINE(PyObject *)
4307 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
4308 {
4309         Py_ssize_t argidx = *p_argidx;
4310         if (argidx < arglen) {
4311                 (*p_argidx)++;
4312                 if (arglen < 0)
4313                         return args;
4314                 else
4315                         return PyTuple_GetItem(args, argidx);
4316         }
4317         PyErr_SetString(PyExc_TypeError,
4318                         "not enough arguments for format string");
4319         return NULL;
4320 }
4321
4322 /* Format codes
4323  * F_LJUST      '-'
4324  * F_SIGN       '+'
4325  * F_BLANK      ' '
4326  * F_ALT        '#'
4327  * F_ZERO       '0'
4328  */
4329 #define F_LJUST (1<<0)
4330 #define F_SIGN  (1<<1)
4331 #define F_BLANK (1<<2)
4332 #define F_ALT   (1<<3)
4333 #define F_ZERO  (1<<4)
4334
4335 Py_LOCAL_INLINE(int)
4336 formatfloat(char *buf, size_t buflen, int flags,
4337             int prec, int type, PyObject *v)
4338 {
4339         double x;
4340         x = PyFloat_AsDouble(v);
4341         if (x == -1.0 && PyErr_Occurred()) {
4342                 PyErr_Format(PyExc_TypeError, "float argument required, "
4343                              "not %.200s", Py_TYPE(v)->tp_name);
4344                 return -1;
4345         }
4346         if (prec < 0)
4347                 prec = 6;
4348 #if SIZEOF_INT > 4
4349         /* make sure that the decimal representation of precision really does
4350            need at most 10 digits: platforms with sizeof(int) == 8 exist! */
4351         if (prec > 0x7fffffff) {
4352                 PyErr_SetString(PyExc_OverflowError,
4353                                 "outrageously large precision "
4354                                 "for formatted float");
4355                 return -1;
4356         }
4357 #endif
4358
4359         if (type == 'f' && fabs(x) >= 1e50)
4360                 type = 'g';
4361         /* Worst case length calc to ensure no buffer overrun:
4362
4363            'g' formats:
4364              fmt = %#.<prec>g
4365              buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4366                 for any double rep.)
4367              len = 1 + prec + 1 + 2 + 5 = 9 + prec
4368
4369            'f' formats:
4370              buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
4371              len = 1 + 50 + 1 + prec = 52 + prec
4372
4373            If prec=0 the effective precision is 1 (the leading digit is
4374            always given), therefore increase the length by one.
4375
4376         */
4377         if (((type == 'g' || type == 'G') &&
4378               buflen <= (size_t)10 + (size_t)prec) ||
4379             (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
4380                 PyErr_SetString(PyExc_OverflowError,
4381                         "formatted float is too long (precision too large?)");
4382                 return -1;
4383         }
4384         _PyOS_double_to_string(buf, buflen, x, type, prec,
4385                             (flags&F_ALT)?Py_DTSF_ALT:0, NULL);
4386         return (int)strlen(buf);
4387 }
4388
4389 /* _PyString_FormatLong emulates the format codes d, u, o, x and X, and
4390  * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
4391  * Python's regular ints.
4392  * Return value:  a new PyString*, or NULL if error.
4393  *  .  *pbuf is set to point into it,
4394  *     *plen set to the # of chars following that.
4395  *     Caller must decref it when done using pbuf.
4396  *     The string starting at *pbuf is of the form
4397  *         "-"? ("0x" | "0X")? digit+
4398  *     "0x"/"0X" are present only for x and X conversions, with F_ALT
4399  *         set in flags.  The case of hex digits will be correct,
4400  *     There will be at least prec digits, zero-filled on the left if
4401  *         necessary to get that many.
4402  * val          object to be converted
4403  * flags        bitmask of format flags; only F_ALT is looked at
4404  * prec         minimum number of digits; 0-fill on left if needed
4405  * type         a character in [duoxX]; u acts the same as d
4406  *
4407  * CAUTION:  o, x and X conversions on regular ints can never
4408  * produce a '-' sign, but can for Python's unbounded ints.
4409  */
4410 PyObject*
4411 _PyString_FormatLong(PyObject *val, int flags, int prec, int type,
4412                      char **pbuf, int *plen)
4413 {
4414         PyObject *result = NULL;
4415         char *buf;
4416         Py_ssize_t i;
4417         int sign;       /* 1 if '-', else 0 */
4418         int len;        /* number of characters */
4419         Py_ssize_t llen;
4420         int numdigits;  /* len == numnondigits + numdigits */
4421         int numnondigits = 0;
4422
4423         switch (type) {
4424         case 'd':
4425         case 'u':
4426                 result = Py_TYPE(val)->tp_str(val);
4427                 break;
4428         case 'o':
4429                 result = Py_TYPE(val)->tp_as_number->nb_oct(val);
4430                 break;
4431         case 'x':
4432         case 'X':
4433                 numnondigits = 2;
4434                 result = Py_TYPE(val)->tp_as_number->nb_hex(val);
4435                 break;
4436         default:
4437                 assert(!"'type' not in [duoxX]");
4438         }
4439         if (!result)
4440                 return NULL;
4441
4442         buf = PyString_AsString(result);
4443         if (!buf) {
4444                 Py_DECREF(result);
4445                 return NULL;
4446         }
4447
4448         /* To modify the string in-place, there can only be one reference. */
4449         if (Py_REFCNT(result) != 1) {
4450                 PyErr_BadInternalCall();
4451                 return NULL;
4452         }
4453         llen = PyString_Size(result);
4454         if (llen > INT_MAX) {
4455                 PyErr_SetString(PyExc_ValueError, "string too large in _PyString_FormatLong");
4456                 return NULL;
4457         }
4458         len = (int)llen;
4459         if (buf[len-1] == 'L') {
4460                 --len;
4461                 buf[len] = '\0';
4462         }
4463         sign = buf[0] == '-';
4464         numnondigits += sign;
4465         numdigits = len - numnondigits;
4466         assert(numdigits > 0);
4467
4468         /* Get rid of base marker unless F_ALT */
4469         if ((flags & F_ALT) == 0) {
4470                 /* Need to skip 0x, 0X or 0. */
4471                 int skipped = 0;
4472                 switch (type) {
4473                 case 'o':
4474                         assert(buf[sign] == '0');
4475                         /* If 0 is only digit, leave it alone. */
4476                         if (numdigits > 1) {
4477                                 skipped = 1;
4478                                 --numdigits;
4479                         }
4480                         break;
4481                 case 'x':
4482                 case 'X':
4483                         assert(buf[sign] == '0');
4484                         assert(buf[sign + 1] == 'x');
4485                         skipped = 2;
4486                         numnondigits -= 2;
4487                         break;
4488                 }
4489                 if (skipped) {
4490                         buf += skipped;
4491                         len -= skipped;
4492                         if (sign)
4493                                 buf[0] = '-';
4494                 }
4495                 assert(len == numnondigits + numdigits);
4496                 assert(numdigits > 0);
4497         }
4498
4499         /* Fill with leading zeroes to meet minimum width. */
4500         if (prec > numdigits) {
4501                 PyObject *r1 = PyString_FromStringAndSize(NULL,
4502                                         numnondigits + prec);
4503                 char *b1;
4504                 if (!r1) {
4505                         Py_DECREF(result);
4506                         return NULL;
4507                 }
4508                 b1 = PyString_AS_STRING(r1);
4509                 for (i = 0; i < numnondigits; ++i)
4510                         *b1++ = *buf++;
4511                 for (i = 0; i < prec - numdigits; i++)
4512                         *b1++ = '0';
4513                 for (i = 0; i < numdigits; i++)
4514                         *b1++ = *buf++;
4515                 *b1 = '\0';
4516                 Py_DECREF(result);
4517                 result = r1;
4518                 buf = PyString_AS_STRING(result);
4519                 len = numnondigits + prec;
4520         }
4521
4522         /* Fix up case for hex conversions. */
4523         if (type == 'X') {
4524                 /* Need to convert all lower case letters to upper case.
4525                    and need to convert 0x to 0X (and -0x to -0X). */
4526                 for (i = 0; i < len; i++)
4527                         if (buf[i] >= 'a' && buf[i] <= 'x')
4528                                 buf[i] -= 'a'-'A';
4529         }
4530         *pbuf = buf;
4531         *plen = len;
4532         return result;
4533 }
4534
4535 Py_LOCAL_INLINE(int)
4536 formatint(char *buf, size_t buflen, int flags,
4537           int prec, int type, PyObject *v)
4538 {
4539         /* fmt = '%#.' + `prec` + 'l' + `type`
4540            worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4541            + 1 + 1 = 24 */
4542         char fmt[64];   /* plenty big enough! */
4543         char *sign;
4544         long x;
4545
4546         x = PyInt_AsLong(v);
4547         if (x == -1 && PyErr_Occurred()) {
4548                 PyErr_Format(PyExc_TypeError, "int argument required, not %.200s",
4549                              Py_TYPE(v)->tp_name);
4550                 return -1;
4551         }
4552         if (x < 0 && type == 'u') {
4553                 type = 'd';
4554         }
4555         if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
4556                 sign = "-";
4557         else
4558                 sign = "";
4559         if (prec < 0)
4560                 prec = 1;
4561
4562         if ((flags & F_ALT) &&
4563             (type == 'x' || type == 'X')) {
4564                 /* When converting under %#x or %#X, there are a number
4565                  * of issues that cause pain:
4566                  * - when 0 is being converted, the C standard leaves off
4567                  *   the '0x' or '0X', which is inconsistent with other
4568                  *   %#x/%#X conversions and inconsistent with Python's
4569                  *   hex() function
4570                  * - there are platforms that violate the standard and
4571                  *   convert 0 with the '0x' or '0X'
4572                  *   (Metrowerks, Compaq Tru64)
4573                  * - there are platforms that give '0x' when converting
4574                  *   under %#X, but convert 0 in accordance with the
4575                  *   standard (OS/2 EMX)
4576                  *
4577                  * We can achieve the desired consistency by inserting our
4578                  * own '0x' or '0X' prefix, and substituting %x/%X in place
4579                  * of %#x/%#X.
4580                  *
4581                  * Note that this is the same approach as used in
4582                  * formatint() in unicodeobject.c
4583                  */
4584                 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
4585                               sign, type, prec, type);
4586         }
4587         else {
4588                 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
4589                               sign, (flags&F_ALT) ? "#" : "",
4590                               prec, type);
4591         }
4592
4593         /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
4594          * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
4595          */
4596         if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
4597                 PyErr_SetString(PyExc_OverflowError,
4598                     "formatted integer is too long (precision too large?)");
4599                 return -1;
4600         }
4601         if (sign[0])
4602                 PyOS_snprintf(buf, buflen, fmt, -x);
4603         else
4604                 PyOS_snprintf(buf, buflen, fmt, x);
4605         return (int)strlen(buf);
4606 }
4607
4608 Py_LOCAL_INLINE(int)
4609 formatchar(char *buf, size_t buflen, PyObject *v)
4610 {
4611         /* presume that the buffer is at least 2 characters long */
4612         if (PyString_Check(v)) {
4613                 if (!PyArg_Parse(v, "c;%c requires int or char", &buf[0]))
4614                         return -1;
4615         }
4616         else {
4617                 if (!PyArg_Parse(v, "b;%c requires int or char", &buf[0]))
4618                         return -1;
4619         }
4620         buf[1] = '\0';
4621         return 1;
4622 }
4623
4624 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4625
4626    FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4627    chars are formatted. XXX This is a magic number. Each formatting
4628    routine does bounds checking to ensure no overflow, but a better
4629    solution may be to malloc a buffer of appropriate size for each
4630    format. For now, the current solution is sufficient.
4631 */
4632 #define FORMATBUFLEN (size_t)120
4633
4634 PyObject *
4635 PyString_Format(PyObject *format, PyObject *args)
4636 {
4637         char *fmt, *res;
4638         Py_ssize_t arglen, argidx;
4639         Py_ssize_t reslen, rescnt, fmtcnt;
4640         int args_owned = 0;
4641         PyObject *result, *orig_args;
4642 #ifdef Py_USING_UNICODE
4643         PyObject *v, *w;
4644 #endif
4645         PyObject *dict = NULL;
4646         if (format == NULL || !PyString_Check(format) || args == NULL) {
4647                 PyErr_BadInternalCall();
4648                 return NULL;
4649         }
4650         orig_args = args;
4651         fmt = PyString_AS_STRING(format);
4652         fmtcnt = PyString_GET_SIZE(format);
4653         reslen = rescnt = fmtcnt + 100;
4654         result = PyString_FromStringAndSize((char *)NULL, reslen);
4655         if (result == NULL)
4656                 return NULL;
4657         res = PyString_AsString(result);
4658         if (PyTuple_Check(args)) {
4659                 arglen = PyTuple_GET_SIZE(args);
4660                 argidx = 0;
4661         }
4662         else {
4663                 arglen = -1;
4664                 argidx = -2;
4665         }
4666         if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
4667             !PyObject_TypeCheck(args, &PyBaseString_Type))
4668                 dict = args;
4669         while (--fmtcnt >= 0) {
4670                 if (*fmt != '%') {
4671                         if (--rescnt < 0) {
4672                                 rescnt = fmtcnt + 100;
4673                                 reslen += rescnt;
4674                                 if (_PyString_Resize(&result, reslen) < 0)
4675                                         return NULL;
4676                                 res = PyString_AS_STRING(result)
4677                                         + reslen - rescnt;
4678                                 --rescnt;
4679                         }
4680                         *res++ = *fmt++;
4681                 }
4682                 else {
4683                         /* Got a format specifier */
4684                         int flags = 0;
4685                         Py_ssize_t width = -1;
4686                         int prec = -1;
4687                         int c = '\0';
4688                         int fill;
4689                         int isnumok;
4690                         PyObject *v = NULL;
4691                         PyObject *temp = NULL;
4692                         char *pbuf;
4693                         int sign;
4694                         Py_ssize_t len;
4695                         char formatbuf[FORMATBUFLEN];
4696                              /* For format{float,int,char}() */
4697 #ifdef Py_USING_UNICODE
4698                         char *fmt_start = fmt;
4699                         Py_ssize_t argidx_start = argidx;
4700 #endif
4701
4702                         fmt++;
4703                         if (*fmt == '(') {
4704                                 char *keystart;
4705                                 Py_ssize_t keylen;
4706                                 PyObject *key;
4707                                 int pcount = 1;
4708
4709                                 if (dict == NULL) {
4710                                         PyErr_SetString(PyExc_TypeError,
4711                                                  "format requires a mapping");
4712                                         goto error;
4713                                 }
4714                                 ++fmt;
4715                                 --fmtcnt;
4716                                 keystart = fmt;
4717                                 /* Skip over balanced parentheses */
4718                                 while (pcount > 0 && --fmtcnt >= 0) {
4719                                         if (*fmt == ')')
4720                                                 --pcount;
4721                                         else if (*fmt == '(')
4722                                                 ++pcount;
4723                                         fmt++;
4724                                 }
4725                                 keylen = fmt - keystart - 1;
4726                                 if (fmtcnt < 0 || pcount > 0) {
4727                                         PyErr_SetString(PyExc_ValueError,
4728                                                    "incomplete format key");
4729                                         goto error;
4730                                 }
4731                                 key = PyString_FromStringAndSize(keystart,
4732                                                                  keylen);
4733                                 if (key == NULL)
4734                                         goto error;
4735                                 if (args_owned) {
4736                                         Py_DECREF(args);
4737                                         args_owned = 0;
4738                                 }
4739                                 args = PyObject_GetItem(dict, key);
4740                                 Py_DECREF(key);
4741                                 if (args == NULL) {
4742                                         goto error;
4743                                 }
4744                                 args_owned = 1;
4745                                 arglen = -1;
4746                                 argidx = -2;
4747                         }
4748                         while (--fmtcnt >= 0) {
4749                                 switch (c = *fmt++) {
4750                                 case '-': flags |= F_LJUST; continue;
4751                                 case '+': flags |= F_SIGN; continue;
4752                                 case ' ': flags |= F_BLANK; continue;
4753                                 case '#': flags |= F_ALT; continue;
4754                                 case '0': flags |= F_ZERO; continue;
4755                                 }
4756                                 break;
4757                         }
4758                         if (c == '*') {
4759                                 v = getnextarg(args, arglen, &argidx);
4760                                 if (v == NULL)
4761                                         goto error;
4762                                 if (!PyInt_Check(v)) {
4763                                         PyErr_SetString(PyExc_TypeError,
4764                                                         "* wants int");
4765                                         goto error;
4766                                 }
4767                                 width = PyInt_AsLong(v);
4768                                 if (width < 0) {
4769                                         flags |= F_LJUST;
4770                                         width = -width;
4771                                 }
4772                                 if (--fmtcnt >= 0)
4773                                         c = *fmt++;
4774                         }
4775                         else if (c >= 0 && isdigit(c)) {
4776                                 width = c - '0';
4777                                 while (--fmtcnt >= 0) {
4778                                         c = Py_CHARMASK(*fmt++);
4779                                         if (!isdigit(c))
4780                                                 break;
4781                                         if ((width*10) / 10 != width) {
4782                                                 PyErr_SetString(
4783                                                         PyExc_ValueError,
4784                                                         "width too big");
4785                                                 goto error;
4786                                         }
4787                                         width = width*10 + (c - '0');
4788                                 }
4789                         }
4790                         if (c == '.') {
4791                                 prec = 0;
4792                                 if (--fmtcnt >= 0)
4793                                         c = *fmt++;
4794                                 if (c == '*') {
4795                                         v = getnextarg(args, arglen, &argidx);
4796                                         if (v == NULL)
4797                                                 goto error;
4798                                         if (!PyInt_Check(v)) {
4799                                                 PyErr_SetString(
4800                                                         PyExc_TypeError,
4801                                                         "* wants int");
4802                                                 goto error;
4803                                         }
4804                                         prec = PyInt_AsLong(v);
4805                                         if (prec < 0)
4806                                                 prec = 0;
4807                                         if (--fmtcnt >= 0)
4808                                                 c = *fmt++;
4809                                 }
4810                                 else if (c >= 0 && isdigit(c)) {
4811                                         prec = c - '0';
4812                                         while (--fmtcnt >= 0) {
4813                                                 c = Py_CHARMASK(*fmt++);
4814                                                 if (!isdigit(c))
4815                                                         break;
4816                                                 if ((prec*10) / 10 != prec) {
4817                                                         PyErr_SetString(
4818                                                             PyExc_ValueError,
4819                                                             "prec too big");
4820                                                         goto error;
4821                                                 }
4822                                                 prec = prec*10 + (c - '0');
4823                                         }
4824                                 }
4825                         } /* prec */
4826                         if (fmtcnt >= 0) {
4827                                 if (c == 'h' || c == 'l' || c == 'L') {
4828                                         if (--fmtcnt >= 0)
4829                                                 c = *fmt++;
4830                                 }
4831                         }
4832                         if (fmtcnt < 0) {
4833                                 PyErr_SetString(PyExc_ValueError,
4834                                                 "incomplete format");
4835                                 goto error;
4836                         }
4837                         if (c != '%') {
4838                                 v = getnextarg(args, arglen, &argidx);
4839                                 if (v == NULL)
4840                                         goto error;
4841                         }
4842                         sign = 0;
4843                         fill = ' ';
4844                         switch (c) {
4845                         case '%':
4846                                 pbuf = "%";
4847                                 len = 1;
4848                                 break;
4849                         case 's':
4850 #ifdef Py_USING_UNICODE
4851                                 if (PyUnicode_Check(v)) {
4852                                         fmt = fmt_start;
4853                                         argidx = argidx_start;
4854                                         goto unicode;
4855                                 }
4856 #endif
4857                                 temp = _PyObject_Str(v);
4858 #ifdef Py_USING_UNICODE
4859                                 if (temp != NULL && PyUnicode_Check(temp)) {
4860                                         Py_DECREF(temp);
4861                                         fmt = fmt_start;
4862                                         argidx = argidx_start;
4863                                         goto unicode;
4864                                 }
4865 #endif
4866                                 /* Fall through */
4867                         case 'r':
4868                                 if (c == 'r')
4869                                         temp = PyObject_Repr(v);
4870                                 if (temp == NULL)
4871                                         goto error;
4872                                 if (!PyString_Check(temp)) {
4873                                         PyErr_SetString(PyExc_TypeError,
4874                                           "%s argument has non-string str()");
4875                                         Py_DECREF(temp);
4876                                         goto error;
4877                                 }
4878                                 pbuf = PyString_AS_STRING(temp);
4879                                 len = PyString_GET_SIZE(temp);
4880                                 if (prec >= 0 && len > prec)
4881                                         len = prec;
4882                                 break;
4883                         case 'i':
4884                         case 'd':
4885                         case 'u':
4886                         case 'o':
4887                         case 'x':
4888                         case 'X':
4889                                 if (c == 'i')
4890                                         c = 'd';
4891                                 isnumok = 0;
4892                                 if (PyNumber_Check(v)) {
4893                                         PyObject *iobj=NULL;
4894
4895                                         if (PyInt_Check(v) || (PyLong_Check(v))) {
4896                                                 iobj = v;
4897                                                 Py_INCREF(iobj);
4898                                         }
4899                                         else {
4900                                                 iobj = PyNumber_Int(v);
4901                                                 if (iobj==NULL) iobj = PyNumber_Long(v);
4902                                         }
4903                                         if (iobj!=NULL) {
4904                                                 if (PyInt_Check(iobj)) {
4905                                                         isnumok = 1;
4906                                                         pbuf = formatbuf;
4907                                                         len = formatint(pbuf,
4908                                                                         sizeof(formatbuf),
4909                                                                         flags, prec, c, iobj);
4910                                                         Py_DECREF(iobj);
4911                                                         if (len < 0)
4912                                                                 goto error;
4913                                                         sign = 1;
4914                                                 }
4915                                                 else if (PyLong_Check(iobj)) {
4916                                                         int ilen;
4917
4918                                                         isnumok = 1;
4919                                                         temp = _PyString_FormatLong(iobj, flags,
4920                                                                 prec, c, &pbuf, &ilen);
4921                                                         Py_DECREF(iobj);
4922                                                         len = ilen;
4923                                                         if (!temp)
4924                                                                 goto error;
4925                                                         sign = 1;
4926                                                 }
4927                                                 else {
4928                                                         Py_DECREF(iobj);
4929                                                 }
4930                                         }
4931                                 }
4932                                 if (!isnumok) {
4933                                         PyErr_Format(PyExc_TypeError,
4934                                             "%%%c format: a number is required, "
4935                                             "not %.200s", c, Py_TYPE(v)->tp_name);
4936                                         goto error;
4937                                 }
4938                                 if (flags & F_ZERO)
4939                                         fill = '0';
4940                                 break;
4941                         case 'e':
4942                         case 'E':
4943                         case 'f':
4944                         case 'F':
4945                         case 'g':
4946                         case 'G':
4947                                 if (c == 'F')
4948                                         c = 'f';
4949                                 pbuf = formatbuf;
4950                                 len = formatfloat(pbuf, sizeof(formatbuf),
4951                                                   flags, prec, c, v);
4952                                 if (len < 0)
4953                                         goto error;
4954                                 sign = 1;
4955                                 if (flags & F_ZERO)
4956                                         fill = '0';
4957                                 break;
4958                         case 'c':
4959 #ifdef Py_USING_UNICODE
4960                                 if (PyUnicode_Check(v)) {
4961                                         fmt = fmt_start;
4962                                         argidx = argidx_start;
4963                                         goto unicode;
4964                                 }
4965 #endif
4966                                 pbuf = formatbuf;
4967                                 len = formatchar(pbuf, sizeof(formatbuf), v);
4968                                 if (len < 0)
4969                                         goto error;
4970                                 break;
4971                         default:
4972                                 PyErr_Format(PyExc_ValueError,
4973                                   "unsupported format character '%c' (0x%x) "
4974                                   "at index %zd",
4975                                   c, c,
4976                                   (Py_ssize_t)(fmt - 1 -
4977                                                PyString_AsString(format)));
4978                                 goto error;
4979                         }
4980                         if (sign) {
4981                                 if (*pbuf == '-' || *pbuf == '+') {
4982                                         sign = *pbuf++;
4983                                         len--;
4984                                 }
4985                                 else if (flags & F_SIGN)
4986                                         sign = '+';
4987                                 else if (flags & F_BLANK)
4988                                         sign = ' ';
4989                                 else
4990                                         sign = 0;
4991                         }
4992                         if (width < len)
4993                                 width = len;
4994                         if (rescnt - (sign != 0) < width) {
4995                                 reslen -= rescnt;
4996                                 rescnt = width + fmtcnt + 100;
4997                                 reslen += rescnt;
4998                                 if (reslen < 0) {
4999                                         Py_DECREF(result);
5000                                         Py_XDECREF(temp);
5001                                         return PyErr_NoMemory();
5002                                 }
5003                                 if (_PyString_Resize(&result, reslen) < 0) {
5004                                         Py_XDECREF(temp);
5005                                         return NULL;
5006                                 }
5007                                 res = PyString_AS_STRING(result)
5008                                         + reslen - rescnt;
5009                         }
5010                         if (sign) {
5011                                 if (fill != ' ')
5012                                         *res++ = sign;
5013                                 rescnt--;
5014                                 if (width > len)
5015                                         width--;
5016                         }
5017                         if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5018                                 assert(pbuf[0] == '0');
5019                                 assert(pbuf[1] == c);
5020                                 if (fill != ' ') {
5021                                         *res++ = *pbuf++;
5022                                         *res++ = *pbuf++;
5023                                 }
5024                                 rescnt -= 2;
5025                                 width -= 2;
5026                                 if (width < 0)
5027                                         width = 0;
5028                                 len -= 2;
5029                         }
5030                         if (width > len && !(flags & F_LJUST)) {
5031                                 do {
5032                                         --rescnt;
5033                                         *res++ = fill;
5034                                 } while (--width > len);
5035                         }
5036                         if (fill == ' ') {
5037                                 if (sign)
5038                                         *res++ = sign;
5039                                 if ((flags & F_ALT) &&
5040                                     (c == 'x' || c == 'X')) {
5041                                         assert(pbuf[0] == '0');
5042                                         assert(pbuf[1] == c);
5043                                         *res++ = *pbuf++;
5044                                         *res++ = *pbuf++;
5045                                 }
5046                         }
5047                         Py_MEMCPY(res, pbuf, len);
5048                         res += len;
5049                         rescnt -= len;
5050                         while (--width >= len) {
5051                                 --rescnt;
5052                                 *res++ = ' ';
5053                         }
5054                         if (dict && (argidx < arglen) && c != '%') {
5055                                 PyErr_SetString(PyExc_TypeError,
5056                                            "not all arguments converted during string formatting");
5057                                 Py_XDECREF(temp);
5058                                 goto error;
5059                         }
5060                         Py_XDECREF(temp);
5061                 } /* '%' */
5062         } /* until end */
5063         if (argidx < arglen && !dict) {
5064                 PyErr_SetString(PyExc_TypeError,
5065                                 "not all arguments converted during string formatting");
5066                 goto error;
5067         }
5068         if (args_owned) {
5069                 Py_DECREF(args);
5070         }
5071         _PyString_Resize(&result, reslen - rescnt);
5072         return result;
5073
5074 #ifdef Py_USING_UNICODE
5075  unicode:
5076         if (args_owned) {
5077                 Py_DECREF(args);
5078                 args_owned = 0;
5079         }
5080         /* Fiddle args right (remove the first argidx arguments) */
5081         if (PyTuple_Check(orig_args) && argidx > 0) {
5082                 PyObject *v;
5083                 Py_ssize_t n = PyTuple_GET_SIZE(orig_args) - argidx;
5084                 v = PyTuple_New(n);
5085                 if (v == NULL)
5086                         goto error;
5087                 while (--n >= 0) {
5088                         PyObject *w = PyTuple_GET_ITEM(orig_args, n + argidx);
5089                         Py_INCREF(w);
5090                         PyTuple_SET_ITEM(v, n, w);
5091                 }
5092                 args = v;
5093         } else {
5094                 Py_INCREF(orig_args);
5095                 args = orig_args;
5096         }
5097         args_owned = 1;
5098         /* Take what we have of the result and let the Unicode formatting
5099            function format the rest of the input. */
5100         rescnt = res - PyString_AS_STRING(result);
5101         if (_PyString_Resize(&result, rescnt))
5102                 goto error;
5103         fmtcnt = PyString_GET_SIZE(format) - \
5104                  (fmt - PyString_AS_STRING(format));
5105         format = PyUnicode_Decode(fmt, fmtcnt, NULL, NULL);
5106         if (format == NULL)
5107                 goto error;
5108         v = PyUnicode_Format(format, args);
5109         Py_DECREF(format);
5110         if (v == NULL)
5111                 goto error;
5112         /* Paste what we have (result) to what the Unicode formatting
5113            function returned (v) and return the result (or error) */
5114         w = PyUnicode_Concat(result, v);
5115         Py_DECREF(result);
5116         Py_DECREF(v);
5117         Py_DECREF(args);
5118         return w;
5119 #endif /* Py_USING_UNICODE */
5120
5121  error:
5122         Py_DECREF(result);
5123         if (args_owned) {
5124                 Py_DECREF(args);
5125         }
5126         return NULL;
5127 }
5128
5129 void
5130 PyString_InternInPlace(PyObject **p)
5131 {
5132         register PyStringObject *s = (PyStringObject *)(*p);
5133         PyObject *t;
5134         if (s == NULL || !PyString_Check(s))
5135                 Py_FatalError("PyString_InternInPlace: strings only please!");
5136         /* If it's a string subclass, we don't really know what putting
5137            it in the interned dict might do. */
5138         if (!PyString_CheckExact(s))
5139                 return;
5140         if (PyString_CHECK_INTERNED(s))
5141                 return;
5142         if (interned == NULL) {
5143                 interned = PyDict_New();
5144                 if (interned == NULL) {
5145                         PyErr_Clear(); /* Don't leave an exception */
5146                         return;
5147                 }
5148         }
5149         t = PyDict_GetItem(interned, (PyObject *)s);
5150         if (t) {
5151                 Py_INCREF(t);
5152                 Py_DECREF(*p);
5153                 *p = t;
5154                 return;
5155         }
5156
5157         if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
5158                 PyErr_Clear();
5159                 return;
5160         }
5161         /* The two references in interned are not counted by refcnt.
5162            The string deallocator will take care of this */
5163         Py_REFCNT(s) -= 2;
5164         PyString_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
5165 }
5166
5167 void
5168 PyString_InternImmortal(PyObject **p)
5169 {
5170         PyString_InternInPlace(p);
5171         if (PyString_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
5172                 PyString_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
5173                 Py_INCREF(*p);
5174         }
5175 }
5176
5177
5178 PyObject *
5179 PyString_InternFromString(const char *cp)
5180 {
5181         PyObject *s = PyString_FromString(cp);
5182         if (s == NULL)
5183                 return NULL;
5184         PyString_InternInPlace(&s);
5185         return s;
5186 }
5187
5188 void
5189 PyString_Fini(void)
5190 {
5191         int i;
5192         for (i = 0; i < UCHAR_MAX + 1; i++) {
5193                 Py_XDECREF(characters[i]);
5194                 characters[i] = NULL;
5195         }
5196         Py_XDECREF(nullstring);
5197         nullstring = NULL;
5198 }
5199
5200 void _Py_ReleaseInternedStrings(void)
5201 {
5202         PyObject *keys;
5203         PyStringObject *s;
5204         Py_ssize_t i, n;
5205         Py_ssize_t immortal_size = 0, mortal_size = 0;
5206
5207         if (interned == NULL || !PyDict_Check(interned))
5208                 return;
5209         keys = PyDict_Keys(interned);
5210         if (keys == NULL || !PyList_Check(keys)) {
5211                 PyErr_Clear();
5212                 return;
5213         }
5214
5215         /* Since _Py_ReleaseInternedStrings() is intended to help a leak
5216            detector, interned strings are not forcibly deallocated; rather, we
5217            give them their stolen references back, and then clear and DECREF
5218            the interned dict. */
5219
5220         n = PyList_GET_SIZE(keys);
5221         fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
5222                 n);
5223         for (i = 0; i < n; i++) {
5224                 s = (PyStringObject *) PyList_GET_ITEM(keys, i);
5225                 switch (s->ob_sstate) {
5226                 case SSTATE_NOT_INTERNED:
5227                         /* XXX Shouldn't happen */
5228                         break;
5229                 case SSTATE_INTERNED_IMMORTAL:
5230                         Py_REFCNT(s) += 1;
5231                         immortal_size += Py_SIZE(s);
5232                         break;
5233                 case SSTATE_INTERNED_MORTAL:
5234                         Py_REFCNT(s) += 2;
5235                         mortal_size += Py_SIZE(s);
5236                         break;
5237                 default:
5238                         Py_FatalError("Inconsistent interned string state.");
5239                 }
5240                 s->ob_sstate = SSTATE_NOT_INTERNED;
5241         }
5242         fprintf(stderr, "total size of all interned strings: "
5243                         "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
5244                         "mortal/immortal\n", mortal_size, immortal_size);
5245         Py_DECREF(keys);
5246         PyDict_Clear(interned);
5247         Py_DECREF(interned);
5248         interned = NULL;
5249 }