Avoid undefined behaviour due to overflow in i_divmod (Objects/intobject.c).
[python.git] / Objects / stringobject.c
blob02aabf2e4299f81b104a8e016eee7d7c29331261
1 /* String (str/bytes) object implementation */
3 #define PY_SSIZE_T_CLEAN
5 #include "Python.h"
6 #include <ctype.h>
7 #include <stddef.h>
9 #ifdef COUNT_ALLOCS
10 Py_ssize_t null_strings, one_strings;
11 #endif
13 static PyStringObject *characters[UCHAR_MAX + 1];
14 static PyStringObject *nullstring;
16 /* This dictionary holds all interned strings. Note that references to
17 strings in this dictionary are *not* counted in the string's ob_refcnt.
18 When the interned string reaches a refcnt of 0 the string deallocation
19 function will delete the reference from this dictionary.
21 Another way to look at this is that to say that the actual reference
22 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
24 static PyObject *interned;
26 /* PyStringObject_SIZE gives the basic size of a string; any memory allocation
27 for a string of length n should request PyStringObject_SIZE + n bytes.
29 Using PyStringObject_SIZE instead of sizeof(PyStringObject) saves
30 3 bytes per string allocation on a typical system.
32 #define PyStringObject_SIZE (offsetof(PyStringObject, ob_sval) + 1)
35 For both PyString_FromString() and PyString_FromStringAndSize(), the
36 parameter `size' denotes number of characters to allocate, not counting any
37 null terminating character.
39 For PyString_FromString(), the parameter `str' points to a null-terminated
40 string containing exactly `size' bytes.
42 For PyString_FromStringAndSize(), the parameter the parameter `str' is
43 either NULL or else points to a string containing at least `size' bytes.
44 For PyString_FromStringAndSize(), the string in the `str' parameter does
45 not have to be null-terminated. (Therefore it is safe to construct a
46 substring by calling `PyString_FromStringAndSize(origstring, substrlen)'.)
47 If `str' is NULL then PyString_FromStringAndSize() will allocate `size+1'
48 bytes (setting the last byte to the null terminating character) and you can
49 fill in the data yourself. If `str' is non-NULL then the resulting
50 PyString object must be treated as immutable and you must not fill in nor
51 alter the data yourself, since the strings may be shared.
53 The PyObject member `op->ob_size', which denotes the number of "extra
54 items" in a variable-size object, will contain the number of bytes
55 allocated for string data, not counting the null terminating character. It
56 is therefore equal to the equal to the `size' parameter (for
57 PyString_FromStringAndSize()) or the length of the string in the `str'
58 parameter (for PyString_FromString()).
60 PyObject *
61 PyString_FromStringAndSize(const char *str, Py_ssize_t size)
63 register PyStringObject *op;
64 if (size < 0) {
65 PyErr_SetString(PyExc_SystemError,
66 "Negative size passed to PyString_FromStringAndSize");
67 return NULL;
69 if (size == 0 && (op = nullstring) != NULL) {
70 #ifdef COUNT_ALLOCS
71 null_strings++;
72 #endif
73 Py_INCREF(op);
74 return (PyObject *)op;
76 if (size == 1 && str != NULL &&
77 (op = characters[*str & UCHAR_MAX]) != NULL)
79 #ifdef COUNT_ALLOCS
80 one_strings++;
81 #endif
82 Py_INCREF(op);
83 return (PyObject *)op;
86 if (size > PY_SSIZE_T_MAX - PyStringObject_SIZE) {
87 PyErr_SetString(PyExc_OverflowError, "string is too large");
88 return NULL;
91 /* Inline PyObject_NewVar */
92 op = (PyStringObject *)PyObject_MALLOC(PyStringObject_SIZE + size);
93 if (op == NULL)
94 return PyErr_NoMemory();
95 PyObject_INIT_VAR(op, &PyString_Type, size);
96 op->ob_shash = -1;
97 op->ob_sstate = SSTATE_NOT_INTERNED;
98 if (str != NULL)
99 Py_MEMCPY(op->ob_sval, str, size);
100 op->ob_sval[size] = '\0';
101 /* share short strings */
102 if (size == 0) {
103 PyObject *t = (PyObject *)op;
104 PyString_InternInPlace(&t);
105 op = (PyStringObject *)t;
106 nullstring = op;
107 Py_INCREF(op);
108 } else if (size == 1 && str != NULL) {
109 PyObject *t = (PyObject *)op;
110 PyString_InternInPlace(&t);
111 op = (PyStringObject *)t;
112 characters[*str & UCHAR_MAX] = op;
113 Py_INCREF(op);
115 return (PyObject *) op;
118 PyObject *
119 PyString_FromString(const char *str)
121 register size_t size;
122 register PyStringObject *op;
124 assert(str != NULL);
125 size = strlen(str);
126 if (size > PY_SSIZE_T_MAX - PyStringObject_SIZE) {
127 PyErr_SetString(PyExc_OverflowError,
128 "string is too long for a Python string");
129 return NULL;
131 if (size == 0 && (op = nullstring) != NULL) {
132 #ifdef COUNT_ALLOCS
133 null_strings++;
134 #endif
135 Py_INCREF(op);
136 return (PyObject *)op;
138 if (size == 1 && (op = characters[*str & UCHAR_MAX]) != NULL) {
139 #ifdef COUNT_ALLOCS
140 one_strings++;
141 #endif
142 Py_INCREF(op);
143 return (PyObject *)op;
146 /* Inline PyObject_NewVar */
147 op = (PyStringObject *)PyObject_MALLOC(PyStringObject_SIZE + size);
148 if (op == NULL)
149 return PyErr_NoMemory();
150 PyObject_INIT_VAR(op, &PyString_Type, size);
151 op->ob_shash = -1;
152 op->ob_sstate = SSTATE_NOT_INTERNED;
153 Py_MEMCPY(op->ob_sval, str, size+1);
154 /* share short strings */
155 if (size == 0) {
156 PyObject *t = (PyObject *)op;
157 PyString_InternInPlace(&t);
158 op = (PyStringObject *)t;
159 nullstring = op;
160 Py_INCREF(op);
161 } else if (size == 1) {
162 PyObject *t = (PyObject *)op;
163 PyString_InternInPlace(&t);
164 op = (PyStringObject *)t;
165 characters[*str & UCHAR_MAX] = op;
166 Py_INCREF(op);
168 return (PyObject *) op;
171 PyObject *
172 PyString_FromFormatV(const char *format, va_list vargs)
174 va_list count;
175 Py_ssize_t n = 0;
176 const char* f;
177 char *s;
178 PyObject* string;
180 #ifdef VA_LIST_IS_ARRAY
181 Py_MEMCPY(count, vargs, sizeof(va_list));
182 #else
183 #ifdef __va_copy
184 __va_copy(count, vargs);
185 #else
186 count = vargs;
187 #endif
188 #endif
189 /* step 1: figure out how large a buffer we need */
190 for (f = format; *f; f++) {
191 if (*f == '%') {
192 #ifdef HAVE_LONG_LONG
193 int longlongflag = 0;
194 #endif
195 const char* p = f;
196 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
199 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
200 * they don't affect the amount of space we reserve.
202 if (*f == 'l') {
203 if (f[1] == 'd' || f[1] == 'u') {
204 ++f;
206 #ifdef HAVE_LONG_LONG
207 else if (f[1] == 'l' &&
208 (f[2] == 'd' || f[2] == 'u')) {
209 longlongflag = 1;
210 f += 2;
212 #endif
214 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
215 ++f;
218 switch (*f) {
219 case 'c':
220 (void)va_arg(count, int);
221 /* fall through... */
222 case '%':
223 n++;
224 break;
225 case 'd': case 'u': case 'i': case 'x':
226 (void) va_arg(count, int);
227 #ifdef HAVE_LONG_LONG
228 /* Need at most
229 ceil(log10(256)*SIZEOF_LONG_LONG) digits,
230 plus 1 for the sign. 53/22 is an upper
231 bound for log10(256). */
232 if (longlongflag)
233 n += 2 + (SIZEOF_LONG_LONG*53-1) / 22;
234 else
235 #endif
236 /* 20 bytes is enough to hold a 64-bit
237 integer. Decimal takes the most
238 space. This isn't enough for
239 octal. */
240 n += 20;
242 break;
243 case 's':
244 s = va_arg(count, char*);
245 n += strlen(s);
246 break;
247 case 'p':
248 (void) va_arg(count, int);
249 /* maximum 64-bit pointer representation:
250 * 0xffffffffffffffff
251 * so 19 characters is enough.
252 * XXX I count 18 -- what's the extra for?
254 n += 19;
255 break;
256 default:
257 /* if we stumble upon an unknown
258 formatting code, copy the rest of
259 the format string to the output
260 string. (we cannot just skip the
261 code, since there's no way to know
262 what's in the argument list) */
263 n += strlen(p);
264 goto expand;
266 } else
267 n++;
269 expand:
270 /* step 2: fill the buffer */
271 /* Since we've analyzed how much space we need for the worst case,
272 use sprintf directly instead of the slower PyOS_snprintf. */
273 string = PyString_FromStringAndSize(NULL, n);
274 if (!string)
275 return NULL;
277 s = PyString_AsString(string);
279 for (f = format; *f; f++) {
280 if (*f == '%') {
281 const char* p = f++;
282 Py_ssize_t i;
283 int longflag = 0;
284 #ifdef HAVE_LONG_LONG
285 int longlongflag = 0;
286 #endif
287 int size_tflag = 0;
288 /* parse the width.precision part (we're only
289 interested in the precision value, if any) */
290 n = 0;
291 while (isdigit(Py_CHARMASK(*f)))
292 n = (n*10) + *f++ - '0';
293 if (*f == '.') {
294 f++;
295 n = 0;
296 while (isdigit(Py_CHARMASK(*f)))
297 n = (n*10) + *f++ - '0';
299 while (*f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
300 f++;
301 /* Handle %ld, %lu, %lld and %llu. */
302 if (*f == 'l') {
303 if (f[1] == 'd' || f[1] == 'u') {
304 longflag = 1;
305 ++f;
307 #ifdef HAVE_LONG_LONG
308 else if (f[1] == 'l' &&
309 (f[2] == 'd' || f[2] == 'u')) {
310 longlongflag = 1;
311 f += 2;
313 #endif
315 /* handle the size_t flag. */
316 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
317 size_tflag = 1;
318 ++f;
321 switch (*f) {
322 case 'c':
323 *s++ = va_arg(vargs, int);
324 break;
325 case 'd':
326 if (longflag)
327 sprintf(s, "%ld", va_arg(vargs, long));
328 #ifdef HAVE_LONG_LONG
329 else if (longlongflag)
330 sprintf(s, "%" PY_FORMAT_LONG_LONG "d",
331 va_arg(vargs, PY_LONG_LONG));
332 #endif
333 else if (size_tflag)
334 sprintf(s, "%" PY_FORMAT_SIZE_T "d",
335 va_arg(vargs, Py_ssize_t));
336 else
337 sprintf(s, "%d", va_arg(vargs, int));
338 s += strlen(s);
339 break;
340 case 'u':
341 if (longflag)
342 sprintf(s, "%lu",
343 va_arg(vargs, unsigned long));
344 #ifdef HAVE_LONG_LONG
345 else if (longlongflag)
346 sprintf(s, "%" PY_FORMAT_LONG_LONG "u",
347 va_arg(vargs, PY_LONG_LONG));
348 #endif
349 else if (size_tflag)
350 sprintf(s, "%" PY_FORMAT_SIZE_T "u",
351 va_arg(vargs, size_t));
352 else
353 sprintf(s, "%u",
354 va_arg(vargs, unsigned int));
355 s += strlen(s);
356 break;
357 case 'i':
358 sprintf(s, "%i", va_arg(vargs, int));
359 s += strlen(s);
360 break;
361 case 'x':
362 sprintf(s, "%x", va_arg(vargs, int));
363 s += strlen(s);
364 break;
365 case 's':
366 p = va_arg(vargs, char*);
367 i = strlen(p);
368 if (n > 0 && i > n)
369 i = n;
370 Py_MEMCPY(s, p, i);
371 s += i;
372 break;
373 case 'p':
374 sprintf(s, "%p", va_arg(vargs, void*));
375 /* %p is ill-defined: ensure leading 0x. */
376 if (s[1] == 'X')
377 s[1] = 'x';
378 else if (s[1] != 'x') {
379 memmove(s+2, s, strlen(s)+1);
380 s[0] = '0';
381 s[1] = 'x';
383 s += strlen(s);
384 break;
385 case '%':
386 *s++ = '%';
387 break;
388 default:
389 strcpy(s, p);
390 s += strlen(s);
391 goto end;
393 } else
394 *s++ = *f;
397 end:
398 _PyString_Resize(&string, s - PyString_AS_STRING(string));
399 return string;
402 PyObject *
403 PyString_FromFormat(const char *format, ...)
405 PyObject* ret;
406 va_list vargs;
408 #ifdef HAVE_STDARG_PROTOTYPES
409 va_start(vargs, format);
410 #else
411 va_start(vargs);
412 #endif
413 ret = PyString_FromFormatV(format, vargs);
414 va_end(vargs);
415 return ret;
419 PyObject *PyString_Decode(const char *s,
420 Py_ssize_t size,
421 const char *encoding,
422 const char *errors)
424 PyObject *v, *str;
426 str = PyString_FromStringAndSize(s, size);
427 if (str == NULL)
428 return NULL;
429 v = PyString_AsDecodedString(str, encoding, errors);
430 Py_DECREF(str);
431 return v;
434 PyObject *PyString_AsDecodedObject(PyObject *str,
435 const char *encoding,
436 const char *errors)
438 PyObject *v;
440 if (!PyString_Check(str)) {
441 PyErr_BadArgument();
442 goto onError;
445 if (encoding == NULL) {
446 #ifdef Py_USING_UNICODE
447 encoding = PyUnicode_GetDefaultEncoding();
448 #else
449 PyErr_SetString(PyExc_ValueError, "no encoding specified");
450 goto onError;
451 #endif
454 /* Decode via the codec registry */
455 v = PyCodec_Decode(str, encoding, errors);
456 if (v == NULL)
457 goto onError;
459 return v;
461 onError:
462 return NULL;
465 PyObject *PyString_AsDecodedString(PyObject *str,
466 const char *encoding,
467 const char *errors)
469 PyObject *v;
471 v = PyString_AsDecodedObject(str, encoding, errors);
472 if (v == NULL)
473 goto onError;
475 #ifdef Py_USING_UNICODE
476 /* Convert Unicode to a string using the default encoding */
477 if (PyUnicode_Check(v)) {
478 PyObject *temp = v;
479 v = PyUnicode_AsEncodedString(v, NULL, NULL);
480 Py_DECREF(temp);
481 if (v == NULL)
482 goto onError;
484 #endif
485 if (!PyString_Check(v)) {
486 PyErr_Format(PyExc_TypeError,
487 "decoder did not return a string object (type=%.400s)",
488 Py_TYPE(v)->tp_name);
489 Py_DECREF(v);
490 goto onError;
493 return v;
495 onError:
496 return NULL;
499 PyObject *PyString_Encode(const char *s,
500 Py_ssize_t size,
501 const char *encoding,
502 const char *errors)
504 PyObject *v, *str;
506 str = PyString_FromStringAndSize(s, size);
507 if (str == NULL)
508 return NULL;
509 v = PyString_AsEncodedString(str, encoding, errors);
510 Py_DECREF(str);
511 return v;
514 PyObject *PyString_AsEncodedObject(PyObject *str,
515 const char *encoding,
516 const char *errors)
518 PyObject *v;
520 if (!PyString_Check(str)) {
521 PyErr_BadArgument();
522 goto onError;
525 if (encoding == NULL) {
526 #ifdef Py_USING_UNICODE
527 encoding = PyUnicode_GetDefaultEncoding();
528 #else
529 PyErr_SetString(PyExc_ValueError, "no encoding specified");
530 goto onError;
531 #endif
534 /* Encode via the codec registry */
535 v = PyCodec_Encode(str, encoding, errors);
536 if (v == NULL)
537 goto onError;
539 return v;
541 onError:
542 return NULL;
545 PyObject *PyString_AsEncodedString(PyObject *str,
546 const char *encoding,
547 const char *errors)
549 PyObject *v;
551 v = PyString_AsEncodedObject(str, encoding, errors);
552 if (v == NULL)
553 goto onError;
555 #ifdef Py_USING_UNICODE
556 /* Convert Unicode to a string using the default encoding */
557 if (PyUnicode_Check(v)) {
558 PyObject *temp = v;
559 v = PyUnicode_AsEncodedString(v, NULL, NULL);
560 Py_DECREF(temp);
561 if (v == NULL)
562 goto onError;
564 #endif
565 if (!PyString_Check(v)) {
566 PyErr_Format(PyExc_TypeError,
567 "encoder did not return a string object (type=%.400s)",
568 Py_TYPE(v)->tp_name);
569 Py_DECREF(v);
570 goto onError;
573 return v;
575 onError:
576 return NULL;
579 static void
580 string_dealloc(PyObject *op)
582 switch (PyString_CHECK_INTERNED(op)) {
583 case SSTATE_NOT_INTERNED:
584 break;
586 case SSTATE_INTERNED_MORTAL:
587 /* revive dead object temporarily for DelItem */
588 Py_REFCNT(op) = 3;
589 if (PyDict_DelItem(interned, op) != 0)
590 Py_FatalError(
591 "deletion of interned string failed");
592 break;
594 case SSTATE_INTERNED_IMMORTAL:
595 Py_FatalError("Immortal interned string died.");
597 default:
598 Py_FatalError("Inconsistent interned string state.");
600 Py_TYPE(op)->tp_free(op);
603 /* Unescape a backslash-escaped string. If unicode is non-zero,
604 the string is a u-literal. If recode_encoding is non-zero,
605 the string is UTF-8 encoded and should be re-encoded in the
606 specified encoding. */
608 PyObject *PyString_DecodeEscape(const char *s,
609 Py_ssize_t len,
610 const char *errors,
611 Py_ssize_t unicode,
612 const char *recode_encoding)
614 int c;
615 char *p, *buf;
616 const char *end;
617 PyObject *v;
618 Py_ssize_t newlen = recode_encoding ? 4*len:len;
619 v = PyString_FromStringAndSize((char *)NULL, newlen);
620 if (v == NULL)
621 return NULL;
622 p = buf = PyString_AsString(v);
623 end = s + len;
624 while (s < end) {
625 if (*s != '\\') {
626 non_esc:
627 #ifdef Py_USING_UNICODE
628 if (recode_encoding && (*s & 0x80)) {
629 PyObject *u, *w;
630 char *r;
631 const char* t;
632 Py_ssize_t rn;
633 t = s;
634 /* Decode non-ASCII bytes as UTF-8. */
635 while (t < end && (*t & 0x80)) t++;
636 u = PyUnicode_DecodeUTF8(s, t - s, errors);
637 if(!u) goto failed;
639 /* Recode them in target encoding. */
640 w = PyUnicode_AsEncodedString(
641 u, recode_encoding, errors);
642 Py_DECREF(u);
643 if (!w) goto failed;
645 /* Append bytes to output buffer. */
646 assert(PyString_Check(w));
647 r = PyString_AS_STRING(w);
648 rn = PyString_GET_SIZE(w);
649 Py_MEMCPY(p, r, rn);
650 p += rn;
651 Py_DECREF(w);
652 s = t;
653 } else {
654 *p++ = *s++;
656 #else
657 *p++ = *s++;
658 #endif
659 continue;
661 s++;
662 if (s==end) {
663 PyErr_SetString(PyExc_ValueError,
664 "Trailing \\ in string");
665 goto failed;
667 switch (*s++) {
668 /* XXX This assumes ASCII! */
669 case '\n': break;
670 case '\\': *p++ = '\\'; break;
671 case '\'': *p++ = '\''; break;
672 case '\"': *p++ = '\"'; break;
673 case 'b': *p++ = '\b'; break;
674 case 'f': *p++ = '\014'; break; /* FF */
675 case 't': *p++ = '\t'; break;
676 case 'n': *p++ = '\n'; break;
677 case 'r': *p++ = '\r'; break;
678 case 'v': *p++ = '\013'; break; /* VT */
679 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
680 case '0': case '1': case '2': case '3':
681 case '4': case '5': case '6': case '7':
682 c = s[-1] - '0';
683 if (s < end && '0' <= *s && *s <= '7') {
684 c = (c<<3) + *s++ - '0';
685 if (s < end && '0' <= *s && *s <= '7')
686 c = (c<<3) + *s++ - '0';
688 *p++ = c;
689 break;
690 case 'x':
691 if (s+1 < end &&
692 isxdigit(Py_CHARMASK(s[0])) &&
693 isxdigit(Py_CHARMASK(s[1])))
695 unsigned int x = 0;
696 c = Py_CHARMASK(*s);
697 s++;
698 if (isdigit(c))
699 x = c - '0';
700 else if (islower(c))
701 x = 10 + c - 'a';
702 else
703 x = 10 + c - 'A';
704 x = x << 4;
705 c = Py_CHARMASK(*s);
706 s++;
707 if (isdigit(c))
708 x += c - '0';
709 else if (islower(c))
710 x += 10 + c - 'a';
711 else
712 x += 10 + c - 'A';
713 *p++ = x;
714 break;
716 if (!errors || strcmp(errors, "strict") == 0) {
717 PyErr_SetString(PyExc_ValueError,
718 "invalid \\x escape");
719 goto failed;
721 if (strcmp(errors, "replace") == 0) {
722 *p++ = '?';
723 } else if (strcmp(errors, "ignore") == 0)
724 /* do nothing */;
725 else {
726 PyErr_Format(PyExc_ValueError,
727 "decoding error; "
728 "unknown error handling code: %.400s",
729 errors);
730 goto failed;
732 #ifndef Py_USING_UNICODE
733 case 'u':
734 case 'U':
735 case 'N':
736 if (unicode) {
737 PyErr_SetString(PyExc_ValueError,
738 "Unicode escapes not legal "
739 "when Unicode disabled");
740 goto failed;
742 #endif
743 default:
744 *p++ = '\\';
745 s--;
746 goto non_esc; /* an arbitry number of unescaped
747 UTF-8 bytes may follow. */
750 if (p-buf < newlen)
751 _PyString_Resize(&v, p - buf);
752 return v;
753 failed:
754 Py_DECREF(v);
755 return NULL;
758 /* -------------------------------------------------------------------- */
759 /* object api */
761 static Py_ssize_t
762 string_getsize(register PyObject *op)
764 char *s;
765 Py_ssize_t len;
766 if (PyString_AsStringAndSize(op, &s, &len))
767 return -1;
768 return len;
771 static /*const*/ char *
772 string_getbuffer(register PyObject *op)
774 char *s;
775 Py_ssize_t len;
776 if (PyString_AsStringAndSize(op, &s, &len))
777 return NULL;
778 return s;
781 Py_ssize_t
782 PyString_Size(register PyObject *op)
784 if (!PyString_Check(op))
785 return string_getsize(op);
786 return Py_SIZE(op);
789 /*const*/ char *
790 PyString_AsString(register PyObject *op)
792 if (!PyString_Check(op))
793 return string_getbuffer(op);
794 return ((PyStringObject *)op) -> ob_sval;
798 PyString_AsStringAndSize(register PyObject *obj,
799 register char **s,
800 register Py_ssize_t *len)
802 if (s == NULL) {
803 PyErr_BadInternalCall();
804 return -1;
807 if (!PyString_Check(obj)) {
808 #ifdef Py_USING_UNICODE
809 if (PyUnicode_Check(obj)) {
810 obj = _PyUnicode_AsDefaultEncodedString(obj, NULL);
811 if (obj == NULL)
812 return -1;
814 else
815 #endif
817 PyErr_Format(PyExc_TypeError,
818 "expected string or Unicode object, "
819 "%.200s found", Py_TYPE(obj)->tp_name);
820 return -1;
824 *s = PyString_AS_STRING(obj);
825 if (len != NULL)
826 *len = PyString_GET_SIZE(obj);
827 else if (strlen(*s) != (size_t)PyString_GET_SIZE(obj)) {
828 PyErr_SetString(PyExc_TypeError,
829 "expected string without null bytes");
830 return -1;
832 return 0;
835 /* -------------------------------------------------------------------- */
836 /* Methods */
838 #include "stringlib/stringdefs.h"
839 #include "stringlib/fastsearch.h"
841 #include "stringlib/count.h"
842 #include "stringlib/find.h"
843 #include "stringlib/partition.h"
845 #define _Py_InsertThousandsGrouping _PyString_InsertThousandsGrouping
846 #include "stringlib/localeutil.h"
850 static int
851 string_print(PyStringObject *op, FILE *fp, int flags)
853 Py_ssize_t i, str_len;
854 char c;
855 int quote;
857 /* XXX Ought to check for interrupts when writing long strings */
858 if (! PyString_CheckExact(op)) {
859 int ret;
860 /* A str subclass may have its own __str__ method. */
861 op = (PyStringObject *) PyObject_Str((PyObject *)op);
862 if (op == NULL)
863 return -1;
864 ret = string_print(op, fp, flags);
865 Py_DECREF(op);
866 return ret;
868 if (flags & Py_PRINT_RAW) {
869 char *data = op->ob_sval;
870 Py_ssize_t size = Py_SIZE(op);
871 Py_BEGIN_ALLOW_THREADS
872 while (size > INT_MAX) {
873 /* Very long strings cannot be written atomically.
874 * But don't write exactly INT_MAX bytes at a time
875 * to avoid memory aligment issues.
877 const int chunk_size = INT_MAX & ~0x3FFF;
878 fwrite(data, 1, chunk_size, fp);
879 data += chunk_size;
880 size -= chunk_size;
882 #ifdef __VMS
883 if (size) fwrite(data, (int)size, 1, fp);
884 #else
885 fwrite(data, 1, (int)size, fp);
886 #endif
887 Py_END_ALLOW_THREADS
888 return 0;
891 /* figure out which quote to use; single is preferred */
892 quote = '\'';
893 if (memchr(op->ob_sval, '\'', Py_SIZE(op)) &&
894 !memchr(op->ob_sval, '"', Py_SIZE(op)))
895 quote = '"';
897 str_len = Py_SIZE(op);
898 Py_BEGIN_ALLOW_THREADS
899 fputc(quote, fp);
900 for (i = 0; i < str_len; i++) {
901 /* Since strings are immutable and the caller should have a
902 reference, accessing the interal buffer should not be an issue
903 with the GIL released. */
904 c = op->ob_sval[i];
905 if (c == quote || c == '\\')
906 fprintf(fp, "\\%c", c);
907 else if (c == '\t')
908 fprintf(fp, "\\t");
909 else if (c == '\n')
910 fprintf(fp, "\\n");
911 else if (c == '\r')
912 fprintf(fp, "\\r");
913 else if (c < ' ' || c >= 0x7f)
914 fprintf(fp, "\\x%02x", c & 0xff);
915 else
916 fputc(c, fp);
918 fputc(quote, fp);
919 Py_END_ALLOW_THREADS
920 return 0;
923 PyObject *
924 PyString_Repr(PyObject *obj, int smartquotes)
926 register PyStringObject* op = (PyStringObject*) obj;
927 size_t newsize = 2 + 4 * Py_SIZE(op);
928 PyObject *v;
929 if (newsize > PY_SSIZE_T_MAX || newsize / 4 != Py_SIZE(op)) {
930 PyErr_SetString(PyExc_OverflowError,
931 "string is too large to make repr");
932 return NULL;
934 v = PyString_FromStringAndSize((char *)NULL, newsize);
935 if (v == NULL) {
936 return NULL;
938 else {
939 register Py_ssize_t i;
940 register char c;
941 register char *p;
942 int quote;
944 /* figure out which quote to use; single is preferred */
945 quote = '\'';
946 if (smartquotes &&
947 memchr(op->ob_sval, '\'', Py_SIZE(op)) &&
948 !memchr(op->ob_sval, '"', Py_SIZE(op)))
949 quote = '"';
951 p = PyString_AS_STRING(v);
952 *p++ = quote;
953 for (i = 0; i < Py_SIZE(op); i++) {
954 /* There's at least enough room for a hex escape
955 and a closing quote. */
956 assert(newsize - (p - PyString_AS_STRING(v)) >= 5);
957 c = op->ob_sval[i];
958 if (c == quote || c == '\\')
959 *p++ = '\\', *p++ = c;
960 else if (c == '\t')
961 *p++ = '\\', *p++ = 't';
962 else if (c == '\n')
963 *p++ = '\\', *p++ = 'n';
964 else if (c == '\r')
965 *p++ = '\\', *p++ = 'r';
966 else if (c < ' ' || c >= 0x7f) {
967 /* For performance, we don't want to call
968 PyOS_snprintf here (extra layers of
969 function call). */
970 sprintf(p, "\\x%02x", c & 0xff);
971 p += 4;
973 else
974 *p++ = c;
976 assert(newsize - (p - PyString_AS_STRING(v)) >= 1);
977 *p++ = quote;
978 *p = '\0';
979 _PyString_Resize(
980 &v, (p - PyString_AS_STRING(v)));
981 return v;
985 static PyObject *
986 string_repr(PyObject *op)
988 return PyString_Repr(op, 1);
991 static PyObject *
992 string_str(PyObject *s)
994 assert(PyString_Check(s));
995 if (PyString_CheckExact(s)) {
996 Py_INCREF(s);
997 return s;
999 else {
1000 /* Subtype -- return genuine string with the same value. */
1001 PyStringObject *t = (PyStringObject *) s;
1002 return PyString_FromStringAndSize(t->ob_sval, Py_SIZE(t));
1006 static Py_ssize_t
1007 string_length(PyStringObject *a)
1009 return Py_SIZE(a);
1012 static PyObject *
1013 string_concat(register PyStringObject *a, register PyObject *bb)
1015 register Py_ssize_t size;
1016 register PyStringObject *op;
1017 if (!PyString_Check(bb)) {
1018 #ifdef Py_USING_UNICODE
1019 if (PyUnicode_Check(bb))
1020 return PyUnicode_Concat((PyObject *)a, bb);
1021 #endif
1022 if (PyByteArray_Check(bb))
1023 return PyByteArray_Concat((PyObject *)a, bb);
1024 PyErr_Format(PyExc_TypeError,
1025 "cannot concatenate 'str' and '%.200s' objects",
1026 Py_TYPE(bb)->tp_name);
1027 return NULL;
1029 #define b ((PyStringObject *)bb)
1030 /* Optimize cases with empty left or right operand */
1031 if ((Py_SIZE(a) == 0 || Py_SIZE(b) == 0) &&
1032 PyString_CheckExact(a) && PyString_CheckExact(b)) {
1033 if (Py_SIZE(a) == 0) {
1034 Py_INCREF(bb);
1035 return bb;
1037 Py_INCREF(a);
1038 return (PyObject *)a;
1040 size = Py_SIZE(a) + Py_SIZE(b);
1041 /* Check that string sizes are not negative, to prevent an
1042 overflow in cases where we are passed incorrectly-created
1043 strings with negative lengths (due to a bug in other code).
1045 if (Py_SIZE(a) < 0 || Py_SIZE(b) < 0 ||
1046 Py_SIZE(a) > PY_SSIZE_T_MAX - Py_SIZE(b)) {
1047 PyErr_SetString(PyExc_OverflowError,
1048 "strings are too large to concat");
1049 return NULL;
1052 /* Inline PyObject_NewVar */
1053 if (size > PY_SSIZE_T_MAX - PyStringObject_SIZE) {
1054 PyErr_SetString(PyExc_OverflowError,
1055 "strings are too large to concat");
1056 return NULL;
1058 op = (PyStringObject *)PyObject_MALLOC(PyStringObject_SIZE + size);
1059 if (op == NULL)
1060 return PyErr_NoMemory();
1061 PyObject_INIT_VAR(op, &PyString_Type, size);
1062 op->ob_shash = -1;
1063 op->ob_sstate = SSTATE_NOT_INTERNED;
1064 Py_MEMCPY(op->ob_sval, a->ob_sval, Py_SIZE(a));
1065 Py_MEMCPY(op->ob_sval + Py_SIZE(a), b->ob_sval, Py_SIZE(b));
1066 op->ob_sval[size] = '\0';
1067 return (PyObject *) op;
1068 #undef b
1071 static PyObject *
1072 string_repeat(register PyStringObject *a, register Py_ssize_t n)
1074 register Py_ssize_t i;
1075 register Py_ssize_t j;
1076 register Py_ssize_t size;
1077 register PyStringObject *op;
1078 size_t nbytes;
1079 if (n < 0)
1080 n = 0;
1081 /* watch out for overflows: the size can overflow int,
1082 * and the # of bytes needed can overflow size_t
1084 size = Py_SIZE(a) * n;
1085 if (n && size / n != Py_SIZE(a)) {
1086 PyErr_SetString(PyExc_OverflowError,
1087 "repeated string is too long");
1088 return NULL;
1090 if (size == Py_SIZE(a) && PyString_CheckExact(a)) {
1091 Py_INCREF(a);
1092 return (PyObject *)a;
1094 nbytes = (size_t)size;
1095 if (nbytes + PyStringObject_SIZE <= nbytes) {
1096 PyErr_SetString(PyExc_OverflowError,
1097 "repeated string is too long");
1098 return NULL;
1100 op = (PyStringObject *)PyObject_MALLOC(PyStringObject_SIZE + nbytes);
1101 if (op == NULL)
1102 return PyErr_NoMemory();
1103 PyObject_INIT_VAR(op, &PyString_Type, size);
1104 op->ob_shash = -1;
1105 op->ob_sstate = SSTATE_NOT_INTERNED;
1106 op->ob_sval[size] = '\0';
1107 if (Py_SIZE(a) == 1 && n > 0) {
1108 memset(op->ob_sval, a->ob_sval[0] , n);
1109 return (PyObject *) op;
1111 i = 0;
1112 if (i < size) {
1113 Py_MEMCPY(op->ob_sval, a->ob_sval, Py_SIZE(a));
1114 i = Py_SIZE(a);
1116 while (i < size) {
1117 j = (i <= size-i) ? i : size-i;
1118 Py_MEMCPY(op->ob_sval+i, op->ob_sval, j);
1119 i += j;
1121 return (PyObject *) op;
1124 /* String slice a[i:j] consists of characters a[i] ... a[j-1] */
1126 static PyObject *
1127 string_slice(register PyStringObject *a, register Py_ssize_t i,
1128 register Py_ssize_t j)
1129 /* j -- may be negative! */
1131 if (i < 0)
1132 i = 0;
1133 if (j < 0)
1134 j = 0; /* Avoid signed/unsigned bug in next line */
1135 if (j > Py_SIZE(a))
1136 j = Py_SIZE(a);
1137 if (i == 0 && j == Py_SIZE(a) && PyString_CheckExact(a)) {
1138 /* It's the same as a */
1139 Py_INCREF(a);
1140 return (PyObject *)a;
1142 if (j < i)
1143 j = i;
1144 return PyString_FromStringAndSize(a->ob_sval + i, j-i);
1147 static int
1148 string_contains(PyObject *str_obj, PyObject *sub_obj)
1150 if (!PyString_CheckExact(sub_obj)) {
1151 #ifdef Py_USING_UNICODE
1152 if (PyUnicode_Check(sub_obj))
1153 return PyUnicode_Contains(str_obj, sub_obj);
1154 #endif
1155 if (!PyString_Check(sub_obj)) {
1156 PyErr_Format(PyExc_TypeError,
1157 "'in <string>' requires string as left operand, "
1158 "not %.200s", Py_TYPE(sub_obj)->tp_name);
1159 return -1;
1163 return stringlib_contains_obj(str_obj, sub_obj);
1166 static PyObject *
1167 string_item(PyStringObject *a, register Py_ssize_t i)
1169 char pchar;
1170 PyObject *v;
1171 if (i < 0 || i >= Py_SIZE(a)) {
1172 PyErr_SetString(PyExc_IndexError, "string index out of range");
1173 return NULL;
1175 pchar = a->ob_sval[i];
1176 v = (PyObject *)characters[pchar & UCHAR_MAX];
1177 if (v == NULL)
1178 v = PyString_FromStringAndSize(&pchar, 1);
1179 else {
1180 #ifdef COUNT_ALLOCS
1181 one_strings++;
1182 #endif
1183 Py_INCREF(v);
1185 return v;
1188 static PyObject*
1189 string_richcompare(PyStringObject *a, PyStringObject *b, int op)
1191 int c;
1192 Py_ssize_t len_a, len_b;
1193 Py_ssize_t min_len;
1194 PyObject *result;
1196 /* Make sure both arguments are strings. */
1197 if (!(PyString_Check(a) && PyString_Check(b))) {
1198 result = Py_NotImplemented;
1199 goto out;
1201 if (a == b) {
1202 switch (op) {
1203 case Py_EQ:case Py_LE:case Py_GE:
1204 result = Py_True;
1205 goto out;
1206 case Py_NE:case Py_LT:case Py_GT:
1207 result = Py_False;
1208 goto out;
1211 if (op == Py_EQ) {
1212 /* Supporting Py_NE here as well does not save
1213 much time, since Py_NE is rarely used. */
1214 if (Py_SIZE(a) == Py_SIZE(b)
1215 && (a->ob_sval[0] == b->ob_sval[0]
1216 && memcmp(a->ob_sval, b->ob_sval, Py_SIZE(a)) == 0)) {
1217 result = Py_True;
1218 } else {
1219 result = Py_False;
1221 goto out;
1223 len_a = Py_SIZE(a); len_b = Py_SIZE(b);
1224 min_len = (len_a < len_b) ? len_a : len_b;
1225 if (min_len > 0) {
1226 c = Py_CHARMASK(*a->ob_sval) - Py_CHARMASK(*b->ob_sval);
1227 if (c==0)
1228 c = memcmp(a->ob_sval, b->ob_sval, min_len);
1229 } else
1230 c = 0;
1231 if (c == 0)
1232 c = (len_a < len_b) ? -1 : (len_a > len_b) ? 1 : 0;
1233 switch (op) {
1234 case Py_LT: c = c < 0; break;
1235 case Py_LE: c = c <= 0; break;
1236 case Py_EQ: assert(0); break; /* unreachable */
1237 case Py_NE: c = c != 0; break;
1238 case Py_GT: c = c > 0; break;
1239 case Py_GE: c = c >= 0; break;
1240 default:
1241 result = Py_NotImplemented;
1242 goto out;
1244 result = c ? Py_True : Py_False;
1245 out:
1246 Py_INCREF(result);
1247 return result;
1251 _PyString_Eq(PyObject *o1, PyObject *o2)
1253 PyStringObject *a = (PyStringObject*) o1;
1254 PyStringObject *b = (PyStringObject*) o2;
1255 return Py_SIZE(a) == Py_SIZE(b)
1256 && *a->ob_sval == *b->ob_sval
1257 && memcmp(a->ob_sval, b->ob_sval, Py_SIZE(a)) == 0;
1260 static long
1261 string_hash(PyStringObject *a)
1263 register Py_ssize_t len;
1264 register unsigned char *p;
1265 register long x;
1267 if (a->ob_shash != -1)
1268 return a->ob_shash;
1269 len = Py_SIZE(a);
1270 p = (unsigned char *) a->ob_sval;
1271 x = *p << 7;
1272 while (--len >= 0)
1273 x = (1000003*x) ^ *p++;
1274 x ^= Py_SIZE(a);
1275 if (x == -1)
1276 x = -2;
1277 a->ob_shash = x;
1278 return x;
1281 static PyObject*
1282 string_subscript(PyStringObject* self, PyObject* item)
1284 if (PyIndex_Check(item)) {
1285 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
1286 if (i == -1 && PyErr_Occurred())
1287 return NULL;
1288 if (i < 0)
1289 i += PyString_GET_SIZE(self);
1290 return string_item(self, i);
1292 else if (PySlice_Check(item)) {
1293 Py_ssize_t start, stop, step, slicelength, cur, i;
1294 char* source_buf;
1295 char* result_buf;
1296 PyObject* result;
1298 if (PySlice_GetIndicesEx((PySliceObject*)item,
1299 PyString_GET_SIZE(self),
1300 &start, &stop, &step, &slicelength) < 0) {
1301 return NULL;
1304 if (slicelength <= 0) {
1305 return PyString_FromStringAndSize("", 0);
1307 else if (start == 0 && step == 1 &&
1308 slicelength == PyString_GET_SIZE(self) &&
1309 PyString_CheckExact(self)) {
1310 Py_INCREF(self);
1311 return (PyObject *)self;
1313 else if (step == 1) {
1314 return PyString_FromStringAndSize(
1315 PyString_AS_STRING(self) + start,
1316 slicelength);
1318 else {
1319 source_buf = PyString_AsString((PyObject*)self);
1320 result_buf = (char *)PyMem_Malloc(slicelength);
1321 if (result_buf == NULL)
1322 return PyErr_NoMemory();
1324 for (cur = start, i = 0; i < slicelength;
1325 cur += step, i++) {
1326 result_buf[i] = source_buf[cur];
1329 result = PyString_FromStringAndSize(result_buf,
1330 slicelength);
1331 PyMem_Free(result_buf);
1332 return result;
1335 else {
1336 PyErr_Format(PyExc_TypeError,
1337 "string indices must be integers, not %.200s",
1338 Py_TYPE(item)->tp_name);
1339 return NULL;
1343 static Py_ssize_t
1344 string_buffer_getreadbuf(PyStringObject *self, Py_ssize_t index, const void **ptr)
1346 if ( index != 0 ) {
1347 PyErr_SetString(PyExc_SystemError,
1348 "accessing non-existent string segment");
1349 return -1;
1351 *ptr = (void *)self->ob_sval;
1352 return Py_SIZE(self);
1355 static Py_ssize_t
1356 string_buffer_getwritebuf(PyStringObject *self, Py_ssize_t index, const void **ptr)
1358 PyErr_SetString(PyExc_TypeError,
1359 "Cannot use string as modifiable buffer");
1360 return -1;
1363 static Py_ssize_t
1364 string_buffer_getsegcount(PyStringObject *self, Py_ssize_t *lenp)
1366 if ( lenp )
1367 *lenp = Py_SIZE(self);
1368 return 1;
1371 static Py_ssize_t
1372 string_buffer_getcharbuf(PyStringObject *self, Py_ssize_t index, const char **ptr)
1374 if ( index != 0 ) {
1375 PyErr_SetString(PyExc_SystemError,
1376 "accessing non-existent string segment");
1377 return -1;
1379 *ptr = self->ob_sval;
1380 return Py_SIZE(self);
1383 static int
1384 string_buffer_getbuffer(PyStringObject *self, Py_buffer *view, int flags)
1386 return PyBuffer_FillInfo(view, (PyObject*)self,
1387 (void *)self->ob_sval, Py_SIZE(self),
1388 1, flags);
1391 static PySequenceMethods string_as_sequence = {
1392 (lenfunc)string_length, /*sq_length*/
1393 (binaryfunc)string_concat, /*sq_concat*/
1394 (ssizeargfunc)string_repeat, /*sq_repeat*/
1395 (ssizeargfunc)string_item, /*sq_item*/
1396 (ssizessizeargfunc)string_slice, /*sq_slice*/
1397 0, /*sq_ass_item*/
1398 0, /*sq_ass_slice*/
1399 (objobjproc)string_contains /*sq_contains*/
1402 static PyMappingMethods string_as_mapping = {
1403 (lenfunc)string_length,
1404 (binaryfunc)string_subscript,
1408 static PyBufferProcs string_as_buffer = {
1409 (readbufferproc)string_buffer_getreadbuf,
1410 (writebufferproc)string_buffer_getwritebuf,
1411 (segcountproc)string_buffer_getsegcount,
1412 (charbufferproc)string_buffer_getcharbuf,
1413 (getbufferproc)string_buffer_getbuffer,
1414 0, /* XXX */
1419 #define LEFTSTRIP 0
1420 #define RIGHTSTRIP 1
1421 #define BOTHSTRIP 2
1423 /* Arrays indexed by above */
1424 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
1426 #define STRIPNAME(i) (stripformat[i]+3)
1429 /* Don't call if length < 2 */
1430 #define Py_STRING_MATCH(target, offset, pattern, length) \
1431 (target[offset] == pattern[0] && \
1432 target[offset+length-1] == pattern[length-1] && \
1433 !memcmp(target+offset+1, pattern+1, length-2) )
1436 /* Overallocate the initial list to reduce the number of reallocs for small
1437 split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three
1438 resizes, to sizes 4, 8, then 16. Most observed string splits are for human
1439 text (roughly 11 words per line) and field delimited data (usually 1-10
1440 fields). For large strings the split algorithms are bandwidth limited
1441 so increasing the preallocation likely will not improve things.*/
1443 #define MAX_PREALLOC 12
1445 /* 5 splits gives 6 elements */
1446 #define PREALLOC_SIZE(maxsplit) \
1447 (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
1449 #define SPLIT_APPEND(data, left, right) \
1450 str = PyString_FromStringAndSize((data) + (left), \
1451 (right) - (left)); \
1452 if (str == NULL) \
1453 goto onError; \
1454 if (PyList_Append(list, str)) { \
1455 Py_DECREF(str); \
1456 goto onError; \
1458 else \
1459 Py_DECREF(str);
1461 #define SPLIT_ADD(data, left, right) { \
1462 str = PyString_FromStringAndSize((data) + (left), \
1463 (right) - (left)); \
1464 if (str == NULL) \
1465 goto onError; \
1466 if (count < MAX_PREALLOC) { \
1467 PyList_SET_ITEM(list, count, str); \
1468 } else { \
1469 if (PyList_Append(list, str)) { \
1470 Py_DECREF(str); \
1471 goto onError; \
1473 else \
1474 Py_DECREF(str); \
1476 count++; }
1478 /* Always force the list to the expected size. */
1479 #define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count
1481 #define SKIP_SPACE(s, i, len) { while (i<len && isspace(Py_CHARMASK(s[i]))) i++; }
1482 #define SKIP_NONSPACE(s, i, len) { while (i<len && !isspace(Py_CHARMASK(s[i]))) i++; }
1483 #define RSKIP_SPACE(s, i) { while (i>=0 && isspace(Py_CHARMASK(s[i]))) i--; }
1484 #define RSKIP_NONSPACE(s, i) { while (i>=0 && !isspace(Py_CHARMASK(s[i]))) i--; }
1486 Py_LOCAL_INLINE(PyObject *)
1487 split_whitespace(PyStringObject *self, Py_ssize_t len, Py_ssize_t maxsplit)
1489 const char *s = PyString_AS_STRING(self);
1490 Py_ssize_t i, j, count=0;
1491 PyObject *str;
1492 PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit));
1494 if (list == NULL)
1495 return NULL;
1497 i = j = 0;
1499 while (maxsplit-- > 0) {
1500 SKIP_SPACE(s, i, len);
1501 if (i==len) break;
1502 j = i; i++;
1503 SKIP_NONSPACE(s, i, len);
1504 if (j == 0 && i == len && PyString_CheckExact(self)) {
1505 /* No whitespace in self, so just use it as list[0] */
1506 Py_INCREF(self);
1507 PyList_SET_ITEM(list, 0, (PyObject *)self);
1508 count++;
1509 break;
1511 SPLIT_ADD(s, j, i);
1514 if (i < len) {
1515 /* Only occurs when maxsplit was reached */
1516 /* Skip any remaining whitespace and copy to end of string */
1517 SKIP_SPACE(s, i, len);
1518 if (i != len)
1519 SPLIT_ADD(s, i, len);
1521 FIX_PREALLOC_SIZE(list);
1522 return list;
1523 onError:
1524 Py_DECREF(list);
1525 return NULL;
1528 Py_LOCAL_INLINE(PyObject *)
1529 split_char(PyStringObject *self, Py_ssize_t len, char ch, Py_ssize_t maxcount)
1531 const char *s = PyString_AS_STRING(self);
1532 register Py_ssize_t i, j, count=0;
1533 PyObject *str;
1534 PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
1536 if (list == NULL)
1537 return NULL;
1539 i = j = 0;
1540 while ((j < len) && (maxcount-- > 0)) {
1541 for(; j<len; j++) {
1542 /* I found that using memchr makes no difference */
1543 if (s[j] == ch) {
1544 SPLIT_ADD(s, i, j);
1545 i = j = j + 1;
1546 break;
1550 if (i == 0 && count == 0 && PyString_CheckExact(self)) {
1551 /* ch not in self, so just use self as list[0] */
1552 Py_INCREF(self);
1553 PyList_SET_ITEM(list, 0, (PyObject *)self);
1554 count++;
1556 else if (i <= len) {
1557 SPLIT_ADD(s, i, len);
1559 FIX_PREALLOC_SIZE(list);
1560 return list;
1562 onError:
1563 Py_DECREF(list);
1564 return NULL;
1567 PyDoc_STRVAR(split__doc__,
1568 "S.split([sep [,maxsplit]]) -> list of strings\n\
1570 Return a list of the words in the string S, using sep as the\n\
1571 delimiter string. If maxsplit is given, at most maxsplit\n\
1572 splits are done. If sep is not specified or is None, any\n\
1573 whitespace string is a separator and empty strings are removed\n\
1574 from the result.");
1576 static PyObject *
1577 string_split(PyStringObject *self, PyObject *args)
1579 Py_ssize_t len = PyString_GET_SIZE(self), n, i, j;
1580 Py_ssize_t maxsplit = -1, count=0;
1581 const char *s = PyString_AS_STRING(self), *sub;
1582 PyObject *list, *str, *subobj = Py_None;
1583 #ifdef USE_FAST
1584 Py_ssize_t pos;
1585 #endif
1587 if (!PyArg_ParseTuple(args, "|On:split", &subobj, &maxsplit))
1588 return NULL;
1589 if (maxsplit < 0)
1590 maxsplit = PY_SSIZE_T_MAX;
1591 if (subobj == Py_None)
1592 return split_whitespace(self, len, maxsplit);
1593 if (PyString_Check(subobj)) {
1594 sub = PyString_AS_STRING(subobj);
1595 n = PyString_GET_SIZE(subobj);
1597 #ifdef Py_USING_UNICODE
1598 else if (PyUnicode_Check(subobj))
1599 return PyUnicode_Split((PyObject *)self, subobj, maxsplit);
1600 #endif
1601 else if (PyObject_AsCharBuffer(subobj, &sub, &n))
1602 return NULL;
1604 if (n == 0) {
1605 PyErr_SetString(PyExc_ValueError, "empty separator");
1606 return NULL;
1608 else if (n == 1)
1609 return split_char(self, len, sub[0], maxsplit);
1611 list = PyList_New(PREALLOC_SIZE(maxsplit));
1612 if (list == NULL)
1613 return NULL;
1615 #ifdef USE_FAST
1616 i = j = 0;
1617 while (maxsplit-- > 0) {
1618 pos = fastsearch(s+i, len-i, sub, n, FAST_SEARCH);
1619 if (pos < 0)
1620 break;
1621 j = i+pos;
1622 SPLIT_ADD(s, i, j);
1623 i = j + n;
1625 #else
1626 i = j = 0;
1627 while ((j+n <= len) && (maxsplit-- > 0)) {
1628 for (; j+n <= len; j++) {
1629 if (Py_STRING_MATCH(s, j, sub, n)) {
1630 SPLIT_ADD(s, i, j);
1631 i = j = j + n;
1632 break;
1636 #endif
1637 SPLIT_ADD(s, i, len);
1638 FIX_PREALLOC_SIZE(list);
1639 return list;
1641 onError:
1642 Py_DECREF(list);
1643 return NULL;
1646 PyDoc_STRVAR(partition__doc__,
1647 "S.partition(sep) -> (head, sep, tail)\n\
1649 Search for the separator sep in S, and return the part before it,\n\
1650 the separator itself, and the part after it. If the separator is not\n\
1651 found, return S and two empty strings.");
1653 static PyObject *
1654 string_partition(PyStringObject *self, PyObject *sep_obj)
1656 const char *sep;
1657 Py_ssize_t sep_len;
1659 if (PyString_Check(sep_obj)) {
1660 sep = PyString_AS_STRING(sep_obj);
1661 sep_len = PyString_GET_SIZE(sep_obj);
1663 #ifdef Py_USING_UNICODE
1664 else if (PyUnicode_Check(sep_obj))
1665 return PyUnicode_Partition((PyObject *) self, sep_obj);
1666 #endif
1667 else if (PyObject_AsCharBuffer(sep_obj, &sep, &sep_len))
1668 return NULL;
1670 return stringlib_partition(
1671 (PyObject*) self,
1672 PyString_AS_STRING(self), PyString_GET_SIZE(self),
1673 sep_obj, sep, sep_len
1677 PyDoc_STRVAR(rpartition__doc__,
1678 "S.rpartition(sep) -> (tail, sep, head)\n\
1680 Search for the separator sep in S, starting at the end of S, and return\n\
1681 the part before it, the separator itself, and the part after it. If the\n\
1682 separator is not found, return two empty strings and S.");
1684 static PyObject *
1685 string_rpartition(PyStringObject *self, PyObject *sep_obj)
1687 const char *sep;
1688 Py_ssize_t sep_len;
1690 if (PyString_Check(sep_obj)) {
1691 sep = PyString_AS_STRING(sep_obj);
1692 sep_len = PyString_GET_SIZE(sep_obj);
1694 #ifdef Py_USING_UNICODE
1695 else if (PyUnicode_Check(sep_obj))
1696 return PyUnicode_RPartition((PyObject *) self, sep_obj);
1697 #endif
1698 else if (PyObject_AsCharBuffer(sep_obj, &sep, &sep_len))
1699 return NULL;
1701 return stringlib_rpartition(
1702 (PyObject*) self,
1703 PyString_AS_STRING(self), PyString_GET_SIZE(self),
1704 sep_obj, sep, sep_len
1708 Py_LOCAL_INLINE(PyObject *)
1709 rsplit_whitespace(PyStringObject *self, Py_ssize_t len, Py_ssize_t maxsplit)
1711 const char *s = PyString_AS_STRING(self);
1712 Py_ssize_t i, j, count=0;
1713 PyObject *str;
1714 PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit));
1716 if (list == NULL)
1717 return NULL;
1719 i = j = len-1;
1721 while (maxsplit-- > 0) {
1722 RSKIP_SPACE(s, i);
1723 if (i<0) break;
1724 j = i; i--;
1725 RSKIP_NONSPACE(s, i);
1726 if (j == len-1 && i < 0 && PyString_CheckExact(self)) {
1727 /* No whitespace in self, so just use it as list[0] */
1728 Py_INCREF(self);
1729 PyList_SET_ITEM(list, 0, (PyObject *)self);
1730 count++;
1731 break;
1733 SPLIT_ADD(s, i + 1, j + 1);
1735 if (i >= 0) {
1736 /* Only occurs when maxsplit was reached */
1737 /* Skip any remaining whitespace and copy to beginning of string */
1738 RSKIP_SPACE(s, i);
1739 if (i >= 0)
1740 SPLIT_ADD(s, 0, i + 1);
1743 FIX_PREALLOC_SIZE(list);
1744 if (PyList_Reverse(list) < 0)
1745 goto onError;
1746 return list;
1747 onError:
1748 Py_DECREF(list);
1749 return NULL;
1752 Py_LOCAL_INLINE(PyObject *)
1753 rsplit_char(PyStringObject *self, Py_ssize_t len, char ch, Py_ssize_t maxcount)
1755 const char *s = PyString_AS_STRING(self);
1756 register Py_ssize_t i, j, count=0;
1757 PyObject *str;
1758 PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
1760 if (list == NULL)
1761 return NULL;
1763 i = j = len - 1;
1764 while ((i >= 0) && (maxcount-- > 0)) {
1765 for (; i >= 0; i--) {
1766 if (s[i] == ch) {
1767 SPLIT_ADD(s, i + 1, j + 1);
1768 j = i = i - 1;
1769 break;
1773 if (i < 0 && count == 0 && PyString_CheckExact(self)) {
1774 /* ch not in self, so just use self as list[0] */
1775 Py_INCREF(self);
1776 PyList_SET_ITEM(list, 0, (PyObject *)self);
1777 count++;
1779 else if (j >= -1) {
1780 SPLIT_ADD(s, 0, j + 1);
1782 FIX_PREALLOC_SIZE(list);
1783 if (PyList_Reverse(list) < 0)
1784 goto onError;
1785 return list;
1787 onError:
1788 Py_DECREF(list);
1789 return NULL;
1792 PyDoc_STRVAR(rsplit__doc__,
1793 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
1795 Return a list of the words in the string S, using sep as the\n\
1796 delimiter string, starting at the end of the string and working\n\
1797 to the front. If maxsplit is given, at most maxsplit splits are\n\
1798 done. If sep is not specified or is None, any whitespace string\n\
1799 is a separator.");
1801 static PyObject *
1802 string_rsplit(PyStringObject *self, PyObject *args)
1804 Py_ssize_t len = PyString_GET_SIZE(self), n, i, j;
1805 Py_ssize_t maxsplit = -1, count=0;
1806 const char *s, *sub;
1807 PyObject *list, *str, *subobj = Py_None;
1809 if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit))
1810 return NULL;
1811 if (maxsplit < 0)
1812 maxsplit = PY_SSIZE_T_MAX;
1813 if (subobj == Py_None)
1814 return rsplit_whitespace(self, len, maxsplit);
1815 if (PyString_Check(subobj)) {
1816 sub = PyString_AS_STRING(subobj);
1817 n = PyString_GET_SIZE(subobj);
1819 #ifdef Py_USING_UNICODE
1820 else if (PyUnicode_Check(subobj))
1821 return PyUnicode_RSplit((PyObject *)self, subobj, maxsplit);
1822 #endif
1823 else if (PyObject_AsCharBuffer(subobj, &sub, &n))
1824 return NULL;
1826 if (n == 0) {
1827 PyErr_SetString(PyExc_ValueError, "empty separator");
1828 return NULL;
1830 else if (n == 1)
1831 return rsplit_char(self, len, sub[0], maxsplit);
1833 list = PyList_New(PREALLOC_SIZE(maxsplit));
1834 if (list == NULL)
1835 return NULL;
1837 j = len;
1838 i = j - n;
1840 s = PyString_AS_STRING(self);
1841 while ( (i >= 0) && (maxsplit-- > 0) ) {
1842 for (; i>=0; i--) {
1843 if (Py_STRING_MATCH(s, i, sub, n)) {
1844 SPLIT_ADD(s, i + n, j);
1845 j = i;
1846 i -= n;
1847 break;
1851 SPLIT_ADD(s, 0, j);
1852 FIX_PREALLOC_SIZE(list);
1853 if (PyList_Reverse(list) < 0)
1854 goto onError;
1855 return list;
1857 onError:
1858 Py_DECREF(list);
1859 return NULL;
1863 PyDoc_STRVAR(join__doc__,
1864 "S.join(iterable) -> string\n\
1866 Return a string which is the concatenation of the strings in the\n\
1867 iterable. The separator between elements is S.");
1869 static PyObject *
1870 string_join(PyStringObject *self, PyObject *orig)
1872 char *sep = PyString_AS_STRING(self);
1873 const Py_ssize_t seplen = PyString_GET_SIZE(self);
1874 PyObject *res = NULL;
1875 char *p;
1876 Py_ssize_t seqlen = 0;
1877 size_t sz = 0;
1878 Py_ssize_t i;
1879 PyObject *seq, *item;
1881 seq = PySequence_Fast(orig, "");
1882 if (seq == NULL) {
1883 return NULL;
1886 seqlen = PySequence_Size(seq);
1887 if (seqlen == 0) {
1888 Py_DECREF(seq);
1889 return PyString_FromString("");
1891 if (seqlen == 1) {
1892 item = PySequence_Fast_GET_ITEM(seq, 0);
1893 if (PyString_CheckExact(item) || PyUnicode_CheckExact(item)) {
1894 Py_INCREF(item);
1895 Py_DECREF(seq);
1896 return item;
1900 /* There are at least two things to join, or else we have a subclass
1901 * of the builtin types in the sequence.
1902 * Do a pre-pass to figure out the total amount of space we'll
1903 * need (sz), see whether any argument is absurd, and defer to
1904 * the Unicode join if appropriate.
1906 for (i = 0; i < seqlen; i++) {
1907 const size_t old_sz = sz;
1908 item = PySequence_Fast_GET_ITEM(seq, i);
1909 if (!PyString_Check(item)){
1910 #ifdef Py_USING_UNICODE
1911 if (PyUnicode_Check(item)) {
1912 /* Defer to Unicode join.
1913 * CAUTION: There's no gurantee that the
1914 * original sequence can be iterated over
1915 * again, so we must pass seq here.
1917 PyObject *result;
1918 result = PyUnicode_Join((PyObject *)self, seq);
1919 Py_DECREF(seq);
1920 return result;
1922 #endif
1923 PyErr_Format(PyExc_TypeError,
1924 "sequence item %zd: expected string,"
1925 " %.80s found",
1926 i, Py_TYPE(item)->tp_name);
1927 Py_DECREF(seq);
1928 return NULL;
1930 sz += PyString_GET_SIZE(item);
1931 if (i != 0)
1932 sz += seplen;
1933 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
1934 PyErr_SetString(PyExc_OverflowError,
1935 "join() result is too long for a Python string");
1936 Py_DECREF(seq);
1937 return NULL;
1941 /* Allocate result space. */
1942 res = PyString_FromStringAndSize((char*)NULL, sz);
1943 if (res == NULL) {
1944 Py_DECREF(seq);
1945 return NULL;
1948 /* Catenate everything. */
1949 p = PyString_AS_STRING(res);
1950 for (i = 0; i < seqlen; ++i) {
1951 size_t n;
1952 item = PySequence_Fast_GET_ITEM(seq, i);
1953 n = PyString_GET_SIZE(item);
1954 Py_MEMCPY(p, PyString_AS_STRING(item), n);
1955 p += n;
1956 if (i < seqlen - 1) {
1957 Py_MEMCPY(p, sep, seplen);
1958 p += seplen;
1962 Py_DECREF(seq);
1963 return res;
1966 PyObject *
1967 _PyString_Join(PyObject *sep, PyObject *x)
1969 assert(sep != NULL && PyString_Check(sep));
1970 assert(x != NULL);
1971 return string_join((PyStringObject *)sep, x);
1974 Py_LOCAL_INLINE(void)
1975 string_adjust_indices(Py_ssize_t *start, Py_ssize_t *end, Py_ssize_t len)
1977 if (*end > len)
1978 *end = len;
1979 else if (*end < 0)
1980 *end += len;
1981 if (*end < 0)
1982 *end = 0;
1983 if (*start < 0)
1984 *start += len;
1985 if (*start < 0)
1986 *start = 0;
1989 Py_LOCAL_INLINE(Py_ssize_t)
1990 string_find_internal(PyStringObject *self, PyObject *args, int dir)
1992 PyObject *subobj;
1993 const char *sub;
1994 Py_ssize_t sub_len;
1995 Py_ssize_t start=0, end=PY_SSIZE_T_MAX;
1996 PyObject *obj_start=Py_None, *obj_end=Py_None;
1998 if (!PyArg_ParseTuple(args, "O|OO:find/rfind/index/rindex", &subobj,
1999 &obj_start, &obj_end))
2000 return -2;
2001 /* To support None in "start" and "end" arguments, meaning
2002 the same as if they were not passed.
2004 if (obj_start != Py_None)
2005 if (!_PyEval_SliceIndex(obj_start, &start))
2006 return -2;
2007 if (obj_end != Py_None)
2008 if (!_PyEval_SliceIndex(obj_end, &end))
2009 return -2;
2011 if (PyString_Check(subobj)) {
2012 sub = PyString_AS_STRING(subobj);
2013 sub_len = PyString_GET_SIZE(subobj);
2015 #ifdef Py_USING_UNICODE
2016 else if (PyUnicode_Check(subobj))
2017 return PyUnicode_Find(
2018 (PyObject *)self, subobj, start, end, dir);
2019 #endif
2020 else if (PyObject_AsCharBuffer(subobj, &sub, &sub_len))
2021 /* XXX - the "expected a character buffer object" is pretty
2022 confusing for a non-expert. remap to something else ? */
2023 return -2;
2025 if (dir > 0)
2026 return stringlib_find_slice(
2027 PyString_AS_STRING(self), PyString_GET_SIZE(self),
2028 sub, sub_len, start, end);
2029 else
2030 return stringlib_rfind_slice(
2031 PyString_AS_STRING(self), PyString_GET_SIZE(self),
2032 sub, sub_len, start, end);
2036 PyDoc_STRVAR(find__doc__,
2037 "S.find(sub [,start [,end]]) -> int\n\
2039 Return the lowest index in S where substring sub is found,\n\
2040 such that sub is contained within s[start:end]. Optional\n\
2041 arguments start and end are interpreted as in slice notation.\n\
2043 Return -1 on failure.");
2045 static PyObject *
2046 string_find(PyStringObject *self, PyObject *args)
2048 Py_ssize_t result = string_find_internal(self, args, +1);
2049 if (result == -2)
2050 return NULL;
2051 return PyInt_FromSsize_t(result);
2055 PyDoc_STRVAR(index__doc__,
2056 "S.index(sub [,start [,end]]) -> int\n\
2058 Like S.find() but raise ValueError when the substring is not found.");
2060 static PyObject *
2061 string_index(PyStringObject *self, PyObject *args)
2063 Py_ssize_t result = string_find_internal(self, args, +1);
2064 if (result == -2)
2065 return NULL;
2066 if (result == -1) {
2067 PyErr_SetString(PyExc_ValueError,
2068 "substring not found");
2069 return NULL;
2071 return PyInt_FromSsize_t(result);
2075 PyDoc_STRVAR(rfind__doc__,
2076 "S.rfind(sub [,start [,end]]) -> int\n\
2078 Return the highest index in S where substring sub is found,\n\
2079 such that sub is contained within s[start:end]. Optional\n\
2080 arguments start and end are interpreted as in slice notation.\n\
2082 Return -1 on failure.");
2084 static PyObject *
2085 string_rfind(PyStringObject *self, PyObject *args)
2087 Py_ssize_t result = string_find_internal(self, args, -1);
2088 if (result == -2)
2089 return NULL;
2090 return PyInt_FromSsize_t(result);
2094 PyDoc_STRVAR(rindex__doc__,
2095 "S.rindex(sub [,start [,end]]) -> int\n\
2097 Like S.rfind() but raise ValueError when the substring is not found.");
2099 static PyObject *
2100 string_rindex(PyStringObject *self, PyObject *args)
2102 Py_ssize_t result = string_find_internal(self, args, -1);
2103 if (result == -2)
2104 return NULL;
2105 if (result == -1) {
2106 PyErr_SetString(PyExc_ValueError,
2107 "substring not found");
2108 return NULL;
2110 return PyInt_FromSsize_t(result);
2114 Py_LOCAL_INLINE(PyObject *)
2115 do_xstrip(PyStringObject *self, int striptype, PyObject *sepobj)
2117 char *s = PyString_AS_STRING(self);
2118 Py_ssize_t len = PyString_GET_SIZE(self);
2119 char *sep = PyString_AS_STRING(sepobj);
2120 Py_ssize_t seplen = PyString_GET_SIZE(sepobj);
2121 Py_ssize_t i, j;
2123 i = 0;
2124 if (striptype != RIGHTSTRIP) {
2125 while (i < len && memchr(sep, Py_CHARMASK(s[i]), seplen)) {
2126 i++;
2130 j = len;
2131 if (striptype != LEFTSTRIP) {
2132 do {
2133 j--;
2134 } while (j >= i && memchr(sep, Py_CHARMASK(s[j]), seplen));
2135 j++;
2138 if (i == 0 && j == len && PyString_CheckExact(self)) {
2139 Py_INCREF(self);
2140 return (PyObject*)self;
2142 else
2143 return PyString_FromStringAndSize(s+i, j-i);
2147 Py_LOCAL_INLINE(PyObject *)
2148 do_strip(PyStringObject *self, int striptype)
2150 char *s = PyString_AS_STRING(self);
2151 Py_ssize_t len = PyString_GET_SIZE(self), i, j;
2153 i = 0;
2154 if (striptype != RIGHTSTRIP) {
2155 while (i < len && isspace(Py_CHARMASK(s[i]))) {
2156 i++;
2160 j = len;
2161 if (striptype != LEFTSTRIP) {
2162 do {
2163 j--;
2164 } while (j >= i && isspace(Py_CHARMASK(s[j])));
2165 j++;
2168 if (i == 0 && j == len && PyString_CheckExact(self)) {
2169 Py_INCREF(self);
2170 return (PyObject*)self;
2172 else
2173 return PyString_FromStringAndSize(s+i, j-i);
2177 Py_LOCAL_INLINE(PyObject *)
2178 do_argstrip(PyStringObject *self, int striptype, PyObject *args)
2180 PyObject *sep = NULL;
2182 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
2183 return NULL;
2185 if (sep != NULL && sep != Py_None) {
2186 if (PyString_Check(sep))
2187 return do_xstrip(self, striptype, sep);
2188 #ifdef Py_USING_UNICODE
2189 else if (PyUnicode_Check(sep)) {
2190 PyObject *uniself = PyUnicode_FromObject((PyObject *)self);
2191 PyObject *res;
2192 if (uniself==NULL)
2193 return NULL;
2194 res = _PyUnicode_XStrip((PyUnicodeObject *)uniself,
2195 striptype, sep);
2196 Py_DECREF(uniself);
2197 return res;
2199 #endif
2200 PyErr_Format(PyExc_TypeError,
2201 #ifdef Py_USING_UNICODE
2202 "%s arg must be None, str or unicode",
2203 #else
2204 "%s arg must be None or str",
2205 #endif
2206 STRIPNAME(striptype));
2207 return NULL;
2210 return do_strip(self, striptype);
2214 PyDoc_STRVAR(strip__doc__,
2215 "S.strip([chars]) -> string or unicode\n\
2217 Return a copy of the string S with leading and trailing\n\
2218 whitespace removed.\n\
2219 If chars is given and not None, remove characters in chars instead.\n\
2220 If chars is unicode, S will be converted to unicode before stripping");
2222 static PyObject *
2223 string_strip(PyStringObject *self, PyObject *args)
2225 if (PyTuple_GET_SIZE(args) == 0)
2226 return do_strip(self, BOTHSTRIP); /* Common case */
2227 else
2228 return do_argstrip(self, BOTHSTRIP, args);
2232 PyDoc_STRVAR(lstrip__doc__,
2233 "S.lstrip([chars]) -> string or unicode\n\
2235 Return a copy of the string S with leading whitespace removed.\n\
2236 If chars is given and not None, remove characters in chars instead.\n\
2237 If chars is unicode, S will be converted to unicode before stripping");
2239 static PyObject *
2240 string_lstrip(PyStringObject *self, PyObject *args)
2242 if (PyTuple_GET_SIZE(args) == 0)
2243 return do_strip(self, LEFTSTRIP); /* Common case */
2244 else
2245 return do_argstrip(self, LEFTSTRIP, args);
2249 PyDoc_STRVAR(rstrip__doc__,
2250 "S.rstrip([chars]) -> string or unicode\n\
2252 Return a copy of the string S with trailing whitespace removed.\n\
2253 If chars is given and not None, remove characters in chars instead.\n\
2254 If chars is unicode, S will be converted to unicode before stripping");
2256 static PyObject *
2257 string_rstrip(PyStringObject *self, PyObject *args)
2259 if (PyTuple_GET_SIZE(args) == 0)
2260 return do_strip(self, RIGHTSTRIP); /* Common case */
2261 else
2262 return do_argstrip(self, RIGHTSTRIP, args);
2266 PyDoc_STRVAR(lower__doc__,
2267 "S.lower() -> string\n\
2269 Return a copy of the string S converted to lowercase.");
2271 /* _tolower and _toupper are defined by SUSv2, but they're not ISO C */
2272 #ifndef _tolower
2273 #define _tolower tolower
2274 #endif
2276 static PyObject *
2277 string_lower(PyStringObject *self)
2279 char *s;
2280 Py_ssize_t i, n = PyString_GET_SIZE(self);
2281 PyObject *newobj;
2283 newobj = PyString_FromStringAndSize(NULL, n);
2284 if (!newobj)
2285 return NULL;
2287 s = PyString_AS_STRING(newobj);
2289 Py_MEMCPY(s, PyString_AS_STRING(self), n);
2291 for (i = 0; i < n; i++) {
2292 int c = Py_CHARMASK(s[i]);
2293 if (isupper(c))
2294 s[i] = _tolower(c);
2297 return newobj;
2300 PyDoc_STRVAR(upper__doc__,
2301 "S.upper() -> string\n\
2303 Return a copy of the string S converted to uppercase.");
2305 #ifndef _toupper
2306 #define _toupper toupper
2307 #endif
2309 static PyObject *
2310 string_upper(PyStringObject *self)
2312 char *s;
2313 Py_ssize_t i, n = PyString_GET_SIZE(self);
2314 PyObject *newobj;
2316 newobj = PyString_FromStringAndSize(NULL, n);
2317 if (!newobj)
2318 return NULL;
2320 s = PyString_AS_STRING(newobj);
2322 Py_MEMCPY(s, PyString_AS_STRING(self), n);
2324 for (i = 0; i < n; i++) {
2325 int c = Py_CHARMASK(s[i]);
2326 if (islower(c))
2327 s[i] = _toupper(c);
2330 return newobj;
2333 PyDoc_STRVAR(title__doc__,
2334 "S.title() -> string\n\
2336 Return a titlecased version of S, i.e. words start with uppercase\n\
2337 characters, all remaining cased characters have lowercase.");
2339 static PyObject*
2340 string_title(PyStringObject *self)
2342 char *s = PyString_AS_STRING(self), *s_new;
2343 Py_ssize_t i, n = PyString_GET_SIZE(self);
2344 int previous_is_cased = 0;
2345 PyObject *newobj;
2347 newobj = PyString_FromStringAndSize(NULL, n);
2348 if (newobj == NULL)
2349 return NULL;
2350 s_new = PyString_AsString(newobj);
2351 for (i = 0; i < n; i++) {
2352 int c = Py_CHARMASK(*s++);
2353 if (islower(c)) {
2354 if (!previous_is_cased)
2355 c = toupper(c);
2356 previous_is_cased = 1;
2357 } else if (isupper(c)) {
2358 if (previous_is_cased)
2359 c = tolower(c);
2360 previous_is_cased = 1;
2361 } else
2362 previous_is_cased = 0;
2363 *s_new++ = c;
2365 return newobj;
2368 PyDoc_STRVAR(capitalize__doc__,
2369 "S.capitalize() -> string\n\
2371 Return a copy of the string S with only its first character\n\
2372 capitalized.");
2374 static PyObject *
2375 string_capitalize(PyStringObject *self)
2377 char *s = PyString_AS_STRING(self), *s_new;
2378 Py_ssize_t i, n = PyString_GET_SIZE(self);
2379 PyObject *newobj;
2381 newobj = PyString_FromStringAndSize(NULL, n);
2382 if (newobj == NULL)
2383 return NULL;
2384 s_new = PyString_AsString(newobj);
2385 if (0 < n) {
2386 int c = Py_CHARMASK(*s++);
2387 if (islower(c))
2388 *s_new = toupper(c);
2389 else
2390 *s_new = c;
2391 s_new++;
2393 for (i = 1; i < n; i++) {
2394 int c = Py_CHARMASK(*s++);
2395 if (isupper(c))
2396 *s_new = tolower(c);
2397 else
2398 *s_new = c;
2399 s_new++;
2401 return newobj;
2405 PyDoc_STRVAR(count__doc__,
2406 "S.count(sub[, start[, end]]) -> int\n\
2408 Return the number of non-overlapping occurrences of substring sub in\n\
2409 string S[start:end]. Optional arguments start and end are interpreted\n\
2410 as in slice notation.");
2412 static PyObject *
2413 string_count(PyStringObject *self, PyObject *args)
2415 PyObject *sub_obj;
2416 const char *str = PyString_AS_STRING(self), *sub;
2417 Py_ssize_t sub_len;
2418 Py_ssize_t start = 0, end = PY_SSIZE_T_MAX;
2420 if (!PyArg_ParseTuple(args, "O|O&O&:count", &sub_obj,
2421 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
2422 return NULL;
2424 if (PyString_Check(sub_obj)) {
2425 sub = PyString_AS_STRING(sub_obj);
2426 sub_len = PyString_GET_SIZE(sub_obj);
2428 #ifdef Py_USING_UNICODE
2429 else if (PyUnicode_Check(sub_obj)) {
2430 Py_ssize_t count;
2431 count = PyUnicode_Count((PyObject *)self, sub_obj, start, end);
2432 if (count == -1)
2433 return NULL;
2434 else
2435 return PyInt_FromSsize_t(count);
2437 #endif
2438 else if (PyObject_AsCharBuffer(sub_obj, &sub, &sub_len))
2439 return NULL;
2441 string_adjust_indices(&start, &end, PyString_GET_SIZE(self));
2443 return PyInt_FromSsize_t(
2444 stringlib_count(str + start, end - start, sub, sub_len)
2448 PyDoc_STRVAR(swapcase__doc__,
2449 "S.swapcase() -> string\n\
2451 Return a copy of the string S with uppercase characters\n\
2452 converted to lowercase and vice versa.");
2454 static PyObject *
2455 string_swapcase(PyStringObject *self)
2457 char *s = PyString_AS_STRING(self), *s_new;
2458 Py_ssize_t i, n = PyString_GET_SIZE(self);
2459 PyObject *newobj;
2461 newobj = PyString_FromStringAndSize(NULL, n);
2462 if (newobj == NULL)
2463 return NULL;
2464 s_new = PyString_AsString(newobj);
2465 for (i = 0; i < n; i++) {
2466 int c = Py_CHARMASK(*s++);
2467 if (islower(c)) {
2468 *s_new = toupper(c);
2470 else if (isupper(c)) {
2471 *s_new = tolower(c);
2473 else
2474 *s_new = c;
2475 s_new++;
2477 return newobj;
2481 PyDoc_STRVAR(translate__doc__,
2482 "S.translate(table [,deletechars]) -> string\n\
2484 Return a copy of the string S, where all characters occurring\n\
2485 in the optional argument deletechars are removed, and the\n\
2486 remaining characters have been mapped through the given\n\
2487 translation table, which must be a string of length 256.");
2489 static PyObject *
2490 string_translate(PyStringObject *self, PyObject *args)
2492 register char *input, *output;
2493 const char *table;
2494 register Py_ssize_t i, c, changed = 0;
2495 PyObject *input_obj = (PyObject*)self;
2496 const char *output_start, *del_table=NULL;
2497 Py_ssize_t inlen, tablen, dellen = 0;
2498 PyObject *result;
2499 int trans_table[256];
2500 PyObject *tableobj, *delobj = NULL;
2502 if (!PyArg_UnpackTuple(args, "translate", 1, 2,
2503 &tableobj, &delobj))
2504 return NULL;
2506 if (PyString_Check(tableobj)) {
2507 table = PyString_AS_STRING(tableobj);
2508 tablen = PyString_GET_SIZE(tableobj);
2510 else if (tableobj == Py_None) {
2511 table = NULL;
2512 tablen = 256;
2514 #ifdef Py_USING_UNICODE
2515 else if (PyUnicode_Check(tableobj)) {
2516 /* Unicode .translate() does not support the deletechars
2517 parameter; instead a mapping to None will cause characters
2518 to be deleted. */
2519 if (delobj != NULL) {
2520 PyErr_SetString(PyExc_TypeError,
2521 "deletions are implemented differently for unicode");
2522 return NULL;
2524 return PyUnicode_Translate((PyObject *)self, tableobj, NULL);
2526 #endif
2527 else if (PyObject_AsCharBuffer(tableobj, &table, &tablen))
2528 return NULL;
2530 if (tablen != 256) {
2531 PyErr_SetString(PyExc_ValueError,
2532 "translation table must be 256 characters long");
2533 return NULL;
2536 if (delobj != NULL) {
2537 if (PyString_Check(delobj)) {
2538 del_table = PyString_AS_STRING(delobj);
2539 dellen = PyString_GET_SIZE(delobj);
2541 #ifdef Py_USING_UNICODE
2542 else if (PyUnicode_Check(delobj)) {
2543 PyErr_SetString(PyExc_TypeError,
2544 "deletions are implemented differently for unicode");
2545 return NULL;
2547 #endif
2548 else if (PyObject_AsCharBuffer(delobj, &del_table, &dellen))
2549 return NULL;
2551 else {
2552 del_table = NULL;
2553 dellen = 0;
2556 inlen = PyString_GET_SIZE(input_obj);
2557 result = PyString_FromStringAndSize((char *)NULL, inlen);
2558 if (result == NULL)
2559 return NULL;
2560 output_start = output = PyString_AsString(result);
2561 input = PyString_AS_STRING(input_obj);
2563 if (dellen == 0 && table != NULL) {
2564 /* If no deletions are required, use faster code */
2565 for (i = inlen; --i >= 0; ) {
2566 c = Py_CHARMASK(*input++);
2567 if (Py_CHARMASK((*output++ = table[c])) != c)
2568 changed = 1;
2570 if (changed || !PyString_CheckExact(input_obj))
2571 return result;
2572 Py_DECREF(result);
2573 Py_INCREF(input_obj);
2574 return input_obj;
2577 if (table == NULL) {
2578 for (i = 0; i < 256; i++)
2579 trans_table[i] = Py_CHARMASK(i);
2580 } else {
2581 for (i = 0; i < 256; i++)
2582 trans_table[i] = Py_CHARMASK(table[i]);
2585 for (i = 0; i < dellen; i++)
2586 trans_table[(int) Py_CHARMASK(del_table[i])] = -1;
2588 for (i = inlen; --i >= 0; ) {
2589 c = Py_CHARMASK(*input++);
2590 if (trans_table[c] != -1)
2591 if (Py_CHARMASK(*output++ = (char)trans_table[c]) == c)
2592 continue;
2593 changed = 1;
2595 if (!changed && PyString_CheckExact(input_obj)) {
2596 Py_DECREF(result);
2597 Py_INCREF(input_obj);
2598 return input_obj;
2600 /* Fix the size of the resulting string */
2601 if (inlen > 0)
2602 _PyString_Resize(&result, output - output_start);
2603 return result;
2607 #define FORWARD 1
2608 #define REVERSE -1
2610 /* find and count characters and substrings */
2612 #define findchar(target, target_len, c) \
2613 ((char *)memchr((const void *)(target), c, target_len))
2615 /* String ops must return a string. */
2616 /* If the object is subclass of string, create a copy */
2617 Py_LOCAL(PyStringObject *)
2618 return_self(PyStringObject *self)
2620 if (PyString_CheckExact(self)) {
2621 Py_INCREF(self);
2622 return self;
2624 return (PyStringObject *)PyString_FromStringAndSize(
2625 PyString_AS_STRING(self),
2626 PyString_GET_SIZE(self));
2629 Py_LOCAL_INLINE(Py_ssize_t)
2630 countchar(const char *target, int target_len, char c, Py_ssize_t maxcount)
2632 Py_ssize_t count=0;
2633 const char *start=target;
2634 const char *end=target+target_len;
2636 while ( (start=findchar(start, end-start, c)) != NULL ) {
2637 count++;
2638 if (count >= maxcount)
2639 break;
2640 start += 1;
2642 return count;
2645 Py_LOCAL(Py_ssize_t)
2646 findstring(const char *target, Py_ssize_t target_len,
2647 const char *pattern, Py_ssize_t pattern_len,
2648 Py_ssize_t start,
2649 Py_ssize_t end,
2650 int direction)
2652 if (start < 0) {
2653 start += target_len;
2654 if (start < 0)
2655 start = 0;
2657 if (end > target_len) {
2658 end = target_len;
2659 } else if (end < 0) {
2660 end += target_len;
2661 if (end < 0)
2662 end = 0;
2665 /* zero-length substrings always match at the first attempt */
2666 if (pattern_len == 0)
2667 return (direction > 0) ? start : end;
2669 end -= pattern_len;
2671 if (direction < 0) {
2672 for (; end >= start; end--)
2673 if (Py_STRING_MATCH(target, end, pattern, pattern_len))
2674 return end;
2675 } else {
2676 for (; start <= end; start++)
2677 if (Py_STRING_MATCH(target, start, pattern, pattern_len))
2678 return start;
2680 return -1;
2683 Py_LOCAL_INLINE(Py_ssize_t)
2684 countstring(const char *target, Py_ssize_t target_len,
2685 const char *pattern, Py_ssize_t pattern_len,
2686 Py_ssize_t start,
2687 Py_ssize_t end,
2688 int direction, Py_ssize_t maxcount)
2690 Py_ssize_t count=0;
2692 if (start < 0) {
2693 start += target_len;
2694 if (start < 0)
2695 start = 0;
2697 if (end > target_len) {
2698 end = target_len;
2699 } else if (end < 0) {
2700 end += target_len;
2701 if (end < 0)
2702 end = 0;
2705 /* zero-length substrings match everywhere */
2706 if (pattern_len == 0 || maxcount == 0) {
2707 if (target_len+1 < maxcount)
2708 return target_len+1;
2709 return maxcount;
2712 end -= pattern_len;
2713 if (direction < 0) {
2714 for (; (end >= start); end--)
2715 if (Py_STRING_MATCH(target, end, pattern, pattern_len)) {
2716 count++;
2717 if (--maxcount <= 0) break;
2718 end -= pattern_len-1;
2720 } else {
2721 for (; (start <= end); start++)
2722 if (Py_STRING_MATCH(target, start, pattern, pattern_len)) {
2723 count++;
2724 if (--maxcount <= 0)
2725 break;
2726 start += pattern_len-1;
2729 return count;
2733 /* Algorithms for different cases of string replacement */
2735 /* len(self)>=1, from="", len(to)>=1, maxcount>=1 */
2736 Py_LOCAL(PyStringObject *)
2737 replace_interleave(PyStringObject *self,
2738 const char *to_s, Py_ssize_t to_len,
2739 Py_ssize_t maxcount)
2741 char *self_s, *result_s;
2742 Py_ssize_t self_len, result_len;
2743 Py_ssize_t count, i, product;
2744 PyStringObject *result;
2746 self_len = PyString_GET_SIZE(self);
2748 /* 1 at the end plus 1 after every character */
2749 count = self_len+1;
2750 if (maxcount < count)
2751 count = maxcount;
2753 /* Check for overflow */
2754 /* result_len = count * to_len + self_len; */
2755 product = count * to_len;
2756 if (product / to_len != count) {
2757 PyErr_SetString(PyExc_OverflowError,
2758 "replace string is too long");
2759 return NULL;
2761 result_len = product + self_len;
2762 if (result_len < 0) {
2763 PyErr_SetString(PyExc_OverflowError,
2764 "replace string is too long");
2765 return NULL;
2768 if (! (result = (PyStringObject *)
2769 PyString_FromStringAndSize(NULL, result_len)) )
2770 return NULL;
2772 self_s = PyString_AS_STRING(self);
2773 result_s = PyString_AS_STRING(result);
2775 /* TODO: special case single character, which doesn't need memcpy */
2777 /* Lay the first one down (guaranteed this will occur) */
2778 Py_MEMCPY(result_s, to_s, to_len);
2779 result_s += to_len;
2780 count -= 1;
2782 for (i=0; i<count; i++) {
2783 *result_s++ = *self_s++;
2784 Py_MEMCPY(result_s, to_s, to_len);
2785 result_s += to_len;
2788 /* Copy the rest of the original string */
2789 Py_MEMCPY(result_s, self_s, self_len-i);
2791 return result;
2794 /* Special case for deleting a single character */
2795 /* len(self)>=1, len(from)==1, to="", maxcount>=1 */
2796 Py_LOCAL(PyStringObject *)
2797 replace_delete_single_character(PyStringObject *self,
2798 char from_c, Py_ssize_t maxcount)
2800 char *self_s, *result_s;
2801 char *start, *next, *end;
2802 Py_ssize_t self_len, result_len;
2803 Py_ssize_t count;
2804 PyStringObject *result;
2806 self_len = PyString_GET_SIZE(self);
2807 self_s = PyString_AS_STRING(self);
2809 count = countchar(self_s, self_len, from_c, maxcount);
2810 if (count == 0) {
2811 return return_self(self);
2814 result_len = self_len - count; /* from_len == 1 */
2815 assert(result_len>=0);
2817 if ( (result = (PyStringObject *)
2818 PyString_FromStringAndSize(NULL, result_len)) == NULL)
2819 return NULL;
2820 result_s = PyString_AS_STRING(result);
2822 start = self_s;
2823 end = self_s + self_len;
2824 while (count-- > 0) {
2825 next = findchar(start, end-start, from_c);
2826 if (next == NULL)
2827 break;
2828 Py_MEMCPY(result_s, start, next-start);
2829 result_s += (next-start);
2830 start = next+1;
2832 Py_MEMCPY(result_s, start, end-start);
2834 return result;
2837 /* len(self)>=1, len(from)>=2, to="", maxcount>=1 */
2839 Py_LOCAL(PyStringObject *)
2840 replace_delete_substring(PyStringObject *self,
2841 const char *from_s, Py_ssize_t from_len,
2842 Py_ssize_t maxcount) {
2843 char *self_s, *result_s;
2844 char *start, *next, *end;
2845 Py_ssize_t self_len, result_len;
2846 Py_ssize_t count, offset;
2847 PyStringObject *result;
2849 self_len = PyString_GET_SIZE(self);
2850 self_s = PyString_AS_STRING(self);
2852 count = countstring(self_s, self_len,
2853 from_s, from_len,
2854 0, self_len, 1,
2855 maxcount);
2857 if (count == 0) {
2858 /* no matches */
2859 return return_self(self);
2862 result_len = self_len - (count * from_len);
2863 assert (result_len>=0);
2865 if ( (result = (PyStringObject *)
2866 PyString_FromStringAndSize(NULL, result_len)) == NULL )
2867 return NULL;
2869 result_s = PyString_AS_STRING(result);
2871 start = self_s;
2872 end = self_s + self_len;
2873 while (count-- > 0) {
2874 offset = findstring(start, end-start,
2875 from_s, from_len,
2876 0, end-start, FORWARD);
2877 if (offset == -1)
2878 break;
2879 next = start + offset;
2881 Py_MEMCPY(result_s, start, next-start);
2883 result_s += (next-start);
2884 start = next+from_len;
2886 Py_MEMCPY(result_s, start, end-start);
2887 return result;
2890 /* len(self)>=1, len(from)==len(to)==1, maxcount>=1 */
2891 Py_LOCAL(PyStringObject *)
2892 replace_single_character_in_place(PyStringObject *self,
2893 char from_c, char to_c,
2894 Py_ssize_t maxcount)
2896 char *self_s, *result_s, *start, *end, *next;
2897 Py_ssize_t self_len;
2898 PyStringObject *result;
2900 /* The result string will be the same size */
2901 self_s = PyString_AS_STRING(self);
2902 self_len = PyString_GET_SIZE(self);
2904 next = findchar(self_s, self_len, from_c);
2906 if (next == NULL) {
2907 /* No matches; return the original string */
2908 return return_self(self);
2911 /* Need to make a new string */
2912 result = (PyStringObject *) PyString_FromStringAndSize(NULL, self_len);
2913 if (result == NULL)
2914 return NULL;
2915 result_s = PyString_AS_STRING(result);
2916 Py_MEMCPY(result_s, self_s, self_len);
2918 /* change everything in-place, starting with this one */
2919 start = result_s + (next-self_s);
2920 *start = to_c;
2921 start++;
2922 end = result_s + self_len;
2924 while (--maxcount > 0) {
2925 next = findchar(start, end-start, from_c);
2926 if (next == NULL)
2927 break;
2928 *next = to_c;
2929 start = next+1;
2932 return result;
2935 /* len(self)>=1, len(from)==len(to)>=2, maxcount>=1 */
2936 Py_LOCAL(PyStringObject *)
2937 replace_substring_in_place(PyStringObject *self,
2938 const char *from_s, Py_ssize_t from_len,
2939 const char *to_s, Py_ssize_t to_len,
2940 Py_ssize_t maxcount)
2942 char *result_s, *start, *end;
2943 char *self_s;
2944 Py_ssize_t self_len, offset;
2945 PyStringObject *result;
2947 /* The result string will be the same size */
2949 self_s = PyString_AS_STRING(self);
2950 self_len = PyString_GET_SIZE(self);
2952 offset = findstring(self_s, self_len,
2953 from_s, from_len,
2954 0, self_len, FORWARD);
2955 if (offset == -1) {
2956 /* No matches; return the original string */
2957 return return_self(self);
2960 /* Need to make a new string */
2961 result = (PyStringObject *) PyString_FromStringAndSize(NULL, self_len);
2962 if (result == NULL)
2963 return NULL;
2964 result_s = PyString_AS_STRING(result);
2965 Py_MEMCPY(result_s, self_s, self_len);
2967 /* change everything in-place, starting with this one */
2968 start = result_s + offset;
2969 Py_MEMCPY(start, to_s, from_len);
2970 start += from_len;
2971 end = result_s + self_len;
2973 while ( --maxcount > 0) {
2974 offset = findstring(start, end-start,
2975 from_s, from_len,
2976 0, end-start, FORWARD);
2977 if (offset==-1)
2978 break;
2979 Py_MEMCPY(start+offset, to_s, from_len);
2980 start += offset+from_len;
2983 return result;
2986 /* len(self)>=1, len(from)==1, len(to)>=2, maxcount>=1 */
2987 Py_LOCAL(PyStringObject *)
2988 replace_single_character(PyStringObject *self,
2989 char from_c,
2990 const char *to_s, Py_ssize_t to_len,
2991 Py_ssize_t maxcount)
2993 char *self_s, *result_s;
2994 char *start, *next, *end;
2995 Py_ssize_t self_len, result_len;
2996 Py_ssize_t count, product;
2997 PyStringObject *result;
2999 self_s = PyString_AS_STRING(self);
3000 self_len = PyString_GET_SIZE(self);
3002 count = countchar(self_s, self_len, from_c, maxcount);
3003 if (count == 0) {
3004 /* no matches, return unchanged */
3005 return return_self(self);
3008 /* use the difference between current and new, hence the "-1" */
3009 /* result_len = self_len + count * (to_len-1) */
3010 product = count * (to_len-1);
3011 if (product / (to_len-1) != count) {
3012 PyErr_SetString(PyExc_OverflowError, "replace string is too long");
3013 return NULL;
3015 result_len = self_len + product;
3016 if (result_len < 0) {
3017 PyErr_SetString(PyExc_OverflowError, "replace string is too long");
3018 return NULL;
3021 if ( (result = (PyStringObject *)
3022 PyString_FromStringAndSize(NULL, result_len)) == NULL)
3023 return NULL;
3024 result_s = PyString_AS_STRING(result);
3026 start = self_s;
3027 end = self_s + self_len;
3028 while (count-- > 0) {
3029 next = findchar(start, end-start, from_c);
3030 if (next == NULL)
3031 break;
3033 if (next == start) {
3034 /* replace with the 'to' */
3035 Py_MEMCPY(result_s, to_s, to_len);
3036 result_s += to_len;
3037 start += 1;
3038 } else {
3039 /* copy the unchanged old then the 'to' */
3040 Py_MEMCPY(result_s, start, next-start);
3041 result_s += (next-start);
3042 Py_MEMCPY(result_s, to_s, to_len);
3043 result_s += to_len;
3044 start = next+1;
3047 /* Copy the remainder of the remaining string */
3048 Py_MEMCPY(result_s, start, end-start);
3050 return result;
3053 /* len(self)>=1, len(from)>=2, len(to)>=2, maxcount>=1 */
3054 Py_LOCAL(PyStringObject *)
3055 replace_substring(PyStringObject *self,
3056 const char *from_s, Py_ssize_t from_len,
3057 const char *to_s, Py_ssize_t to_len,
3058 Py_ssize_t maxcount) {
3059 char *self_s, *result_s;
3060 char *start, *next, *end;
3061 Py_ssize_t self_len, result_len;
3062 Py_ssize_t count, offset, product;
3063 PyStringObject *result;
3065 self_s = PyString_AS_STRING(self);
3066 self_len = PyString_GET_SIZE(self);
3068 count = countstring(self_s, self_len,
3069 from_s, from_len,
3070 0, self_len, FORWARD, maxcount);
3071 if (count == 0) {
3072 /* no matches, return unchanged */
3073 return return_self(self);
3076 /* Check for overflow */
3077 /* result_len = self_len + count * (to_len-from_len) */
3078 product = count * (to_len-from_len);
3079 if (product / (to_len-from_len) != count) {
3080 PyErr_SetString(PyExc_OverflowError, "replace string is too long");
3081 return NULL;
3083 result_len = self_len + product;
3084 if (result_len < 0) {
3085 PyErr_SetString(PyExc_OverflowError, "replace string is too long");
3086 return NULL;
3089 if ( (result = (PyStringObject *)
3090 PyString_FromStringAndSize(NULL, result_len)) == NULL)
3091 return NULL;
3092 result_s = PyString_AS_STRING(result);
3094 start = self_s;
3095 end = self_s + self_len;
3096 while (count-- > 0) {
3097 offset = findstring(start, end-start,
3098 from_s, from_len,
3099 0, end-start, FORWARD);
3100 if (offset == -1)
3101 break;
3102 next = start+offset;
3103 if (next == start) {
3104 /* replace with the 'to' */
3105 Py_MEMCPY(result_s, to_s, to_len);
3106 result_s += to_len;
3107 start += from_len;
3108 } else {
3109 /* copy the unchanged old then the 'to' */
3110 Py_MEMCPY(result_s, start, next-start);
3111 result_s += (next-start);
3112 Py_MEMCPY(result_s, to_s, to_len);
3113 result_s += to_len;
3114 start = next+from_len;
3117 /* Copy the remainder of the remaining string */
3118 Py_MEMCPY(result_s, start, end-start);
3120 return result;
3124 Py_LOCAL(PyStringObject *)
3125 replace(PyStringObject *self,
3126 const char *from_s, Py_ssize_t from_len,
3127 const char *to_s, Py_ssize_t to_len,
3128 Py_ssize_t maxcount)
3130 if (maxcount < 0) {
3131 maxcount = PY_SSIZE_T_MAX;
3132 } else if (maxcount == 0 || PyString_GET_SIZE(self) == 0) {
3133 /* nothing to do; return the original string */
3134 return return_self(self);
3137 if (maxcount == 0 ||
3138 (from_len == 0 && to_len == 0)) {
3139 /* nothing to do; return the original string */
3140 return return_self(self);
3143 /* Handle zero-length special cases */
3145 if (from_len == 0) {
3146 /* insert the 'to' string everywhere. */
3147 /* >>> "Python".replace("", ".") */
3148 /* '.P.y.t.h.o.n.' */
3149 return replace_interleave(self, to_s, to_len, maxcount);
3152 /* Except for "".replace("", "A") == "A" there is no way beyond this */
3153 /* point for an empty self string to generate a non-empty string */
3154 /* Special case so the remaining code always gets a non-empty string */
3155 if (PyString_GET_SIZE(self) == 0) {
3156 return return_self(self);
3159 if (to_len == 0) {
3160 /* delete all occurances of 'from' string */
3161 if (from_len == 1) {
3162 return replace_delete_single_character(
3163 self, from_s[0], maxcount);
3164 } else {
3165 return replace_delete_substring(self, from_s, from_len, maxcount);
3169 /* Handle special case where both strings have the same length */
3171 if (from_len == to_len) {
3172 if (from_len == 1) {
3173 return replace_single_character_in_place(
3174 self,
3175 from_s[0],
3176 to_s[0],
3177 maxcount);
3178 } else {
3179 return replace_substring_in_place(
3180 self, from_s, from_len, to_s, to_len, maxcount);
3184 /* Otherwise use the more generic algorithms */
3185 if (from_len == 1) {
3186 return replace_single_character(self, from_s[0],
3187 to_s, to_len, maxcount);
3188 } else {
3189 /* len('from')>=2, len('to')>=1 */
3190 return replace_substring(self, from_s, from_len, to_s, to_len, maxcount);
3194 PyDoc_STRVAR(replace__doc__,
3195 "S.replace (old, new[, count]) -> string\n\
3197 Return a copy of string S with all occurrences of substring\n\
3198 old replaced by new. If the optional argument count is\n\
3199 given, only the first count occurrences are replaced.");
3201 static PyObject *
3202 string_replace(PyStringObject *self, PyObject *args)
3204 Py_ssize_t count = -1;
3205 PyObject *from, *to;
3206 const char *from_s, *to_s;
3207 Py_ssize_t from_len, to_len;
3209 if (!PyArg_ParseTuple(args, "OO|n:replace", &from, &to, &count))
3210 return NULL;
3212 if (PyString_Check(from)) {
3213 from_s = PyString_AS_STRING(from);
3214 from_len = PyString_GET_SIZE(from);
3216 #ifdef Py_USING_UNICODE
3217 if (PyUnicode_Check(from))
3218 return PyUnicode_Replace((PyObject *)self,
3219 from, to, count);
3220 #endif
3221 else if (PyObject_AsCharBuffer(from, &from_s, &from_len))
3222 return NULL;
3224 if (PyString_Check(to)) {
3225 to_s = PyString_AS_STRING(to);
3226 to_len = PyString_GET_SIZE(to);
3228 #ifdef Py_USING_UNICODE
3229 else if (PyUnicode_Check(to))
3230 return PyUnicode_Replace((PyObject *)self,
3231 from, to, count);
3232 #endif
3233 else if (PyObject_AsCharBuffer(to, &to_s, &to_len))
3234 return NULL;
3236 return (PyObject *)replace((PyStringObject *) self,
3237 from_s, from_len,
3238 to_s, to_len, count);
3241 /** End DALKE **/
3243 /* Matches the end (direction >= 0) or start (direction < 0) of self
3244 * against substr, using the start and end arguments. Returns
3245 * -1 on error, 0 if not found and 1 if found.
3247 Py_LOCAL(int)
3248 _string_tailmatch(PyStringObject *self, PyObject *substr, Py_ssize_t start,
3249 Py_ssize_t end, int direction)
3251 Py_ssize_t len = PyString_GET_SIZE(self);
3252 Py_ssize_t slen;
3253 const char* sub;
3254 const char* str;
3256 if (PyString_Check(substr)) {
3257 sub = PyString_AS_STRING(substr);
3258 slen = PyString_GET_SIZE(substr);
3260 #ifdef Py_USING_UNICODE
3261 else if (PyUnicode_Check(substr))
3262 return PyUnicode_Tailmatch((PyObject *)self,
3263 substr, start, end, direction);
3264 #endif
3265 else if (PyObject_AsCharBuffer(substr, &sub, &slen))
3266 return -1;
3267 str = PyString_AS_STRING(self);
3269 string_adjust_indices(&start, &end, len);
3271 if (direction < 0) {
3272 /* startswith */
3273 if (start+slen > len)
3274 return 0;
3275 } else {
3276 /* endswith */
3277 if (end-start < slen || start > len)
3278 return 0;
3280 if (end-slen > start)
3281 start = end - slen;
3283 if (end-start >= slen)
3284 return ! memcmp(str+start, sub, slen);
3285 return 0;
3289 PyDoc_STRVAR(startswith__doc__,
3290 "S.startswith(prefix[, start[, end]]) -> bool\n\
3292 Return True if S starts with the specified prefix, False otherwise.\n\
3293 With optional start, test S beginning at that position.\n\
3294 With optional end, stop comparing S at that position.\n\
3295 prefix can also be a tuple of strings to try.");
3297 static PyObject *
3298 string_startswith(PyStringObject *self, PyObject *args)
3300 Py_ssize_t start = 0;
3301 Py_ssize_t end = PY_SSIZE_T_MAX;
3302 PyObject *subobj;
3303 int result;
3305 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
3306 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3307 return NULL;
3308 if (PyTuple_Check(subobj)) {
3309 Py_ssize_t i;
3310 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
3311 result = _string_tailmatch(self,
3312 PyTuple_GET_ITEM(subobj, i),
3313 start, end, -1);
3314 if (result == -1)
3315 return NULL;
3316 else if (result) {
3317 Py_RETURN_TRUE;
3320 Py_RETURN_FALSE;
3322 result = _string_tailmatch(self, subobj, start, end, -1);
3323 if (result == -1)
3324 return NULL;
3325 else
3326 return PyBool_FromLong(result);
3330 PyDoc_STRVAR(endswith__doc__,
3331 "S.endswith(suffix[, start[, end]]) -> bool\n\
3333 Return True if S ends with the specified suffix, False otherwise.\n\
3334 With optional start, test S beginning at that position.\n\
3335 With optional end, stop comparing S at that position.\n\
3336 suffix can also be a tuple of strings to try.");
3338 static PyObject *
3339 string_endswith(PyStringObject *self, PyObject *args)
3341 Py_ssize_t start = 0;
3342 Py_ssize_t end = PY_SSIZE_T_MAX;
3343 PyObject *subobj;
3344 int result;
3346 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
3347 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3348 return NULL;
3349 if (PyTuple_Check(subobj)) {
3350 Py_ssize_t i;
3351 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
3352 result = _string_tailmatch(self,
3353 PyTuple_GET_ITEM(subobj, i),
3354 start, end, +1);
3355 if (result == -1)
3356 return NULL;
3357 else if (result) {
3358 Py_RETURN_TRUE;
3361 Py_RETURN_FALSE;
3363 result = _string_tailmatch(self, subobj, start, end, +1);
3364 if (result == -1)
3365 return NULL;
3366 else
3367 return PyBool_FromLong(result);
3371 PyDoc_STRVAR(encode__doc__,
3372 "S.encode([encoding[,errors]]) -> object\n\
3374 Encodes S using the codec registered for encoding. encoding defaults\n\
3375 to the default encoding. errors may be given to set a different error\n\
3376 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3377 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
3378 'xmlcharrefreplace' as well as any other name registered with\n\
3379 codecs.register_error that is able to handle UnicodeEncodeErrors.");
3381 static PyObject *
3382 string_encode(PyStringObject *self, PyObject *args, PyObject *kwargs)
3384 static char *kwlist[] = {"encoding", "errors", 0};
3385 char *encoding = NULL;
3386 char *errors = NULL;
3387 PyObject *v;
3389 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
3390 kwlist, &encoding, &errors))
3391 return NULL;
3392 v = PyString_AsEncodedObject((PyObject *)self, encoding, errors);
3393 if (v == NULL)
3394 goto onError;
3395 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
3396 PyErr_Format(PyExc_TypeError,
3397 "encoder did not return a string/unicode object "
3398 "(type=%.400s)",
3399 Py_TYPE(v)->tp_name);
3400 Py_DECREF(v);
3401 return NULL;
3403 return v;
3405 onError:
3406 return NULL;
3410 PyDoc_STRVAR(decode__doc__,
3411 "S.decode([encoding[,errors]]) -> object\n\
3413 Decodes S using the codec registered for encoding. encoding defaults\n\
3414 to the default encoding. errors may be given to set a different error\n\
3415 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3416 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
3417 as well as any other name registered with codecs.register_error that is\n\
3418 able to handle UnicodeDecodeErrors.");
3420 static PyObject *
3421 string_decode(PyStringObject *self, PyObject *args, PyObject *kwargs)
3423 static char *kwlist[] = {"encoding", "errors", 0};
3424 char *encoding = NULL;
3425 char *errors = NULL;
3426 PyObject *v;
3428 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
3429 kwlist, &encoding, &errors))
3430 return NULL;
3431 v = PyString_AsDecodedObject((PyObject *)self, encoding, errors);
3432 if (v == NULL)
3433 goto onError;
3434 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
3435 PyErr_Format(PyExc_TypeError,
3436 "decoder did not return a string/unicode object "
3437 "(type=%.400s)",
3438 Py_TYPE(v)->tp_name);
3439 Py_DECREF(v);
3440 return NULL;
3442 return v;
3444 onError:
3445 return NULL;
3449 PyDoc_STRVAR(expandtabs__doc__,
3450 "S.expandtabs([tabsize]) -> string\n\
3452 Return a copy of S where all tab characters are expanded using spaces.\n\
3453 If tabsize is not given, a tab size of 8 characters is assumed.");
3455 static PyObject*
3456 string_expandtabs(PyStringObject *self, PyObject *args)
3458 const char *e, *p, *qe;
3459 char *q;
3460 Py_ssize_t i, j, incr;
3461 PyObject *u;
3462 int tabsize = 8;
3464 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3465 return NULL;
3467 /* First pass: determine size of output string */
3468 i = 0; /* chars up to and including most recent \n or \r */
3469 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
3470 e = PyString_AS_STRING(self) + PyString_GET_SIZE(self); /* end of input */
3471 for (p = PyString_AS_STRING(self); p < e; p++)
3472 if (*p == '\t') {
3473 if (tabsize > 0) {
3474 incr = tabsize - (j % tabsize);
3475 if (j > PY_SSIZE_T_MAX - incr)
3476 goto overflow1;
3477 j += incr;
3480 else {
3481 if (j > PY_SSIZE_T_MAX - 1)
3482 goto overflow1;
3483 j++;
3484 if (*p == '\n' || *p == '\r') {
3485 if (i > PY_SSIZE_T_MAX - j)
3486 goto overflow1;
3487 i += j;
3488 j = 0;
3492 if (i > PY_SSIZE_T_MAX - j)
3493 goto overflow1;
3495 /* Second pass: create output string and fill it */
3496 u = PyString_FromStringAndSize(NULL, i + j);
3497 if (!u)
3498 return NULL;
3500 j = 0; /* same as in first pass */
3501 q = PyString_AS_STRING(u); /* next output char */
3502 qe = PyString_AS_STRING(u) + PyString_GET_SIZE(u); /* end of output */
3504 for (p = PyString_AS_STRING(self); p < e; p++)
3505 if (*p == '\t') {
3506 if (tabsize > 0) {
3507 i = tabsize - (j % tabsize);
3508 j += i;
3509 while (i--) {
3510 if (q >= qe)
3511 goto overflow2;
3512 *q++ = ' ';
3516 else {
3517 if (q >= qe)
3518 goto overflow2;
3519 *q++ = *p;
3520 j++;
3521 if (*p == '\n' || *p == '\r')
3522 j = 0;
3525 return u;
3527 overflow2:
3528 Py_DECREF(u);
3529 overflow1:
3530 PyErr_SetString(PyExc_OverflowError, "new string is too long");
3531 return NULL;
3534 Py_LOCAL_INLINE(PyObject *)
3535 pad(PyStringObject *self, Py_ssize_t left, Py_ssize_t right, char fill)
3537 PyObject *u;
3539 if (left < 0)
3540 left = 0;
3541 if (right < 0)
3542 right = 0;
3544 if (left == 0 && right == 0 && PyString_CheckExact(self)) {
3545 Py_INCREF(self);
3546 return (PyObject *)self;
3549 u = PyString_FromStringAndSize(NULL,
3550 left + PyString_GET_SIZE(self) + right);
3551 if (u) {
3552 if (left)
3553 memset(PyString_AS_STRING(u), fill, left);
3554 Py_MEMCPY(PyString_AS_STRING(u) + left,
3555 PyString_AS_STRING(self),
3556 PyString_GET_SIZE(self));
3557 if (right)
3558 memset(PyString_AS_STRING(u) + left + PyString_GET_SIZE(self),
3559 fill, right);
3562 return u;
3565 PyDoc_STRVAR(ljust__doc__,
3566 "S.ljust(width[, fillchar]) -> string\n"
3567 "\n"
3568 "Return S left-justified in a string of length width. Padding is\n"
3569 "done using the specified fill character (default is a space).");
3571 static PyObject *
3572 string_ljust(PyStringObject *self, PyObject *args)
3574 Py_ssize_t width;
3575 char fillchar = ' ';
3577 if (!PyArg_ParseTuple(args, "n|c:ljust", &width, &fillchar))
3578 return NULL;
3580 if (PyString_GET_SIZE(self) >= width && PyString_CheckExact(self)) {
3581 Py_INCREF(self);
3582 return (PyObject*) self;
3585 return pad(self, 0, width - PyString_GET_SIZE(self), fillchar);
3589 PyDoc_STRVAR(rjust__doc__,
3590 "S.rjust(width[, fillchar]) -> string\n"
3591 "\n"
3592 "Return S right-justified in a string of length width. Padding is\n"
3593 "done using the specified fill character (default is a space)");
3595 static PyObject *
3596 string_rjust(PyStringObject *self, PyObject *args)
3598 Py_ssize_t width;
3599 char fillchar = ' ';
3601 if (!PyArg_ParseTuple(args, "n|c:rjust", &width, &fillchar))
3602 return NULL;
3604 if (PyString_GET_SIZE(self) >= width && PyString_CheckExact(self)) {
3605 Py_INCREF(self);
3606 return (PyObject*) self;
3609 return pad(self, width - PyString_GET_SIZE(self), 0, fillchar);
3613 PyDoc_STRVAR(center__doc__,
3614 "S.center(width[, fillchar]) -> string\n"
3615 "\n"
3616 "Return S centered in a string of length width. Padding is\n"
3617 "done using the specified fill character (default is a space)");
3619 static PyObject *
3620 string_center(PyStringObject *self, PyObject *args)
3622 Py_ssize_t marg, left;
3623 Py_ssize_t width;
3624 char fillchar = ' ';
3626 if (!PyArg_ParseTuple(args, "n|c:center", &width, &fillchar))
3627 return NULL;
3629 if (PyString_GET_SIZE(self) >= width && PyString_CheckExact(self)) {
3630 Py_INCREF(self);
3631 return (PyObject*) self;
3634 marg = width - PyString_GET_SIZE(self);
3635 left = marg / 2 + (marg & width & 1);
3637 return pad(self, left, marg - left, fillchar);
3640 PyDoc_STRVAR(zfill__doc__,
3641 "S.zfill(width) -> string\n"
3642 "\n"
3643 "Pad a numeric string S with zeros on the left, to fill a field\n"
3644 "of the specified width. The string S is never truncated.");
3646 static PyObject *
3647 string_zfill(PyStringObject *self, PyObject *args)
3649 Py_ssize_t fill;
3650 PyObject *s;
3651 char *p;
3652 Py_ssize_t width;
3654 if (!PyArg_ParseTuple(args, "n:zfill", &width))
3655 return NULL;
3657 if (PyString_GET_SIZE(self) >= width) {
3658 if (PyString_CheckExact(self)) {
3659 Py_INCREF(self);
3660 return (PyObject*) self;
3662 else
3663 return PyString_FromStringAndSize(
3664 PyString_AS_STRING(self),
3665 PyString_GET_SIZE(self)
3669 fill = width - PyString_GET_SIZE(self);
3671 s = pad(self, fill, 0, '0');
3673 if (s == NULL)
3674 return NULL;
3676 p = PyString_AS_STRING(s);
3677 if (p[fill] == '+' || p[fill] == '-') {
3678 /* move sign to beginning of string */
3679 p[0] = p[fill];
3680 p[fill] = '0';
3683 return (PyObject*) s;
3686 PyDoc_STRVAR(isspace__doc__,
3687 "S.isspace() -> bool\n\
3689 Return True if all characters in S are whitespace\n\
3690 and there is at least one character in S, False otherwise.");
3692 static PyObject*
3693 string_isspace(PyStringObject *self)
3695 register const unsigned char *p
3696 = (unsigned char *) PyString_AS_STRING(self);
3697 register const unsigned char *e;
3699 /* Shortcut for single character strings */
3700 if (PyString_GET_SIZE(self) == 1 &&
3701 isspace(*p))
3702 return PyBool_FromLong(1);
3704 /* Special case for empty strings */
3705 if (PyString_GET_SIZE(self) == 0)
3706 return PyBool_FromLong(0);
3708 e = p + PyString_GET_SIZE(self);
3709 for (; p < e; p++) {
3710 if (!isspace(*p))
3711 return PyBool_FromLong(0);
3713 return PyBool_FromLong(1);
3717 PyDoc_STRVAR(isalpha__doc__,
3718 "S.isalpha() -> bool\n\
3720 Return True if all characters in S are alphabetic\n\
3721 and there is at least one character in S, False otherwise.");
3723 static PyObject*
3724 string_isalpha(PyStringObject *self)
3726 register const unsigned char *p
3727 = (unsigned char *) PyString_AS_STRING(self);
3728 register const unsigned char *e;
3730 /* Shortcut for single character strings */
3731 if (PyString_GET_SIZE(self) == 1 &&
3732 isalpha(*p))
3733 return PyBool_FromLong(1);
3735 /* Special case for empty strings */
3736 if (PyString_GET_SIZE(self) == 0)
3737 return PyBool_FromLong(0);
3739 e = p + PyString_GET_SIZE(self);
3740 for (; p < e; p++) {
3741 if (!isalpha(*p))
3742 return PyBool_FromLong(0);
3744 return PyBool_FromLong(1);
3748 PyDoc_STRVAR(isalnum__doc__,
3749 "S.isalnum() -> bool\n\
3751 Return True if all characters in S are alphanumeric\n\
3752 and there is at least one character in S, False otherwise.");
3754 static PyObject*
3755 string_isalnum(PyStringObject *self)
3757 register const unsigned char *p
3758 = (unsigned char *) PyString_AS_STRING(self);
3759 register const unsigned char *e;
3761 /* Shortcut for single character strings */
3762 if (PyString_GET_SIZE(self) == 1 &&
3763 isalnum(*p))
3764 return PyBool_FromLong(1);
3766 /* Special case for empty strings */
3767 if (PyString_GET_SIZE(self) == 0)
3768 return PyBool_FromLong(0);
3770 e = p + PyString_GET_SIZE(self);
3771 for (; p < e; p++) {
3772 if (!isalnum(*p))
3773 return PyBool_FromLong(0);
3775 return PyBool_FromLong(1);
3779 PyDoc_STRVAR(isdigit__doc__,
3780 "S.isdigit() -> bool\n\
3782 Return True if all characters in S are digits\n\
3783 and there is at least one character in S, False otherwise.");
3785 static PyObject*
3786 string_isdigit(PyStringObject *self)
3788 register const unsigned char *p
3789 = (unsigned char *) PyString_AS_STRING(self);
3790 register const unsigned char *e;
3792 /* Shortcut for single character strings */
3793 if (PyString_GET_SIZE(self) == 1 &&
3794 isdigit(*p))
3795 return PyBool_FromLong(1);
3797 /* Special case for empty strings */
3798 if (PyString_GET_SIZE(self) == 0)
3799 return PyBool_FromLong(0);
3801 e = p + PyString_GET_SIZE(self);
3802 for (; p < e; p++) {
3803 if (!isdigit(*p))
3804 return PyBool_FromLong(0);
3806 return PyBool_FromLong(1);
3810 PyDoc_STRVAR(islower__doc__,
3811 "S.islower() -> bool\n\
3813 Return True if all cased characters in S are lowercase and there is\n\
3814 at least one cased character in S, False otherwise.");
3816 static PyObject*
3817 string_islower(PyStringObject *self)
3819 register const unsigned char *p
3820 = (unsigned char *) PyString_AS_STRING(self);
3821 register const unsigned char *e;
3822 int cased;
3824 /* Shortcut for single character strings */
3825 if (PyString_GET_SIZE(self) == 1)
3826 return PyBool_FromLong(islower(*p) != 0);
3828 /* Special case for empty strings */
3829 if (PyString_GET_SIZE(self) == 0)
3830 return PyBool_FromLong(0);
3832 e = p + PyString_GET_SIZE(self);
3833 cased = 0;
3834 for (; p < e; p++) {
3835 if (isupper(*p))
3836 return PyBool_FromLong(0);
3837 else if (!cased && islower(*p))
3838 cased = 1;
3840 return PyBool_FromLong(cased);
3844 PyDoc_STRVAR(isupper__doc__,
3845 "S.isupper() -> bool\n\
3847 Return True if all cased characters in S are uppercase and there is\n\
3848 at least one cased character in S, False otherwise.");
3850 static PyObject*
3851 string_isupper(PyStringObject *self)
3853 register const unsigned char *p
3854 = (unsigned char *) PyString_AS_STRING(self);
3855 register const unsigned char *e;
3856 int cased;
3858 /* Shortcut for single character strings */
3859 if (PyString_GET_SIZE(self) == 1)
3860 return PyBool_FromLong(isupper(*p) != 0);
3862 /* Special case for empty strings */
3863 if (PyString_GET_SIZE(self) == 0)
3864 return PyBool_FromLong(0);
3866 e = p + PyString_GET_SIZE(self);
3867 cased = 0;
3868 for (; p < e; p++) {
3869 if (islower(*p))
3870 return PyBool_FromLong(0);
3871 else if (!cased && isupper(*p))
3872 cased = 1;
3874 return PyBool_FromLong(cased);
3878 PyDoc_STRVAR(istitle__doc__,
3879 "S.istitle() -> bool\n\
3881 Return True if S is a titlecased string and there is at least one\n\
3882 character in S, i.e. uppercase characters may only follow uncased\n\
3883 characters and lowercase characters only cased ones. Return False\n\
3884 otherwise.");
3886 static PyObject*
3887 string_istitle(PyStringObject *self, PyObject *uncased)
3889 register const unsigned char *p
3890 = (unsigned char *) PyString_AS_STRING(self);
3891 register const unsigned char *e;
3892 int cased, previous_is_cased;
3894 /* Shortcut for single character strings */
3895 if (PyString_GET_SIZE(self) == 1)
3896 return PyBool_FromLong(isupper(*p) != 0);
3898 /* Special case for empty strings */
3899 if (PyString_GET_SIZE(self) == 0)
3900 return PyBool_FromLong(0);
3902 e = p + PyString_GET_SIZE(self);
3903 cased = 0;
3904 previous_is_cased = 0;
3905 for (; p < e; p++) {
3906 register const unsigned char ch = *p;
3908 if (isupper(ch)) {
3909 if (previous_is_cased)
3910 return PyBool_FromLong(0);
3911 previous_is_cased = 1;
3912 cased = 1;
3914 else if (islower(ch)) {
3915 if (!previous_is_cased)
3916 return PyBool_FromLong(0);
3917 previous_is_cased = 1;
3918 cased = 1;
3920 else
3921 previous_is_cased = 0;
3923 return PyBool_FromLong(cased);
3927 PyDoc_STRVAR(splitlines__doc__,
3928 "S.splitlines([keepends]) -> list of strings\n\
3930 Return a list of the lines in S, breaking at line boundaries.\n\
3931 Line breaks are not included in the resulting list unless keepends\n\
3932 is given and true.");
3934 static PyObject*
3935 string_splitlines(PyStringObject *self, PyObject *args)
3937 register Py_ssize_t i;
3938 register Py_ssize_t j;
3939 Py_ssize_t len;
3940 int keepends = 0;
3941 PyObject *list;
3942 PyObject *str;
3943 char *data;
3945 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
3946 return NULL;
3948 data = PyString_AS_STRING(self);
3949 len = PyString_GET_SIZE(self);
3951 /* This does not use the preallocated list because splitlines is
3952 usually run with hundreds of newlines. The overhead of
3953 switching between PyList_SET_ITEM and append causes about a
3954 2-3% slowdown for that common case. A smarter implementation
3955 could move the if check out, so the SET_ITEMs are done first
3956 and the appends only done when the prealloc buffer is full.
3957 That's too much work for little gain.*/
3959 list = PyList_New(0);
3960 if (!list)
3961 goto onError;
3963 for (i = j = 0; i < len; ) {
3964 Py_ssize_t eol;
3966 /* Find a line and append it */
3967 while (i < len && data[i] != '\n' && data[i] != '\r')
3968 i++;
3970 /* Skip the line break reading CRLF as one line break */
3971 eol = i;
3972 if (i < len) {
3973 if (data[i] == '\r' && i + 1 < len &&
3974 data[i+1] == '\n')
3975 i += 2;
3976 else
3977 i++;
3978 if (keepends)
3979 eol = i;
3981 SPLIT_APPEND(data, j, eol);
3982 j = i;
3984 if (j < len) {
3985 SPLIT_APPEND(data, j, len);
3988 return list;
3990 onError:
3991 Py_XDECREF(list);
3992 return NULL;
3995 PyDoc_STRVAR(sizeof__doc__,
3996 "S.__sizeof__() -> size of S in memory, in bytes");
3998 static PyObject *
3999 string_sizeof(PyStringObject *v)
4001 Py_ssize_t res;
4002 res = PyStringObject_SIZE + PyString_GET_SIZE(v) * Py_TYPE(v)->tp_itemsize;
4003 return PyInt_FromSsize_t(res);
4006 #undef SPLIT_APPEND
4007 #undef SPLIT_ADD
4008 #undef MAX_PREALLOC
4009 #undef PREALLOC_SIZE
4011 static PyObject *
4012 string_getnewargs(PyStringObject *v)
4014 return Py_BuildValue("(s#)", v->ob_sval, Py_SIZE(v));
4018 #include "stringlib/string_format.h"
4020 PyDoc_STRVAR(format__doc__,
4021 "S.format(*args, **kwargs) -> unicode\n\
4025 static PyObject *
4026 string__format__(PyObject* self, PyObject* args)
4028 PyObject *format_spec;
4029 PyObject *result = NULL;
4030 PyObject *tmp = NULL;
4032 /* If 2.x, convert format_spec to the same type as value */
4033 /* This is to allow things like u''.format('') */
4034 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
4035 goto done;
4036 if (!(PyString_Check(format_spec) || PyUnicode_Check(format_spec))) {
4037 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
4038 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
4039 goto done;
4041 tmp = PyObject_Str(format_spec);
4042 if (tmp == NULL)
4043 goto done;
4044 format_spec = tmp;
4046 result = _PyBytes_FormatAdvanced(self,
4047 PyString_AS_STRING(format_spec),
4048 PyString_GET_SIZE(format_spec));
4049 done:
4050 Py_XDECREF(tmp);
4051 return result;
4054 PyDoc_STRVAR(p_format__doc__,
4055 "S.__format__(format_spec) -> unicode\n\
4060 static PyMethodDef
4061 string_methods[] = {
4062 /* Counterparts of the obsolete stropmodule functions; except
4063 string.maketrans(). */
4064 {"join", (PyCFunction)string_join, METH_O, join__doc__},
4065 {"split", (PyCFunction)string_split, METH_VARARGS, split__doc__},
4066 {"rsplit", (PyCFunction)string_rsplit, METH_VARARGS, rsplit__doc__},
4067 {"lower", (PyCFunction)string_lower, METH_NOARGS, lower__doc__},
4068 {"upper", (PyCFunction)string_upper, METH_NOARGS, upper__doc__},
4069 {"islower", (PyCFunction)string_islower, METH_NOARGS, islower__doc__},
4070 {"isupper", (PyCFunction)string_isupper, METH_NOARGS, isupper__doc__},
4071 {"isspace", (PyCFunction)string_isspace, METH_NOARGS, isspace__doc__},
4072 {"isdigit", (PyCFunction)string_isdigit, METH_NOARGS, isdigit__doc__},
4073 {"istitle", (PyCFunction)string_istitle, METH_NOARGS, istitle__doc__},
4074 {"isalpha", (PyCFunction)string_isalpha, METH_NOARGS, isalpha__doc__},
4075 {"isalnum", (PyCFunction)string_isalnum, METH_NOARGS, isalnum__doc__},
4076 {"capitalize", (PyCFunction)string_capitalize, METH_NOARGS,
4077 capitalize__doc__},
4078 {"count", (PyCFunction)string_count, METH_VARARGS, count__doc__},
4079 {"endswith", (PyCFunction)string_endswith, METH_VARARGS,
4080 endswith__doc__},
4081 {"partition", (PyCFunction)string_partition, METH_O, partition__doc__},
4082 {"find", (PyCFunction)string_find, METH_VARARGS, find__doc__},
4083 {"index", (PyCFunction)string_index, METH_VARARGS, index__doc__},
4084 {"lstrip", (PyCFunction)string_lstrip, METH_VARARGS, lstrip__doc__},
4085 {"replace", (PyCFunction)string_replace, METH_VARARGS, replace__doc__},
4086 {"rfind", (PyCFunction)string_rfind, METH_VARARGS, rfind__doc__},
4087 {"rindex", (PyCFunction)string_rindex, METH_VARARGS, rindex__doc__},
4088 {"rstrip", (PyCFunction)string_rstrip, METH_VARARGS, rstrip__doc__},
4089 {"rpartition", (PyCFunction)string_rpartition, METH_O,
4090 rpartition__doc__},
4091 {"startswith", (PyCFunction)string_startswith, METH_VARARGS,
4092 startswith__doc__},
4093 {"strip", (PyCFunction)string_strip, METH_VARARGS, strip__doc__},
4094 {"swapcase", (PyCFunction)string_swapcase, METH_NOARGS,
4095 swapcase__doc__},
4096 {"translate", (PyCFunction)string_translate, METH_VARARGS,
4097 translate__doc__},
4098 {"title", (PyCFunction)string_title, METH_NOARGS, title__doc__},
4099 {"ljust", (PyCFunction)string_ljust, METH_VARARGS, ljust__doc__},
4100 {"rjust", (PyCFunction)string_rjust, METH_VARARGS, rjust__doc__},
4101 {"center", (PyCFunction)string_center, METH_VARARGS, center__doc__},
4102 {"zfill", (PyCFunction)string_zfill, METH_VARARGS, zfill__doc__},
4103 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
4104 {"__format__", (PyCFunction) string__format__, METH_VARARGS, p_format__doc__},
4105 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
4106 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
4107 {"encode", (PyCFunction)string_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
4108 {"decode", (PyCFunction)string_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
4109 {"expandtabs", (PyCFunction)string_expandtabs, METH_VARARGS,
4110 expandtabs__doc__},
4111 {"splitlines", (PyCFunction)string_splitlines, METH_VARARGS,
4112 splitlines__doc__},
4113 {"__sizeof__", (PyCFunction)string_sizeof, METH_NOARGS,
4114 sizeof__doc__},
4115 {"__getnewargs__", (PyCFunction)string_getnewargs, METH_NOARGS},
4116 {NULL, NULL} /* sentinel */
4119 static PyObject *
4120 str_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
4122 static PyObject *
4123 string_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
4125 PyObject *x = NULL;
4126 static char *kwlist[] = {"object", 0};
4128 if (type != &PyString_Type)
4129 return str_subtype_new(type, args, kwds);
4130 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O:str", kwlist, &x))
4131 return NULL;
4132 if (x == NULL)
4133 return PyString_FromString("");
4134 return PyObject_Str(x);
4137 static PyObject *
4138 str_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
4140 PyObject *tmp, *pnew;
4141 Py_ssize_t n;
4143 assert(PyType_IsSubtype(type, &PyString_Type));
4144 tmp = string_new(&PyString_Type, args, kwds);
4145 if (tmp == NULL)
4146 return NULL;
4147 assert(PyString_CheckExact(tmp));
4148 n = PyString_GET_SIZE(tmp);
4149 pnew = type->tp_alloc(type, n);
4150 if (pnew != NULL) {
4151 Py_MEMCPY(PyString_AS_STRING(pnew), PyString_AS_STRING(tmp), n+1);
4152 ((PyStringObject *)pnew)->ob_shash =
4153 ((PyStringObject *)tmp)->ob_shash;
4154 ((PyStringObject *)pnew)->ob_sstate = SSTATE_NOT_INTERNED;
4156 Py_DECREF(tmp);
4157 return pnew;
4160 static PyObject *
4161 basestring_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
4163 PyErr_SetString(PyExc_TypeError,
4164 "The basestring type cannot be instantiated");
4165 return NULL;
4168 static PyObject *
4169 string_mod(PyObject *v, PyObject *w)
4171 if (!PyString_Check(v)) {
4172 Py_INCREF(Py_NotImplemented);
4173 return Py_NotImplemented;
4175 return PyString_Format(v, w);
4178 PyDoc_STRVAR(basestring_doc,
4179 "Type basestring cannot be instantiated; it is the base for str and unicode.");
4181 static PyNumberMethods string_as_number = {
4182 0, /*nb_add*/
4183 0, /*nb_subtract*/
4184 0, /*nb_multiply*/
4185 0, /*nb_divide*/
4186 string_mod, /*nb_remainder*/
4190 PyTypeObject PyBaseString_Type = {
4191 PyVarObject_HEAD_INIT(&PyType_Type, 0)
4192 "basestring",
4195 0, /* tp_dealloc */
4196 0, /* tp_print */
4197 0, /* tp_getattr */
4198 0, /* tp_setattr */
4199 0, /* tp_compare */
4200 0, /* tp_repr */
4201 0, /* tp_as_number */
4202 0, /* tp_as_sequence */
4203 0, /* tp_as_mapping */
4204 0, /* tp_hash */
4205 0, /* tp_call */
4206 0, /* tp_str */
4207 0, /* tp_getattro */
4208 0, /* tp_setattro */
4209 0, /* tp_as_buffer */
4210 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
4211 basestring_doc, /* tp_doc */
4212 0, /* tp_traverse */
4213 0, /* tp_clear */
4214 0, /* tp_richcompare */
4215 0, /* tp_weaklistoffset */
4216 0, /* tp_iter */
4217 0, /* tp_iternext */
4218 0, /* tp_methods */
4219 0, /* tp_members */
4220 0, /* tp_getset */
4221 &PyBaseObject_Type, /* tp_base */
4222 0, /* tp_dict */
4223 0, /* tp_descr_get */
4224 0, /* tp_descr_set */
4225 0, /* tp_dictoffset */
4226 0, /* tp_init */
4227 0, /* tp_alloc */
4228 basestring_new, /* tp_new */
4229 0, /* tp_free */
4232 PyDoc_STRVAR(string_doc,
4233 "str(object) -> string\n\
4235 Return a nice string representation of the object.\n\
4236 If the argument is a string, the return value is the same object.");
4238 PyTypeObject PyString_Type = {
4239 PyVarObject_HEAD_INIT(&PyType_Type, 0)
4240 "str",
4241 PyStringObject_SIZE,
4242 sizeof(char),
4243 string_dealloc, /* tp_dealloc */
4244 (printfunc)string_print, /* tp_print */
4245 0, /* tp_getattr */
4246 0, /* tp_setattr */
4247 0, /* tp_compare */
4248 string_repr, /* tp_repr */
4249 &string_as_number, /* tp_as_number */
4250 &string_as_sequence, /* tp_as_sequence */
4251 &string_as_mapping, /* tp_as_mapping */
4252 (hashfunc)string_hash, /* tp_hash */
4253 0, /* tp_call */
4254 string_str, /* tp_str */
4255 PyObject_GenericGetAttr, /* tp_getattro */
4256 0, /* tp_setattro */
4257 &string_as_buffer, /* tp_as_buffer */
4258 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
4259 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_STRING_SUBCLASS |
4260 Py_TPFLAGS_HAVE_NEWBUFFER, /* tp_flags */
4261 string_doc, /* tp_doc */
4262 0, /* tp_traverse */
4263 0, /* tp_clear */
4264 (richcmpfunc)string_richcompare, /* tp_richcompare */
4265 0, /* tp_weaklistoffset */
4266 0, /* tp_iter */
4267 0, /* tp_iternext */
4268 string_methods, /* tp_methods */
4269 0, /* tp_members */
4270 0, /* tp_getset */
4271 &PyBaseString_Type, /* tp_base */
4272 0, /* tp_dict */
4273 0, /* tp_descr_get */
4274 0, /* tp_descr_set */
4275 0, /* tp_dictoffset */
4276 0, /* tp_init */
4277 0, /* tp_alloc */
4278 string_new, /* tp_new */
4279 PyObject_Del, /* tp_free */
4282 void
4283 PyString_Concat(register PyObject **pv, register PyObject *w)
4285 register PyObject *v;
4286 if (*pv == NULL)
4287 return;
4288 if (w == NULL || !PyString_Check(*pv)) {
4289 Py_DECREF(*pv);
4290 *pv = NULL;
4291 return;
4293 v = string_concat((PyStringObject *) *pv, w);
4294 Py_DECREF(*pv);
4295 *pv = v;
4298 void
4299 PyString_ConcatAndDel(register PyObject **pv, register PyObject *w)
4301 PyString_Concat(pv, w);
4302 Py_XDECREF(w);
4306 /* The following function breaks the notion that strings are immutable:
4307 it changes the size of a string. We get away with this only if there
4308 is only one module referencing the object. You can also think of it
4309 as creating a new string object and destroying the old one, only
4310 more efficiently. In any case, don't use this if the string may
4311 already be known to some other part of the code...
4312 Note that if there's not enough memory to resize the string, the original
4313 string object at *pv is deallocated, *pv is set to NULL, an "out of
4314 memory" exception is set, and -1 is returned. Else (on success) 0 is
4315 returned, and the value in *pv may or may not be the same as on input.
4316 As always, an extra byte is allocated for a trailing \0 byte (newsize
4317 does *not* include that), and a trailing \0 byte is stored.
4321 _PyString_Resize(PyObject **pv, Py_ssize_t newsize)
4323 register PyObject *v;
4324 register PyStringObject *sv;
4325 v = *pv;
4326 if (!PyString_Check(v) || Py_REFCNT(v) != 1 || newsize < 0 ||
4327 PyString_CHECK_INTERNED(v)) {
4328 *pv = 0;
4329 Py_DECREF(v);
4330 PyErr_BadInternalCall();
4331 return -1;
4333 /* XXX UNREF/NEWREF interface should be more symmetrical */
4334 _Py_DEC_REFTOTAL;
4335 _Py_ForgetReference(v);
4336 *pv = (PyObject *)
4337 PyObject_REALLOC((char *)v, PyStringObject_SIZE + newsize);
4338 if (*pv == NULL) {
4339 PyObject_Del(v);
4340 PyErr_NoMemory();
4341 return -1;
4343 _Py_NewReference(*pv);
4344 sv = (PyStringObject *) *pv;
4345 Py_SIZE(sv) = newsize;
4346 sv->ob_sval[newsize] = '\0';
4347 sv->ob_shash = -1; /* invalidate cached hash value */
4348 return 0;
4351 /* Helpers for formatstring */
4353 Py_LOCAL_INLINE(PyObject *)
4354 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
4356 Py_ssize_t argidx = *p_argidx;
4357 if (argidx < arglen) {
4358 (*p_argidx)++;
4359 if (arglen < 0)
4360 return args;
4361 else
4362 return PyTuple_GetItem(args, argidx);
4364 PyErr_SetString(PyExc_TypeError,
4365 "not enough arguments for format string");
4366 return NULL;
4369 /* Format codes
4370 * F_LJUST '-'
4371 * F_SIGN '+'
4372 * F_BLANK ' '
4373 * F_ALT '#'
4374 * F_ZERO '0'
4376 #define F_LJUST (1<<0)
4377 #define F_SIGN (1<<1)
4378 #define F_BLANK (1<<2)
4379 #define F_ALT (1<<3)
4380 #define F_ZERO (1<<4)
4382 /* Returns a new reference to a PyString object, or NULL on failure. */
4384 static PyObject *
4385 formatfloat(PyObject *v, int flags, int prec, int type)
4387 char *p;
4388 PyObject *result;
4389 double x;
4391 x = PyFloat_AsDouble(v);
4392 if (x == -1.0 && PyErr_Occurred()) {
4393 PyErr_Format(PyExc_TypeError, "float argument required, "
4394 "not %.200s", Py_TYPE(v)->tp_name);
4395 return NULL;
4398 if (prec < 0)
4399 prec = 6;
4401 p = PyOS_double_to_string(x, type, prec,
4402 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
4404 if (p == NULL)
4405 return NULL;
4406 result = PyString_FromStringAndSize(p, strlen(p));
4407 PyMem_Free(p);
4408 return result;
4411 /* _PyString_FormatLong emulates the format codes d, u, o, x and X, and
4412 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
4413 * Python's regular ints.
4414 * Return value: a new PyString*, or NULL if error.
4415 * . *pbuf is set to point into it,
4416 * *plen set to the # of chars following that.
4417 * Caller must decref it when done using pbuf.
4418 * The string starting at *pbuf is of the form
4419 * "-"? ("0x" | "0X")? digit+
4420 * "0x"/"0X" are present only for x and X conversions, with F_ALT
4421 * set in flags. The case of hex digits will be correct,
4422 * There will be at least prec digits, zero-filled on the left if
4423 * necessary to get that many.
4424 * val object to be converted
4425 * flags bitmask of format flags; only F_ALT is looked at
4426 * prec minimum number of digits; 0-fill on left if needed
4427 * type a character in [duoxX]; u acts the same as d
4429 * CAUTION: o, x and X conversions on regular ints can never
4430 * produce a '-' sign, but can for Python's unbounded ints.
4432 PyObject*
4433 _PyString_FormatLong(PyObject *val, int flags, int prec, int type,
4434 char **pbuf, int *plen)
4436 PyObject *result = NULL;
4437 char *buf;
4438 Py_ssize_t i;
4439 int sign; /* 1 if '-', else 0 */
4440 int len; /* number of characters */
4441 Py_ssize_t llen;
4442 int numdigits; /* len == numnondigits + numdigits */
4443 int numnondigits = 0;
4445 switch (type) {
4446 case 'd':
4447 case 'u':
4448 result = Py_TYPE(val)->tp_str(val);
4449 break;
4450 case 'o':
4451 result = Py_TYPE(val)->tp_as_number->nb_oct(val);
4452 break;
4453 case 'x':
4454 case 'X':
4455 numnondigits = 2;
4456 result = Py_TYPE(val)->tp_as_number->nb_hex(val);
4457 break;
4458 default:
4459 assert(!"'type' not in [duoxX]");
4461 if (!result)
4462 return NULL;
4464 buf = PyString_AsString(result);
4465 if (!buf) {
4466 Py_DECREF(result);
4467 return NULL;
4470 /* To modify the string in-place, there can only be one reference. */
4471 if (Py_REFCNT(result) != 1) {
4472 PyErr_BadInternalCall();
4473 return NULL;
4475 llen = PyString_Size(result);
4476 if (llen > INT_MAX) {
4477 PyErr_SetString(PyExc_ValueError, "string too large in _PyString_FormatLong");
4478 return NULL;
4480 len = (int)llen;
4481 if (buf[len-1] == 'L') {
4482 --len;
4483 buf[len] = '\0';
4485 sign = buf[0] == '-';
4486 numnondigits += sign;
4487 numdigits = len - numnondigits;
4488 assert(numdigits > 0);
4490 /* Get rid of base marker unless F_ALT */
4491 if ((flags & F_ALT) == 0) {
4492 /* Need to skip 0x, 0X or 0. */
4493 int skipped = 0;
4494 switch (type) {
4495 case 'o':
4496 assert(buf[sign] == '0');
4497 /* If 0 is only digit, leave it alone. */
4498 if (numdigits > 1) {
4499 skipped = 1;
4500 --numdigits;
4502 break;
4503 case 'x':
4504 case 'X':
4505 assert(buf[sign] == '0');
4506 assert(buf[sign + 1] == 'x');
4507 skipped = 2;
4508 numnondigits -= 2;
4509 break;
4511 if (skipped) {
4512 buf += skipped;
4513 len -= skipped;
4514 if (sign)
4515 buf[0] = '-';
4517 assert(len == numnondigits + numdigits);
4518 assert(numdigits > 0);
4521 /* Fill with leading zeroes to meet minimum width. */
4522 if (prec > numdigits) {
4523 PyObject *r1 = PyString_FromStringAndSize(NULL,
4524 numnondigits + prec);
4525 char *b1;
4526 if (!r1) {
4527 Py_DECREF(result);
4528 return NULL;
4530 b1 = PyString_AS_STRING(r1);
4531 for (i = 0; i < numnondigits; ++i)
4532 *b1++ = *buf++;
4533 for (i = 0; i < prec - numdigits; i++)
4534 *b1++ = '0';
4535 for (i = 0; i < numdigits; i++)
4536 *b1++ = *buf++;
4537 *b1 = '\0';
4538 Py_DECREF(result);
4539 result = r1;
4540 buf = PyString_AS_STRING(result);
4541 len = numnondigits + prec;
4544 /* Fix up case for hex conversions. */
4545 if (type == 'X') {
4546 /* Need to convert all lower case letters to upper case.
4547 and need to convert 0x to 0X (and -0x to -0X). */
4548 for (i = 0; i < len; i++)
4549 if (buf[i] >= 'a' && buf[i] <= 'x')
4550 buf[i] -= 'a'-'A';
4552 *pbuf = buf;
4553 *plen = len;
4554 return result;
4557 Py_LOCAL_INLINE(int)
4558 formatint(char *buf, size_t buflen, int flags,
4559 int prec, int type, PyObject *v)
4561 /* fmt = '%#.' + `prec` + 'l' + `type`
4562 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4563 + 1 + 1 = 24 */
4564 char fmt[64]; /* plenty big enough! */
4565 char *sign;
4566 long x;
4568 x = PyInt_AsLong(v);
4569 if (x == -1 && PyErr_Occurred()) {
4570 PyErr_Format(PyExc_TypeError, "int argument required, not %.200s",
4571 Py_TYPE(v)->tp_name);
4572 return -1;
4574 if (x < 0 && type == 'u') {
4575 type = 'd';
4577 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
4578 sign = "-";
4579 else
4580 sign = "";
4581 if (prec < 0)
4582 prec = 1;
4584 if ((flags & F_ALT) &&
4585 (type == 'x' || type == 'X')) {
4586 /* When converting under %#x or %#X, there are a number
4587 * of issues that cause pain:
4588 * - when 0 is being converted, the C standard leaves off
4589 * the '0x' or '0X', which is inconsistent with other
4590 * %#x/%#X conversions and inconsistent with Python's
4591 * hex() function
4592 * - there are platforms that violate the standard and
4593 * convert 0 with the '0x' or '0X'
4594 * (Metrowerks, Compaq Tru64)
4595 * - there are platforms that give '0x' when converting
4596 * under %#X, but convert 0 in accordance with the
4597 * standard (OS/2 EMX)
4599 * We can achieve the desired consistency by inserting our
4600 * own '0x' or '0X' prefix, and substituting %x/%X in place
4601 * of %#x/%#X.
4603 * Note that this is the same approach as used in
4604 * formatint() in unicodeobject.c
4606 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
4607 sign, type, prec, type);
4609 else {
4610 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
4611 sign, (flags&F_ALT) ? "#" : "",
4612 prec, type);
4615 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
4616 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
4618 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
4619 PyErr_SetString(PyExc_OverflowError,
4620 "formatted integer is too long (precision too large?)");
4621 return -1;
4623 if (sign[0])
4624 PyOS_snprintf(buf, buflen, fmt, -x);
4625 else
4626 PyOS_snprintf(buf, buflen, fmt, x);
4627 return (int)strlen(buf);
4630 Py_LOCAL_INLINE(int)
4631 formatchar(char *buf, size_t buflen, PyObject *v)
4633 /* presume that the buffer is at least 2 characters long */
4634 if (PyString_Check(v)) {
4635 if (!PyArg_Parse(v, "c;%c requires int or char", &buf[0]))
4636 return -1;
4638 else {
4639 if (!PyArg_Parse(v, "b;%c requires int or char", &buf[0]))
4640 return -1;
4642 buf[1] = '\0';
4643 return 1;
4646 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4648 FORMATBUFLEN is the length of the buffer in which the ints &
4649 chars are formatted. XXX This is a magic number. Each formatting
4650 routine does bounds checking to ensure no overflow, but a better
4651 solution may be to malloc a buffer of appropriate size for each
4652 format. For now, the current solution is sufficient.
4654 #define FORMATBUFLEN (size_t)120
4656 PyObject *
4657 PyString_Format(PyObject *format, PyObject *args)
4659 char *fmt, *res;
4660 Py_ssize_t arglen, argidx;
4661 Py_ssize_t reslen, rescnt, fmtcnt;
4662 int args_owned = 0;
4663 PyObject *result, *orig_args;
4664 #ifdef Py_USING_UNICODE
4665 PyObject *v, *w;
4666 #endif
4667 PyObject *dict = NULL;
4668 if (format == NULL || !PyString_Check(format) || args == NULL) {
4669 PyErr_BadInternalCall();
4670 return NULL;
4672 orig_args = args;
4673 fmt = PyString_AS_STRING(format);
4674 fmtcnt = PyString_GET_SIZE(format);
4675 reslen = rescnt = fmtcnt + 100;
4676 result = PyString_FromStringAndSize((char *)NULL, reslen);
4677 if (result == NULL)
4678 return NULL;
4679 res = PyString_AsString(result);
4680 if (PyTuple_Check(args)) {
4681 arglen = PyTuple_GET_SIZE(args);
4682 argidx = 0;
4684 else {
4685 arglen = -1;
4686 argidx = -2;
4688 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
4689 !PyObject_TypeCheck(args, &PyBaseString_Type))
4690 dict = args;
4691 while (--fmtcnt >= 0) {
4692 if (*fmt != '%') {
4693 if (--rescnt < 0) {
4694 rescnt = fmtcnt + 100;
4695 reslen += rescnt;
4696 if (_PyString_Resize(&result, reslen) < 0)
4697 return NULL;
4698 res = PyString_AS_STRING(result)
4699 + reslen - rescnt;
4700 --rescnt;
4702 *res++ = *fmt++;
4704 else {
4705 /* Got a format specifier */
4706 int flags = 0;
4707 Py_ssize_t width = -1;
4708 int prec = -1;
4709 int c = '\0';
4710 int fill;
4711 int isnumok;
4712 PyObject *v = NULL;
4713 PyObject *temp = NULL;
4714 char *pbuf;
4715 int sign;
4716 Py_ssize_t len;
4717 char formatbuf[FORMATBUFLEN];
4718 /* For format{int,char}() */
4719 #ifdef Py_USING_UNICODE
4720 char *fmt_start = fmt;
4721 Py_ssize_t argidx_start = argidx;
4722 #endif
4724 fmt++;
4725 if (*fmt == '(') {
4726 char *keystart;
4727 Py_ssize_t keylen;
4728 PyObject *key;
4729 int pcount = 1;
4731 if (dict == NULL) {
4732 PyErr_SetString(PyExc_TypeError,
4733 "format requires a mapping");
4734 goto error;
4736 ++fmt;
4737 --fmtcnt;
4738 keystart = fmt;
4739 /* Skip over balanced parentheses */
4740 while (pcount > 0 && --fmtcnt >= 0) {
4741 if (*fmt == ')')
4742 --pcount;
4743 else if (*fmt == '(')
4744 ++pcount;
4745 fmt++;
4747 keylen = fmt - keystart - 1;
4748 if (fmtcnt < 0 || pcount > 0) {
4749 PyErr_SetString(PyExc_ValueError,
4750 "incomplete format key");
4751 goto error;
4753 key = PyString_FromStringAndSize(keystart,
4754 keylen);
4755 if (key == NULL)
4756 goto error;
4757 if (args_owned) {
4758 Py_DECREF(args);
4759 args_owned = 0;
4761 args = PyObject_GetItem(dict, key);
4762 Py_DECREF(key);
4763 if (args == NULL) {
4764 goto error;
4766 args_owned = 1;
4767 arglen = -1;
4768 argidx = -2;
4770 while (--fmtcnt >= 0) {
4771 switch (c = *fmt++) {
4772 case '-': flags |= F_LJUST; continue;
4773 case '+': flags |= F_SIGN; continue;
4774 case ' ': flags |= F_BLANK; continue;
4775 case '#': flags |= F_ALT; continue;
4776 case '0': flags |= F_ZERO; continue;
4778 break;
4780 if (c == '*') {
4781 v = getnextarg(args, arglen, &argidx);
4782 if (v == NULL)
4783 goto error;
4784 if (!PyInt_Check(v)) {
4785 PyErr_SetString(PyExc_TypeError,
4786 "* wants int");
4787 goto error;
4789 width = PyInt_AsLong(v);
4790 if (width < 0) {
4791 flags |= F_LJUST;
4792 width = -width;
4794 if (--fmtcnt >= 0)
4795 c = *fmt++;
4797 else if (c >= 0 && isdigit(c)) {
4798 width = c - '0';
4799 while (--fmtcnt >= 0) {
4800 c = Py_CHARMASK(*fmt++);
4801 if (!isdigit(c))
4802 break;
4803 if ((width*10) / 10 != width) {
4804 PyErr_SetString(
4805 PyExc_ValueError,
4806 "width too big");
4807 goto error;
4809 width = width*10 + (c - '0');
4812 if (c == '.') {
4813 prec = 0;
4814 if (--fmtcnt >= 0)
4815 c = *fmt++;
4816 if (c == '*') {
4817 v = getnextarg(args, arglen, &argidx);
4818 if (v == NULL)
4819 goto error;
4820 if (!PyInt_Check(v)) {
4821 PyErr_SetString(
4822 PyExc_TypeError,
4823 "* wants int");
4824 goto error;
4826 prec = PyInt_AsLong(v);
4827 if (prec < 0)
4828 prec = 0;
4829 if (--fmtcnt >= 0)
4830 c = *fmt++;
4832 else if (c >= 0 && isdigit(c)) {
4833 prec = c - '0';
4834 while (--fmtcnt >= 0) {
4835 c = Py_CHARMASK(*fmt++);
4836 if (!isdigit(c))
4837 break;
4838 if ((prec*10) / 10 != prec) {
4839 PyErr_SetString(
4840 PyExc_ValueError,
4841 "prec too big");
4842 goto error;
4844 prec = prec*10 + (c - '0');
4847 } /* prec */
4848 if (fmtcnt >= 0) {
4849 if (c == 'h' || c == 'l' || c == 'L') {
4850 if (--fmtcnt >= 0)
4851 c = *fmt++;
4854 if (fmtcnt < 0) {
4855 PyErr_SetString(PyExc_ValueError,
4856 "incomplete format");
4857 goto error;
4859 if (c != '%') {
4860 v = getnextarg(args, arglen, &argidx);
4861 if (v == NULL)
4862 goto error;
4864 sign = 0;
4865 fill = ' ';
4866 switch (c) {
4867 case '%':
4868 pbuf = "%";
4869 len = 1;
4870 break;
4871 case 's':
4872 #ifdef Py_USING_UNICODE
4873 if (PyUnicode_Check(v)) {
4874 fmt = fmt_start;
4875 argidx = argidx_start;
4876 goto unicode;
4878 #endif
4879 temp = _PyObject_Str(v);
4880 #ifdef Py_USING_UNICODE
4881 if (temp != NULL && PyUnicode_Check(temp)) {
4882 Py_DECREF(temp);
4883 fmt = fmt_start;
4884 argidx = argidx_start;
4885 goto unicode;
4887 #endif
4888 /* Fall through */
4889 case 'r':
4890 if (c == 'r')
4891 temp = PyObject_Repr(v);
4892 if (temp == NULL)
4893 goto error;
4894 if (!PyString_Check(temp)) {
4895 PyErr_SetString(PyExc_TypeError,
4896 "%s argument has non-string str()");
4897 Py_DECREF(temp);
4898 goto error;
4900 pbuf = PyString_AS_STRING(temp);
4901 len = PyString_GET_SIZE(temp);
4902 if (prec >= 0 && len > prec)
4903 len = prec;
4904 break;
4905 case 'i':
4906 case 'd':
4907 case 'u':
4908 case 'o':
4909 case 'x':
4910 case 'X':
4911 if (c == 'i')
4912 c = 'd';
4913 isnumok = 0;
4914 if (PyNumber_Check(v)) {
4915 PyObject *iobj=NULL;
4917 if (PyInt_Check(v) || (PyLong_Check(v))) {
4918 iobj = v;
4919 Py_INCREF(iobj);
4921 else {
4922 iobj = PyNumber_Int(v);
4923 if (iobj==NULL) iobj = PyNumber_Long(v);
4925 if (iobj!=NULL) {
4926 if (PyInt_Check(iobj)) {
4927 isnumok = 1;
4928 pbuf = formatbuf;
4929 len = formatint(pbuf,
4930 sizeof(formatbuf),
4931 flags, prec, c, iobj);
4932 Py_DECREF(iobj);
4933 if (len < 0)
4934 goto error;
4935 sign = 1;
4937 else if (PyLong_Check(iobj)) {
4938 int ilen;
4940 isnumok = 1;
4941 temp = _PyString_FormatLong(iobj, flags,
4942 prec, c, &pbuf, &ilen);
4943 Py_DECREF(iobj);
4944 len = ilen;
4945 if (!temp)
4946 goto error;
4947 sign = 1;
4949 else {
4950 Py_DECREF(iobj);
4954 if (!isnumok) {
4955 PyErr_Format(PyExc_TypeError,
4956 "%%%c format: a number is required, "
4957 "not %.200s", c, Py_TYPE(v)->tp_name);
4958 goto error;
4960 if (flags & F_ZERO)
4961 fill = '0';
4962 break;
4963 case 'e':
4964 case 'E':
4965 case 'f':
4966 case 'F':
4967 case 'g':
4968 case 'G':
4969 temp = formatfloat(v, flags, prec, c);
4970 if (temp == NULL)
4971 goto error;
4972 pbuf = PyString_AS_STRING(temp);
4973 len = PyString_GET_SIZE(temp);
4974 sign = 1;
4975 if (flags & F_ZERO)
4976 fill = '0';
4977 break;
4978 case 'c':
4979 #ifdef Py_USING_UNICODE
4980 if (PyUnicode_Check(v)) {
4981 fmt = fmt_start;
4982 argidx = argidx_start;
4983 goto unicode;
4985 #endif
4986 pbuf = formatbuf;
4987 len = formatchar(pbuf, sizeof(formatbuf), v);
4988 if (len < 0)
4989 goto error;
4990 break;
4991 default:
4992 PyErr_Format(PyExc_ValueError,
4993 "unsupported format character '%c' (0x%x) "
4994 "at index %zd",
4995 c, c,
4996 (Py_ssize_t)(fmt - 1 -
4997 PyString_AsString(format)));
4998 goto error;
5000 if (sign) {
5001 if (*pbuf == '-' || *pbuf == '+') {
5002 sign = *pbuf++;
5003 len--;
5005 else if (flags & F_SIGN)
5006 sign = '+';
5007 else if (flags & F_BLANK)
5008 sign = ' ';
5009 else
5010 sign = 0;
5012 if (width < len)
5013 width = len;
5014 if (rescnt - (sign != 0) < width) {
5015 reslen -= rescnt;
5016 rescnt = width + fmtcnt + 100;
5017 reslen += rescnt;
5018 if (reslen < 0) {
5019 Py_DECREF(result);
5020 Py_XDECREF(temp);
5021 return PyErr_NoMemory();
5023 if (_PyString_Resize(&result, reslen) < 0) {
5024 Py_XDECREF(temp);
5025 return NULL;
5027 res = PyString_AS_STRING(result)
5028 + reslen - rescnt;
5030 if (sign) {
5031 if (fill != ' ')
5032 *res++ = sign;
5033 rescnt--;
5034 if (width > len)
5035 width--;
5037 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5038 assert(pbuf[0] == '0');
5039 assert(pbuf[1] == c);
5040 if (fill != ' ') {
5041 *res++ = *pbuf++;
5042 *res++ = *pbuf++;
5044 rescnt -= 2;
5045 width -= 2;
5046 if (width < 0)
5047 width = 0;
5048 len -= 2;
5050 if (width > len && !(flags & F_LJUST)) {
5051 do {
5052 --rescnt;
5053 *res++ = fill;
5054 } while (--width > len);
5056 if (fill == ' ') {
5057 if (sign)
5058 *res++ = sign;
5059 if ((flags & F_ALT) &&
5060 (c == 'x' || c == 'X')) {
5061 assert(pbuf[0] == '0');
5062 assert(pbuf[1] == c);
5063 *res++ = *pbuf++;
5064 *res++ = *pbuf++;
5067 Py_MEMCPY(res, pbuf, len);
5068 res += len;
5069 rescnt -= len;
5070 while (--width >= len) {
5071 --rescnt;
5072 *res++ = ' ';
5074 if (dict && (argidx < arglen) && c != '%') {
5075 PyErr_SetString(PyExc_TypeError,
5076 "not all arguments converted during string formatting");
5077 Py_XDECREF(temp);
5078 goto error;
5080 Py_XDECREF(temp);
5081 } /* '%' */
5082 } /* until end */
5083 if (argidx < arglen && !dict) {
5084 PyErr_SetString(PyExc_TypeError,
5085 "not all arguments converted during string formatting");
5086 goto error;
5088 if (args_owned) {
5089 Py_DECREF(args);
5091 _PyString_Resize(&result, reslen - rescnt);
5092 return result;
5094 #ifdef Py_USING_UNICODE
5095 unicode:
5096 if (args_owned) {
5097 Py_DECREF(args);
5098 args_owned = 0;
5100 /* Fiddle args right (remove the first argidx arguments) */
5101 if (PyTuple_Check(orig_args) && argidx > 0) {
5102 PyObject *v;
5103 Py_ssize_t n = PyTuple_GET_SIZE(orig_args) - argidx;
5104 v = PyTuple_New(n);
5105 if (v == NULL)
5106 goto error;
5107 while (--n >= 0) {
5108 PyObject *w = PyTuple_GET_ITEM(orig_args, n + argidx);
5109 Py_INCREF(w);
5110 PyTuple_SET_ITEM(v, n, w);
5112 args = v;
5113 } else {
5114 Py_INCREF(orig_args);
5115 args = orig_args;
5117 args_owned = 1;
5118 /* Take what we have of the result and let the Unicode formatting
5119 function format the rest of the input. */
5120 rescnt = res - PyString_AS_STRING(result);
5121 if (_PyString_Resize(&result, rescnt))
5122 goto error;
5123 fmtcnt = PyString_GET_SIZE(format) - \
5124 (fmt - PyString_AS_STRING(format));
5125 format = PyUnicode_Decode(fmt, fmtcnt, NULL, NULL);
5126 if (format == NULL)
5127 goto error;
5128 v = PyUnicode_Format(format, args);
5129 Py_DECREF(format);
5130 if (v == NULL)
5131 goto error;
5132 /* Paste what we have (result) to what the Unicode formatting
5133 function returned (v) and return the result (or error) */
5134 w = PyUnicode_Concat(result, v);
5135 Py_DECREF(result);
5136 Py_DECREF(v);
5137 Py_DECREF(args);
5138 return w;
5139 #endif /* Py_USING_UNICODE */
5141 error:
5142 Py_DECREF(result);
5143 if (args_owned) {
5144 Py_DECREF(args);
5146 return NULL;
5149 void
5150 PyString_InternInPlace(PyObject **p)
5152 register PyStringObject *s = (PyStringObject *)(*p);
5153 PyObject *t;
5154 if (s == NULL || !PyString_Check(s))
5155 Py_FatalError("PyString_InternInPlace: strings only please!");
5156 /* If it's a string subclass, we don't really know what putting
5157 it in the interned dict might do. */
5158 if (!PyString_CheckExact(s))
5159 return;
5160 if (PyString_CHECK_INTERNED(s))
5161 return;
5162 if (interned == NULL) {
5163 interned = PyDict_New();
5164 if (interned == NULL) {
5165 PyErr_Clear(); /* Don't leave an exception */
5166 return;
5169 t = PyDict_GetItem(interned, (PyObject *)s);
5170 if (t) {
5171 Py_INCREF(t);
5172 Py_DECREF(*p);
5173 *p = t;
5174 return;
5177 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
5178 PyErr_Clear();
5179 return;
5181 /* The two references in interned are not counted by refcnt.
5182 The string deallocator will take care of this */
5183 Py_REFCNT(s) -= 2;
5184 PyString_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
5187 void
5188 PyString_InternImmortal(PyObject **p)
5190 PyString_InternInPlace(p);
5191 if (PyString_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
5192 PyString_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
5193 Py_INCREF(*p);
5198 PyObject *
5199 PyString_InternFromString(const char *cp)
5201 PyObject *s = PyString_FromString(cp);
5202 if (s == NULL)
5203 return NULL;
5204 PyString_InternInPlace(&s);
5205 return s;
5208 void
5209 PyString_Fini(void)
5211 int i;
5212 for (i = 0; i < UCHAR_MAX + 1; i++) {
5213 Py_XDECREF(characters[i]);
5214 characters[i] = NULL;
5216 Py_XDECREF(nullstring);
5217 nullstring = NULL;
5220 void _Py_ReleaseInternedStrings(void)
5222 PyObject *keys;
5223 PyStringObject *s;
5224 Py_ssize_t i, n;
5225 Py_ssize_t immortal_size = 0, mortal_size = 0;
5227 if (interned == NULL || !PyDict_Check(interned))
5228 return;
5229 keys = PyDict_Keys(interned);
5230 if (keys == NULL || !PyList_Check(keys)) {
5231 PyErr_Clear();
5232 return;
5235 /* Since _Py_ReleaseInternedStrings() is intended to help a leak
5236 detector, interned strings are not forcibly deallocated; rather, we
5237 give them their stolen references back, and then clear and DECREF
5238 the interned dict. */
5240 n = PyList_GET_SIZE(keys);
5241 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
5243 for (i = 0; i < n; i++) {
5244 s = (PyStringObject *) PyList_GET_ITEM(keys, i);
5245 switch (s->ob_sstate) {
5246 case SSTATE_NOT_INTERNED:
5247 /* XXX Shouldn't happen */
5248 break;
5249 case SSTATE_INTERNED_IMMORTAL:
5250 Py_REFCNT(s) += 1;
5251 immortal_size += Py_SIZE(s);
5252 break;
5253 case SSTATE_INTERNED_MORTAL:
5254 Py_REFCNT(s) += 2;
5255 mortal_size += Py_SIZE(s);
5256 break;
5257 default:
5258 Py_FatalError("Inconsistent interned string state.");
5260 s->ob_sstate = SSTATE_NOT_INTERNED;
5262 fprintf(stderr, "total size of all interned strings: "
5263 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
5264 "mortal/immortal\n", mortal_size, immortal_size);
5265 Py_DECREF(keys);
5266 PyDict_Clear(interned);
5267 Py_DECREF(interned);
5268 interned = NULL;