Updated with fix for #3126.
[python.git] / Objects / stringobject.c
blob793cc8879360aa5594d9c2a9c094b2d6e29c50ac
1 /* String (str/bytes) object implementation */
3 #define PY_SSIZE_T_CLEAN
5 #include "Python.h"
6 #include <ctype.h>
8 #ifdef COUNT_ALLOCS
9 int null_strings, one_strings;
10 #endif
12 static PyStringObject *characters[UCHAR_MAX + 1];
13 static PyStringObject *nullstring;
15 /* This dictionary holds all interned strings. Note that references to
16 strings in this dictionary are *not* counted in the string's ob_refcnt.
17 When the interned string reaches a refcnt of 0 the string deallocation
18 function will delete the reference from this dictionary.
20 Another way to look at this is that to say that the actual reference
21 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
23 static PyObject *interned;
26 For both PyString_FromString() and PyString_FromStringAndSize(), the
27 parameter `size' denotes number of characters to allocate, not counting any
28 null terminating character.
30 For PyString_FromString(), the parameter `str' points to a null-terminated
31 string containing exactly `size' bytes.
33 For PyString_FromStringAndSize(), the parameter the parameter `str' is
34 either NULL or else points to a string containing at least `size' bytes.
35 For PyString_FromStringAndSize(), the string in the `str' parameter does
36 not have to be null-terminated. (Therefore it is safe to construct a
37 substring by calling `PyString_FromStringAndSize(origstring, substrlen)'.)
38 If `str' is NULL then PyString_FromStringAndSize() will allocate `size+1'
39 bytes (setting the last byte to the null terminating character) and you can
40 fill in the data yourself. If `str' is non-NULL then the resulting
41 PyString object must be treated as immutable and you must not fill in nor
42 alter the data yourself, since the strings may be shared.
44 The PyObject member `op->ob_size', which denotes the number of "extra
45 items" in a variable-size object, will contain the number of bytes
46 allocated for string data, not counting the null terminating character. It
47 is therefore equal to the equal to the `size' parameter (for
48 PyString_FromStringAndSize()) or the length of the string in the `str'
49 parameter (for PyString_FromString()).
51 PyObject *
52 PyString_FromStringAndSize(const char *str, Py_ssize_t size)
54 register PyStringObject *op;
55 if (size < 0) {
56 PyErr_SetString(PyExc_SystemError,
57 "Negative size passed to PyString_FromStringAndSize");
58 return NULL;
60 if (size == 0 && (op = nullstring) != NULL) {
61 #ifdef COUNT_ALLOCS
62 null_strings++;
63 #endif
64 Py_INCREF(op);
65 return (PyObject *)op;
67 if (size == 1 && str != NULL &&
68 (op = characters[*str & UCHAR_MAX]) != NULL)
70 #ifdef COUNT_ALLOCS
71 one_strings++;
72 #endif
73 Py_INCREF(op);
74 return (PyObject *)op;
77 /* Inline PyObject_NewVar */
78 op = (PyStringObject *)PyObject_MALLOC(sizeof(PyStringObject) + size);
79 if (op == NULL)
80 return PyErr_NoMemory();
81 PyObject_INIT_VAR(op, &PyString_Type, size);
82 op->ob_shash = -1;
83 op->ob_sstate = SSTATE_NOT_INTERNED;
84 if (str != NULL)
85 Py_MEMCPY(op->ob_sval, str, size);
86 op->ob_sval[size] = '\0';
87 /* share short strings */
88 if (size == 0) {
89 PyObject *t = (PyObject *)op;
90 PyString_InternInPlace(&t);
91 op = (PyStringObject *)t;
92 nullstring = op;
93 Py_INCREF(op);
94 } else if (size == 1 && str != NULL) {
95 PyObject *t = (PyObject *)op;
96 PyString_InternInPlace(&t);
97 op = (PyStringObject *)t;
98 characters[*str & UCHAR_MAX] = op;
99 Py_INCREF(op);
101 return (PyObject *) op;
104 PyObject *
105 PyString_FromString(const char *str)
107 register size_t size;
108 register PyStringObject *op;
110 assert(str != NULL);
111 size = strlen(str);
112 if (size > PY_SSIZE_T_MAX) {
113 PyErr_SetString(PyExc_OverflowError,
114 "string is too long for a Python string");
115 return NULL;
117 if (size == 0 && (op = nullstring) != NULL) {
118 #ifdef COUNT_ALLOCS
119 null_strings++;
120 #endif
121 Py_INCREF(op);
122 return (PyObject *)op;
124 if (size == 1 && (op = characters[*str & UCHAR_MAX]) != NULL) {
125 #ifdef COUNT_ALLOCS
126 one_strings++;
127 #endif
128 Py_INCREF(op);
129 return (PyObject *)op;
132 /* Inline PyObject_NewVar */
133 op = (PyStringObject *)PyObject_MALLOC(sizeof(PyStringObject) + size);
134 if (op == NULL)
135 return PyErr_NoMemory();
136 PyObject_INIT_VAR(op, &PyString_Type, size);
137 op->ob_shash = -1;
138 op->ob_sstate = SSTATE_NOT_INTERNED;
139 Py_MEMCPY(op->ob_sval, str, size+1);
140 /* share short strings */
141 if (size == 0) {
142 PyObject *t = (PyObject *)op;
143 PyString_InternInPlace(&t);
144 op = (PyStringObject *)t;
145 nullstring = op;
146 Py_INCREF(op);
147 } else if (size == 1) {
148 PyObject *t = (PyObject *)op;
149 PyString_InternInPlace(&t);
150 op = (PyStringObject *)t;
151 characters[*str & UCHAR_MAX] = op;
152 Py_INCREF(op);
154 return (PyObject *) op;
157 PyObject *
158 PyString_FromFormatV(const char *format, va_list vargs)
160 va_list count;
161 Py_ssize_t n = 0;
162 const char* f;
163 char *s;
164 PyObject* string;
166 #ifdef VA_LIST_IS_ARRAY
167 Py_MEMCPY(count, vargs, sizeof(va_list));
168 #else
169 #ifdef __va_copy
170 __va_copy(count, vargs);
171 #else
172 count = vargs;
173 #endif
174 #endif
175 /* step 1: figure out how large a buffer we need */
176 for (f = format; *f; f++) {
177 if (*f == '%') {
178 const char* p = f;
179 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
182 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
183 * they don't affect the amount of space we reserve.
185 if ((*f == 'l' || *f == 'z') &&
186 (f[1] == 'd' || f[1] == 'u'))
187 ++f;
189 switch (*f) {
190 case 'c':
191 (void)va_arg(count, int);
192 /* fall through... */
193 case '%':
194 n++;
195 break;
196 case 'd': case 'u': case 'i': case 'x':
197 (void) va_arg(count, int);
198 /* 20 bytes is enough to hold a 64-bit
199 integer. Decimal takes the most space.
200 This isn't enough for octal. */
201 n += 20;
202 break;
203 case 's':
204 s = va_arg(count, char*);
205 n += strlen(s);
206 break;
207 case 'p':
208 (void) va_arg(count, int);
209 /* maximum 64-bit pointer representation:
210 * 0xffffffffffffffff
211 * so 19 characters is enough.
212 * XXX I count 18 -- what's the extra for?
214 n += 19;
215 break;
216 default:
217 /* if we stumble upon an unknown
218 formatting code, copy the rest of
219 the format string to the output
220 string. (we cannot just skip the
221 code, since there's no way to know
222 what's in the argument list) */
223 n += strlen(p);
224 goto expand;
226 } else
227 n++;
229 expand:
230 /* step 2: fill the buffer */
231 /* Since we've analyzed how much space we need for the worst case,
232 use sprintf directly instead of the slower PyOS_snprintf. */
233 string = PyString_FromStringAndSize(NULL, n);
234 if (!string)
235 return NULL;
237 s = PyString_AsString(string);
239 for (f = format; *f; f++) {
240 if (*f == '%') {
241 const char* p = f++;
242 Py_ssize_t i;
243 int longflag = 0;
244 int size_tflag = 0;
245 /* parse the width.precision part (we're only
246 interested in the precision value, if any) */
247 n = 0;
248 while (isdigit(Py_CHARMASK(*f)))
249 n = (n*10) + *f++ - '0';
250 if (*f == '.') {
251 f++;
252 n = 0;
253 while (isdigit(Py_CHARMASK(*f)))
254 n = (n*10) + *f++ - '0';
256 while (*f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
257 f++;
258 /* handle the long flag, but only for %ld and %lu.
259 others can be added when necessary. */
260 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
261 longflag = 1;
262 ++f;
264 /* handle the size_t flag. */
265 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
266 size_tflag = 1;
267 ++f;
270 switch (*f) {
271 case 'c':
272 *s++ = va_arg(vargs, int);
273 break;
274 case 'd':
275 if (longflag)
276 sprintf(s, "%ld", va_arg(vargs, long));
277 else if (size_tflag)
278 sprintf(s, "%" PY_FORMAT_SIZE_T "d",
279 va_arg(vargs, Py_ssize_t));
280 else
281 sprintf(s, "%d", va_arg(vargs, int));
282 s += strlen(s);
283 break;
284 case 'u':
285 if (longflag)
286 sprintf(s, "%lu",
287 va_arg(vargs, unsigned long));
288 else if (size_tflag)
289 sprintf(s, "%" PY_FORMAT_SIZE_T "u",
290 va_arg(vargs, size_t));
291 else
292 sprintf(s, "%u",
293 va_arg(vargs, unsigned int));
294 s += strlen(s);
295 break;
296 case 'i':
297 sprintf(s, "%i", va_arg(vargs, int));
298 s += strlen(s);
299 break;
300 case 'x':
301 sprintf(s, "%x", va_arg(vargs, int));
302 s += strlen(s);
303 break;
304 case 's':
305 p = va_arg(vargs, char*);
306 i = strlen(p);
307 if (n > 0 && i > n)
308 i = n;
309 Py_MEMCPY(s, p, i);
310 s += i;
311 break;
312 case 'p':
313 sprintf(s, "%p", va_arg(vargs, void*));
314 /* %p is ill-defined: ensure leading 0x. */
315 if (s[1] == 'X')
316 s[1] = 'x';
317 else if (s[1] != 'x') {
318 memmove(s+2, s, strlen(s)+1);
319 s[0] = '0';
320 s[1] = 'x';
322 s += strlen(s);
323 break;
324 case '%':
325 *s++ = '%';
326 break;
327 default:
328 strcpy(s, p);
329 s += strlen(s);
330 goto end;
332 } else
333 *s++ = *f;
336 end:
337 _PyString_Resize(&string, s - PyString_AS_STRING(string));
338 return string;
341 PyObject *
342 PyString_FromFormat(const char *format, ...)
344 PyObject* ret;
345 va_list vargs;
347 #ifdef HAVE_STDARG_PROTOTYPES
348 va_start(vargs, format);
349 #else
350 va_start(vargs);
351 #endif
352 ret = PyString_FromFormatV(format, vargs);
353 va_end(vargs);
354 return ret;
358 PyObject *PyString_Decode(const char *s,
359 Py_ssize_t size,
360 const char *encoding,
361 const char *errors)
363 PyObject *v, *str;
365 str = PyString_FromStringAndSize(s, size);
366 if (str == NULL)
367 return NULL;
368 v = PyString_AsDecodedString(str, encoding, errors);
369 Py_DECREF(str);
370 return v;
373 PyObject *PyString_AsDecodedObject(PyObject *str,
374 const char *encoding,
375 const char *errors)
377 PyObject *v;
379 if (!PyString_Check(str)) {
380 PyErr_BadArgument();
381 goto onError;
384 if (encoding == NULL) {
385 #ifdef Py_USING_UNICODE
386 encoding = PyUnicode_GetDefaultEncoding();
387 #else
388 PyErr_SetString(PyExc_ValueError, "no encoding specified");
389 goto onError;
390 #endif
393 /* Decode via the codec registry */
394 v = PyCodec_Decode(str, encoding, errors);
395 if (v == NULL)
396 goto onError;
398 return v;
400 onError:
401 return NULL;
404 PyObject *PyString_AsDecodedString(PyObject *str,
405 const char *encoding,
406 const char *errors)
408 PyObject *v;
410 v = PyString_AsDecodedObject(str, encoding, errors);
411 if (v == NULL)
412 goto onError;
414 #ifdef Py_USING_UNICODE
415 /* Convert Unicode to a string using the default encoding */
416 if (PyUnicode_Check(v)) {
417 PyObject *temp = v;
418 v = PyUnicode_AsEncodedString(v, NULL, NULL);
419 Py_DECREF(temp);
420 if (v == NULL)
421 goto onError;
423 #endif
424 if (!PyString_Check(v)) {
425 PyErr_Format(PyExc_TypeError,
426 "decoder did not return a string object (type=%.400s)",
427 Py_TYPE(v)->tp_name);
428 Py_DECREF(v);
429 goto onError;
432 return v;
434 onError:
435 return NULL;
438 PyObject *PyString_Encode(const char *s,
439 Py_ssize_t size,
440 const char *encoding,
441 const char *errors)
443 PyObject *v, *str;
445 str = PyString_FromStringAndSize(s, size);
446 if (str == NULL)
447 return NULL;
448 v = PyString_AsEncodedString(str, encoding, errors);
449 Py_DECREF(str);
450 return v;
453 PyObject *PyString_AsEncodedObject(PyObject *str,
454 const char *encoding,
455 const char *errors)
457 PyObject *v;
459 if (!PyString_Check(str)) {
460 PyErr_BadArgument();
461 goto onError;
464 if (encoding == NULL) {
465 #ifdef Py_USING_UNICODE
466 encoding = PyUnicode_GetDefaultEncoding();
467 #else
468 PyErr_SetString(PyExc_ValueError, "no encoding specified");
469 goto onError;
470 #endif
473 /* Encode via the codec registry */
474 v = PyCodec_Encode(str, encoding, errors);
475 if (v == NULL)
476 goto onError;
478 return v;
480 onError:
481 return NULL;
484 PyObject *PyString_AsEncodedString(PyObject *str,
485 const char *encoding,
486 const char *errors)
488 PyObject *v;
490 v = PyString_AsEncodedObject(str, encoding, errors);
491 if (v == NULL)
492 goto onError;
494 #ifdef Py_USING_UNICODE
495 /* Convert Unicode to a string using the default encoding */
496 if (PyUnicode_Check(v)) {
497 PyObject *temp = v;
498 v = PyUnicode_AsEncodedString(v, NULL, NULL);
499 Py_DECREF(temp);
500 if (v == NULL)
501 goto onError;
503 #endif
504 if (!PyString_Check(v)) {
505 PyErr_Format(PyExc_TypeError,
506 "encoder did not return a string object (type=%.400s)",
507 Py_TYPE(v)->tp_name);
508 Py_DECREF(v);
509 goto onError;
512 return v;
514 onError:
515 return NULL;
518 static void
519 string_dealloc(PyObject *op)
521 switch (PyString_CHECK_INTERNED(op)) {
522 case SSTATE_NOT_INTERNED:
523 break;
525 case SSTATE_INTERNED_MORTAL:
526 /* revive dead object temporarily for DelItem */
527 Py_REFCNT(op) = 3;
528 if (PyDict_DelItem(interned, op) != 0)
529 Py_FatalError(
530 "deletion of interned string failed");
531 break;
533 case SSTATE_INTERNED_IMMORTAL:
534 Py_FatalError("Immortal interned string died.");
536 default:
537 Py_FatalError("Inconsistent interned string state.");
539 Py_TYPE(op)->tp_free(op);
542 /* Unescape a backslash-escaped string. If unicode is non-zero,
543 the string is a u-literal. If recode_encoding is non-zero,
544 the string is UTF-8 encoded and should be re-encoded in the
545 specified encoding. */
547 PyObject *PyString_DecodeEscape(const char *s,
548 Py_ssize_t len,
549 const char *errors,
550 Py_ssize_t unicode,
551 const char *recode_encoding)
553 int c;
554 char *p, *buf;
555 const char *end;
556 PyObject *v;
557 Py_ssize_t newlen = recode_encoding ? 4*len:len;
558 v = PyString_FromStringAndSize((char *)NULL, newlen);
559 if (v == NULL)
560 return NULL;
561 p = buf = PyString_AsString(v);
562 end = s + len;
563 while (s < end) {
564 if (*s != '\\') {
565 non_esc:
566 #ifdef Py_USING_UNICODE
567 if (recode_encoding && (*s & 0x80)) {
568 PyObject *u, *w;
569 char *r;
570 const char* t;
571 Py_ssize_t rn;
572 t = s;
573 /* Decode non-ASCII bytes as UTF-8. */
574 while (t < end && (*t & 0x80)) t++;
575 u = PyUnicode_DecodeUTF8(s, t - s, errors);
576 if(!u) goto failed;
578 /* Recode them in target encoding. */
579 w = PyUnicode_AsEncodedString(
580 u, recode_encoding, errors);
581 Py_DECREF(u);
582 if (!w) goto failed;
584 /* Append bytes to output buffer. */
585 assert(PyString_Check(w));
586 r = PyString_AS_STRING(w);
587 rn = PyString_GET_SIZE(w);
588 Py_MEMCPY(p, r, rn);
589 p += rn;
590 Py_DECREF(w);
591 s = t;
592 } else {
593 *p++ = *s++;
595 #else
596 *p++ = *s++;
597 #endif
598 continue;
600 s++;
601 if (s==end) {
602 PyErr_SetString(PyExc_ValueError,
603 "Trailing \\ in string");
604 goto failed;
606 switch (*s++) {
607 /* XXX This assumes ASCII! */
608 case '\n': break;
609 case '\\': *p++ = '\\'; break;
610 case '\'': *p++ = '\''; break;
611 case '\"': *p++ = '\"'; break;
612 case 'b': *p++ = '\b'; break;
613 case 'f': *p++ = '\014'; break; /* FF */
614 case 't': *p++ = '\t'; break;
615 case 'n': *p++ = '\n'; break;
616 case 'r': *p++ = '\r'; break;
617 case 'v': *p++ = '\013'; break; /* VT */
618 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
619 case '0': case '1': case '2': case '3':
620 case '4': case '5': case '6': case '7':
621 c = s[-1] - '0';
622 if (s < end && '0' <= *s && *s <= '7') {
623 c = (c<<3) + *s++ - '0';
624 if (s < end && '0' <= *s && *s <= '7')
625 c = (c<<3) + *s++ - '0';
627 *p++ = c;
628 break;
629 case 'x':
630 if (s+1 < end &&
631 isxdigit(Py_CHARMASK(s[0])) &&
632 isxdigit(Py_CHARMASK(s[1])))
634 unsigned int x = 0;
635 c = Py_CHARMASK(*s);
636 s++;
637 if (isdigit(c))
638 x = c - '0';
639 else if (islower(c))
640 x = 10 + c - 'a';
641 else
642 x = 10 + c - 'A';
643 x = x << 4;
644 c = Py_CHARMASK(*s);
645 s++;
646 if (isdigit(c))
647 x += c - '0';
648 else if (islower(c))
649 x += 10 + c - 'a';
650 else
651 x += 10 + c - 'A';
652 *p++ = x;
653 break;
655 if (!errors || strcmp(errors, "strict") == 0) {
656 PyErr_SetString(PyExc_ValueError,
657 "invalid \\x escape");
658 goto failed;
660 if (strcmp(errors, "replace") == 0) {
661 *p++ = '?';
662 } else if (strcmp(errors, "ignore") == 0)
663 /* do nothing */;
664 else {
665 PyErr_Format(PyExc_ValueError,
666 "decoding error; "
667 "unknown error handling code: %.400s",
668 errors);
669 goto failed;
671 #ifndef Py_USING_UNICODE
672 case 'u':
673 case 'U':
674 case 'N':
675 if (unicode) {
676 PyErr_SetString(PyExc_ValueError,
677 "Unicode escapes not legal "
678 "when Unicode disabled");
679 goto failed;
681 #endif
682 default:
683 *p++ = '\\';
684 s--;
685 goto non_esc; /* an arbitry number of unescaped
686 UTF-8 bytes may follow. */
689 if (p-buf < newlen)
690 _PyString_Resize(&v, p - buf);
691 return v;
692 failed:
693 Py_DECREF(v);
694 return NULL;
697 /* -------------------------------------------------------------------- */
698 /* object api */
700 static Py_ssize_t
701 string_getsize(register PyObject *op)
703 char *s;
704 Py_ssize_t len;
705 if (PyString_AsStringAndSize(op, &s, &len))
706 return -1;
707 return len;
710 static /*const*/ char *
711 string_getbuffer(register PyObject *op)
713 char *s;
714 Py_ssize_t len;
715 if (PyString_AsStringAndSize(op, &s, &len))
716 return NULL;
717 return s;
720 Py_ssize_t
721 PyString_Size(register PyObject *op)
723 if (!PyString_Check(op))
724 return string_getsize(op);
725 return Py_SIZE(op);
728 /*const*/ char *
729 PyString_AsString(register PyObject *op)
731 if (!PyString_Check(op))
732 return string_getbuffer(op);
733 return ((PyStringObject *)op) -> ob_sval;
737 PyString_AsStringAndSize(register PyObject *obj,
738 register char **s,
739 register Py_ssize_t *len)
741 if (s == NULL) {
742 PyErr_BadInternalCall();
743 return -1;
746 if (!PyString_Check(obj)) {
747 #ifdef Py_USING_UNICODE
748 if (PyUnicode_Check(obj)) {
749 obj = _PyUnicode_AsDefaultEncodedString(obj, NULL);
750 if (obj == NULL)
751 return -1;
753 else
754 #endif
756 PyErr_Format(PyExc_TypeError,
757 "expected string or Unicode object, "
758 "%.200s found", Py_TYPE(obj)->tp_name);
759 return -1;
763 *s = PyString_AS_STRING(obj);
764 if (len != NULL)
765 *len = PyString_GET_SIZE(obj);
766 else if (strlen(*s) != (size_t)PyString_GET_SIZE(obj)) {
767 PyErr_SetString(PyExc_TypeError,
768 "expected string without null bytes");
769 return -1;
771 return 0;
774 /* -------------------------------------------------------------------- */
775 /* Methods */
777 #include "stringlib/stringdefs.h"
778 #include "stringlib/fastsearch.h"
780 #include "stringlib/count.h"
781 #include "stringlib/find.h"
782 #include "stringlib/partition.h"
784 #define _Py_InsertThousandsGrouping _PyString_InsertThousandsGrouping
785 #include "stringlib/localeutil.h"
789 static int
790 string_print(PyStringObject *op, FILE *fp, int flags)
792 Py_ssize_t i, str_len;
793 char c;
794 int quote;
796 /* XXX Ought to check for interrupts when writing long strings */
797 if (! PyString_CheckExact(op)) {
798 int ret;
799 /* A str subclass may have its own __str__ method. */
800 op = (PyStringObject *) PyObject_Str((PyObject *)op);
801 if (op == NULL)
802 return -1;
803 ret = string_print(op, fp, flags);
804 Py_DECREF(op);
805 return ret;
807 if (flags & Py_PRINT_RAW) {
808 char *data = op->ob_sval;
809 Py_ssize_t size = Py_SIZE(op);
810 Py_BEGIN_ALLOW_THREADS
811 while (size > INT_MAX) {
812 /* Very long strings cannot be written atomically.
813 * But don't write exactly INT_MAX bytes at a time
814 * to avoid memory aligment issues.
816 const int chunk_size = INT_MAX & ~0x3FFF;
817 fwrite(data, 1, chunk_size, fp);
818 data += chunk_size;
819 size -= chunk_size;
821 #ifdef __VMS
822 if (size) fwrite(data, (int)size, 1, fp);
823 #else
824 fwrite(data, 1, (int)size, fp);
825 #endif
826 Py_END_ALLOW_THREADS
827 return 0;
830 /* figure out which quote to use; single is preferred */
831 quote = '\'';
832 if (memchr(op->ob_sval, '\'', Py_SIZE(op)) &&
833 !memchr(op->ob_sval, '"', Py_SIZE(op)))
834 quote = '"';
836 str_len = Py_SIZE(op);
837 Py_BEGIN_ALLOW_THREADS
838 fputc(quote, fp);
839 for (i = 0; i < str_len; i++) {
840 /* Since strings are immutable and the caller should have a
841 reference, accessing the interal buffer should not be an issue
842 with the GIL released. */
843 c = op->ob_sval[i];
844 if (c == quote || c == '\\')
845 fprintf(fp, "\\%c", c);
846 else if (c == '\t')
847 fprintf(fp, "\\t");
848 else if (c == '\n')
849 fprintf(fp, "\\n");
850 else if (c == '\r')
851 fprintf(fp, "\\r");
852 else if (c < ' ' || c >= 0x7f)
853 fprintf(fp, "\\x%02x", c & 0xff);
854 else
855 fputc(c, fp);
857 fputc(quote, fp);
858 Py_END_ALLOW_THREADS
859 return 0;
862 PyObject *
863 PyString_Repr(PyObject *obj, int smartquotes)
865 register PyStringObject* op = (PyStringObject*) obj;
866 size_t newsize = 2 + 4 * Py_SIZE(op);
867 PyObject *v;
868 if (newsize > PY_SSIZE_T_MAX || newsize / 4 != Py_SIZE(op)) {
869 PyErr_SetString(PyExc_OverflowError,
870 "string is too large to make repr");
871 return NULL;
873 v = PyString_FromStringAndSize((char *)NULL, newsize);
874 if (v == NULL) {
875 return NULL;
877 else {
878 register Py_ssize_t i;
879 register char c;
880 register char *p;
881 int quote;
883 /* figure out which quote to use; single is preferred */
884 quote = '\'';
885 if (smartquotes &&
886 memchr(op->ob_sval, '\'', Py_SIZE(op)) &&
887 !memchr(op->ob_sval, '"', Py_SIZE(op)))
888 quote = '"';
890 p = PyString_AS_STRING(v);
891 *p++ = quote;
892 for (i = 0; i < Py_SIZE(op); i++) {
893 /* There's at least enough room for a hex escape
894 and a closing quote. */
895 assert(newsize - (p - PyString_AS_STRING(v)) >= 5);
896 c = op->ob_sval[i];
897 if (c == quote || c == '\\')
898 *p++ = '\\', *p++ = c;
899 else if (c == '\t')
900 *p++ = '\\', *p++ = 't';
901 else if (c == '\n')
902 *p++ = '\\', *p++ = 'n';
903 else if (c == '\r')
904 *p++ = '\\', *p++ = 'r';
905 else if (c < ' ' || c >= 0x7f) {
906 /* For performance, we don't want to call
907 PyOS_snprintf here (extra layers of
908 function call). */
909 sprintf(p, "\\x%02x", c & 0xff);
910 p += 4;
912 else
913 *p++ = c;
915 assert(newsize - (p - PyString_AS_STRING(v)) >= 1);
916 *p++ = quote;
917 *p = '\0';
918 _PyString_Resize(
919 &v, (p - PyString_AS_STRING(v)));
920 return v;
924 static PyObject *
925 string_repr(PyObject *op)
927 return PyString_Repr(op, 1);
930 static PyObject *
931 string_str(PyObject *s)
933 assert(PyString_Check(s));
934 if (PyString_CheckExact(s)) {
935 Py_INCREF(s);
936 return s;
938 else {
939 /* Subtype -- return genuine string with the same value. */
940 PyStringObject *t = (PyStringObject *) s;
941 return PyString_FromStringAndSize(t->ob_sval, Py_SIZE(t));
945 static Py_ssize_t
946 string_length(PyStringObject *a)
948 return Py_SIZE(a);
951 static PyObject *
952 string_concat(register PyStringObject *a, register PyObject *bb)
954 register Py_ssize_t size;
955 register PyStringObject *op;
956 if (!PyString_Check(bb)) {
957 #ifdef Py_USING_UNICODE
958 if (PyUnicode_Check(bb))
959 return PyUnicode_Concat((PyObject *)a, bb);
960 #endif
961 if (PyByteArray_Check(bb))
962 return PyByteArray_Concat((PyObject *)a, bb);
963 PyErr_Format(PyExc_TypeError,
964 "cannot concatenate 'str' and '%.200s' objects",
965 Py_TYPE(bb)->tp_name);
966 return NULL;
968 #define b ((PyStringObject *)bb)
969 /* Optimize cases with empty left or right operand */
970 if ((Py_SIZE(a) == 0 || Py_SIZE(b) == 0) &&
971 PyString_CheckExact(a) && PyString_CheckExact(b)) {
972 if (Py_SIZE(a) == 0) {
973 Py_INCREF(bb);
974 return bb;
976 Py_INCREF(a);
977 return (PyObject *)a;
979 size = Py_SIZE(a) + Py_SIZE(b);
980 if (size < 0) {
981 PyErr_SetString(PyExc_OverflowError,
982 "strings are too large to concat");
983 return NULL;
986 /* Inline PyObject_NewVar */
987 op = (PyStringObject *)PyObject_MALLOC(sizeof(PyStringObject) + size);
988 if (op == NULL)
989 return PyErr_NoMemory();
990 PyObject_INIT_VAR(op, &PyString_Type, size);
991 op->ob_shash = -1;
992 op->ob_sstate = SSTATE_NOT_INTERNED;
993 Py_MEMCPY(op->ob_sval, a->ob_sval, Py_SIZE(a));
994 Py_MEMCPY(op->ob_sval + Py_SIZE(a), b->ob_sval, Py_SIZE(b));
995 op->ob_sval[size] = '\0';
996 return (PyObject *) op;
997 #undef b
1000 static PyObject *
1001 string_repeat(register PyStringObject *a, register Py_ssize_t n)
1003 register Py_ssize_t i;
1004 register Py_ssize_t j;
1005 register Py_ssize_t size;
1006 register PyStringObject *op;
1007 size_t nbytes;
1008 if (n < 0)
1009 n = 0;
1010 /* watch out for overflows: the size can overflow int,
1011 * and the # of bytes needed can overflow size_t
1013 size = Py_SIZE(a) * n;
1014 if (n && size / n != Py_SIZE(a)) {
1015 PyErr_SetString(PyExc_OverflowError,
1016 "repeated string is too long");
1017 return NULL;
1019 if (size == Py_SIZE(a) && PyString_CheckExact(a)) {
1020 Py_INCREF(a);
1021 return (PyObject *)a;
1023 nbytes = (size_t)size;
1024 if (nbytes + sizeof(PyStringObject) <= nbytes) {
1025 PyErr_SetString(PyExc_OverflowError,
1026 "repeated string is too long");
1027 return NULL;
1029 op = (PyStringObject *)
1030 PyObject_MALLOC(sizeof(PyStringObject) + nbytes);
1031 if (op == NULL)
1032 return PyErr_NoMemory();
1033 PyObject_INIT_VAR(op, &PyString_Type, size);
1034 op->ob_shash = -1;
1035 op->ob_sstate = SSTATE_NOT_INTERNED;
1036 op->ob_sval[size] = '\0';
1037 if (Py_SIZE(a) == 1 && n > 0) {
1038 memset(op->ob_sval, a->ob_sval[0] , n);
1039 return (PyObject *) op;
1041 i = 0;
1042 if (i < size) {
1043 Py_MEMCPY(op->ob_sval, a->ob_sval, Py_SIZE(a));
1044 i = Py_SIZE(a);
1046 while (i < size) {
1047 j = (i <= size-i) ? i : size-i;
1048 Py_MEMCPY(op->ob_sval+i, op->ob_sval, j);
1049 i += j;
1051 return (PyObject *) op;
1054 /* String slice a[i:j] consists of characters a[i] ... a[j-1] */
1056 static PyObject *
1057 string_slice(register PyStringObject *a, register Py_ssize_t i,
1058 register Py_ssize_t j)
1059 /* j -- may be negative! */
1061 if (i < 0)
1062 i = 0;
1063 if (j < 0)
1064 j = 0; /* Avoid signed/unsigned bug in next line */
1065 if (j > Py_SIZE(a))
1066 j = Py_SIZE(a);
1067 if (i == 0 && j == Py_SIZE(a) && PyString_CheckExact(a)) {
1068 /* It's the same as a */
1069 Py_INCREF(a);
1070 return (PyObject *)a;
1072 if (j < i)
1073 j = i;
1074 return PyString_FromStringAndSize(a->ob_sval + i, j-i);
1077 static int
1078 string_contains(PyObject *str_obj, PyObject *sub_obj)
1080 if (!PyString_CheckExact(sub_obj)) {
1081 #ifdef Py_USING_UNICODE
1082 if (PyUnicode_Check(sub_obj))
1083 return PyUnicode_Contains(str_obj, sub_obj);
1084 #endif
1085 if (!PyString_Check(sub_obj)) {
1086 PyErr_Format(PyExc_TypeError,
1087 "'in <string>' requires string as left operand, "
1088 "not %.200s", Py_TYPE(sub_obj)->tp_name);
1089 return -1;
1093 return stringlib_contains_obj(str_obj, sub_obj);
1096 static PyObject *
1097 string_item(PyStringObject *a, register Py_ssize_t i)
1099 char pchar;
1100 PyObject *v;
1101 if (i < 0 || i >= Py_SIZE(a)) {
1102 PyErr_SetString(PyExc_IndexError, "string index out of range");
1103 return NULL;
1105 pchar = a->ob_sval[i];
1106 v = (PyObject *)characters[pchar & UCHAR_MAX];
1107 if (v == NULL)
1108 v = PyString_FromStringAndSize(&pchar, 1);
1109 else {
1110 #ifdef COUNT_ALLOCS
1111 one_strings++;
1112 #endif
1113 Py_INCREF(v);
1115 return v;
1118 static PyObject*
1119 string_richcompare(PyStringObject *a, PyStringObject *b, int op)
1121 int c;
1122 Py_ssize_t len_a, len_b;
1123 Py_ssize_t min_len;
1124 PyObject *result;
1126 /* Make sure both arguments are strings. */
1127 if (!(PyString_Check(a) && PyString_Check(b))) {
1128 result = Py_NotImplemented;
1129 goto out;
1131 if (a == b) {
1132 switch (op) {
1133 case Py_EQ:case Py_LE:case Py_GE:
1134 result = Py_True;
1135 goto out;
1136 case Py_NE:case Py_LT:case Py_GT:
1137 result = Py_False;
1138 goto out;
1141 if (op == Py_EQ) {
1142 /* Supporting Py_NE here as well does not save
1143 much time, since Py_NE is rarely used. */
1144 if (Py_SIZE(a) == Py_SIZE(b)
1145 && (a->ob_sval[0] == b->ob_sval[0]
1146 && memcmp(a->ob_sval, b->ob_sval, Py_SIZE(a)) == 0)) {
1147 result = Py_True;
1148 } else {
1149 result = Py_False;
1151 goto out;
1153 len_a = Py_SIZE(a); len_b = Py_SIZE(b);
1154 min_len = (len_a < len_b) ? len_a : len_b;
1155 if (min_len > 0) {
1156 c = Py_CHARMASK(*a->ob_sval) - Py_CHARMASK(*b->ob_sval);
1157 if (c==0)
1158 c = memcmp(a->ob_sval, b->ob_sval, min_len);
1159 } else
1160 c = 0;
1161 if (c == 0)
1162 c = (len_a < len_b) ? -1 : (len_a > len_b) ? 1 : 0;
1163 switch (op) {
1164 case Py_LT: c = c < 0; break;
1165 case Py_LE: c = c <= 0; break;
1166 case Py_EQ: assert(0); break; /* unreachable */
1167 case Py_NE: c = c != 0; break;
1168 case Py_GT: c = c > 0; break;
1169 case Py_GE: c = c >= 0; break;
1170 default:
1171 result = Py_NotImplemented;
1172 goto out;
1174 result = c ? Py_True : Py_False;
1175 out:
1176 Py_INCREF(result);
1177 return result;
1181 _PyString_Eq(PyObject *o1, PyObject *o2)
1183 PyStringObject *a = (PyStringObject*) o1;
1184 PyStringObject *b = (PyStringObject*) o2;
1185 return Py_SIZE(a) == Py_SIZE(b)
1186 && *a->ob_sval == *b->ob_sval
1187 && memcmp(a->ob_sval, b->ob_sval, Py_SIZE(a)) == 0;
1190 static long
1191 string_hash(PyStringObject *a)
1193 register Py_ssize_t len;
1194 register unsigned char *p;
1195 register long x;
1197 if (a->ob_shash != -1)
1198 return a->ob_shash;
1199 len = Py_SIZE(a);
1200 p = (unsigned char *) a->ob_sval;
1201 x = *p << 7;
1202 while (--len >= 0)
1203 x = (1000003*x) ^ *p++;
1204 x ^= Py_SIZE(a);
1205 if (x == -1)
1206 x = -2;
1207 a->ob_shash = x;
1208 return x;
1211 static PyObject*
1212 string_subscript(PyStringObject* self, PyObject* item)
1214 if (PyIndex_Check(item)) {
1215 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
1216 if (i == -1 && PyErr_Occurred())
1217 return NULL;
1218 if (i < 0)
1219 i += PyString_GET_SIZE(self);
1220 return string_item(self, i);
1222 else if (PySlice_Check(item)) {
1223 Py_ssize_t start, stop, step, slicelength, cur, i;
1224 char* source_buf;
1225 char* result_buf;
1226 PyObject* result;
1228 if (PySlice_GetIndicesEx((PySliceObject*)item,
1229 PyString_GET_SIZE(self),
1230 &start, &stop, &step, &slicelength) < 0) {
1231 return NULL;
1234 if (slicelength <= 0) {
1235 return PyString_FromStringAndSize("", 0);
1237 else if (start == 0 && step == 1 &&
1238 slicelength == PyString_GET_SIZE(self) &&
1239 PyString_CheckExact(self)) {
1240 Py_INCREF(self);
1241 return (PyObject *)self;
1243 else if (step == 1) {
1244 return PyString_FromStringAndSize(
1245 PyString_AS_STRING(self) + start,
1246 slicelength);
1248 else {
1249 source_buf = PyString_AsString((PyObject*)self);
1250 result_buf = (char *)PyMem_Malloc(slicelength);
1251 if (result_buf == NULL)
1252 return PyErr_NoMemory();
1254 for (cur = start, i = 0; i < slicelength;
1255 cur += step, i++) {
1256 result_buf[i] = source_buf[cur];
1259 result = PyString_FromStringAndSize(result_buf,
1260 slicelength);
1261 PyMem_Free(result_buf);
1262 return result;
1265 else {
1266 PyErr_Format(PyExc_TypeError,
1267 "string indices must be integers, not %.200s",
1268 Py_TYPE(item)->tp_name);
1269 return NULL;
1273 static Py_ssize_t
1274 string_buffer_getreadbuf(PyStringObject *self, Py_ssize_t index, const void **ptr)
1276 if ( index != 0 ) {
1277 PyErr_SetString(PyExc_SystemError,
1278 "accessing non-existent string segment");
1279 return -1;
1281 *ptr = (void *)self->ob_sval;
1282 return Py_SIZE(self);
1285 static Py_ssize_t
1286 string_buffer_getwritebuf(PyStringObject *self, Py_ssize_t index, const void **ptr)
1288 PyErr_SetString(PyExc_TypeError,
1289 "Cannot use string as modifiable buffer");
1290 return -1;
1293 static Py_ssize_t
1294 string_buffer_getsegcount(PyStringObject *self, Py_ssize_t *lenp)
1296 if ( lenp )
1297 *lenp = Py_SIZE(self);
1298 return 1;
1301 static Py_ssize_t
1302 string_buffer_getcharbuf(PyStringObject *self, Py_ssize_t index, const char **ptr)
1304 if ( index != 0 ) {
1305 PyErr_SetString(PyExc_SystemError,
1306 "accessing non-existent string segment");
1307 return -1;
1309 *ptr = self->ob_sval;
1310 return Py_SIZE(self);
1313 static int
1314 string_buffer_getbuffer(PyStringObject *self, Py_buffer *view, int flags)
1316 return PyBuffer_FillInfo(view, (void *)self->ob_sval, Py_SIZE(self),
1317 0, flags);
1320 static PySequenceMethods string_as_sequence = {
1321 (lenfunc)string_length, /*sq_length*/
1322 (binaryfunc)string_concat, /*sq_concat*/
1323 (ssizeargfunc)string_repeat, /*sq_repeat*/
1324 (ssizeargfunc)string_item, /*sq_item*/
1325 (ssizessizeargfunc)string_slice, /*sq_slice*/
1326 0, /*sq_ass_item*/
1327 0, /*sq_ass_slice*/
1328 (objobjproc)string_contains /*sq_contains*/
1331 static PyMappingMethods string_as_mapping = {
1332 (lenfunc)string_length,
1333 (binaryfunc)string_subscript,
1337 static PyBufferProcs string_as_buffer = {
1338 (readbufferproc)string_buffer_getreadbuf,
1339 (writebufferproc)string_buffer_getwritebuf,
1340 (segcountproc)string_buffer_getsegcount,
1341 (charbufferproc)string_buffer_getcharbuf,
1342 (getbufferproc)string_buffer_getbuffer,
1343 0, /* XXX */
1348 #define LEFTSTRIP 0
1349 #define RIGHTSTRIP 1
1350 #define BOTHSTRIP 2
1352 /* Arrays indexed by above */
1353 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
1355 #define STRIPNAME(i) (stripformat[i]+3)
1358 /* Don't call if length < 2 */
1359 #define Py_STRING_MATCH(target, offset, pattern, length) \
1360 (target[offset] == pattern[0] && \
1361 target[offset+length-1] == pattern[length-1] && \
1362 !memcmp(target+offset+1, pattern+1, length-2) )
1365 /* Overallocate the initial list to reduce the number of reallocs for small
1366 split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three
1367 resizes, to sizes 4, 8, then 16. Most observed string splits are for human
1368 text (roughly 11 words per line) and field delimited data (usually 1-10
1369 fields). For large strings the split algorithms are bandwidth limited
1370 so increasing the preallocation likely will not improve things.*/
1372 #define MAX_PREALLOC 12
1374 /* 5 splits gives 6 elements */
1375 #define PREALLOC_SIZE(maxsplit) \
1376 (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
1378 #define SPLIT_APPEND(data, left, right) \
1379 str = PyString_FromStringAndSize((data) + (left), \
1380 (right) - (left)); \
1381 if (str == NULL) \
1382 goto onError; \
1383 if (PyList_Append(list, str)) { \
1384 Py_DECREF(str); \
1385 goto onError; \
1387 else \
1388 Py_DECREF(str);
1390 #define SPLIT_ADD(data, left, right) { \
1391 str = PyString_FromStringAndSize((data) + (left), \
1392 (right) - (left)); \
1393 if (str == NULL) \
1394 goto onError; \
1395 if (count < MAX_PREALLOC) { \
1396 PyList_SET_ITEM(list, count, str); \
1397 } else { \
1398 if (PyList_Append(list, str)) { \
1399 Py_DECREF(str); \
1400 goto onError; \
1402 else \
1403 Py_DECREF(str); \
1405 count++; }
1407 /* Always force the list to the expected size. */
1408 #define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count
1410 #define SKIP_SPACE(s, i, len) { while (i<len && isspace(Py_CHARMASK(s[i]))) i++; }
1411 #define SKIP_NONSPACE(s, i, len) { while (i<len && !isspace(Py_CHARMASK(s[i]))) i++; }
1412 #define RSKIP_SPACE(s, i) { while (i>=0 && isspace(Py_CHARMASK(s[i]))) i--; }
1413 #define RSKIP_NONSPACE(s, i) { while (i>=0 && !isspace(Py_CHARMASK(s[i]))) i--; }
1415 Py_LOCAL_INLINE(PyObject *)
1416 split_whitespace(PyStringObject *self, Py_ssize_t len, Py_ssize_t maxsplit)
1418 const char *s = PyString_AS_STRING(self);
1419 Py_ssize_t i, j, count=0;
1420 PyObject *str;
1421 PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit));
1423 if (list == NULL)
1424 return NULL;
1426 i = j = 0;
1428 while (maxsplit-- > 0) {
1429 SKIP_SPACE(s, i, len);
1430 if (i==len) break;
1431 j = i; i++;
1432 SKIP_NONSPACE(s, i, len);
1433 if (j == 0 && i == len && PyString_CheckExact(self)) {
1434 /* No whitespace in self, so just use it as list[0] */
1435 Py_INCREF(self);
1436 PyList_SET_ITEM(list, 0, (PyObject *)self);
1437 count++;
1438 break;
1440 SPLIT_ADD(s, j, i);
1443 if (i < len) {
1444 /* Only occurs when maxsplit was reached */
1445 /* Skip any remaining whitespace and copy to end of string */
1446 SKIP_SPACE(s, i, len);
1447 if (i != len)
1448 SPLIT_ADD(s, i, len);
1450 FIX_PREALLOC_SIZE(list);
1451 return list;
1452 onError:
1453 Py_DECREF(list);
1454 return NULL;
1457 Py_LOCAL_INLINE(PyObject *)
1458 split_char(PyStringObject *self, Py_ssize_t len, char ch, Py_ssize_t maxcount)
1460 const char *s = PyString_AS_STRING(self);
1461 register Py_ssize_t i, j, count=0;
1462 PyObject *str;
1463 PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
1465 if (list == NULL)
1466 return NULL;
1468 i = j = 0;
1469 while ((j < len) && (maxcount-- > 0)) {
1470 for(; j<len; j++) {
1471 /* I found that using memchr makes no difference */
1472 if (s[j] == ch) {
1473 SPLIT_ADD(s, i, j);
1474 i = j = j + 1;
1475 break;
1479 if (i == 0 && count == 0 && PyString_CheckExact(self)) {
1480 /* ch not in self, so just use self as list[0] */
1481 Py_INCREF(self);
1482 PyList_SET_ITEM(list, 0, (PyObject *)self);
1483 count++;
1485 else if (i <= len) {
1486 SPLIT_ADD(s, i, len);
1488 FIX_PREALLOC_SIZE(list);
1489 return list;
1491 onError:
1492 Py_DECREF(list);
1493 return NULL;
1496 PyDoc_STRVAR(split__doc__,
1497 "S.split([sep [,maxsplit]]) -> list of strings\n\
1499 Return a list of the words in the string S, using sep as the\n\
1500 delimiter string. If maxsplit is given, at most maxsplit\n\
1501 splits are done. If sep is not specified or is None, any\n\
1502 whitespace string is a separator and empty strings are removed\n\
1503 from the result.");
1505 static PyObject *
1506 string_split(PyStringObject *self, PyObject *args)
1508 Py_ssize_t len = PyString_GET_SIZE(self), n, i, j;
1509 Py_ssize_t maxsplit = -1, count=0;
1510 const char *s = PyString_AS_STRING(self), *sub;
1511 PyObject *list, *str, *subobj = Py_None;
1512 #ifdef USE_FAST
1513 Py_ssize_t pos;
1514 #endif
1516 if (!PyArg_ParseTuple(args, "|On:split", &subobj, &maxsplit))
1517 return NULL;
1518 if (maxsplit < 0)
1519 maxsplit = PY_SSIZE_T_MAX;
1520 if (subobj == Py_None)
1521 return split_whitespace(self, len, maxsplit);
1522 if (PyString_Check(subobj)) {
1523 sub = PyString_AS_STRING(subobj);
1524 n = PyString_GET_SIZE(subobj);
1526 #ifdef Py_USING_UNICODE
1527 else if (PyUnicode_Check(subobj))
1528 return PyUnicode_Split((PyObject *)self, subobj, maxsplit);
1529 #endif
1530 else if (PyObject_AsCharBuffer(subobj, &sub, &n))
1531 return NULL;
1533 if (n == 0) {
1534 PyErr_SetString(PyExc_ValueError, "empty separator");
1535 return NULL;
1537 else if (n == 1)
1538 return split_char(self, len, sub[0], maxsplit);
1540 list = PyList_New(PREALLOC_SIZE(maxsplit));
1541 if (list == NULL)
1542 return NULL;
1544 #ifdef USE_FAST
1545 i = j = 0;
1546 while (maxsplit-- > 0) {
1547 pos = fastsearch(s+i, len-i, sub, n, FAST_SEARCH);
1548 if (pos < 0)
1549 break;
1550 j = i+pos;
1551 SPLIT_ADD(s, i, j);
1552 i = j + n;
1554 #else
1555 i = j = 0;
1556 while ((j+n <= len) && (maxsplit-- > 0)) {
1557 for (; j+n <= len; j++) {
1558 if (Py_STRING_MATCH(s, j, sub, n)) {
1559 SPLIT_ADD(s, i, j);
1560 i = j = j + n;
1561 break;
1565 #endif
1566 SPLIT_ADD(s, i, len);
1567 FIX_PREALLOC_SIZE(list);
1568 return list;
1570 onError:
1571 Py_DECREF(list);
1572 return NULL;
1575 PyDoc_STRVAR(partition__doc__,
1576 "S.partition(sep) -> (head, sep, tail)\n\
1578 Searches for the separator sep in S, and returns the part before it,\n\
1579 the separator itself, and the part after it. If the separator is not\n\
1580 found, returns S and two empty strings.");
1582 static PyObject *
1583 string_partition(PyStringObject *self, PyObject *sep_obj)
1585 const char *sep;
1586 Py_ssize_t sep_len;
1588 if (PyString_Check(sep_obj)) {
1589 sep = PyString_AS_STRING(sep_obj);
1590 sep_len = PyString_GET_SIZE(sep_obj);
1592 #ifdef Py_USING_UNICODE
1593 else if (PyUnicode_Check(sep_obj))
1594 return PyUnicode_Partition((PyObject *) self, sep_obj);
1595 #endif
1596 else if (PyObject_AsCharBuffer(sep_obj, &sep, &sep_len))
1597 return NULL;
1599 return stringlib_partition(
1600 (PyObject*) self,
1601 PyString_AS_STRING(self), PyString_GET_SIZE(self),
1602 sep_obj, sep, sep_len
1606 PyDoc_STRVAR(rpartition__doc__,
1607 "S.rpartition(sep) -> (tail, sep, head)\n\
1609 Searches for the separator sep in S, starting at the end of S, and returns\n\
1610 the part before it, the separator itself, and the part after it. If the\n\
1611 separator is not found, returns two empty strings and S.");
1613 static PyObject *
1614 string_rpartition(PyStringObject *self, PyObject *sep_obj)
1616 const char *sep;
1617 Py_ssize_t sep_len;
1619 if (PyString_Check(sep_obj)) {
1620 sep = PyString_AS_STRING(sep_obj);
1621 sep_len = PyString_GET_SIZE(sep_obj);
1623 #ifdef Py_USING_UNICODE
1624 else if (PyUnicode_Check(sep_obj))
1625 return PyUnicode_Partition((PyObject *) self, sep_obj);
1626 #endif
1627 else if (PyObject_AsCharBuffer(sep_obj, &sep, &sep_len))
1628 return NULL;
1630 return stringlib_rpartition(
1631 (PyObject*) self,
1632 PyString_AS_STRING(self), PyString_GET_SIZE(self),
1633 sep_obj, sep, sep_len
1637 Py_LOCAL_INLINE(PyObject *)
1638 rsplit_whitespace(PyStringObject *self, Py_ssize_t len, Py_ssize_t maxsplit)
1640 const char *s = PyString_AS_STRING(self);
1641 Py_ssize_t i, j, count=0;
1642 PyObject *str;
1643 PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit));
1645 if (list == NULL)
1646 return NULL;
1648 i = j = len-1;
1650 while (maxsplit-- > 0) {
1651 RSKIP_SPACE(s, i);
1652 if (i<0) break;
1653 j = i; i--;
1654 RSKIP_NONSPACE(s, i);
1655 if (j == len-1 && i < 0 && PyString_CheckExact(self)) {
1656 /* No whitespace in self, so just use it as list[0] */
1657 Py_INCREF(self);
1658 PyList_SET_ITEM(list, 0, (PyObject *)self);
1659 count++;
1660 break;
1662 SPLIT_ADD(s, i + 1, j + 1);
1664 if (i >= 0) {
1665 /* Only occurs when maxsplit was reached */
1666 /* Skip any remaining whitespace and copy to beginning of string */
1667 RSKIP_SPACE(s, i);
1668 if (i >= 0)
1669 SPLIT_ADD(s, 0, i + 1);
1672 FIX_PREALLOC_SIZE(list);
1673 if (PyList_Reverse(list) < 0)
1674 goto onError;
1675 return list;
1676 onError:
1677 Py_DECREF(list);
1678 return NULL;
1681 Py_LOCAL_INLINE(PyObject *)
1682 rsplit_char(PyStringObject *self, Py_ssize_t len, char ch, Py_ssize_t maxcount)
1684 const char *s = PyString_AS_STRING(self);
1685 register Py_ssize_t i, j, count=0;
1686 PyObject *str;
1687 PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
1689 if (list == NULL)
1690 return NULL;
1692 i = j = len - 1;
1693 while ((i >= 0) && (maxcount-- > 0)) {
1694 for (; i >= 0; i--) {
1695 if (s[i] == ch) {
1696 SPLIT_ADD(s, i + 1, j + 1);
1697 j = i = i - 1;
1698 break;
1702 if (i < 0 && count == 0 && PyString_CheckExact(self)) {
1703 /* ch not in self, so just use self as list[0] */
1704 Py_INCREF(self);
1705 PyList_SET_ITEM(list, 0, (PyObject *)self);
1706 count++;
1708 else if (j >= -1) {
1709 SPLIT_ADD(s, 0, j + 1);
1711 FIX_PREALLOC_SIZE(list);
1712 if (PyList_Reverse(list) < 0)
1713 goto onError;
1714 return list;
1716 onError:
1717 Py_DECREF(list);
1718 return NULL;
1721 PyDoc_STRVAR(rsplit__doc__,
1722 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
1724 Return a list of the words in the string S, using sep as the\n\
1725 delimiter string, starting at the end of the string and working\n\
1726 to the front. If maxsplit is given, at most maxsplit splits are\n\
1727 done. If sep is not specified or is None, any whitespace string\n\
1728 is a separator.");
1730 static PyObject *
1731 string_rsplit(PyStringObject *self, PyObject *args)
1733 Py_ssize_t len = PyString_GET_SIZE(self), n, i, j;
1734 Py_ssize_t maxsplit = -1, count=0;
1735 const char *s, *sub;
1736 PyObject *list, *str, *subobj = Py_None;
1738 if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit))
1739 return NULL;
1740 if (maxsplit < 0)
1741 maxsplit = PY_SSIZE_T_MAX;
1742 if (subobj == Py_None)
1743 return rsplit_whitespace(self, len, maxsplit);
1744 if (PyString_Check(subobj)) {
1745 sub = PyString_AS_STRING(subobj);
1746 n = PyString_GET_SIZE(subobj);
1748 #ifdef Py_USING_UNICODE
1749 else if (PyUnicode_Check(subobj))
1750 return PyUnicode_RSplit((PyObject *)self, subobj, maxsplit);
1751 #endif
1752 else if (PyObject_AsCharBuffer(subobj, &sub, &n))
1753 return NULL;
1755 if (n == 0) {
1756 PyErr_SetString(PyExc_ValueError, "empty separator");
1757 return NULL;
1759 else if (n == 1)
1760 return rsplit_char(self, len, sub[0], maxsplit);
1762 list = PyList_New(PREALLOC_SIZE(maxsplit));
1763 if (list == NULL)
1764 return NULL;
1766 j = len;
1767 i = j - n;
1769 s = PyString_AS_STRING(self);
1770 while ( (i >= 0) && (maxsplit-- > 0) ) {
1771 for (; i>=0; i--) {
1772 if (Py_STRING_MATCH(s, i, sub, n)) {
1773 SPLIT_ADD(s, i + n, j);
1774 j = i;
1775 i -= n;
1776 break;
1780 SPLIT_ADD(s, 0, j);
1781 FIX_PREALLOC_SIZE(list);
1782 if (PyList_Reverse(list) < 0)
1783 goto onError;
1784 return list;
1786 onError:
1787 Py_DECREF(list);
1788 return NULL;
1792 PyDoc_STRVAR(join__doc__,
1793 "S.join(sequence) -> string\n\
1795 Return a string which is the concatenation of the strings in the\n\
1796 sequence. The separator between elements is S.");
1798 static PyObject *
1799 string_join(PyStringObject *self, PyObject *orig)
1801 char *sep = PyString_AS_STRING(self);
1802 const Py_ssize_t seplen = PyString_GET_SIZE(self);
1803 PyObject *res = NULL;
1804 char *p;
1805 Py_ssize_t seqlen = 0;
1806 size_t sz = 0;
1807 Py_ssize_t i;
1808 PyObject *seq, *item;
1810 seq = PySequence_Fast(orig, "");
1811 if (seq == NULL) {
1812 return NULL;
1815 seqlen = PySequence_Size(seq);
1816 if (seqlen == 0) {
1817 Py_DECREF(seq);
1818 return PyString_FromString("");
1820 if (seqlen == 1) {
1821 item = PySequence_Fast_GET_ITEM(seq, 0);
1822 if (PyString_CheckExact(item) || PyUnicode_CheckExact(item)) {
1823 Py_INCREF(item);
1824 Py_DECREF(seq);
1825 return item;
1829 /* There are at least two things to join, or else we have a subclass
1830 * of the builtin types in the sequence.
1831 * Do a pre-pass to figure out the total amount of space we'll
1832 * need (sz), see whether any argument is absurd, and defer to
1833 * the Unicode join if appropriate.
1835 for (i = 0; i < seqlen; i++) {
1836 const size_t old_sz = sz;
1837 item = PySequence_Fast_GET_ITEM(seq, i);
1838 if (!PyString_Check(item)){
1839 #ifdef Py_USING_UNICODE
1840 if (PyUnicode_Check(item)) {
1841 /* Defer to Unicode join.
1842 * CAUTION: There's no gurantee that the
1843 * original sequence can be iterated over
1844 * again, so we must pass seq here.
1846 PyObject *result;
1847 result = PyUnicode_Join((PyObject *)self, seq);
1848 Py_DECREF(seq);
1849 return result;
1851 #endif
1852 PyErr_Format(PyExc_TypeError,
1853 "sequence item %zd: expected string,"
1854 " %.80s found",
1855 i, Py_TYPE(item)->tp_name);
1856 Py_DECREF(seq);
1857 return NULL;
1859 sz += PyString_GET_SIZE(item);
1860 if (i != 0)
1861 sz += seplen;
1862 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
1863 PyErr_SetString(PyExc_OverflowError,
1864 "join() result is too long for a Python string");
1865 Py_DECREF(seq);
1866 return NULL;
1870 /* Allocate result space. */
1871 res = PyString_FromStringAndSize((char*)NULL, sz);
1872 if (res == NULL) {
1873 Py_DECREF(seq);
1874 return NULL;
1877 /* Catenate everything. */
1878 p = PyString_AS_STRING(res);
1879 for (i = 0; i < seqlen; ++i) {
1880 size_t n;
1881 item = PySequence_Fast_GET_ITEM(seq, i);
1882 n = PyString_GET_SIZE(item);
1883 Py_MEMCPY(p, PyString_AS_STRING(item), n);
1884 p += n;
1885 if (i < seqlen - 1) {
1886 Py_MEMCPY(p, sep, seplen);
1887 p += seplen;
1891 Py_DECREF(seq);
1892 return res;
1895 PyObject *
1896 _PyString_Join(PyObject *sep, PyObject *x)
1898 assert(sep != NULL && PyString_Check(sep));
1899 assert(x != NULL);
1900 return string_join((PyStringObject *)sep, x);
1903 Py_LOCAL_INLINE(void)
1904 string_adjust_indices(Py_ssize_t *start, Py_ssize_t *end, Py_ssize_t len)
1906 if (*end > len)
1907 *end = len;
1908 else if (*end < 0)
1909 *end += len;
1910 if (*end < 0)
1911 *end = 0;
1912 if (*start < 0)
1913 *start += len;
1914 if (*start < 0)
1915 *start = 0;
1918 Py_LOCAL_INLINE(Py_ssize_t)
1919 string_find_internal(PyStringObject *self, PyObject *args, int dir)
1921 PyObject *subobj;
1922 const char *sub;
1923 Py_ssize_t sub_len;
1924 Py_ssize_t start=0, end=PY_SSIZE_T_MAX;
1925 PyObject *obj_start=Py_None, *obj_end=Py_None;
1927 if (!PyArg_ParseTuple(args, "O|OO:find/rfind/index/rindex", &subobj,
1928 &obj_start, &obj_end))
1929 return -2;
1930 /* To support None in "start" and "end" arguments, meaning
1931 the same as if they were not passed.
1933 if (obj_start != Py_None)
1934 if (!_PyEval_SliceIndex(obj_start, &start))
1935 return -2;
1936 if (obj_end != Py_None)
1937 if (!_PyEval_SliceIndex(obj_end, &end))
1938 return -2;
1940 if (PyString_Check(subobj)) {
1941 sub = PyString_AS_STRING(subobj);
1942 sub_len = PyString_GET_SIZE(subobj);
1944 #ifdef Py_USING_UNICODE
1945 else if (PyUnicode_Check(subobj))
1946 return PyUnicode_Find(
1947 (PyObject *)self, subobj, start, end, dir);
1948 #endif
1949 else if (PyObject_AsCharBuffer(subobj, &sub, &sub_len))
1950 /* XXX - the "expected a character buffer object" is pretty
1951 confusing for a non-expert. remap to something else ? */
1952 return -2;
1954 if (dir > 0)
1955 return stringlib_find_slice(
1956 PyString_AS_STRING(self), PyString_GET_SIZE(self),
1957 sub, sub_len, start, end);
1958 else
1959 return stringlib_rfind_slice(
1960 PyString_AS_STRING(self), PyString_GET_SIZE(self),
1961 sub, sub_len, start, end);
1965 PyDoc_STRVAR(find__doc__,
1966 "S.find(sub [,start [,end]]) -> int\n\
1968 Return the lowest index in S where substring sub is found,\n\
1969 such that sub is contained within s[start:end]. Optional\n\
1970 arguments start and end are interpreted as in slice notation.\n\
1972 Return -1 on failure.");
1974 static PyObject *
1975 string_find(PyStringObject *self, PyObject *args)
1977 Py_ssize_t result = string_find_internal(self, args, +1);
1978 if (result == -2)
1979 return NULL;
1980 return PyInt_FromSsize_t(result);
1984 PyDoc_STRVAR(index__doc__,
1985 "S.index(sub [,start [,end]]) -> int\n\
1987 Like S.find() but raise ValueError when the substring is not found.");
1989 static PyObject *
1990 string_index(PyStringObject *self, PyObject *args)
1992 Py_ssize_t result = string_find_internal(self, args, +1);
1993 if (result == -2)
1994 return NULL;
1995 if (result == -1) {
1996 PyErr_SetString(PyExc_ValueError,
1997 "substring not found");
1998 return NULL;
2000 return PyInt_FromSsize_t(result);
2004 PyDoc_STRVAR(rfind__doc__,
2005 "S.rfind(sub [,start [,end]]) -> int\n\
2007 Return the highest index in S where substring sub is found,\n\
2008 such that sub is contained within s[start:end]. Optional\n\
2009 arguments start and end are interpreted as in slice notation.\n\
2011 Return -1 on failure.");
2013 static PyObject *
2014 string_rfind(PyStringObject *self, PyObject *args)
2016 Py_ssize_t result = string_find_internal(self, args, -1);
2017 if (result == -2)
2018 return NULL;
2019 return PyInt_FromSsize_t(result);
2023 PyDoc_STRVAR(rindex__doc__,
2024 "S.rindex(sub [,start [,end]]) -> int\n\
2026 Like S.rfind() but raise ValueError when the substring is not found.");
2028 static PyObject *
2029 string_rindex(PyStringObject *self, PyObject *args)
2031 Py_ssize_t result = string_find_internal(self, args, -1);
2032 if (result == -2)
2033 return NULL;
2034 if (result == -1) {
2035 PyErr_SetString(PyExc_ValueError,
2036 "substring not found");
2037 return NULL;
2039 return PyInt_FromSsize_t(result);
2043 Py_LOCAL_INLINE(PyObject *)
2044 do_xstrip(PyStringObject *self, int striptype, PyObject *sepobj)
2046 char *s = PyString_AS_STRING(self);
2047 Py_ssize_t len = PyString_GET_SIZE(self);
2048 char *sep = PyString_AS_STRING(sepobj);
2049 Py_ssize_t seplen = PyString_GET_SIZE(sepobj);
2050 Py_ssize_t i, j;
2052 i = 0;
2053 if (striptype != RIGHTSTRIP) {
2054 while (i < len && memchr(sep, Py_CHARMASK(s[i]), seplen)) {
2055 i++;
2059 j = len;
2060 if (striptype != LEFTSTRIP) {
2061 do {
2062 j--;
2063 } while (j >= i && memchr(sep, Py_CHARMASK(s[j]), seplen));
2064 j++;
2067 if (i == 0 && j == len && PyString_CheckExact(self)) {
2068 Py_INCREF(self);
2069 return (PyObject*)self;
2071 else
2072 return PyString_FromStringAndSize(s+i, j-i);
2076 Py_LOCAL_INLINE(PyObject *)
2077 do_strip(PyStringObject *self, int striptype)
2079 char *s = PyString_AS_STRING(self);
2080 Py_ssize_t len = PyString_GET_SIZE(self), i, j;
2082 i = 0;
2083 if (striptype != RIGHTSTRIP) {
2084 while (i < len && isspace(Py_CHARMASK(s[i]))) {
2085 i++;
2089 j = len;
2090 if (striptype != LEFTSTRIP) {
2091 do {
2092 j--;
2093 } while (j >= i && isspace(Py_CHARMASK(s[j])));
2094 j++;
2097 if (i == 0 && j == len && PyString_CheckExact(self)) {
2098 Py_INCREF(self);
2099 return (PyObject*)self;
2101 else
2102 return PyString_FromStringAndSize(s+i, j-i);
2106 Py_LOCAL_INLINE(PyObject *)
2107 do_argstrip(PyStringObject *self, int striptype, PyObject *args)
2109 PyObject *sep = NULL;
2111 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
2112 return NULL;
2114 if (sep != NULL && sep != Py_None) {
2115 if (PyString_Check(sep))
2116 return do_xstrip(self, striptype, sep);
2117 #ifdef Py_USING_UNICODE
2118 else if (PyUnicode_Check(sep)) {
2119 PyObject *uniself = PyUnicode_FromObject((PyObject *)self);
2120 PyObject *res;
2121 if (uniself==NULL)
2122 return NULL;
2123 res = _PyUnicode_XStrip((PyUnicodeObject *)uniself,
2124 striptype, sep);
2125 Py_DECREF(uniself);
2126 return res;
2128 #endif
2129 PyErr_Format(PyExc_TypeError,
2130 #ifdef Py_USING_UNICODE
2131 "%s arg must be None, str or unicode",
2132 #else
2133 "%s arg must be None or str",
2134 #endif
2135 STRIPNAME(striptype));
2136 return NULL;
2139 return do_strip(self, striptype);
2143 PyDoc_STRVAR(strip__doc__,
2144 "S.strip([chars]) -> string or unicode\n\
2146 Return a copy of the string S with leading and trailing\n\
2147 whitespace removed.\n\
2148 If chars is given and not None, remove characters in chars instead.\n\
2149 If chars is unicode, S will be converted to unicode before stripping");
2151 static PyObject *
2152 string_strip(PyStringObject *self, PyObject *args)
2154 if (PyTuple_GET_SIZE(args) == 0)
2155 return do_strip(self, BOTHSTRIP); /* Common case */
2156 else
2157 return do_argstrip(self, BOTHSTRIP, args);
2161 PyDoc_STRVAR(lstrip__doc__,
2162 "S.lstrip([chars]) -> string or unicode\n\
2164 Return a copy of the string S with leading whitespace removed.\n\
2165 If chars is given and not None, remove characters in chars instead.\n\
2166 If chars is unicode, S will be converted to unicode before stripping");
2168 static PyObject *
2169 string_lstrip(PyStringObject *self, PyObject *args)
2171 if (PyTuple_GET_SIZE(args) == 0)
2172 return do_strip(self, LEFTSTRIP); /* Common case */
2173 else
2174 return do_argstrip(self, LEFTSTRIP, args);
2178 PyDoc_STRVAR(rstrip__doc__,
2179 "S.rstrip([chars]) -> string or unicode\n\
2181 Return a copy of the string S with trailing whitespace removed.\n\
2182 If chars is given and not None, remove characters in chars instead.\n\
2183 If chars is unicode, S will be converted to unicode before stripping");
2185 static PyObject *
2186 string_rstrip(PyStringObject *self, PyObject *args)
2188 if (PyTuple_GET_SIZE(args) == 0)
2189 return do_strip(self, RIGHTSTRIP); /* Common case */
2190 else
2191 return do_argstrip(self, RIGHTSTRIP, args);
2195 PyDoc_STRVAR(lower__doc__,
2196 "S.lower() -> string\n\
2198 Return a copy of the string S converted to lowercase.");
2200 /* _tolower and _toupper are defined by SUSv2, but they're not ISO C */
2201 #ifndef _tolower
2202 #define _tolower tolower
2203 #endif
2205 static PyObject *
2206 string_lower(PyStringObject *self)
2208 char *s;
2209 Py_ssize_t i, n = PyString_GET_SIZE(self);
2210 PyObject *newobj;
2212 newobj = PyString_FromStringAndSize(NULL, n);
2213 if (!newobj)
2214 return NULL;
2216 s = PyString_AS_STRING(newobj);
2218 Py_MEMCPY(s, PyString_AS_STRING(self), n);
2220 for (i = 0; i < n; i++) {
2221 int c = Py_CHARMASK(s[i]);
2222 if (isupper(c))
2223 s[i] = _tolower(c);
2226 return newobj;
2229 PyDoc_STRVAR(upper__doc__,
2230 "S.upper() -> string\n\
2232 Return a copy of the string S converted to uppercase.");
2234 #ifndef _toupper
2235 #define _toupper toupper
2236 #endif
2238 static PyObject *
2239 string_upper(PyStringObject *self)
2241 char *s;
2242 Py_ssize_t i, n = PyString_GET_SIZE(self);
2243 PyObject *newobj;
2245 newobj = PyString_FromStringAndSize(NULL, n);
2246 if (!newobj)
2247 return NULL;
2249 s = PyString_AS_STRING(newobj);
2251 Py_MEMCPY(s, PyString_AS_STRING(self), n);
2253 for (i = 0; i < n; i++) {
2254 int c = Py_CHARMASK(s[i]);
2255 if (islower(c))
2256 s[i] = _toupper(c);
2259 return newobj;
2262 PyDoc_STRVAR(title__doc__,
2263 "S.title() -> string\n\
2265 Return a titlecased version of S, i.e. words start with uppercase\n\
2266 characters, all remaining cased characters have lowercase.");
2268 static PyObject*
2269 string_title(PyStringObject *self)
2271 char *s = PyString_AS_STRING(self), *s_new;
2272 Py_ssize_t i, n = PyString_GET_SIZE(self);
2273 int previous_is_cased = 0;
2274 PyObject *newobj;
2276 newobj = PyString_FromStringAndSize(NULL, n);
2277 if (newobj == NULL)
2278 return NULL;
2279 s_new = PyString_AsString(newobj);
2280 for (i = 0; i < n; i++) {
2281 int c = Py_CHARMASK(*s++);
2282 if (islower(c)) {
2283 if (!previous_is_cased)
2284 c = toupper(c);
2285 previous_is_cased = 1;
2286 } else if (isupper(c)) {
2287 if (previous_is_cased)
2288 c = tolower(c);
2289 previous_is_cased = 1;
2290 } else
2291 previous_is_cased = 0;
2292 *s_new++ = c;
2294 return newobj;
2297 PyDoc_STRVAR(capitalize__doc__,
2298 "S.capitalize() -> string\n\
2300 Return a copy of the string S with only its first character\n\
2301 capitalized.");
2303 static PyObject *
2304 string_capitalize(PyStringObject *self)
2306 char *s = PyString_AS_STRING(self), *s_new;
2307 Py_ssize_t i, n = PyString_GET_SIZE(self);
2308 PyObject *newobj;
2310 newobj = PyString_FromStringAndSize(NULL, n);
2311 if (newobj == NULL)
2312 return NULL;
2313 s_new = PyString_AsString(newobj);
2314 if (0 < n) {
2315 int c = Py_CHARMASK(*s++);
2316 if (islower(c))
2317 *s_new = toupper(c);
2318 else
2319 *s_new = c;
2320 s_new++;
2322 for (i = 1; i < n; i++) {
2323 int c = Py_CHARMASK(*s++);
2324 if (isupper(c))
2325 *s_new = tolower(c);
2326 else
2327 *s_new = c;
2328 s_new++;
2330 return newobj;
2334 PyDoc_STRVAR(count__doc__,
2335 "S.count(sub[, start[, end]]) -> int\n\
2337 Return the number of non-overlapping occurrences of substring sub in\n\
2338 string S[start:end]. Optional arguments start and end are interpreted\n\
2339 as in slice notation.");
2341 static PyObject *
2342 string_count(PyStringObject *self, PyObject *args)
2344 PyObject *sub_obj;
2345 const char *str = PyString_AS_STRING(self), *sub;
2346 Py_ssize_t sub_len;
2347 Py_ssize_t start = 0, end = PY_SSIZE_T_MAX;
2349 if (!PyArg_ParseTuple(args, "O|O&O&:count", &sub_obj,
2350 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
2351 return NULL;
2353 if (PyString_Check(sub_obj)) {
2354 sub = PyString_AS_STRING(sub_obj);
2355 sub_len = PyString_GET_SIZE(sub_obj);
2357 #ifdef Py_USING_UNICODE
2358 else if (PyUnicode_Check(sub_obj)) {
2359 Py_ssize_t count;
2360 count = PyUnicode_Count((PyObject *)self, sub_obj, start, end);
2361 if (count == -1)
2362 return NULL;
2363 else
2364 return PyInt_FromSsize_t(count);
2366 #endif
2367 else if (PyObject_AsCharBuffer(sub_obj, &sub, &sub_len))
2368 return NULL;
2370 string_adjust_indices(&start, &end, PyString_GET_SIZE(self));
2372 return PyInt_FromSsize_t(
2373 stringlib_count(str + start, end - start, sub, sub_len)
2377 PyDoc_STRVAR(swapcase__doc__,
2378 "S.swapcase() -> string\n\
2380 Return a copy of the string S with uppercase characters\n\
2381 converted to lowercase and vice versa.");
2383 static PyObject *
2384 string_swapcase(PyStringObject *self)
2386 char *s = PyString_AS_STRING(self), *s_new;
2387 Py_ssize_t i, n = PyString_GET_SIZE(self);
2388 PyObject *newobj;
2390 newobj = PyString_FromStringAndSize(NULL, n);
2391 if (newobj == NULL)
2392 return NULL;
2393 s_new = PyString_AsString(newobj);
2394 for (i = 0; i < n; i++) {
2395 int c = Py_CHARMASK(*s++);
2396 if (islower(c)) {
2397 *s_new = toupper(c);
2399 else if (isupper(c)) {
2400 *s_new = tolower(c);
2402 else
2403 *s_new = c;
2404 s_new++;
2406 return newobj;
2410 PyDoc_STRVAR(translate__doc__,
2411 "S.translate(table [,deletechars]) -> string\n\
2413 Return a copy of the string S, where all characters occurring\n\
2414 in the optional argument deletechars are removed, and the\n\
2415 remaining characters have been mapped through the given\n\
2416 translation table, which must be a string of length 256.");
2418 static PyObject *
2419 string_translate(PyStringObject *self, PyObject *args)
2421 register char *input, *output;
2422 const char *table;
2423 register Py_ssize_t i, c, changed = 0;
2424 PyObject *input_obj = (PyObject*)self;
2425 const char *output_start, *del_table=NULL;
2426 Py_ssize_t inlen, tablen, dellen = 0;
2427 PyObject *result;
2428 int trans_table[256];
2429 PyObject *tableobj, *delobj = NULL;
2431 if (!PyArg_UnpackTuple(args, "translate", 1, 2,
2432 &tableobj, &delobj))
2433 return NULL;
2435 if (PyString_Check(tableobj)) {
2436 table = PyString_AS_STRING(tableobj);
2437 tablen = PyString_GET_SIZE(tableobj);
2439 else if (tableobj == Py_None) {
2440 table = NULL;
2441 tablen = 256;
2443 #ifdef Py_USING_UNICODE
2444 else if (PyUnicode_Check(tableobj)) {
2445 /* Unicode .translate() does not support the deletechars
2446 parameter; instead a mapping to None will cause characters
2447 to be deleted. */
2448 if (delobj != NULL) {
2449 PyErr_SetString(PyExc_TypeError,
2450 "deletions are implemented differently for unicode");
2451 return NULL;
2453 return PyUnicode_Translate((PyObject *)self, tableobj, NULL);
2455 #endif
2456 else if (PyObject_AsCharBuffer(tableobj, &table, &tablen))
2457 return NULL;
2459 if (tablen != 256) {
2460 PyErr_SetString(PyExc_ValueError,
2461 "translation table must be 256 characters long");
2462 return NULL;
2465 if (delobj != NULL) {
2466 if (PyString_Check(delobj)) {
2467 del_table = PyString_AS_STRING(delobj);
2468 dellen = PyString_GET_SIZE(delobj);
2470 #ifdef Py_USING_UNICODE
2471 else if (PyUnicode_Check(delobj)) {
2472 PyErr_SetString(PyExc_TypeError,
2473 "deletions are implemented differently for unicode");
2474 return NULL;
2476 #endif
2477 else if (PyObject_AsCharBuffer(delobj, &del_table, &dellen))
2478 return NULL;
2480 else {
2481 del_table = NULL;
2482 dellen = 0;
2485 inlen = PyString_GET_SIZE(input_obj);
2486 result = PyString_FromStringAndSize((char *)NULL, inlen);
2487 if (result == NULL)
2488 return NULL;
2489 output_start = output = PyString_AsString(result);
2490 input = PyString_AS_STRING(input_obj);
2492 if (dellen == 0 && table != NULL) {
2493 /* If no deletions are required, use faster code */
2494 for (i = inlen; --i >= 0; ) {
2495 c = Py_CHARMASK(*input++);
2496 if (Py_CHARMASK((*output++ = table[c])) != c)
2497 changed = 1;
2499 if (changed || !PyString_CheckExact(input_obj))
2500 return result;
2501 Py_DECREF(result);
2502 Py_INCREF(input_obj);
2503 return input_obj;
2506 if (table == NULL) {
2507 for (i = 0; i < 256; i++)
2508 trans_table[i] = Py_CHARMASK(i);
2509 } else {
2510 for (i = 0; i < 256; i++)
2511 trans_table[i] = Py_CHARMASK(table[i]);
2514 for (i = 0; i < dellen; i++)
2515 trans_table[(int) Py_CHARMASK(del_table[i])] = -1;
2517 for (i = inlen; --i >= 0; ) {
2518 c = Py_CHARMASK(*input++);
2519 if (trans_table[c] != -1)
2520 if (Py_CHARMASK(*output++ = (char)trans_table[c]) == c)
2521 continue;
2522 changed = 1;
2524 if (!changed && PyString_CheckExact(input_obj)) {
2525 Py_DECREF(result);
2526 Py_INCREF(input_obj);
2527 return input_obj;
2529 /* Fix the size of the resulting string */
2530 if (inlen > 0)
2531 _PyString_Resize(&result, output - output_start);
2532 return result;
2536 #define FORWARD 1
2537 #define REVERSE -1
2539 /* find and count characters and substrings */
2541 #define findchar(target, target_len, c) \
2542 ((char *)memchr((const void *)(target), c, target_len))
2544 /* String ops must return a string. */
2545 /* If the object is subclass of string, create a copy */
2546 Py_LOCAL(PyStringObject *)
2547 return_self(PyStringObject *self)
2549 if (PyString_CheckExact(self)) {
2550 Py_INCREF(self);
2551 return self;
2553 return (PyStringObject *)PyString_FromStringAndSize(
2554 PyString_AS_STRING(self),
2555 PyString_GET_SIZE(self));
2558 Py_LOCAL_INLINE(Py_ssize_t)
2559 countchar(const char *target, int target_len, char c, Py_ssize_t maxcount)
2561 Py_ssize_t count=0;
2562 const char *start=target;
2563 const char *end=target+target_len;
2565 while ( (start=findchar(start, end-start, c)) != NULL ) {
2566 count++;
2567 if (count >= maxcount)
2568 break;
2569 start += 1;
2571 return count;
2574 Py_LOCAL(Py_ssize_t)
2575 findstring(const char *target, Py_ssize_t target_len,
2576 const char *pattern, Py_ssize_t pattern_len,
2577 Py_ssize_t start,
2578 Py_ssize_t end,
2579 int direction)
2581 if (start < 0) {
2582 start += target_len;
2583 if (start < 0)
2584 start = 0;
2586 if (end > target_len) {
2587 end = target_len;
2588 } else if (end < 0) {
2589 end += target_len;
2590 if (end < 0)
2591 end = 0;
2594 /* zero-length substrings always match at the first attempt */
2595 if (pattern_len == 0)
2596 return (direction > 0) ? start : end;
2598 end -= pattern_len;
2600 if (direction < 0) {
2601 for (; end >= start; end--)
2602 if (Py_STRING_MATCH(target, end, pattern, pattern_len))
2603 return end;
2604 } else {
2605 for (; start <= end; start++)
2606 if (Py_STRING_MATCH(target, start, pattern, pattern_len))
2607 return start;
2609 return -1;
2612 Py_LOCAL_INLINE(Py_ssize_t)
2613 countstring(const char *target, Py_ssize_t target_len,
2614 const char *pattern, Py_ssize_t pattern_len,
2615 Py_ssize_t start,
2616 Py_ssize_t end,
2617 int direction, Py_ssize_t maxcount)
2619 Py_ssize_t count=0;
2621 if (start < 0) {
2622 start += target_len;
2623 if (start < 0)
2624 start = 0;
2626 if (end > target_len) {
2627 end = target_len;
2628 } else if (end < 0) {
2629 end += target_len;
2630 if (end < 0)
2631 end = 0;
2634 /* zero-length substrings match everywhere */
2635 if (pattern_len == 0 || maxcount == 0) {
2636 if (target_len+1 < maxcount)
2637 return target_len+1;
2638 return maxcount;
2641 end -= pattern_len;
2642 if (direction < 0) {
2643 for (; (end >= start); end--)
2644 if (Py_STRING_MATCH(target, end, pattern, pattern_len)) {
2645 count++;
2646 if (--maxcount <= 0) break;
2647 end -= pattern_len-1;
2649 } else {
2650 for (; (start <= end); start++)
2651 if (Py_STRING_MATCH(target, start, pattern, pattern_len)) {
2652 count++;
2653 if (--maxcount <= 0)
2654 break;
2655 start += pattern_len-1;
2658 return count;
2662 /* Algorithms for different cases of string replacement */
2664 /* len(self)>=1, from="", len(to)>=1, maxcount>=1 */
2665 Py_LOCAL(PyStringObject *)
2666 replace_interleave(PyStringObject *self,
2667 const char *to_s, Py_ssize_t to_len,
2668 Py_ssize_t maxcount)
2670 char *self_s, *result_s;
2671 Py_ssize_t self_len, result_len;
2672 Py_ssize_t count, i, product;
2673 PyStringObject *result;
2675 self_len = PyString_GET_SIZE(self);
2677 /* 1 at the end plus 1 after every character */
2678 count = self_len+1;
2679 if (maxcount < count)
2680 count = maxcount;
2682 /* Check for overflow */
2683 /* result_len = count * to_len + self_len; */
2684 product = count * to_len;
2685 if (product / to_len != count) {
2686 PyErr_SetString(PyExc_OverflowError,
2687 "replace string is too long");
2688 return NULL;
2690 result_len = product + self_len;
2691 if (result_len < 0) {
2692 PyErr_SetString(PyExc_OverflowError,
2693 "replace string is too long");
2694 return NULL;
2697 if (! (result = (PyStringObject *)
2698 PyString_FromStringAndSize(NULL, result_len)) )
2699 return NULL;
2701 self_s = PyString_AS_STRING(self);
2702 result_s = PyString_AS_STRING(result);
2704 /* TODO: special case single character, which doesn't need memcpy */
2706 /* Lay the first one down (guaranteed this will occur) */
2707 Py_MEMCPY(result_s, to_s, to_len);
2708 result_s += to_len;
2709 count -= 1;
2711 for (i=0; i<count; i++) {
2712 *result_s++ = *self_s++;
2713 Py_MEMCPY(result_s, to_s, to_len);
2714 result_s += to_len;
2717 /* Copy the rest of the original string */
2718 Py_MEMCPY(result_s, self_s, self_len-i);
2720 return result;
2723 /* Special case for deleting a single character */
2724 /* len(self)>=1, len(from)==1, to="", maxcount>=1 */
2725 Py_LOCAL(PyStringObject *)
2726 replace_delete_single_character(PyStringObject *self,
2727 char from_c, Py_ssize_t maxcount)
2729 char *self_s, *result_s;
2730 char *start, *next, *end;
2731 Py_ssize_t self_len, result_len;
2732 Py_ssize_t count;
2733 PyStringObject *result;
2735 self_len = PyString_GET_SIZE(self);
2736 self_s = PyString_AS_STRING(self);
2738 count = countchar(self_s, self_len, from_c, maxcount);
2739 if (count == 0) {
2740 return return_self(self);
2743 result_len = self_len - count; /* from_len == 1 */
2744 assert(result_len>=0);
2746 if ( (result = (PyStringObject *)
2747 PyString_FromStringAndSize(NULL, result_len)) == NULL)
2748 return NULL;
2749 result_s = PyString_AS_STRING(result);
2751 start = self_s;
2752 end = self_s + self_len;
2753 while (count-- > 0) {
2754 next = findchar(start, end-start, from_c);
2755 if (next == NULL)
2756 break;
2757 Py_MEMCPY(result_s, start, next-start);
2758 result_s += (next-start);
2759 start = next+1;
2761 Py_MEMCPY(result_s, start, end-start);
2763 return result;
2766 /* len(self)>=1, len(from)>=2, to="", maxcount>=1 */
2768 Py_LOCAL(PyStringObject *)
2769 replace_delete_substring(PyStringObject *self,
2770 const char *from_s, Py_ssize_t from_len,
2771 Py_ssize_t maxcount) {
2772 char *self_s, *result_s;
2773 char *start, *next, *end;
2774 Py_ssize_t self_len, result_len;
2775 Py_ssize_t count, offset;
2776 PyStringObject *result;
2778 self_len = PyString_GET_SIZE(self);
2779 self_s = PyString_AS_STRING(self);
2781 count = countstring(self_s, self_len,
2782 from_s, from_len,
2783 0, self_len, 1,
2784 maxcount);
2786 if (count == 0) {
2787 /* no matches */
2788 return return_self(self);
2791 result_len = self_len - (count * from_len);
2792 assert (result_len>=0);
2794 if ( (result = (PyStringObject *)
2795 PyString_FromStringAndSize(NULL, result_len)) == NULL )
2796 return NULL;
2798 result_s = PyString_AS_STRING(result);
2800 start = self_s;
2801 end = self_s + self_len;
2802 while (count-- > 0) {
2803 offset = findstring(start, end-start,
2804 from_s, from_len,
2805 0, end-start, FORWARD);
2806 if (offset == -1)
2807 break;
2808 next = start + offset;
2810 Py_MEMCPY(result_s, start, next-start);
2812 result_s += (next-start);
2813 start = next+from_len;
2815 Py_MEMCPY(result_s, start, end-start);
2816 return result;
2819 /* len(self)>=1, len(from)==len(to)==1, maxcount>=1 */
2820 Py_LOCAL(PyStringObject *)
2821 replace_single_character_in_place(PyStringObject *self,
2822 char from_c, char to_c,
2823 Py_ssize_t maxcount)
2825 char *self_s, *result_s, *start, *end, *next;
2826 Py_ssize_t self_len;
2827 PyStringObject *result;
2829 /* The result string will be the same size */
2830 self_s = PyString_AS_STRING(self);
2831 self_len = PyString_GET_SIZE(self);
2833 next = findchar(self_s, self_len, from_c);
2835 if (next == NULL) {
2836 /* No matches; return the original string */
2837 return return_self(self);
2840 /* Need to make a new string */
2841 result = (PyStringObject *) PyString_FromStringAndSize(NULL, self_len);
2842 if (result == NULL)
2843 return NULL;
2844 result_s = PyString_AS_STRING(result);
2845 Py_MEMCPY(result_s, self_s, self_len);
2847 /* change everything in-place, starting with this one */
2848 start = result_s + (next-self_s);
2849 *start = to_c;
2850 start++;
2851 end = result_s + self_len;
2853 while (--maxcount > 0) {
2854 next = findchar(start, end-start, from_c);
2855 if (next == NULL)
2856 break;
2857 *next = to_c;
2858 start = next+1;
2861 return result;
2864 /* len(self)>=1, len(from)==len(to)>=2, maxcount>=1 */
2865 Py_LOCAL(PyStringObject *)
2866 replace_substring_in_place(PyStringObject *self,
2867 const char *from_s, Py_ssize_t from_len,
2868 const char *to_s, Py_ssize_t to_len,
2869 Py_ssize_t maxcount)
2871 char *result_s, *start, *end;
2872 char *self_s;
2873 Py_ssize_t self_len, offset;
2874 PyStringObject *result;
2876 /* The result string will be the same size */
2878 self_s = PyString_AS_STRING(self);
2879 self_len = PyString_GET_SIZE(self);
2881 offset = findstring(self_s, self_len,
2882 from_s, from_len,
2883 0, self_len, FORWARD);
2884 if (offset == -1) {
2885 /* No matches; return the original string */
2886 return return_self(self);
2889 /* Need to make a new string */
2890 result = (PyStringObject *) PyString_FromStringAndSize(NULL, self_len);
2891 if (result == NULL)
2892 return NULL;
2893 result_s = PyString_AS_STRING(result);
2894 Py_MEMCPY(result_s, self_s, self_len);
2896 /* change everything in-place, starting with this one */
2897 start = result_s + offset;
2898 Py_MEMCPY(start, to_s, from_len);
2899 start += from_len;
2900 end = result_s + self_len;
2902 while ( --maxcount > 0) {
2903 offset = findstring(start, end-start,
2904 from_s, from_len,
2905 0, end-start, FORWARD);
2906 if (offset==-1)
2907 break;
2908 Py_MEMCPY(start+offset, to_s, from_len);
2909 start += offset+from_len;
2912 return result;
2915 /* len(self)>=1, len(from)==1, len(to)>=2, maxcount>=1 */
2916 Py_LOCAL(PyStringObject *)
2917 replace_single_character(PyStringObject *self,
2918 char from_c,
2919 const char *to_s, Py_ssize_t to_len,
2920 Py_ssize_t maxcount)
2922 char *self_s, *result_s;
2923 char *start, *next, *end;
2924 Py_ssize_t self_len, result_len;
2925 Py_ssize_t count, product;
2926 PyStringObject *result;
2928 self_s = PyString_AS_STRING(self);
2929 self_len = PyString_GET_SIZE(self);
2931 count = countchar(self_s, self_len, from_c, maxcount);
2932 if (count == 0) {
2933 /* no matches, return unchanged */
2934 return return_self(self);
2937 /* use the difference between current and new, hence the "-1" */
2938 /* result_len = self_len + count * (to_len-1) */
2939 product = count * (to_len-1);
2940 if (product / (to_len-1) != count) {
2941 PyErr_SetString(PyExc_OverflowError, "replace string is too long");
2942 return NULL;
2944 result_len = self_len + product;
2945 if (result_len < 0) {
2946 PyErr_SetString(PyExc_OverflowError, "replace string is too long");
2947 return NULL;
2950 if ( (result = (PyStringObject *)
2951 PyString_FromStringAndSize(NULL, result_len)) == NULL)
2952 return NULL;
2953 result_s = PyString_AS_STRING(result);
2955 start = self_s;
2956 end = self_s + self_len;
2957 while (count-- > 0) {
2958 next = findchar(start, end-start, from_c);
2959 if (next == NULL)
2960 break;
2962 if (next == start) {
2963 /* replace with the 'to' */
2964 Py_MEMCPY(result_s, to_s, to_len);
2965 result_s += to_len;
2966 start += 1;
2967 } else {
2968 /* copy the unchanged old then the 'to' */
2969 Py_MEMCPY(result_s, start, next-start);
2970 result_s += (next-start);
2971 Py_MEMCPY(result_s, to_s, to_len);
2972 result_s += to_len;
2973 start = next+1;
2976 /* Copy the remainder of the remaining string */
2977 Py_MEMCPY(result_s, start, end-start);
2979 return result;
2982 /* len(self)>=1, len(from)>=2, len(to)>=2, maxcount>=1 */
2983 Py_LOCAL(PyStringObject *)
2984 replace_substring(PyStringObject *self,
2985 const char *from_s, Py_ssize_t from_len,
2986 const char *to_s, Py_ssize_t to_len,
2987 Py_ssize_t maxcount) {
2988 char *self_s, *result_s;
2989 char *start, *next, *end;
2990 Py_ssize_t self_len, result_len;
2991 Py_ssize_t count, offset, product;
2992 PyStringObject *result;
2994 self_s = PyString_AS_STRING(self);
2995 self_len = PyString_GET_SIZE(self);
2997 count = countstring(self_s, self_len,
2998 from_s, from_len,
2999 0, self_len, FORWARD, maxcount);
3000 if (count == 0) {
3001 /* no matches, return unchanged */
3002 return return_self(self);
3005 /* Check for overflow */
3006 /* result_len = self_len + count * (to_len-from_len) */
3007 product = count * (to_len-from_len);
3008 if (product / (to_len-from_len) != count) {
3009 PyErr_SetString(PyExc_OverflowError, "replace string is too long");
3010 return NULL;
3012 result_len = self_len + product;
3013 if (result_len < 0) {
3014 PyErr_SetString(PyExc_OverflowError, "replace string is too long");
3015 return NULL;
3018 if ( (result = (PyStringObject *)
3019 PyString_FromStringAndSize(NULL, result_len)) == NULL)
3020 return NULL;
3021 result_s = PyString_AS_STRING(result);
3023 start = self_s;
3024 end = self_s + self_len;
3025 while (count-- > 0) {
3026 offset = findstring(start, end-start,
3027 from_s, from_len,
3028 0, end-start, FORWARD);
3029 if (offset == -1)
3030 break;
3031 next = start+offset;
3032 if (next == start) {
3033 /* replace with the 'to' */
3034 Py_MEMCPY(result_s, to_s, to_len);
3035 result_s += to_len;
3036 start += from_len;
3037 } else {
3038 /* copy the unchanged old then the 'to' */
3039 Py_MEMCPY(result_s, start, next-start);
3040 result_s += (next-start);
3041 Py_MEMCPY(result_s, to_s, to_len);
3042 result_s += to_len;
3043 start = next+from_len;
3046 /* Copy the remainder of the remaining string */
3047 Py_MEMCPY(result_s, start, end-start);
3049 return result;
3053 Py_LOCAL(PyStringObject *)
3054 replace(PyStringObject *self,
3055 const char *from_s, Py_ssize_t from_len,
3056 const char *to_s, Py_ssize_t to_len,
3057 Py_ssize_t maxcount)
3059 if (maxcount < 0) {
3060 maxcount = PY_SSIZE_T_MAX;
3061 } else if (maxcount == 0 || PyString_GET_SIZE(self) == 0) {
3062 /* nothing to do; return the original string */
3063 return return_self(self);
3066 if (maxcount == 0 ||
3067 (from_len == 0 && to_len == 0)) {
3068 /* nothing to do; return the original string */
3069 return return_self(self);
3072 /* Handle zero-length special cases */
3074 if (from_len == 0) {
3075 /* insert the 'to' string everywhere. */
3076 /* >>> "Python".replace("", ".") */
3077 /* '.P.y.t.h.o.n.' */
3078 return replace_interleave(self, to_s, to_len, maxcount);
3081 /* Except for "".replace("", "A") == "A" there is no way beyond this */
3082 /* point for an empty self string to generate a non-empty string */
3083 /* Special case so the remaining code always gets a non-empty string */
3084 if (PyString_GET_SIZE(self) == 0) {
3085 return return_self(self);
3088 if (to_len == 0) {
3089 /* delete all occurances of 'from' string */
3090 if (from_len == 1) {
3091 return replace_delete_single_character(
3092 self, from_s[0], maxcount);
3093 } else {
3094 return replace_delete_substring(self, from_s, from_len, maxcount);
3098 /* Handle special case where both strings have the same length */
3100 if (from_len == to_len) {
3101 if (from_len == 1) {
3102 return replace_single_character_in_place(
3103 self,
3104 from_s[0],
3105 to_s[0],
3106 maxcount);
3107 } else {
3108 return replace_substring_in_place(
3109 self, from_s, from_len, to_s, to_len, maxcount);
3113 /* Otherwise use the more generic algorithms */
3114 if (from_len == 1) {
3115 return replace_single_character(self, from_s[0],
3116 to_s, to_len, maxcount);
3117 } else {
3118 /* len('from')>=2, len('to')>=1 */
3119 return replace_substring(self, from_s, from_len, to_s, to_len, maxcount);
3123 PyDoc_STRVAR(replace__doc__,
3124 "S.replace (old, new[, count]) -> string\n\
3126 Return a copy of string S with all occurrences of substring\n\
3127 old replaced by new. If the optional argument count is\n\
3128 given, only the first count occurrences are replaced.");
3130 static PyObject *
3131 string_replace(PyStringObject *self, PyObject *args)
3133 Py_ssize_t count = -1;
3134 PyObject *from, *to;
3135 const char *from_s, *to_s;
3136 Py_ssize_t from_len, to_len;
3138 if (!PyArg_ParseTuple(args, "OO|n:replace", &from, &to, &count))
3139 return NULL;
3141 if (PyString_Check(from)) {
3142 from_s = PyString_AS_STRING(from);
3143 from_len = PyString_GET_SIZE(from);
3145 #ifdef Py_USING_UNICODE
3146 if (PyUnicode_Check(from))
3147 return PyUnicode_Replace((PyObject *)self,
3148 from, to, count);
3149 #endif
3150 else if (PyObject_AsCharBuffer(from, &from_s, &from_len))
3151 return NULL;
3153 if (PyString_Check(to)) {
3154 to_s = PyString_AS_STRING(to);
3155 to_len = PyString_GET_SIZE(to);
3157 #ifdef Py_USING_UNICODE
3158 else if (PyUnicode_Check(to))
3159 return PyUnicode_Replace((PyObject *)self,
3160 from, to, count);
3161 #endif
3162 else if (PyObject_AsCharBuffer(to, &to_s, &to_len))
3163 return NULL;
3165 return (PyObject *)replace((PyStringObject *) self,
3166 from_s, from_len,
3167 to_s, to_len, count);
3170 /** End DALKE **/
3172 /* Matches the end (direction >= 0) or start (direction < 0) of self
3173 * against substr, using the start and end arguments. Returns
3174 * -1 on error, 0 if not found and 1 if found.
3176 Py_LOCAL(int)
3177 _string_tailmatch(PyStringObject *self, PyObject *substr, Py_ssize_t start,
3178 Py_ssize_t end, int direction)
3180 Py_ssize_t len = PyString_GET_SIZE(self);
3181 Py_ssize_t slen;
3182 const char* sub;
3183 const char* str;
3185 if (PyString_Check(substr)) {
3186 sub = PyString_AS_STRING(substr);
3187 slen = PyString_GET_SIZE(substr);
3189 #ifdef Py_USING_UNICODE
3190 else if (PyUnicode_Check(substr))
3191 return PyUnicode_Tailmatch((PyObject *)self,
3192 substr, start, end, direction);
3193 #endif
3194 else if (PyObject_AsCharBuffer(substr, &sub, &slen))
3195 return -1;
3196 str = PyString_AS_STRING(self);
3198 string_adjust_indices(&start, &end, len);
3200 if (direction < 0) {
3201 /* startswith */
3202 if (start+slen > len)
3203 return 0;
3204 } else {
3205 /* endswith */
3206 if (end-start < slen || start > len)
3207 return 0;
3209 if (end-slen > start)
3210 start = end - slen;
3212 if (end-start >= slen)
3213 return ! memcmp(str+start, sub, slen);
3214 return 0;
3218 PyDoc_STRVAR(startswith__doc__,
3219 "S.startswith(prefix[, start[, end]]) -> bool\n\
3221 Return True if S starts with the specified prefix, False otherwise.\n\
3222 With optional start, test S beginning at that position.\n\
3223 With optional end, stop comparing S at that position.\n\
3224 prefix can also be a tuple of strings to try.");
3226 static PyObject *
3227 string_startswith(PyStringObject *self, PyObject *args)
3229 Py_ssize_t start = 0;
3230 Py_ssize_t end = PY_SSIZE_T_MAX;
3231 PyObject *subobj;
3232 int result;
3234 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
3235 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3236 return NULL;
3237 if (PyTuple_Check(subobj)) {
3238 Py_ssize_t i;
3239 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
3240 result = _string_tailmatch(self,
3241 PyTuple_GET_ITEM(subobj, i),
3242 start, end, -1);
3243 if (result == -1)
3244 return NULL;
3245 else if (result) {
3246 Py_RETURN_TRUE;
3249 Py_RETURN_FALSE;
3251 result = _string_tailmatch(self, subobj, start, end, -1);
3252 if (result == -1)
3253 return NULL;
3254 else
3255 return PyBool_FromLong(result);
3259 PyDoc_STRVAR(endswith__doc__,
3260 "S.endswith(suffix[, start[, end]]) -> bool\n\
3262 Return True if S ends with the specified suffix, False otherwise.\n\
3263 With optional start, test S beginning at that position.\n\
3264 With optional end, stop comparing S at that position.\n\
3265 suffix can also be a tuple of strings to try.");
3267 static PyObject *
3268 string_endswith(PyStringObject *self, PyObject *args)
3270 Py_ssize_t start = 0;
3271 Py_ssize_t end = PY_SSIZE_T_MAX;
3272 PyObject *subobj;
3273 int result;
3275 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
3276 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3277 return NULL;
3278 if (PyTuple_Check(subobj)) {
3279 Py_ssize_t i;
3280 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
3281 result = _string_tailmatch(self,
3282 PyTuple_GET_ITEM(subobj, i),
3283 start, end, +1);
3284 if (result == -1)
3285 return NULL;
3286 else if (result) {
3287 Py_RETURN_TRUE;
3290 Py_RETURN_FALSE;
3292 result = _string_tailmatch(self, subobj, start, end, +1);
3293 if (result == -1)
3294 return NULL;
3295 else
3296 return PyBool_FromLong(result);
3300 PyDoc_STRVAR(encode__doc__,
3301 "S.encode([encoding[,errors]]) -> object\n\
3303 Encodes S using the codec registered for encoding. encoding defaults\n\
3304 to the default encoding. errors may be given to set a different error\n\
3305 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3306 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
3307 'xmlcharrefreplace' as well as any other name registered with\n\
3308 codecs.register_error that is able to handle UnicodeEncodeErrors.");
3310 static PyObject *
3311 string_encode(PyStringObject *self, PyObject *args)
3313 char *encoding = NULL;
3314 char *errors = NULL;
3315 PyObject *v;
3317 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3318 return NULL;
3319 v = PyString_AsEncodedObject((PyObject *)self, encoding, errors);
3320 if (v == NULL)
3321 goto onError;
3322 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
3323 PyErr_Format(PyExc_TypeError,
3324 "encoder did not return a string/unicode object "
3325 "(type=%.400s)",
3326 Py_TYPE(v)->tp_name);
3327 Py_DECREF(v);
3328 return NULL;
3330 return v;
3332 onError:
3333 return NULL;
3337 PyDoc_STRVAR(decode__doc__,
3338 "S.decode([encoding[,errors]]) -> object\n\
3340 Decodes S using the codec registered for encoding. encoding defaults\n\
3341 to the default encoding. errors may be given to set a different error\n\
3342 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3343 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
3344 as well as any other name registerd with codecs.register_error that is\n\
3345 able to handle UnicodeDecodeErrors.");
3347 static PyObject *
3348 string_decode(PyStringObject *self, PyObject *args)
3350 char *encoding = NULL;
3351 char *errors = NULL;
3352 PyObject *v;
3354 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
3355 return NULL;
3356 v = PyString_AsDecodedObject((PyObject *)self, encoding, errors);
3357 if (v == NULL)
3358 goto onError;
3359 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
3360 PyErr_Format(PyExc_TypeError,
3361 "decoder did not return a string/unicode object "
3362 "(type=%.400s)",
3363 Py_TYPE(v)->tp_name);
3364 Py_DECREF(v);
3365 return NULL;
3367 return v;
3369 onError:
3370 return NULL;
3374 PyDoc_STRVAR(expandtabs__doc__,
3375 "S.expandtabs([tabsize]) -> string\n\
3377 Return a copy of S where all tab characters are expanded using spaces.\n\
3378 If tabsize is not given, a tab size of 8 characters is assumed.");
3380 static PyObject*
3381 string_expandtabs(PyStringObject *self, PyObject *args)
3383 const char *e, *p, *qe;
3384 char *q;
3385 Py_ssize_t i, j, incr;
3386 PyObject *u;
3387 int tabsize = 8;
3389 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3390 return NULL;
3392 /* First pass: determine size of output string */
3393 i = 0; /* chars up to and including most recent \n or \r */
3394 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
3395 e = PyString_AS_STRING(self) + PyString_GET_SIZE(self); /* end of input */
3396 for (p = PyString_AS_STRING(self); p < e; p++)
3397 if (*p == '\t') {
3398 if (tabsize > 0) {
3399 incr = tabsize - (j % tabsize);
3400 if (j > PY_SSIZE_T_MAX - incr)
3401 goto overflow1;
3402 j += incr;
3405 else {
3406 if (j > PY_SSIZE_T_MAX - 1)
3407 goto overflow1;
3408 j++;
3409 if (*p == '\n' || *p == '\r') {
3410 if (i > PY_SSIZE_T_MAX - j)
3411 goto overflow1;
3412 i += j;
3413 j = 0;
3417 if (i > PY_SSIZE_T_MAX - j)
3418 goto overflow1;
3420 /* Second pass: create output string and fill it */
3421 u = PyString_FromStringAndSize(NULL, i + j);
3422 if (!u)
3423 return NULL;
3425 j = 0; /* same as in first pass */
3426 q = PyString_AS_STRING(u); /* next output char */
3427 qe = PyString_AS_STRING(u) + PyString_GET_SIZE(u); /* end of output */
3429 for (p = PyString_AS_STRING(self); p < e; p++)
3430 if (*p == '\t') {
3431 if (tabsize > 0) {
3432 i = tabsize - (j % tabsize);
3433 j += i;
3434 while (i--) {
3435 if (q >= qe)
3436 goto overflow2;
3437 *q++ = ' ';
3441 else {
3442 if (q >= qe)
3443 goto overflow2;
3444 *q++ = *p;
3445 j++;
3446 if (*p == '\n' || *p == '\r')
3447 j = 0;
3450 return u;
3452 overflow2:
3453 Py_DECREF(u);
3454 overflow1:
3455 PyErr_SetString(PyExc_OverflowError, "new string is too long");
3456 return NULL;
3459 Py_LOCAL_INLINE(PyObject *)
3460 pad(PyStringObject *self, Py_ssize_t left, Py_ssize_t right, char fill)
3462 PyObject *u;
3464 if (left < 0)
3465 left = 0;
3466 if (right < 0)
3467 right = 0;
3469 if (left == 0 && right == 0 && PyString_CheckExact(self)) {
3470 Py_INCREF(self);
3471 return (PyObject *)self;
3474 u = PyString_FromStringAndSize(NULL,
3475 left + PyString_GET_SIZE(self) + right);
3476 if (u) {
3477 if (left)
3478 memset(PyString_AS_STRING(u), fill, left);
3479 Py_MEMCPY(PyString_AS_STRING(u) + left,
3480 PyString_AS_STRING(self),
3481 PyString_GET_SIZE(self));
3482 if (right)
3483 memset(PyString_AS_STRING(u) + left + PyString_GET_SIZE(self),
3484 fill, right);
3487 return u;
3490 PyDoc_STRVAR(ljust__doc__,
3491 "S.ljust(width[, fillchar]) -> string\n"
3492 "\n"
3493 "Return S left justified in a string of length width. Padding is\n"
3494 "done using the specified fill character (default is a space).");
3496 static PyObject *
3497 string_ljust(PyStringObject *self, PyObject *args)
3499 Py_ssize_t width;
3500 char fillchar = ' ';
3502 if (!PyArg_ParseTuple(args, "n|c:ljust", &width, &fillchar))
3503 return NULL;
3505 if (PyString_GET_SIZE(self) >= width && PyString_CheckExact(self)) {
3506 Py_INCREF(self);
3507 return (PyObject*) self;
3510 return pad(self, 0, width - PyString_GET_SIZE(self), fillchar);
3514 PyDoc_STRVAR(rjust__doc__,
3515 "S.rjust(width[, fillchar]) -> string\n"
3516 "\n"
3517 "Return S right justified in a string of length width. Padding is\n"
3518 "done using the specified fill character (default is a space)");
3520 static PyObject *
3521 string_rjust(PyStringObject *self, PyObject *args)
3523 Py_ssize_t width;
3524 char fillchar = ' ';
3526 if (!PyArg_ParseTuple(args, "n|c:rjust", &width, &fillchar))
3527 return NULL;
3529 if (PyString_GET_SIZE(self) >= width && PyString_CheckExact(self)) {
3530 Py_INCREF(self);
3531 return (PyObject*) self;
3534 return pad(self, width - PyString_GET_SIZE(self), 0, fillchar);
3538 PyDoc_STRVAR(center__doc__,
3539 "S.center(width[, fillchar]) -> string\n"
3540 "\n"
3541 "Return S centered in a string of length width. Padding is\n"
3542 "done using the specified fill character (default is a space)");
3544 static PyObject *
3545 string_center(PyStringObject *self, PyObject *args)
3547 Py_ssize_t marg, left;
3548 Py_ssize_t width;
3549 char fillchar = ' ';
3551 if (!PyArg_ParseTuple(args, "n|c:center", &width, &fillchar))
3552 return NULL;
3554 if (PyString_GET_SIZE(self) >= width && PyString_CheckExact(self)) {
3555 Py_INCREF(self);
3556 return (PyObject*) self;
3559 marg = width - PyString_GET_SIZE(self);
3560 left = marg / 2 + (marg & width & 1);
3562 return pad(self, left, marg - left, fillchar);
3565 PyDoc_STRVAR(zfill__doc__,
3566 "S.zfill(width) -> string\n"
3567 "\n"
3568 "Pad a numeric string S with zeros on the left, to fill a field\n"
3569 "of the specified width. The string S is never truncated.");
3571 static PyObject *
3572 string_zfill(PyStringObject *self, PyObject *args)
3574 Py_ssize_t fill;
3575 PyObject *s;
3576 char *p;
3577 Py_ssize_t width;
3579 if (!PyArg_ParseTuple(args, "n:zfill", &width))
3580 return NULL;
3582 if (PyString_GET_SIZE(self) >= width) {
3583 if (PyString_CheckExact(self)) {
3584 Py_INCREF(self);
3585 return (PyObject*) self;
3587 else
3588 return PyString_FromStringAndSize(
3589 PyString_AS_STRING(self),
3590 PyString_GET_SIZE(self)
3594 fill = width - PyString_GET_SIZE(self);
3596 s = pad(self, fill, 0, '0');
3598 if (s == NULL)
3599 return NULL;
3601 p = PyString_AS_STRING(s);
3602 if (p[fill] == '+' || p[fill] == '-') {
3603 /* move sign to beginning of string */
3604 p[0] = p[fill];
3605 p[fill] = '0';
3608 return (PyObject*) s;
3611 PyDoc_STRVAR(isspace__doc__,
3612 "S.isspace() -> bool\n\
3614 Return True if all characters in S are whitespace\n\
3615 and there is at least one character in S, False otherwise.");
3617 static PyObject*
3618 string_isspace(PyStringObject *self)
3620 register const unsigned char *p
3621 = (unsigned char *) PyString_AS_STRING(self);
3622 register const unsigned char *e;
3624 /* Shortcut for single character strings */
3625 if (PyString_GET_SIZE(self) == 1 &&
3626 isspace(*p))
3627 return PyBool_FromLong(1);
3629 /* Special case for empty strings */
3630 if (PyString_GET_SIZE(self) == 0)
3631 return PyBool_FromLong(0);
3633 e = p + PyString_GET_SIZE(self);
3634 for (; p < e; p++) {
3635 if (!isspace(*p))
3636 return PyBool_FromLong(0);
3638 return PyBool_FromLong(1);
3642 PyDoc_STRVAR(isalpha__doc__,
3643 "S.isalpha() -> bool\n\
3645 Return True if all characters in S are alphabetic\n\
3646 and there is at least one character in S, False otherwise.");
3648 static PyObject*
3649 string_isalpha(PyStringObject *self)
3651 register const unsigned char *p
3652 = (unsigned char *) PyString_AS_STRING(self);
3653 register const unsigned char *e;
3655 /* Shortcut for single character strings */
3656 if (PyString_GET_SIZE(self) == 1 &&
3657 isalpha(*p))
3658 return PyBool_FromLong(1);
3660 /* Special case for empty strings */
3661 if (PyString_GET_SIZE(self) == 0)
3662 return PyBool_FromLong(0);
3664 e = p + PyString_GET_SIZE(self);
3665 for (; p < e; p++) {
3666 if (!isalpha(*p))
3667 return PyBool_FromLong(0);
3669 return PyBool_FromLong(1);
3673 PyDoc_STRVAR(isalnum__doc__,
3674 "S.isalnum() -> bool\n\
3676 Return True if all characters in S are alphanumeric\n\
3677 and there is at least one character in S, False otherwise.");
3679 static PyObject*
3680 string_isalnum(PyStringObject *self)
3682 register const unsigned char *p
3683 = (unsigned char *) PyString_AS_STRING(self);
3684 register const unsigned char *e;
3686 /* Shortcut for single character strings */
3687 if (PyString_GET_SIZE(self) == 1 &&
3688 isalnum(*p))
3689 return PyBool_FromLong(1);
3691 /* Special case for empty strings */
3692 if (PyString_GET_SIZE(self) == 0)
3693 return PyBool_FromLong(0);
3695 e = p + PyString_GET_SIZE(self);
3696 for (; p < e; p++) {
3697 if (!isalnum(*p))
3698 return PyBool_FromLong(0);
3700 return PyBool_FromLong(1);
3704 PyDoc_STRVAR(isdigit__doc__,
3705 "S.isdigit() -> bool\n\
3707 Return True if all characters in S are digits\n\
3708 and there is at least one character in S, False otherwise.");
3710 static PyObject*
3711 string_isdigit(PyStringObject *self)
3713 register const unsigned char *p
3714 = (unsigned char *) PyString_AS_STRING(self);
3715 register const unsigned char *e;
3717 /* Shortcut for single character strings */
3718 if (PyString_GET_SIZE(self) == 1 &&
3719 isdigit(*p))
3720 return PyBool_FromLong(1);
3722 /* Special case for empty strings */
3723 if (PyString_GET_SIZE(self) == 0)
3724 return PyBool_FromLong(0);
3726 e = p + PyString_GET_SIZE(self);
3727 for (; p < e; p++) {
3728 if (!isdigit(*p))
3729 return PyBool_FromLong(0);
3731 return PyBool_FromLong(1);
3735 PyDoc_STRVAR(islower__doc__,
3736 "S.islower() -> bool\n\
3738 Return True if all cased characters in S are lowercase and there is\n\
3739 at least one cased character in S, False otherwise.");
3741 static PyObject*
3742 string_islower(PyStringObject *self)
3744 register const unsigned char *p
3745 = (unsigned char *) PyString_AS_STRING(self);
3746 register const unsigned char *e;
3747 int cased;
3749 /* Shortcut for single character strings */
3750 if (PyString_GET_SIZE(self) == 1)
3751 return PyBool_FromLong(islower(*p) != 0);
3753 /* Special case for empty strings */
3754 if (PyString_GET_SIZE(self) == 0)
3755 return PyBool_FromLong(0);
3757 e = p + PyString_GET_SIZE(self);
3758 cased = 0;
3759 for (; p < e; p++) {
3760 if (isupper(*p))
3761 return PyBool_FromLong(0);
3762 else if (!cased && islower(*p))
3763 cased = 1;
3765 return PyBool_FromLong(cased);
3769 PyDoc_STRVAR(isupper__doc__,
3770 "S.isupper() -> bool\n\
3772 Return True if all cased characters in S are uppercase and there is\n\
3773 at least one cased character in S, False otherwise.");
3775 static PyObject*
3776 string_isupper(PyStringObject *self)
3778 register const unsigned char *p
3779 = (unsigned char *) PyString_AS_STRING(self);
3780 register const unsigned char *e;
3781 int cased;
3783 /* Shortcut for single character strings */
3784 if (PyString_GET_SIZE(self) == 1)
3785 return PyBool_FromLong(isupper(*p) != 0);
3787 /* Special case for empty strings */
3788 if (PyString_GET_SIZE(self) == 0)
3789 return PyBool_FromLong(0);
3791 e = p + PyString_GET_SIZE(self);
3792 cased = 0;
3793 for (; p < e; p++) {
3794 if (islower(*p))
3795 return PyBool_FromLong(0);
3796 else if (!cased && isupper(*p))
3797 cased = 1;
3799 return PyBool_FromLong(cased);
3803 PyDoc_STRVAR(istitle__doc__,
3804 "S.istitle() -> bool\n\
3806 Return True if S is a titlecased string and there is at least one\n\
3807 character in S, i.e. uppercase characters may only follow uncased\n\
3808 characters and lowercase characters only cased ones. Return False\n\
3809 otherwise.");
3811 static PyObject*
3812 string_istitle(PyStringObject *self, PyObject *uncased)
3814 register const unsigned char *p
3815 = (unsigned char *) PyString_AS_STRING(self);
3816 register const unsigned char *e;
3817 int cased, previous_is_cased;
3819 /* Shortcut for single character strings */
3820 if (PyString_GET_SIZE(self) == 1)
3821 return PyBool_FromLong(isupper(*p) != 0);
3823 /* Special case for empty strings */
3824 if (PyString_GET_SIZE(self) == 0)
3825 return PyBool_FromLong(0);
3827 e = p + PyString_GET_SIZE(self);
3828 cased = 0;
3829 previous_is_cased = 0;
3830 for (; p < e; p++) {
3831 register const unsigned char ch = *p;
3833 if (isupper(ch)) {
3834 if (previous_is_cased)
3835 return PyBool_FromLong(0);
3836 previous_is_cased = 1;
3837 cased = 1;
3839 else if (islower(ch)) {
3840 if (!previous_is_cased)
3841 return PyBool_FromLong(0);
3842 previous_is_cased = 1;
3843 cased = 1;
3845 else
3846 previous_is_cased = 0;
3848 return PyBool_FromLong(cased);
3852 PyDoc_STRVAR(splitlines__doc__,
3853 "S.splitlines([keepends]) -> list of strings\n\
3855 Return a list of the lines in S, breaking at line boundaries.\n\
3856 Line breaks are not included in the resulting list unless keepends\n\
3857 is given and true.");
3859 static PyObject*
3860 string_splitlines(PyStringObject *self, PyObject *args)
3862 register Py_ssize_t i;
3863 register Py_ssize_t j;
3864 Py_ssize_t len;
3865 int keepends = 0;
3866 PyObject *list;
3867 PyObject *str;
3868 char *data;
3870 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
3871 return NULL;
3873 data = PyString_AS_STRING(self);
3874 len = PyString_GET_SIZE(self);
3876 /* This does not use the preallocated list because splitlines is
3877 usually run with hundreds of newlines. The overhead of
3878 switching between PyList_SET_ITEM and append causes about a
3879 2-3% slowdown for that common case. A smarter implementation
3880 could move the if check out, so the SET_ITEMs are done first
3881 and the appends only done when the prealloc buffer is full.
3882 That's too much work for little gain.*/
3884 list = PyList_New(0);
3885 if (!list)
3886 goto onError;
3888 for (i = j = 0; i < len; ) {
3889 Py_ssize_t eol;
3891 /* Find a line and append it */
3892 while (i < len && data[i] != '\n' && data[i] != '\r')
3893 i++;
3895 /* Skip the line break reading CRLF as one line break */
3896 eol = i;
3897 if (i < len) {
3898 if (data[i] == '\r' && i + 1 < len &&
3899 data[i+1] == '\n')
3900 i += 2;
3901 else
3902 i++;
3903 if (keepends)
3904 eol = i;
3906 SPLIT_APPEND(data, j, eol);
3907 j = i;
3909 if (j < len) {
3910 SPLIT_APPEND(data, j, len);
3913 return list;
3915 onError:
3916 Py_XDECREF(list);
3917 return NULL;
3920 PyDoc_STRVAR(sizeof__doc__,
3921 "S.__sizeof__() -> size of S in memory, in bytes");
3923 static PyObject *
3924 string_sizeof(PyStringObject *v)
3926 Py_ssize_t res;
3927 res = sizeof(PyStringObject) + v->ob_size * v->ob_type->tp_itemsize;
3928 return PyInt_FromSsize_t(res);
3931 #undef SPLIT_APPEND
3932 #undef SPLIT_ADD
3933 #undef MAX_PREALLOC
3934 #undef PREALLOC_SIZE
3936 static PyObject *
3937 string_getnewargs(PyStringObject *v)
3939 return Py_BuildValue("(s#)", v->ob_sval, Py_SIZE(v));
3943 #include "stringlib/string_format.h"
3945 PyDoc_STRVAR(format__doc__,
3946 "S.format(*args, **kwargs) -> unicode\n\
3950 static PyObject *
3951 string__format__(PyObject* self, PyObject* args)
3953 PyObject *format_spec;
3954 PyObject *result = NULL;
3955 PyObject *tmp = NULL;
3957 /* If 2.x, convert format_spec to the same type as value */
3958 /* This is to allow things like u''.format('') */
3959 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
3960 goto done;
3961 if (!(PyString_Check(format_spec) || PyUnicode_Check(format_spec))) {
3962 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
3963 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
3964 goto done;
3966 tmp = PyObject_Str(format_spec);
3967 if (tmp == NULL)
3968 goto done;
3969 format_spec = tmp;
3971 result = _PyBytes_FormatAdvanced(self,
3972 PyString_AS_STRING(format_spec),
3973 PyString_GET_SIZE(format_spec));
3974 done:
3975 Py_XDECREF(tmp);
3976 return result;
3979 PyDoc_STRVAR(p_format__doc__,
3980 "S.__format__(format_spec) -> unicode\n\
3985 static PyMethodDef
3986 string_methods[] = {
3987 /* Counterparts of the obsolete stropmodule functions; except
3988 string.maketrans(). */
3989 {"join", (PyCFunction)string_join, METH_O, join__doc__},
3990 {"split", (PyCFunction)string_split, METH_VARARGS, split__doc__},
3991 {"rsplit", (PyCFunction)string_rsplit, METH_VARARGS, rsplit__doc__},
3992 {"lower", (PyCFunction)string_lower, METH_NOARGS, lower__doc__},
3993 {"upper", (PyCFunction)string_upper, METH_NOARGS, upper__doc__},
3994 {"islower", (PyCFunction)string_islower, METH_NOARGS, islower__doc__},
3995 {"isupper", (PyCFunction)string_isupper, METH_NOARGS, isupper__doc__},
3996 {"isspace", (PyCFunction)string_isspace, METH_NOARGS, isspace__doc__},
3997 {"isdigit", (PyCFunction)string_isdigit, METH_NOARGS, isdigit__doc__},
3998 {"istitle", (PyCFunction)string_istitle, METH_NOARGS, istitle__doc__},
3999 {"isalpha", (PyCFunction)string_isalpha, METH_NOARGS, isalpha__doc__},
4000 {"isalnum", (PyCFunction)string_isalnum, METH_NOARGS, isalnum__doc__},
4001 {"capitalize", (PyCFunction)string_capitalize, METH_NOARGS,
4002 capitalize__doc__},
4003 {"count", (PyCFunction)string_count, METH_VARARGS, count__doc__},
4004 {"endswith", (PyCFunction)string_endswith, METH_VARARGS,
4005 endswith__doc__},
4006 {"partition", (PyCFunction)string_partition, METH_O, partition__doc__},
4007 {"find", (PyCFunction)string_find, METH_VARARGS, find__doc__},
4008 {"index", (PyCFunction)string_index, METH_VARARGS, index__doc__},
4009 {"lstrip", (PyCFunction)string_lstrip, METH_VARARGS, lstrip__doc__},
4010 {"replace", (PyCFunction)string_replace, METH_VARARGS, replace__doc__},
4011 {"rfind", (PyCFunction)string_rfind, METH_VARARGS, rfind__doc__},
4012 {"rindex", (PyCFunction)string_rindex, METH_VARARGS, rindex__doc__},
4013 {"rstrip", (PyCFunction)string_rstrip, METH_VARARGS, rstrip__doc__},
4014 {"rpartition", (PyCFunction)string_rpartition, METH_O,
4015 rpartition__doc__},
4016 {"startswith", (PyCFunction)string_startswith, METH_VARARGS,
4017 startswith__doc__},
4018 {"strip", (PyCFunction)string_strip, METH_VARARGS, strip__doc__},
4019 {"swapcase", (PyCFunction)string_swapcase, METH_NOARGS,
4020 swapcase__doc__},
4021 {"translate", (PyCFunction)string_translate, METH_VARARGS,
4022 translate__doc__},
4023 {"title", (PyCFunction)string_title, METH_NOARGS, title__doc__},
4024 {"ljust", (PyCFunction)string_ljust, METH_VARARGS, ljust__doc__},
4025 {"rjust", (PyCFunction)string_rjust, METH_VARARGS, rjust__doc__},
4026 {"center", (PyCFunction)string_center, METH_VARARGS, center__doc__},
4027 {"zfill", (PyCFunction)string_zfill, METH_VARARGS, zfill__doc__},
4028 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
4029 {"__format__", (PyCFunction) string__format__, METH_VARARGS, p_format__doc__},
4030 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
4031 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
4032 {"encode", (PyCFunction)string_encode, METH_VARARGS, encode__doc__},
4033 {"decode", (PyCFunction)string_decode, METH_VARARGS, decode__doc__},
4034 {"expandtabs", (PyCFunction)string_expandtabs, METH_VARARGS,
4035 expandtabs__doc__},
4036 {"splitlines", (PyCFunction)string_splitlines, METH_VARARGS,
4037 splitlines__doc__},
4038 {"__sizeof__", (PyCFunction)string_sizeof, METH_NOARGS,
4039 sizeof__doc__},
4040 {"__getnewargs__", (PyCFunction)string_getnewargs, METH_NOARGS},
4041 {NULL, NULL} /* sentinel */
4044 static PyObject *
4045 str_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
4047 static PyObject *
4048 string_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
4050 PyObject *x = NULL;
4051 static char *kwlist[] = {"object", 0};
4053 if (type != &PyString_Type)
4054 return str_subtype_new(type, args, kwds);
4055 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O:str", kwlist, &x))
4056 return NULL;
4057 if (x == NULL)
4058 return PyString_FromString("");
4059 return PyObject_Str(x);
4062 static PyObject *
4063 str_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
4065 PyObject *tmp, *pnew;
4066 Py_ssize_t n;
4068 assert(PyType_IsSubtype(type, &PyString_Type));
4069 tmp = string_new(&PyString_Type, args, kwds);
4070 if (tmp == NULL)
4071 return NULL;
4072 assert(PyString_CheckExact(tmp));
4073 n = PyString_GET_SIZE(tmp);
4074 pnew = type->tp_alloc(type, n);
4075 if (pnew != NULL) {
4076 Py_MEMCPY(PyString_AS_STRING(pnew), PyString_AS_STRING(tmp), n+1);
4077 ((PyStringObject *)pnew)->ob_shash =
4078 ((PyStringObject *)tmp)->ob_shash;
4079 ((PyStringObject *)pnew)->ob_sstate = SSTATE_NOT_INTERNED;
4081 Py_DECREF(tmp);
4082 return pnew;
4085 static PyObject *
4086 basestring_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
4088 PyErr_SetString(PyExc_TypeError,
4089 "The basestring type cannot be instantiated");
4090 return NULL;
4093 static PyObject *
4094 string_mod(PyObject *v, PyObject *w)
4096 if (!PyString_Check(v)) {
4097 Py_INCREF(Py_NotImplemented);
4098 return Py_NotImplemented;
4100 return PyString_Format(v, w);
4103 PyDoc_STRVAR(basestring_doc,
4104 "Type basestring cannot be instantiated; it is the base for str and unicode.");
4106 static PyNumberMethods string_as_number = {
4107 0, /*nb_add*/
4108 0, /*nb_subtract*/
4109 0, /*nb_multiply*/
4110 0, /*nb_divide*/
4111 string_mod, /*nb_remainder*/
4115 PyTypeObject PyBaseString_Type = {
4116 PyVarObject_HEAD_INIT(&PyType_Type, 0)
4117 "basestring",
4120 0, /* tp_dealloc */
4121 0, /* tp_print */
4122 0, /* tp_getattr */
4123 0, /* tp_setattr */
4124 0, /* tp_compare */
4125 0, /* tp_repr */
4126 0, /* tp_as_number */
4127 0, /* tp_as_sequence */
4128 0, /* tp_as_mapping */
4129 0, /* tp_hash */
4130 0, /* tp_call */
4131 0, /* tp_str */
4132 0, /* tp_getattro */
4133 0, /* tp_setattro */
4134 0, /* tp_as_buffer */
4135 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
4136 basestring_doc, /* tp_doc */
4137 0, /* tp_traverse */
4138 0, /* tp_clear */
4139 0, /* tp_richcompare */
4140 0, /* tp_weaklistoffset */
4141 0, /* tp_iter */
4142 0, /* tp_iternext */
4143 0, /* tp_methods */
4144 0, /* tp_members */
4145 0, /* tp_getset */
4146 &PyBaseObject_Type, /* tp_base */
4147 0, /* tp_dict */
4148 0, /* tp_descr_get */
4149 0, /* tp_descr_set */
4150 0, /* tp_dictoffset */
4151 0, /* tp_init */
4152 0, /* tp_alloc */
4153 basestring_new, /* tp_new */
4154 0, /* tp_free */
4157 PyDoc_STRVAR(string_doc,
4158 "str(object) -> string\n\
4160 Return a nice string representation of the object.\n\
4161 If the argument is a string, the return value is the same object.");
4163 PyTypeObject PyString_Type = {
4164 PyVarObject_HEAD_INIT(&PyType_Type, 0)
4165 "str",
4166 sizeof(PyStringObject),
4167 sizeof(char),
4168 string_dealloc, /* tp_dealloc */
4169 (printfunc)string_print, /* tp_print */
4170 0, /* tp_getattr */
4171 0, /* tp_setattr */
4172 0, /* tp_compare */
4173 string_repr, /* tp_repr */
4174 &string_as_number, /* tp_as_number */
4175 &string_as_sequence, /* tp_as_sequence */
4176 &string_as_mapping, /* tp_as_mapping */
4177 (hashfunc)string_hash, /* tp_hash */
4178 0, /* tp_call */
4179 string_str, /* tp_str */
4180 PyObject_GenericGetAttr, /* tp_getattro */
4181 0, /* tp_setattro */
4182 &string_as_buffer, /* tp_as_buffer */
4183 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
4184 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_STRING_SUBCLASS |
4185 Py_TPFLAGS_HAVE_NEWBUFFER, /* tp_flags */
4186 string_doc, /* tp_doc */
4187 0, /* tp_traverse */
4188 0, /* tp_clear */
4189 (richcmpfunc)string_richcompare, /* tp_richcompare */
4190 0, /* tp_weaklistoffset */
4191 0, /* tp_iter */
4192 0, /* tp_iternext */
4193 string_methods, /* tp_methods */
4194 0, /* tp_members */
4195 0, /* tp_getset */
4196 &PyBaseString_Type, /* tp_base */
4197 0, /* tp_dict */
4198 0, /* tp_descr_get */
4199 0, /* tp_descr_set */
4200 0, /* tp_dictoffset */
4201 0, /* tp_init */
4202 0, /* tp_alloc */
4203 string_new, /* tp_new */
4204 PyObject_Del, /* tp_free */
4207 void
4208 PyString_Concat(register PyObject **pv, register PyObject *w)
4210 register PyObject *v;
4211 if (*pv == NULL)
4212 return;
4213 if (w == NULL || !PyString_Check(*pv)) {
4214 Py_DECREF(*pv);
4215 *pv = NULL;
4216 return;
4218 v = string_concat((PyStringObject *) *pv, w);
4219 Py_DECREF(*pv);
4220 *pv = v;
4223 void
4224 PyString_ConcatAndDel(register PyObject **pv, register PyObject *w)
4226 PyString_Concat(pv, w);
4227 Py_XDECREF(w);
4231 /* The following function breaks the notion that strings are immutable:
4232 it changes the size of a string. We get away with this only if there
4233 is only one module referencing the object. You can also think of it
4234 as creating a new string object and destroying the old one, only
4235 more efficiently. In any case, don't use this if the string may
4236 already be known to some other part of the code...
4237 Note that if there's not enough memory to resize the string, the original
4238 string object at *pv is deallocated, *pv is set to NULL, an "out of
4239 memory" exception is set, and -1 is returned. Else (on success) 0 is
4240 returned, and the value in *pv may or may not be the same as on input.
4241 As always, an extra byte is allocated for a trailing \0 byte (newsize
4242 does *not* include that), and a trailing \0 byte is stored.
4246 _PyString_Resize(PyObject **pv, Py_ssize_t newsize)
4248 register PyObject *v;
4249 register PyStringObject *sv;
4250 v = *pv;
4251 if (!PyString_Check(v) || Py_REFCNT(v) != 1 || newsize < 0 ||
4252 PyString_CHECK_INTERNED(v)) {
4253 *pv = 0;
4254 Py_DECREF(v);
4255 PyErr_BadInternalCall();
4256 return -1;
4258 /* XXX UNREF/NEWREF interface should be more symmetrical */
4259 _Py_DEC_REFTOTAL;
4260 _Py_ForgetReference(v);
4261 *pv = (PyObject *)
4262 PyObject_REALLOC((char *)v, sizeof(PyStringObject) + newsize);
4263 if (*pv == NULL) {
4264 PyObject_Del(v);
4265 PyErr_NoMemory();
4266 return -1;
4268 _Py_NewReference(*pv);
4269 sv = (PyStringObject *) *pv;
4270 Py_SIZE(sv) = newsize;
4271 sv->ob_sval[newsize] = '\0';
4272 sv->ob_shash = -1; /* invalidate cached hash value */
4273 return 0;
4276 /* Helpers for formatstring */
4278 Py_LOCAL_INLINE(PyObject *)
4279 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
4281 Py_ssize_t argidx = *p_argidx;
4282 if (argidx < arglen) {
4283 (*p_argidx)++;
4284 if (arglen < 0)
4285 return args;
4286 else
4287 return PyTuple_GetItem(args, argidx);
4289 PyErr_SetString(PyExc_TypeError,
4290 "not enough arguments for format string");
4291 return NULL;
4294 /* Format codes
4295 * F_LJUST '-'
4296 * F_SIGN '+'
4297 * F_BLANK ' '
4298 * F_ALT '#'
4299 * F_ZERO '0'
4301 #define F_LJUST (1<<0)
4302 #define F_SIGN (1<<1)
4303 #define F_BLANK (1<<2)
4304 #define F_ALT (1<<3)
4305 #define F_ZERO (1<<4)
4307 Py_LOCAL_INLINE(int)
4308 formatfloat(char *buf, size_t buflen, int flags,
4309 int prec, int type, PyObject *v)
4311 /* fmt = '%#.' + `prec` + `type`
4312 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
4313 char fmt[20];
4314 double x;
4315 x = PyFloat_AsDouble(v);
4316 if (x == -1.0 && PyErr_Occurred()) {
4317 PyErr_Format(PyExc_TypeError, "float argument required, "
4318 "not %.200s", Py_TYPE(v)->tp_name);
4319 return -1;
4321 if (prec < 0)
4322 prec = 6;
4323 if (type == 'f' && fabs(x)/1e25 >= 1e25)
4324 type = 'g';
4325 /* Worst case length calc to ensure no buffer overrun:
4327 'g' formats:
4328 fmt = %#.<prec>g
4329 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4330 for any double rep.)
4331 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4333 'f' formats:
4334 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
4335 len = 1 + 50 + 1 + prec = 52 + prec
4337 If prec=0 the effective precision is 1 (the leading digit is
4338 always given), therefore increase the length by one.
4341 if (((type == 'g' || type == 'G') &&
4342 buflen <= (size_t)10 + (size_t)prec) ||
4343 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
4344 PyErr_SetString(PyExc_OverflowError,
4345 "formatted float is too long (precision too large?)");
4346 return -1;
4348 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
4349 (flags&F_ALT) ? "#" : "",
4350 prec, type);
4351 PyOS_ascii_formatd(buf, buflen, fmt, x);
4352 return (int)strlen(buf);
4355 /* _PyString_FormatLong emulates the format codes d, u, o, x and X, and
4356 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
4357 * Python's regular ints.
4358 * Return value: a new PyString*, or NULL if error.
4359 * . *pbuf is set to point into it,
4360 * *plen set to the # of chars following that.
4361 * Caller must decref it when done using pbuf.
4362 * The string starting at *pbuf is of the form
4363 * "-"? ("0x" | "0X")? digit+
4364 * "0x"/"0X" are present only for x and X conversions, with F_ALT
4365 * set in flags. The case of hex digits will be correct,
4366 * There will be at least prec digits, zero-filled on the left if
4367 * necessary to get that many.
4368 * val object to be converted
4369 * flags bitmask of format flags; only F_ALT is looked at
4370 * prec minimum number of digits; 0-fill on left if needed
4371 * type a character in [duoxX]; u acts the same as d
4373 * CAUTION: o, x and X conversions on regular ints can never
4374 * produce a '-' sign, but can for Python's unbounded ints.
4376 PyObject*
4377 _PyString_FormatLong(PyObject *val, int flags, int prec, int type,
4378 char **pbuf, int *plen)
4380 PyObject *result = NULL;
4381 char *buf;
4382 Py_ssize_t i;
4383 int sign; /* 1 if '-', else 0 */
4384 int len; /* number of characters */
4385 Py_ssize_t llen;
4386 int numdigits; /* len == numnondigits + numdigits */
4387 int numnondigits = 0;
4389 switch (type) {
4390 case 'd':
4391 case 'u':
4392 result = Py_TYPE(val)->tp_str(val);
4393 break;
4394 case 'o':
4395 result = Py_TYPE(val)->tp_as_number->nb_oct(val);
4396 break;
4397 case 'x':
4398 case 'X':
4399 numnondigits = 2;
4400 result = Py_TYPE(val)->tp_as_number->nb_hex(val);
4401 break;
4402 default:
4403 assert(!"'type' not in [duoxX]");
4405 if (!result)
4406 return NULL;
4408 buf = PyString_AsString(result);
4409 if (!buf) {
4410 Py_DECREF(result);
4411 return NULL;
4414 /* To modify the string in-place, there can only be one reference. */
4415 if (Py_REFCNT(result) != 1) {
4416 PyErr_BadInternalCall();
4417 return NULL;
4419 llen = PyString_Size(result);
4420 if (llen > INT_MAX) {
4421 PyErr_SetString(PyExc_ValueError, "string too large in _PyString_FormatLong");
4422 return NULL;
4424 len = (int)llen;
4425 if (buf[len-1] == 'L') {
4426 --len;
4427 buf[len] = '\0';
4429 sign = buf[0] == '-';
4430 numnondigits += sign;
4431 numdigits = len - numnondigits;
4432 assert(numdigits > 0);
4434 /* Get rid of base marker unless F_ALT */
4435 if ((flags & F_ALT) == 0) {
4436 /* Need to skip 0x, 0X or 0. */
4437 int skipped = 0;
4438 switch (type) {
4439 case 'o':
4440 assert(buf[sign] == '0');
4441 /* If 0 is only digit, leave it alone. */
4442 if (numdigits > 1) {
4443 skipped = 1;
4444 --numdigits;
4446 break;
4447 case 'x':
4448 case 'X':
4449 assert(buf[sign] == '0');
4450 assert(buf[sign + 1] == 'x');
4451 skipped = 2;
4452 numnondigits -= 2;
4453 break;
4455 if (skipped) {
4456 buf += skipped;
4457 len -= skipped;
4458 if (sign)
4459 buf[0] = '-';
4461 assert(len == numnondigits + numdigits);
4462 assert(numdigits > 0);
4465 /* Fill with leading zeroes to meet minimum width. */
4466 if (prec > numdigits) {
4467 PyObject *r1 = PyString_FromStringAndSize(NULL,
4468 numnondigits + prec);
4469 char *b1;
4470 if (!r1) {
4471 Py_DECREF(result);
4472 return NULL;
4474 b1 = PyString_AS_STRING(r1);
4475 for (i = 0; i < numnondigits; ++i)
4476 *b1++ = *buf++;
4477 for (i = 0; i < prec - numdigits; i++)
4478 *b1++ = '0';
4479 for (i = 0; i < numdigits; i++)
4480 *b1++ = *buf++;
4481 *b1 = '\0';
4482 Py_DECREF(result);
4483 result = r1;
4484 buf = PyString_AS_STRING(result);
4485 len = numnondigits + prec;
4488 /* Fix up case for hex conversions. */
4489 if (type == 'X') {
4490 /* Need to convert all lower case letters to upper case.
4491 and need to convert 0x to 0X (and -0x to -0X). */
4492 for (i = 0; i < len; i++)
4493 if (buf[i] >= 'a' && buf[i] <= 'x')
4494 buf[i] -= 'a'-'A';
4496 *pbuf = buf;
4497 *plen = len;
4498 return result;
4501 Py_LOCAL_INLINE(int)
4502 formatint(char *buf, size_t buflen, int flags,
4503 int prec, int type, PyObject *v)
4505 /* fmt = '%#.' + `prec` + 'l' + `type`
4506 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4507 + 1 + 1 = 24 */
4508 char fmt[64]; /* plenty big enough! */
4509 char *sign;
4510 long x;
4512 x = PyInt_AsLong(v);
4513 if (x == -1 && PyErr_Occurred()) {
4514 PyErr_Format(PyExc_TypeError, "int argument required, not %.200s",
4515 Py_TYPE(v)->tp_name);
4516 return -1;
4518 if (x < 0 && type == 'u') {
4519 type = 'd';
4521 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
4522 sign = "-";
4523 else
4524 sign = "";
4525 if (prec < 0)
4526 prec = 1;
4528 if ((flags & F_ALT) &&
4529 (type == 'x' || type == 'X')) {
4530 /* When converting under %#x or %#X, there are a number
4531 * of issues that cause pain:
4532 * - when 0 is being converted, the C standard leaves off
4533 * the '0x' or '0X', which is inconsistent with other
4534 * %#x/%#X conversions and inconsistent with Python's
4535 * hex() function
4536 * - there are platforms that violate the standard and
4537 * convert 0 with the '0x' or '0X'
4538 * (Metrowerks, Compaq Tru64)
4539 * - there are platforms that give '0x' when converting
4540 * under %#X, but convert 0 in accordance with the
4541 * standard (OS/2 EMX)
4543 * We can achieve the desired consistency by inserting our
4544 * own '0x' or '0X' prefix, and substituting %x/%X in place
4545 * of %#x/%#X.
4547 * Note that this is the same approach as used in
4548 * formatint() in unicodeobject.c
4550 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
4551 sign, type, prec, type);
4553 else {
4554 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
4555 sign, (flags&F_ALT) ? "#" : "",
4556 prec, type);
4559 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
4560 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
4562 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
4563 PyErr_SetString(PyExc_OverflowError,
4564 "formatted integer is too long (precision too large?)");
4565 return -1;
4567 if (sign[0])
4568 PyOS_snprintf(buf, buflen, fmt, -x);
4569 else
4570 PyOS_snprintf(buf, buflen, fmt, x);
4571 return (int)strlen(buf);
4574 Py_LOCAL_INLINE(int)
4575 formatchar(char *buf, size_t buflen, PyObject *v)
4577 /* presume that the buffer is at least 2 characters long */
4578 if (PyString_Check(v)) {
4579 if (!PyArg_Parse(v, "c;%c requires int or char", &buf[0]))
4580 return -1;
4582 else {
4583 if (!PyArg_Parse(v, "b;%c requires int or char", &buf[0]))
4584 return -1;
4586 buf[1] = '\0';
4587 return 1;
4590 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4592 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4593 chars are formatted. XXX This is a magic number. Each formatting
4594 routine does bounds checking to ensure no overflow, but a better
4595 solution may be to malloc a buffer of appropriate size for each
4596 format. For now, the current solution is sufficient.
4598 #define FORMATBUFLEN (size_t)120
4600 PyObject *
4601 PyString_Format(PyObject *format, PyObject *args)
4603 char *fmt, *res;
4604 Py_ssize_t arglen, argidx;
4605 Py_ssize_t reslen, rescnt, fmtcnt;
4606 int args_owned = 0;
4607 PyObject *result, *orig_args;
4608 #ifdef Py_USING_UNICODE
4609 PyObject *v, *w;
4610 #endif
4611 PyObject *dict = NULL;
4612 if (format == NULL || !PyString_Check(format) || args == NULL) {
4613 PyErr_BadInternalCall();
4614 return NULL;
4616 orig_args = args;
4617 fmt = PyString_AS_STRING(format);
4618 fmtcnt = PyString_GET_SIZE(format);
4619 reslen = rescnt = fmtcnt + 100;
4620 result = PyString_FromStringAndSize((char *)NULL, reslen);
4621 if (result == NULL)
4622 return NULL;
4623 res = PyString_AsString(result);
4624 if (PyTuple_Check(args)) {
4625 arglen = PyTuple_GET_SIZE(args);
4626 argidx = 0;
4628 else {
4629 arglen = -1;
4630 argidx = -2;
4632 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
4633 !PyObject_TypeCheck(args, &PyBaseString_Type))
4634 dict = args;
4635 while (--fmtcnt >= 0) {
4636 if (*fmt != '%') {
4637 if (--rescnt < 0) {
4638 rescnt = fmtcnt + 100;
4639 reslen += rescnt;
4640 if (_PyString_Resize(&result, reslen) < 0)
4641 return NULL;
4642 res = PyString_AS_STRING(result)
4643 + reslen - rescnt;
4644 --rescnt;
4646 *res++ = *fmt++;
4648 else {
4649 /* Got a format specifier */
4650 int flags = 0;
4651 Py_ssize_t width = -1;
4652 int prec = -1;
4653 int c = '\0';
4654 int fill;
4655 int isnumok;
4656 PyObject *v = NULL;
4657 PyObject *temp = NULL;
4658 char *pbuf;
4659 int sign;
4660 Py_ssize_t len;
4661 char formatbuf[FORMATBUFLEN];
4662 /* For format{float,int,char}() */
4663 #ifdef Py_USING_UNICODE
4664 char *fmt_start = fmt;
4665 Py_ssize_t argidx_start = argidx;
4666 #endif
4668 fmt++;
4669 if (*fmt == '(') {
4670 char *keystart;
4671 Py_ssize_t keylen;
4672 PyObject *key;
4673 int pcount = 1;
4675 if (dict == NULL) {
4676 PyErr_SetString(PyExc_TypeError,
4677 "format requires a mapping");
4678 goto error;
4680 ++fmt;
4681 --fmtcnt;
4682 keystart = fmt;
4683 /* Skip over balanced parentheses */
4684 while (pcount > 0 && --fmtcnt >= 0) {
4685 if (*fmt == ')')
4686 --pcount;
4687 else if (*fmt == '(')
4688 ++pcount;
4689 fmt++;
4691 keylen = fmt - keystart - 1;
4692 if (fmtcnt < 0 || pcount > 0) {
4693 PyErr_SetString(PyExc_ValueError,
4694 "incomplete format key");
4695 goto error;
4697 key = PyString_FromStringAndSize(keystart,
4698 keylen);
4699 if (key == NULL)
4700 goto error;
4701 if (args_owned) {
4702 Py_DECREF(args);
4703 args_owned = 0;
4705 args = PyObject_GetItem(dict, key);
4706 Py_DECREF(key);
4707 if (args == NULL) {
4708 goto error;
4710 args_owned = 1;
4711 arglen = -1;
4712 argidx = -2;
4714 while (--fmtcnt >= 0) {
4715 switch (c = *fmt++) {
4716 case '-': flags |= F_LJUST; continue;
4717 case '+': flags |= F_SIGN; continue;
4718 case ' ': flags |= F_BLANK; continue;
4719 case '#': flags |= F_ALT; continue;
4720 case '0': flags |= F_ZERO; continue;
4722 break;
4724 if (c == '*') {
4725 v = getnextarg(args, arglen, &argidx);
4726 if (v == NULL)
4727 goto error;
4728 if (!PyInt_Check(v)) {
4729 PyErr_SetString(PyExc_TypeError,
4730 "* wants int");
4731 goto error;
4733 width = PyInt_AsLong(v);
4734 if (width < 0) {
4735 flags |= F_LJUST;
4736 width = -width;
4738 if (--fmtcnt >= 0)
4739 c = *fmt++;
4741 else if (c >= 0 && isdigit(c)) {
4742 width = c - '0';
4743 while (--fmtcnt >= 0) {
4744 c = Py_CHARMASK(*fmt++);
4745 if (!isdigit(c))
4746 break;
4747 if ((width*10) / 10 != width) {
4748 PyErr_SetString(
4749 PyExc_ValueError,
4750 "width too big");
4751 goto error;
4753 width = width*10 + (c - '0');
4756 if (c == '.') {
4757 prec = 0;
4758 if (--fmtcnt >= 0)
4759 c = *fmt++;
4760 if (c == '*') {
4761 v = getnextarg(args, arglen, &argidx);
4762 if (v == NULL)
4763 goto error;
4764 if (!PyInt_Check(v)) {
4765 PyErr_SetString(
4766 PyExc_TypeError,
4767 "* wants int");
4768 goto error;
4770 prec = PyInt_AsLong(v);
4771 if (prec < 0)
4772 prec = 0;
4773 if (--fmtcnt >= 0)
4774 c = *fmt++;
4776 else if (c >= 0 && isdigit(c)) {
4777 prec = c - '0';
4778 while (--fmtcnt >= 0) {
4779 c = Py_CHARMASK(*fmt++);
4780 if (!isdigit(c))
4781 break;
4782 if ((prec*10) / 10 != prec) {
4783 PyErr_SetString(
4784 PyExc_ValueError,
4785 "prec too big");
4786 goto error;
4788 prec = prec*10 + (c - '0');
4791 } /* prec */
4792 if (fmtcnt >= 0) {
4793 if (c == 'h' || c == 'l' || c == 'L') {
4794 if (--fmtcnt >= 0)
4795 c = *fmt++;
4798 if (fmtcnt < 0) {
4799 PyErr_SetString(PyExc_ValueError,
4800 "incomplete format");
4801 goto error;
4803 if (c != '%') {
4804 v = getnextarg(args, arglen, &argidx);
4805 if (v == NULL)
4806 goto error;
4808 sign = 0;
4809 fill = ' ';
4810 switch (c) {
4811 case '%':
4812 pbuf = "%";
4813 len = 1;
4814 break;
4815 case 's':
4816 #ifdef Py_USING_UNICODE
4817 if (PyUnicode_Check(v)) {
4818 fmt = fmt_start;
4819 argidx = argidx_start;
4820 goto unicode;
4822 #endif
4823 temp = _PyObject_Str(v);
4824 #ifdef Py_USING_UNICODE
4825 if (temp != NULL && PyUnicode_Check(temp)) {
4826 Py_DECREF(temp);
4827 fmt = fmt_start;
4828 argidx = argidx_start;
4829 goto unicode;
4831 #endif
4832 /* Fall through */
4833 case 'r':
4834 if (c == 'r')
4835 temp = PyObject_Repr(v);
4836 if (temp == NULL)
4837 goto error;
4838 if (!PyString_Check(temp)) {
4839 PyErr_SetString(PyExc_TypeError,
4840 "%s argument has non-string str()");
4841 Py_DECREF(temp);
4842 goto error;
4844 pbuf = PyString_AS_STRING(temp);
4845 len = PyString_GET_SIZE(temp);
4846 if (prec >= 0 && len > prec)
4847 len = prec;
4848 break;
4849 case 'i':
4850 case 'd':
4851 case 'u':
4852 case 'o':
4853 case 'x':
4854 case 'X':
4855 if (c == 'i')
4856 c = 'd';
4857 isnumok = 0;
4858 if (PyNumber_Check(v)) {
4859 PyObject *iobj=NULL;
4861 if (PyInt_Check(v) || (PyLong_Check(v))) {
4862 iobj = v;
4863 Py_INCREF(iobj);
4865 else {
4866 iobj = PyNumber_Int(v);
4867 if (iobj==NULL) iobj = PyNumber_Long(v);
4869 if (iobj!=NULL) {
4870 if (PyInt_Check(iobj)) {
4871 isnumok = 1;
4872 pbuf = formatbuf;
4873 len = formatint(pbuf,
4874 sizeof(formatbuf),
4875 flags, prec, c, iobj);
4876 Py_DECREF(iobj);
4877 if (len < 0)
4878 goto error;
4879 sign = 1;
4881 else if (PyLong_Check(iobj)) {
4882 int ilen;
4884 isnumok = 1;
4885 temp = _PyString_FormatLong(iobj, flags,
4886 prec, c, &pbuf, &ilen);
4887 Py_DECREF(iobj);
4888 len = ilen;
4889 if (!temp)
4890 goto error;
4891 sign = 1;
4893 else {
4894 Py_DECREF(iobj);
4898 if (!isnumok) {
4899 PyErr_Format(PyExc_TypeError,
4900 "%%%c format: a number is required, "
4901 "not %.200s", c, Py_TYPE(v)->tp_name);
4902 goto error;
4904 if (flags & F_ZERO)
4905 fill = '0';
4906 break;
4907 case 'e':
4908 case 'E':
4909 case 'f':
4910 case 'F':
4911 case 'g':
4912 case 'G':
4913 if (c == 'F')
4914 c = 'f';
4915 pbuf = formatbuf;
4916 len = formatfloat(pbuf, sizeof(formatbuf),
4917 flags, prec, c, v);
4918 if (len < 0)
4919 goto error;
4920 sign = 1;
4921 if (flags & F_ZERO)
4922 fill = '0';
4923 break;
4924 case 'c':
4925 #ifdef Py_USING_UNICODE
4926 if (PyUnicode_Check(v)) {
4927 fmt = fmt_start;
4928 argidx = argidx_start;
4929 goto unicode;
4931 #endif
4932 pbuf = formatbuf;
4933 len = formatchar(pbuf, sizeof(formatbuf), v);
4934 if (len < 0)
4935 goto error;
4936 break;
4937 default:
4938 PyErr_Format(PyExc_ValueError,
4939 "unsupported format character '%c' (0x%x) "
4940 "at index %zd",
4941 c, c,
4942 (Py_ssize_t)(fmt - 1 -
4943 PyString_AsString(format)));
4944 goto error;
4946 if (sign) {
4947 if (*pbuf == '-' || *pbuf == '+') {
4948 sign = *pbuf++;
4949 len--;
4951 else if (flags & F_SIGN)
4952 sign = '+';
4953 else if (flags & F_BLANK)
4954 sign = ' ';
4955 else
4956 sign = 0;
4958 if (width < len)
4959 width = len;
4960 if (rescnt - (sign != 0) < width) {
4961 reslen -= rescnt;
4962 rescnt = width + fmtcnt + 100;
4963 reslen += rescnt;
4964 if (reslen < 0) {
4965 Py_DECREF(result);
4966 Py_XDECREF(temp);
4967 return PyErr_NoMemory();
4969 if (_PyString_Resize(&result, reslen) < 0) {
4970 Py_XDECREF(temp);
4971 return NULL;
4973 res = PyString_AS_STRING(result)
4974 + reslen - rescnt;
4976 if (sign) {
4977 if (fill != ' ')
4978 *res++ = sign;
4979 rescnt--;
4980 if (width > len)
4981 width--;
4983 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
4984 assert(pbuf[0] == '0');
4985 assert(pbuf[1] == c);
4986 if (fill != ' ') {
4987 *res++ = *pbuf++;
4988 *res++ = *pbuf++;
4990 rescnt -= 2;
4991 width -= 2;
4992 if (width < 0)
4993 width = 0;
4994 len -= 2;
4996 if (width > len && !(flags & F_LJUST)) {
4997 do {
4998 --rescnt;
4999 *res++ = fill;
5000 } while (--width > len);
5002 if (fill == ' ') {
5003 if (sign)
5004 *res++ = sign;
5005 if ((flags & F_ALT) &&
5006 (c == 'x' || c == 'X')) {
5007 assert(pbuf[0] == '0');
5008 assert(pbuf[1] == c);
5009 *res++ = *pbuf++;
5010 *res++ = *pbuf++;
5013 Py_MEMCPY(res, pbuf, len);
5014 res += len;
5015 rescnt -= len;
5016 while (--width >= len) {
5017 --rescnt;
5018 *res++ = ' ';
5020 if (dict && (argidx < arglen) && c != '%') {
5021 PyErr_SetString(PyExc_TypeError,
5022 "not all arguments converted during string formatting");
5023 Py_XDECREF(temp);
5024 goto error;
5026 Py_XDECREF(temp);
5027 } /* '%' */
5028 } /* until end */
5029 if (argidx < arglen && !dict) {
5030 PyErr_SetString(PyExc_TypeError,
5031 "not all arguments converted during string formatting");
5032 goto error;
5034 if (args_owned) {
5035 Py_DECREF(args);
5037 _PyString_Resize(&result, reslen - rescnt);
5038 return result;
5040 #ifdef Py_USING_UNICODE
5041 unicode:
5042 if (args_owned) {
5043 Py_DECREF(args);
5044 args_owned = 0;
5046 /* Fiddle args right (remove the first argidx arguments) */
5047 if (PyTuple_Check(orig_args) && argidx > 0) {
5048 PyObject *v;
5049 Py_ssize_t n = PyTuple_GET_SIZE(orig_args) - argidx;
5050 v = PyTuple_New(n);
5051 if (v == NULL)
5052 goto error;
5053 while (--n >= 0) {
5054 PyObject *w = PyTuple_GET_ITEM(orig_args, n + argidx);
5055 Py_INCREF(w);
5056 PyTuple_SET_ITEM(v, n, w);
5058 args = v;
5059 } else {
5060 Py_INCREF(orig_args);
5061 args = orig_args;
5063 args_owned = 1;
5064 /* Take what we have of the result and let the Unicode formatting
5065 function format the rest of the input. */
5066 rescnt = res - PyString_AS_STRING(result);
5067 if (_PyString_Resize(&result, rescnt))
5068 goto error;
5069 fmtcnt = PyString_GET_SIZE(format) - \
5070 (fmt - PyString_AS_STRING(format));
5071 format = PyUnicode_Decode(fmt, fmtcnt, NULL, NULL);
5072 if (format == NULL)
5073 goto error;
5074 v = PyUnicode_Format(format, args);
5075 Py_DECREF(format);
5076 if (v == NULL)
5077 goto error;
5078 /* Paste what we have (result) to what the Unicode formatting
5079 function returned (v) and return the result (or error) */
5080 w = PyUnicode_Concat(result, v);
5081 Py_DECREF(result);
5082 Py_DECREF(v);
5083 Py_DECREF(args);
5084 return w;
5085 #endif /* Py_USING_UNICODE */
5087 error:
5088 Py_DECREF(result);
5089 if (args_owned) {
5090 Py_DECREF(args);
5092 return NULL;
5095 void
5096 PyString_InternInPlace(PyObject **p)
5098 register PyStringObject *s = (PyStringObject *)(*p);
5099 PyObject *t;
5100 if (s == NULL || !PyString_Check(s))
5101 Py_FatalError("PyString_InternInPlace: strings only please!");
5102 /* If it's a string subclass, we don't really know what putting
5103 it in the interned dict might do. */
5104 if (!PyString_CheckExact(s))
5105 return;
5106 if (PyString_CHECK_INTERNED(s))
5107 return;
5108 if (interned == NULL) {
5109 interned = PyDict_New();
5110 if (interned == NULL) {
5111 PyErr_Clear(); /* Don't leave an exception */
5112 return;
5115 t = PyDict_GetItem(interned, (PyObject *)s);
5116 if (t) {
5117 Py_INCREF(t);
5118 Py_DECREF(*p);
5119 *p = t;
5120 return;
5123 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
5124 PyErr_Clear();
5125 return;
5127 /* The two references in interned are not counted by refcnt.
5128 The string deallocator will take care of this */
5129 Py_REFCNT(s) -= 2;
5130 PyString_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
5133 void
5134 PyString_InternImmortal(PyObject **p)
5136 PyString_InternInPlace(p);
5137 if (PyString_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
5138 PyString_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
5139 Py_INCREF(*p);
5144 PyObject *
5145 PyString_InternFromString(const char *cp)
5147 PyObject *s = PyString_FromString(cp);
5148 if (s == NULL)
5149 return NULL;
5150 PyString_InternInPlace(&s);
5151 return s;
5154 void
5155 PyString_Fini(void)
5157 int i;
5158 for (i = 0; i < UCHAR_MAX + 1; i++) {
5159 Py_XDECREF(characters[i]);
5160 characters[i] = NULL;
5162 Py_XDECREF(nullstring);
5163 nullstring = NULL;
5166 void _Py_ReleaseInternedStrings(void)
5168 PyObject *keys;
5169 PyStringObject *s;
5170 Py_ssize_t i, n;
5171 Py_ssize_t immortal_size = 0, mortal_size = 0;
5173 if (interned == NULL || !PyDict_Check(interned))
5174 return;
5175 keys = PyDict_Keys(interned);
5176 if (keys == NULL || !PyList_Check(keys)) {
5177 PyErr_Clear();
5178 return;
5181 /* Since _Py_ReleaseInternedStrings() is intended to help a leak
5182 detector, interned strings are not forcibly deallocated; rather, we
5183 give them their stolen references back, and then clear and DECREF
5184 the interned dict. */
5186 n = PyList_GET_SIZE(keys);
5187 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
5189 for (i = 0; i < n; i++) {
5190 s = (PyStringObject *) PyList_GET_ITEM(keys, i);
5191 switch (s->ob_sstate) {
5192 case SSTATE_NOT_INTERNED:
5193 /* XXX Shouldn't happen */
5194 break;
5195 case SSTATE_INTERNED_IMMORTAL:
5196 Py_REFCNT(s) += 1;
5197 immortal_size += Py_SIZE(s);
5198 break;
5199 case SSTATE_INTERNED_MORTAL:
5200 Py_REFCNT(s) += 2;
5201 mortal_size += Py_SIZE(s);
5202 break;
5203 default:
5204 Py_FatalError("Inconsistent interned string state.");
5206 s->ob_sstate = SSTATE_NOT_INTERNED;
5208 fprintf(stderr, "total size of all interned strings: "
5209 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
5210 "mortal/immortal\n", mortal_size, immortal_size);
5211 Py_DECREF(keys);
5212 PyDict_Clear(interned);
5213 Py_DECREF(interned);
5214 interned = NULL;