Applying patches backported from 3.1, by Gregor Lingl.
[python.git] / Objects / stringobject.c
blob1233fc0cc909e70aceed00de396faf88b700cac4
1 /* String (str/bytes) object implementation */
3 #define PY_SSIZE_T_CLEAN
5 #include "Python.h"
6 #include <ctype.h>
7 #include <stddef.h>
9 #ifdef COUNT_ALLOCS
10 Py_ssize_t null_strings, one_strings;
11 #endif
13 static PyStringObject *characters[UCHAR_MAX + 1];
14 static PyStringObject *nullstring;
16 /* This dictionary holds all interned strings. Note that references to
17 strings in this dictionary are *not* counted in the string's ob_refcnt.
18 When the interned string reaches a refcnt of 0 the string deallocation
19 function will delete the reference from this dictionary.
21 Another way to look at this is that to say that the actual reference
22 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
24 static PyObject *interned;
26 /* PyStringObject_SIZE gives the basic size of a string; any memory allocation
27 for a string of length n should request PyStringObject_SIZE + n bytes.
29 Using PyStringObject_SIZE instead of sizeof(PyStringObject) saves
30 3 bytes per string allocation on a typical system.
32 #define PyStringObject_SIZE (offsetof(PyStringObject, ob_sval) + 1)
35 For both PyString_FromString() and PyString_FromStringAndSize(), the
36 parameter `size' denotes number of characters to allocate, not counting any
37 null terminating character.
39 For PyString_FromString(), the parameter `str' points to a null-terminated
40 string containing exactly `size' bytes.
42 For PyString_FromStringAndSize(), the parameter the parameter `str' is
43 either NULL or else points to a string containing at least `size' bytes.
44 For PyString_FromStringAndSize(), the string in the `str' parameter does
45 not have to be null-terminated. (Therefore it is safe to construct a
46 substring by calling `PyString_FromStringAndSize(origstring, substrlen)'.)
47 If `str' is NULL then PyString_FromStringAndSize() will allocate `size+1'
48 bytes (setting the last byte to the null terminating character) and you can
49 fill in the data yourself. If `str' is non-NULL then the resulting
50 PyString object must be treated as immutable and you must not fill in nor
51 alter the data yourself, since the strings may be shared.
53 The PyObject member `op->ob_size', which denotes the number of "extra
54 items" in a variable-size object, will contain the number of bytes
55 allocated for string data, not counting the null terminating character. It
56 is therefore equal to the equal to the `size' parameter (for
57 PyString_FromStringAndSize()) or the length of the string in the `str'
58 parameter (for PyString_FromString()).
60 PyObject *
61 PyString_FromStringAndSize(const char *str, Py_ssize_t size)
63 register PyStringObject *op;
64 if (size < 0) {
65 PyErr_SetString(PyExc_SystemError,
66 "Negative size passed to PyString_FromStringAndSize");
67 return NULL;
69 if (size == 0 && (op = nullstring) != NULL) {
70 #ifdef COUNT_ALLOCS
71 null_strings++;
72 #endif
73 Py_INCREF(op);
74 return (PyObject *)op;
76 if (size == 1 && str != NULL &&
77 (op = characters[*str & UCHAR_MAX]) != NULL)
79 #ifdef COUNT_ALLOCS
80 one_strings++;
81 #endif
82 Py_INCREF(op);
83 return (PyObject *)op;
86 if (size > PY_SSIZE_T_MAX - PyStringObject_SIZE) {
87 PyErr_SetString(PyExc_OverflowError, "string is too large");
88 return NULL;
91 /* Inline PyObject_NewVar */
92 op = (PyStringObject *)PyObject_MALLOC(PyStringObject_SIZE + size);
93 if (op == NULL)
94 return PyErr_NoMemory();
95 PyObject_INIT_VAR(op, &PyString_Type, size);
96 op->ob_shash = -1;
97 op->ob_sstate = SSTATE_NOT_INTERNED;
98 if (str != NULL)
99 Py_MEMCPY(op->ob_sval, str, size);
100 op->ob_sval[size] = '\0';
101 /* share short strings */
102 if (size == 0) {
103 PyObject *t = (PyObject *)op;
104 PyString_InternInPlace(&t);
105 op = (PyStringObject *)t;
106 nullstring = op;
107 Py_INCREF(op);
108 } else if (size == 1 && str != NULL) {
109 PyObject *t = (PyObject *)op;
110 PyString_InternInPlace(&t);
111 op = (PyStringObject *)t;
112 characters[*str & UCHAR_MAX] = op;
113 Py_INCREF(op);
115 return (PyObject *) op;
118 PyObject *
119 PyString_FromString(const char *str)
121 register size_t size;
122 register PyStringObject *op;
124 assert(str != NULL);
125 size = strlen(str);
126 if (size > PY_SSIZE_T_MAX - PyStringObject_SIZE) {
127 PyErr_SetString(PyExc_OverflowError,
128 "string is too long for a Python string");
129 return NULL;
131 if (size == 0 && (op = nullstring) != NULL) {
132 #ifdef COUNT_ALLOCS
133 null_strings++;
134 #endif
135 Py_INCREF(op);
136 return (PyObject *)op;
138 if (size == 1 && (op = characters[*str & UCHAR_MAX]) != NULL) {
139 #ifdef COUNT_ALLOCS
140 one_strings++;
141 #endif
142 Py_INCREF(op);
143 return (PyObject *)op;
146 /* Inline PyObject_NewVar */
147 op = (PyStringObject *)PyObject_MALLOC(PyStringObject_SIZE + size);
148 if (op == NULL)
149 return PyErr_NoMemory();
150 PyObject_INIT_VAR(op, &PyString_Type, size);
151 op->ob_shash = -1;
152 op->ob_sstate = SSTATE_NOT_INTERNED;
153 Py_MEMCPY(op->ob_sval, str, size+1);
154 /* share short strings */
155 if (size == 0) {
156 PyObject *t = (PyObject *)op;
157 PyString_InternInPlace(&t);
158 op = (PyStringObject *)t;
159 nullstring = op;
160 Py_INCREF(op);
161 } else if (size == 1) {
162 PyObject *t = (PyObject *)op;
163 PyString_InternInPlace(&t);
164 op = (PyStringObject *)t;
165 characters[*str & UCHAR_MAX] = op;
166 Py_INCREF(op);
168 return (PyObject *) op;
171 PyObject *
172 PyString_FromFormatV(const char *format, va_list vargs)
174 va_list count;
175 Py_ssize_t n = 0;
176 const char* f;
177 char *s;
178 PyObject* string;
180 #ifdef VA_LIST_IS_ARRAY
181 Py_MEMCPY(count, vargs, sizeof(va_list));
182 #else
183 #ifdef __va_copy
184 __va_copy(count, vargs);
185 #else
186 count = vargs;
187 #endif
188 #endif
189 /* step 1: figure out how large a buffer we need */
190 for (f = format; *f; f++) {
191 if (*f == '%') {
192 const char* p = f;
193 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
196 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
197 * they don't affect the amount of space we reserve.
199 if ((*f == 'l' || *f == 'z') &&
200 (f[1] == 'd' || f[1] == 'u'))
201 ++f;
203 switch (*f) {
204 case 'c':
205 (void)va_arg(count, int);
206 /* fall through... */
207 case '%':
208 n++;
209 break;
210 case 'd': case 'u': case 'i': case 'x':
211 (void) va_arg(count, int);
212 /* 20 bytes is enough to hold a 64-bit
213 integer. Decimal takes the most space.
214 This isn't enough for octal. */
215 n += 20;
216 break;
217 case 's':
218 s = va_arg(count, char*);
219 n += strlen(s);
220 break;
221 case 'p':
222 (void) va_arg(count, int);
223 /* maximum 64-bit pointer representation:
224 * 0xffffffffffffffff
225 * so 19 characters is enough.
226 * XXX I count 18 -- what's the extra for?
228 n += 19;
229 break;
230 default:
231 /* if we stumble upon an unknown
232 formatting code, copy the rest of
233 the format string to the output
234 string. (we cannot just skip the
235 code, since there's no way to know
236 what's in the argument list) */
237 n += strlen(p);
238 goto expand;
240 } else
241 n++;
243 expand:
244 /* step 2: fill the buffer */
245 /* Since we've analyzed how much space we need for the worst case,
246 use sprintf directly instead of the slower PyOS_snprintf. */
247 string = PyString_FromStringAndSize(NULL, n);
248 if (!string)
249 return NULL;
251 s = PyString_AsString(string);
253 for (f = format; *f; f++) {
254 if (*f == '%') {
255 const char* p = f++;
256 Py_ssize_t i;
257 int longflag = 0;
258 int size_tflag = 0;
259 /* parse the width.precision part (we're only
260 interested in the precision value, if any) */
261 n = 0;
262 while (isdigit(Py_CHARMASK(*f)))
263 n = (n*10) + *f++ - '0';
264 if (*f == '.') {
265 f++;
266 n = 0;
267 while (isdigit(Py_CHARMASK(*f)))
268 n = (n*10) + *f++ - '0';
270 while (*f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
271 f++;
272 /* handle the long flag, but only for %ld and %lu.
273 others can be added when necessary. */
274 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
275 longflag = 1;
276 ++f;
278 /* handle the size_t flag. */
279 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
280 size_tflag = 1;
281 ++f;
284 switch (*f) {
285 case 'c':
286 *s++ = va_arg(vargs, int);
287 break;
288 case 'd':
289 if (longflag)
290 sprintf(s, "%ld", va_arg(vargs, long));
291 else if (size_tflag)
292 sprintf(s, "%" PY_FORMAT_SIZE_T "d",
293 va_arg(vargs, Py_ssize_t));
294 else
295 sprintf(s, "%d", va_arg(vargs, int));
296 s += strlen(s);
297 break;
298 case 'u':
299 if (longflag)
300 sprintf(s, "%lu",
301 va_arg(vargs, unsigned long));
302 else if (size_tflag)
303 sprintf(s, "%" PY_FORMAT_SIZE_T "u",
304 va_arg(vargs, size_t));
305 else
306 sprintf(s, "%u",
307 va_arg(vargs, unsigned int));
308 s += strlen(s);
309 break;
310 case 'i':
311 sprintf(s, "%i", va_arg(vargs, int));
312 s += strlen(s);
313 break;
314 case 'x':
315 sprintf(s, "%x", va_arg(vargs, int));
316 s += strlen(s);
317 break;
318 case 's':
319 p = va_arg(vargs, char*);
320 i = strlen(p);
321 if (n > 0 && i > n)
322 i = n;
323 Py_MEMCPY(s, p, i);
324 s += i;
325 break;
326 case 'p':
327 sprintf(s, "%p", va_arg(vargs, void*));
328 /* %p is ill-defined: ensure leading 0x. */
329 if (s[1] == 'X')
330 s[1] = 'x';
331 else if (s[1] != 'x') {
332 memmove(s+2, s, strlen(s)+1);
333 s[0] = '0';
334 s[1] = 'x';
336 s += strlen(s);
337 break;
338 case '%':
339 *s++ = '%';
340 break;
341 default:
342 strcpy(s, p);
343 s += strlen(s);
344 goto end;
346 } else
347 *s++ = *f;
350 end:
351 _PyString_Resize(&string, s - PyString_AS_STRING(string));
352 return string;
355 PyObject *
356 PyString_FromFormat(const char *format, ...)
358 PyObject* ret;
359 va_list vargs;
361 #ifdef HAVE_STDARG_PROTOTYPES
362 va_start(vargs, format);
363 #else
364 va_start(vargs);
365 #endif
366 ret = PyString_FromFormatV(format, vargs);
367 va_end(vargs);
368 return ret;
372 PyObject *PyString_Decode(const char *s,
373 Py_ssize_t size,
374 const char *encoding,
375 const char *errors)
377 PyObject *v, *str;
379 str = PyString_FromStringAndSize(s, size);
380 if (str == NULL)
381 return NULL;
382 v = PyString_AsDecodedString(str, encoding, errors);
383 Py_DECREF(str);
384 return v;
387 PyObject *PyString_AsDecodedObject(PyObject *str,
388 const char *encoding,
389 const char *errors)
391 PyObject *v;
393 if (!PyString_Check(str)) {
394 PyErr_BadArgument();
395 goto onError;
398 if (encoding == NULL) {
399 #ifdef Py_USING_UNICODE
400 encoding = PyUnicode_GetDefaultEncoding();
401 #else
402 PyErr_SetString(PyExc_ValueError, "no encoding specified");
403 goto onError;
404 #endif
407 /* Decode via the codec registry */
408 v = PyCodec_Decode(str, encoding, errors);
409 if (v == NULL)
410 goto onError;
412 return v;
414 onError:
415 return NULL;
418 PyObject *PyString_AsDecodedString(PyObject *str,
419 const char *encoding,
420 const char *errors)
422 PyObject *v;
424 v = PyString_AsDecodedObject(str, encoding, errors);
425 if (v == NULL)
426 goto onError;
428 #ifdef Py_USING_UNICODE
429 /* Convert Unicode to a string using the default encoding */
430 if (PyUnicode_Check(v)) {
431 PyObject *temp = v;
432 v = PyUnicode_AsEncodedString(v, NULL, NULL);
433 Py_DECREF(temp);
434 if (v == NULL)
435 goto onError;
437 #endif
438 if (!PyString_Check(v)) {
439 PyErr_Format(PyExc_TypeError,
440 "decoder did not return a string object (type=%.400s)",
441 Py_TYPE(v)->tp_name);
442 Py_DECREF(v);
443 goto onError;
446 return v;
448 onError:
449 return NULL;
452 PyObject *PyString_Encode(const char *s,
453 Py_ssize_t size,
454 const char *encoding,
455 const char *errors)
457 PyObject *v, *str;
459 str = PyString_FromStringAndSize(s, size);
460 if (str == NULL)
461 return NULL;
462 v = PyString_AsEncodedString(str, encoding, errors);
463 Py_DECREF(str);
464 return v;
467 PyObject *PyString_AsEncodedObject(PyObject *str,
468 const char *encoding,
469 const char *errors)
471 PyObject *v;
473 if (!PyString_Check(str)) {
474 PyErr_BadArgument();
475 goto onError;
478 if (encoding == NULL) {
479 #ifdef Py_USING_UNICODE
480 encoding = PyUnicode_GetDefaultEncoding();
481 #else
482 PyErr_SetString(PyExc_ValueError, "no encoding specified");
483 goto onError;
484 #endif
487 /* Encode via the codec registry */
488 v = PyCodec_Encode(str, encoding, errors);
489 if (v == NULL)
490 goto onError;
492 return v;
494 onError:
495 return NULL;
498 PyObject *PyString_AsEncodedString(PyObject *str,
499 const char *encoding,
500 const char *errors)
502 PyObject *v;
504 v = PyString_AsEncodedObject(str, encoding, errors);
505 if (v == NULL)
506 goto onError;
508 #ifdef Py_USING_UNICODE
509 /* Convert Unicode to a string using the default encoding */
510 if (PyUnicode_Check(v)) {
511 PyObject *temp = v;
512 v = PyUnicode_AsEncodedString(v, NULL, NULL);
513 Py_DECREF(temp);
514 if (v == NULL)
515 goto onError;
517 #endif
518 if (!PyString_Check(v)) {
519 PyErr_Format(PyExc_TypeError,
520 "encoder did not return a string object (type=%.400s)",
521 Py_TYPE(v)->tp_name);
522 Py_DECREF(v);
523 goto onError;
526 return v;
528 onError:
529 return NULL;
532 static void
533 string_dealloc(PyObject *op)
535 switch (PyString_CHECK_INTERNED(op)) {
536 case SSTATE_NOT_INTERNED:
537 break;
539 case SSTATE_INTERNED_MORTAL:
540 /* revive dead object temporarily for DelItem */
541 Py_REFCNT(op) = 3;
542 if (PyDict_DelItem(interned, op) != 0)
543 Py_FatalError(
544 "deletion of interned string failed");
545 break;
547 case SSTATE_INTERNED_IMMORTAL:
548 Py_FatalError("Immortal interned string died.");
550 default:
551 Py_FatalError("Inconsistent interned string state.");
553 Py_TYPE(op)->tp_free(op);
556 /* Unescape a backslash-escaped string. If unicode is non-zero,
557 the string is a u-literal. If recode_encoding is non-zero,
558 the string is UTF-8 encoded and should be re-encoded in the
559 specified encoding. */
561 PyObject *PyString_DecodeEscape(const char *s,
562 Py_ssize_t len,
563 const char *errors,
564 Py_ssize_t unicode,
565 const char *recode_encoding)
567 int c;
568 char *p, *buf;
569 const char *end;
570 PyObject *v;
571 Py_ssize_t newlen = recode_encoding ? 4*len:len;
572 v = PyString_FromStringAndSize((char *)NULL, newlen);
573 if (v == NULL)
574 return NULL;
575 p = buf = PyString_AsString(v);
576 end = s + len;
577 while (s < end) {
578 if (*s != '\\') {
579 non_esc:
580 #ifdef Py_USING_UNICODE
581 if (recode_encoding && (*s & 0x80)) {
582 PyObject *u, *w;
583 char *r;
584 const char* t;
585 Py_ssize_t rn;
586 t = s;
587 /* Decode non-ASCII bytes as UTF-8. */
588 while (t < end && (*t & 0x80)) t++;
589 u = PyUnicode_DecodeUTF8(s, t - s, errors);
590 if(!u) goto failed;
592 /* Recode them in target encoding. */
593 w = PyUnicode_AsEncodedString(
594 u, recode_encoding, errors);
595 Py_DECREF(u);
596 if (!w) goto failed;
598 /* Append bytes to output buffer. */
599 assert(PyString_Check(w));
600 r = PyString_AS_STRING(w);
601 rn = PyString_GET_SIZE(w);
602 Py_MEMCPY(p, r, rn);
603 p += rn;
604 Py_DECREF(w);
605 s = t;
606 } else {
607 *p++ = *s++;
609 #else
610 *p++ = *s++;
611 #endif
612 continue;
614 s++;
615 if (s==end) {
616 PyErr_SetString(PyExc_ValueError,
617 "Trailing \\ in string");
618 goto failed;
620 switch (*s++) {
621 /* XXX This assumes ASCII! */
622 case '\n': break;
623 case '\\': *p++ = '\\'; break;
624 case '\'': *p++ = '\''; break;
625 case '\"': *p++ = '\"'; break;
626 case 'b': *p++ = '\b'; break;
627 case 'f': *p++ = '\014'; break; /* FF */
628 case 't': *p++ = '\t'; break;
629 case 'n': *p++ = '\n'; break;
630 case 'r': *p++ = '\r'; break;
631 case 'v': *p++ = '\013'; break; /* VT */
632 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
633 case '0': case '1': case '2': case '3':
634 case '4': case '5': case '6': case '7':
635 c = s[-1] - '0';
636 if (s < end && '0' <= *s && *s <= '7') {
637 c = (c<<3) + *s++ - '0';
638 if (s < end && '0' <= *s && *s <= '7')
639 c = (c<<3) + *s++ - '0';
641 *p++ = c;
642 break;
643 case 'x':
644 if (s+1 < end &&
645 isxdigit(Py_CHARMASK(s[0])) &&
646 isxdigit(Py_CHARMASK(s[1])))
648 unsigned int x = 0;
649 c = Py_CHARMASK(*s);
650 s++;
651 if (isdigit(c))
652 x = c - '0';
653 else if (islower(c))
654 x = 10 + c - 'a';
655 else
656 x = 10 + c - 'A';
657 x = x << 4;
658 c = Py_CHARMASK(*s);
659 s++;
660 if (isdigit(c))
661 x += c - '0';
662 else if (islower(c))
663 x += 10 + c - 'a';
664 else
665 x += 10 + c - 'A';
666 *p++ = x;
667 break;
669 if (!errors || strcmp(errors, "strict") == 0) {
670 PyErr_SetString(PyExc_ValueError,
671 "invalid \\x escape");
672 goto failed;
674 if (strcmp(errors, "replace") == 0) {
675 *p++ = '?';
676 } else if (strcmp(errors, "ignore") == 0)
677 /* do nothing */;
678 else {
679 PyErr_Format(PyExc_ValueError,
680 "decoding error; "
681 "unknown error handling code: %.400s",
682 errors);
683 goto failed;
685 #ifndef Py_USING_UNICODE
686 case 'u':
687 case 'U':
688 case 'N':
689 if (unicode) {
690 PyErr_SetString(PyExc_ValueError,
691 "Unicode escapes not legal "
692 "when Unicode disabled");
693 goto failed;
695 #endif
696 default:
697 *p++ = '\\';
698 s--;
699 goto non_esc; /* an arbitry number of unescaped
700 UTF-8 bytes may follow. */
703 if (p-buf < newlen)
704 _PyString_Resize(&v, p - buf);
705 return v;
706 failed:
707 Py_DECREF(v);
708 return NULL;
711 /* -------------------------------------------------------------------- */
712 /* object api */
714 static Py_ssize_t
715 string_getsize(register PyObject *op)
717 char *s;
718 Py_ssize_t len;
719 if (PyString_AsStringAndSize(op, &s, &len))
720 return -1;
721 return len;
724 static /*const*/ char *
725 string_getbuffer(register PyObject *op)
727 char *s;
728 Py_ssize_t len;
729 if (PyString_AsStringAndSize(op, &s, &len))
730 return NULL;
731 return s;
734 Py_ssize_t
735 PyString_Size(register PyObject *op)
737 if (!PyString_Check(op))
738 return string_getsize(op);
739 return Py_SIZE(op);
742 /*const*/ char *
743 PyString_AsString(register PyObject *op)
745 if (!PyString_Check(op))
746 return string_getbuffer(op);
747 return ((PyStringObject *)op) -> ob_sval;
751 PyString_AsStringAndSize(register PyObject *obj,
752 register char **s,
753 register Py_ssize_t *len)
755 if (s == NULL) {
756 PyErr_BadInternalCall();
757 return -1;
760 if (!PyString_Check(obj)) {
761 #ifdef Py_USING_UNICODE
762 if (PyUnicode_Check(obj)) {
763 obj = _PyUnicode_AsDefaultEncodedString(obj, NULL);
764 if (obj == NULL)
765 return -1;
767 else
768 #endif
770 PyErr_Format(PyExc_TypeError,
771 "expected string or Unicode object, "
772 "%.200s found", Py_TYPE(obj)->tp_name);
773 return -1;
777 *s = PyString_AS_STRING(obj);
778 if (len != NULL)
779 *len = PyString_GET_SIZE(obj);
780 else if (strlen(*s) != (size_t)PyString_GET_SIZE(obj)) {
781 PyErr_SetString(PyExc_TypeError,
782 "expected string without null bytes");
783 return -1;
785 return 0;
788 /* -------------------------------------------------------------------- */
789 /* Methods */
791 #include "stringlib/stringdefs.h"
792 #include "stringlib/fastsearch.h"
794 #include "stringlib/count.h"
795 #include "stringlib/find.h"
796 #include "stringlib/partition.h"
798 #define _Py_InsertThousandsGrouping _PyString_InsertThousandsGrouping
799 #include "stringlib/localeutil.h"
803 static int
804 string_print(PyStringObject *op, FILE *fp, int flags)
806 Py_ssize_t i, str_len;
807 char c;
808 int quote;
810 /* XXX Ought to check for interrupts when writing long strings */
811 if (! PyString_CheckExact(op)) {
812 int ret;
813 /* A str subclass may have its own __str__ method. */
814 op = (PyStringObject *) PyObject_Str((PyObject *)op);
815 if (op == NULL)
816 return -1;
817 ret = string_print(op, fp, flags);
818 Py_DECREF(op);
819 return ret;
821 if (flags & Py_PRINT_RAW) {
822 char *data = op->ob_sval;
823 Py_ssize_t size = Py_SIZE(op);
824 Py_BEGIN_ALLOW_THREADS
825 while (size > INT_MAX) {
826 /* Very long strings cannot be written atomically.
827 * But don't write exactly INT_MAX bytes at a time
828 * to avoid memory aligment issues.
830 const int chunk_size = INT_MAX & ~0x3FFF;
831 fwrite(data, 1, chunk_size, fp);
832 data += chunk_size;
833 size -= chunk_size;
835 #ifdef __VMS
836 if (size) fwrite(data, (int)size, 1, fp);
837 #else
838 fwrite(data, 1, (int)size, fp);
839 #endif
840 Py_END_ALLOW_THREADS
841 return 0;
844 /* figure out which quote to use; single is preferred */
845 quote = '\'';
846 if (memchr(op->ob_sval, '\'', Py_SIZE(op)) &&
847 !memchr(op->ob_sval, '"', Py_SIZE(op)))
848 quote = '"';
850 str_len = Py_SIZE(op);
851 Py_BEGIN_ALLOW_THREADS
852 fputc(quote, fp);
853 for (i = 0; i < str_len; i++) {
854 /* Since strings are immutable and the caller should have a
855 reference, accessing the interal buffer should not be an issue
856 with the GIL released. */
857 c = op->ob_sval[i];
858 if (c == quote || c == '\\')
859 fprintf(fp, "\\%c", c);
860 else if (c == '\t')
861 fprintf(fp, "\\t");
862 else if (c == '\n')
863 fprintf(fp, "\\n");
864 else if (c == '\r')
865 fprintf(fp, "\\r");
866 else if (c < ' ' || c >= 0x7f)
867 fprintf(fp, "\\x%02x", c & 0xff);
868 else
869 fputc(c, fp);
871 fputc(quote, fp);
872 Py_END_ALLOW_THREADS
873 return 0;
876 PyObject *
877 PyString_Repr(PyObject *obj, int smartquotes)
879 register PyStringObject* op = (PyStringObject*) obj;
880 size_t newsize = 2 + 4 * Py_SIZE(op);
881 PyObject *v;
882 if (newsize > PY_SSIZE_T_MAX || newsize / 4 != Py_SIZE(op)) {
883 PyErr_SetString(PyExc_OverflowError,
884 "string is too large to make repr");
885 return NULL;
887 v = PyString_FromStringAndSize((char *)NULL, newsize);
888 if (v == NULL) {
889 return NULL;
891 else {
892 register Py_ssize_t i;
893 register char c;
894 register char *p;
895 int quote;
897 /* figure out which quote to use; single is preferred */
898 quote = '\'';
899 if (smartquotes &&
900 memchr(op->ob_sval, '\'', Py_SIZE(op)) &&
901 !memchr(op->ob_sval, '"', Py_SIZE(op)))
902 quote = '"';
904 p = PyString_AS_STRING(v);
905 *p++ = quote;
906 for (i = 0; i < Py_SIZE(op); i++) {
907 /* There's at least enough room for a hex escape
908 and a closing quote. */
909 assert(newsize - (p - PyString_AS_STRING(v)) >= 5);
910 c = op->ob_sval[i];
911 if (c == quote || c == '\\')
912 *p++ = '\\', *p++ = c;
913 else if (c == '\t')
914 *p++ = '\\', *p++ = 't';
915 else if (c == '\n')
916 *p++ = '\\', *p++ = 'n';
917 else if (c == '\r')
918 *p++ = '\\', *p++ = 'r';
919 else if (c < ' ' || c >= 0x7f) {
920 /* For performance, we don't want to call
921 PyOS_snprintf here (extra layers of
922 function call). */
923 sprintf(p, "\\x%02x", c & 0xff);
924 p += 4;
926 else
927 *p++ = c;
929 assert(newsize - (p - PyString_AS_STRING(v)) >= 1);
930 *p++ = quote;
931 *p = '\0';
932 _PyString_Resize(
933 &v, (p - PyString_AS_STRING(v)));
934 return v;
938 static PyObject *
939 string_repr(PyObject *op)
941 return PyString_Repr(op, 1);
944 static PyObject *
945 string_str(PyObject *s)
947 assert(PyString_Check(s));
948 if (PyString_CheckExact(s)) {
949 Py_INCREF(s);
950 return s;
952 else {
953 /* Subtype -- return genuine string with the same value. */
954 PyStringObject *t = (PyStringObject *) s;
955 return PyString_FromStringAndSize(t->ob_sval, Py_SIZE(t));
959 static Py_ssize_t
960 string_length(PyStringObject *a)
962 return Py_SIZE(a);
965 static PyObject *
966 string_concat(register PyStringObject *a, register PyObject *bb)
968 register Py_ssize_t size;
969 register PyStringObject *op;
970 if (!PyString_Check(bb)) {
971 #ifdef Py_USING_UNICODE
972 if (PyUnicode_Check(bb))
973 return PyUnicode_Concat((PyObject *)a, bb);
974 #endif
975 if (PyByteArray_Check(bb))
976 return PyByteArray_Concat((PyObject *)a, bb);
977 PyErr_Format(PyExc_TypeError,
978 "cannot concatenate 'str' and '%.200s' objects",
979 Py_TYPE(bb)->tp_name);
980 return NULL;
982 #define b ((PyStringObject *)bb)
983 /* Optimize cases with empty left or right operand */
984 if ((Py_SIZE(a) == 0 || Py_SIZE(b) == 0) &&
985 PyString_CheckExact(a) && PyString_CheckExact(b)) {
986 if (Py_SIZE(a) == 0) {
987 Py_INCREF(bb);
988 return bb;
990 Py_INCREF(a);
991 return (PyObject *)a;
993 size = Py_SIZE(a) + Py_SIZE(b);
994 /* Check that string sizes are not negative, to prevent an
995 overflow in cases where we are passed incorrectly-created
996 strings with negative lengths (due to a bug in other code).
998 if (Py_SIZE(a) < 0 || Py_SIZE(b) < 0 ||
999 Py_SIZE(a) > PY_SSIZE_T_MAX - Py_SIZE(b)) {
1000 PyErr_SetString(PyExc_OverflowError,
1001 "strings are too large to concat");
1002 return NULL;
1005 /* Inline PyObject_NewVar */
1006 if (size > PY_SSIZE_T_MAX - PyStringObject_SIZE) {
1007 PyErr_SetString(PyExc_OverflowError,
1008 "strings are too large to concat");
1009 return NULL;
1011 op = (PyStringObject *)PyObject_MALLOC(PyStringObject_SIZE + size);
1012 if (op == NULL)
1013 return PyErr_NoMemory();
1014 PyObject_INIT_VAR(op, &PyString_Type, size);
1015 op->ob_shash = -1;
1016 op->ob_sstate = SSTATE_NOT_INTERNED;
1017 Py_MEMCPY(op->ob_sval, a->ob_sval, Py_SIZE(a));
1018 Py_MEMCPY(op->ob_sval + Py_SIZE(a), b->ob_sval, Py_SIZE(b));
1019 op->ob_sval[size] = '\0';
1020 return (PyObject *) op;
1021 #undef b
1024 static PyObject *
1025 string_repeat(register PyStringObject *a, register Py_ssize_t n)
1027 register Py_ssize_t i;
1028 register Py_ssize_t j;
1029 register Py_ssize_t size;
1030 register PyStringObject *op;
1031 size_t nbytes;
1032 if (n < 0)
1033 n = 0;
1034 /* watch out for overflows: the size can overflow int,
1035 * and the # of bytes needed can overflow size_t
1037 size = Py_SIZE(a) * n;
1038 if (n && size / n != Py_SIZE(a)) {
1039 PyErr_SetString(PyExc_OverflowError,
1040 "repeated string is too long");
1041 return NULL;
1043 if (size == Py_SIZE(a) && PyString_CheckExact(a)) {
1044 Py_INCREF(a);
1045 return (PyObject *)a;
1047 nbytes = (size_t)size;
1048 if (nbytes + PyStringObject_SIZE <= nbytes) {
1049 PyErr_SetString(PyExc_OverflowError,
1050 "repeated string is too long");
1051 return NULL;
1053 op = (PyStringObject *)PyObject_MALLOC(PyStringObject_SIZE + nbytes);
1054 if (op == NULL)
1055 return PyErr_NoMemory();
1056 PyObject_INIT_VAR(op, &PyString_Type, size);
1057 op->ob_shash = -1;
1058 op->ob_sstate = SSTATE_NOT_INTERNED;
1059 op->ob_sval[size] = '\0';
1060 if (Py_SIZE(a) == 1 && n > 0) {
1061 memset(op->ob_sval, a->ob_sval[0] , n);
1062 return (PyObject *) op;
1064 i = 0;
1065 if (i < size) {
1066 Py_MEMCPY(op->ob_sval, a->ob_sval, Py_SIZE(a));
1067 i = Py_SIZE(a);
1069 while (i < size) {
1070 j = (i <= size-i) ? i : size-i;
1071 Py_MEMCPY(op->ob_sval+i, op->ob_sval, j);
1072 i += j;
1074 return (PyObject *) op;
1077 /* String slice a[i:j] consists of characters a[i] ... a[j-1] */
1079 static PyObject *
1080 string_slice(register PyStringObject *a, register Py_ssize_t i,
1081 register Py_ssize_t j)
1082 /* j -- may be negative! */
1084 if (i < 0)
1085 i = 0;
1086 if (j < 0)
1087 j = 0; /* Avoid signed/unsigned bug in next line */
1088 if (j > Py_SIZE(a))
1089 j = Py_SIZE(a);
1090 if (i == 0 && j == Py_SIZE(a) && PyString_CheckExact(a)) {
1091 /* It's the same as a */
1092 Py_INCREF(a);
1093 return (PyObject *)a;
1095 if (j < i)
1096 j = i;
1097 return PyString_FromStringAndSize(a->ob_sval + i, j-i);
1100 static int
1101 string_contains(PyObject *str_obj, PyObject *sub_obj)
1103 if (!PyString_CheckExact(sub_obj)) {
1104 #ifdef Py_USING_UNICODE
1105 if (PyUnicode_Check(sub_obj))
1106 return PyUnicode_Contains(str_obj, sub_obj);
1107 #endif
1108 if (!PyString_Check(sub_obj)) {
1109 PyErr_Format(PyExc_TypeError,
1110 "'in <string>' requires string as left operand, "
1111 "not %.200s", Py_TYPE(sub_obj)->tp_name);
1112 return -1;
1116 return stringlib_contains_obj(str_obj, sub_obj);
1119 static PyObject *
1120 string_item(PyStringObject *a, register Py_ssize_t i)
1122 char pchar;
1123 PyObject *v;
1124 if (i < 0 || i >= Py_SIZE(a)) {
1125 PyErr_SetString(PyExc_IndexError, "string index out of range");
1126 return NULL;
1128 pchar = a->ob_sval[i];
1129 v = (PyObject *)characters[pchar & UCHAR_MAX];
1130 if (v == NULL)
1131 v = PyString_FromStringAndSize(&pchar, 1);
1132 else {
1133 #ifdef COUNT_ALLOCS
1134 one_strings++;
1135 #endif
1136 Py_INCREF(v);
1138 return v;
1141 static PyObject*
1142 string_richcompare(PyStringObject *a, PyStringObject *b, int op)
1144 int c;
1145 Py_ssize_t len_a, len_b;
1146 Py_ssize_t min_len;
1147 PyObject *result;
1149 /* Make sure both arguments are strings. */
1150 if (!(PyString_Check(a) && PyString_Check(b))) {
1151 result = Py_NotImplemented;
1152 goto out;
1154 if (a == b) {
1155 switch (op) {
1156 case Py_EQ:case Py_LE:case Py_GE:
1157 result = Py_True;
1158 goto out;
1159 case Py_NE:case Py_LT:case Py_GT:
1160 result = Py_False;
1161 goto out;
1164 if (op == Py_EQ) {
1165 /* Supporting Py_NE here as well does not save
1166 much time, since Py_NE is rarely used. */
1167 if (Py_SIZE(a) == Py_SIZE(b)
1168 && (a->ob_sval[0] == b->ob_sval[0]
1169 && memcmp(a->ob_sval, b->ob_sval, Py_SIZE(a)) == 0)) {
1170 result = Py_True;
1171 } else {
1172 result = Py_False;
1174 goto out;
1176 len_a = Py_SIZE(a); len_b = Py_SIZE(b);
1177 min_len = (len_a < len_b) ? len_a : len_b;
1178 if (min_len > 0) {
1179 c = Py_CHARMASK(*a->ob_sval) - Py_CHARMASK(*b->ob_sval);
1180 if (c==0)
1181 c = memcmp(a->ob_sval, b->ob_sval, min_len);
1182 } else
1183 c = 0;
1184 if (c == 0)
1185 c = (len_a < len_b) ? -1 : (len_a > len_b) ? 1 : 0;
1186 switch (op) {
1187 case Py_LT: c = c < 0; break;
1188 case Py_LE: c = c <= 0; break;
1189 case Py_EQ: assert(0); break; /* unreachable */
1190 case Py_NE: c = c != 0; break;
1191 case Py_GT: c = c > 0; break;
1192 case Py_GE: c = c >= 0; break;
1193 default:
1194 result = Py_NotImplemented;
1195 goto out;
1197 result = c ? Py_True : Py_False;
1198 out:
1199 Py_INCREF(result);
1200 return result;
1204 _PyString_Eq(PyObject *o1, PyObject *o2)
1206 PyStringObject *a = (PyStringObject*) o1;
1207 PyStringObject *b = (PyStringObject*) o2;
1208 return Py_SIZE(a) == Py_SIZE(b)
1209 && *a->ob_sval == *b->ob_sval
1210 && memcmp(a->ob_sval, b->ob_sval, Py_SIZE(a)) == 0;
1213 static long
1214 string_hash(PyStringObject *a)
1216 register Py_ssize_t len;
1217 register unsigned char *p;
1218 register long x;
1220 if (a->ob_shash != -1)
1221 return a->ob_shash;
1222 len = Py_SIZE(a);
1223 p = (unsigned char *) a->ob_sval;
1224 x = *p << 7;
1225 while (--len >= 0)
1226 x = (1000003*x) ^ *p++;
1227 x ^= Py_SIZE(a);
1228 if (x == -1)
1229 x = -2;
1230 a->ob_shash = x;
1231 return x;
1234 static PyObject*
1235 string_subscript(PyStringObject* self, PyObject* item)
1237 if (PyIndex_Check(item)) {
1238 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
1239 if (i == -1 && PyErr_Occurred())
1240 return NULL;
1241 if (i < 0)
1242 i += PyString_GET_SIZE(self);
1243 return string_item(self, i);
1245 else if (PySlice_Check(item)) {
1246 Py_ssize_t start, stop, step, slicelength, cur, i;
1247 char* source_buf;
1248 char* result_buf;
1249 PyObject* result;
1251 if (PySlice_GetIndicesEx((PySliceObject*)item,
1252 PyString_GET_SIZE(self),
1253 &start, &stop, &step, &slicelength) < 0) {
1254 return NULL;
1257 if (slicelength <= 0) {
1258 return PyString_FromStringAndSize("", 0);
1260 else if (start == 0 && step == 1 &&
1261 slicelength == PyString_GET_SIZE(self) &&
1262 PyString_CheckExact(self)) {
1263 Py_INCREF(self);
1264 return (PyObject *)self;
1266 else if (step == 1) {
1267 return PyString_FromStringAndSize(
1268 PyString_AS_STRING(self) + start,
1269 slicelength);
1271 else {
1272 source_buf = PyString_AsString((PyObject*)self);
1273 result_buf = (char *)PyMem_Malloc(slicelength);
1274 if (result_buf == NULL)
1275 return PyErr_NoMemory();
1277 for (cur = start, i = 0; i < slicelength;
1278 cur += step, i++) {
1279 result_buf[i] = source_buf[cur];
1282 result = PyString_FromStringAndSize(result_buf,
1283 slicelength);
1284 PyMem_Free(result_buf);
1285 return result;
1288 else {
1289 PyErr_Format(PyExc_TypeError,
1290 "string indices must be integers, not %.200s",
1291 Py_TYPE(item)->tp_name);
1292 return NULL;
1296 static Py_ssize_t
1297 string_buffer_getreadbuf(PyStringObject *self, Py_ssize_t index, const void **ptr)
1299 if ( index != 0 ) {
1300 PyErr_SetString(PyExc_SystemError,
1301 "accessing non-existent string segment");
1302 return -1;
1304 *ptr = (void *)self->ob_sval;
1305 return Py_SIZE(self);
1308 static Py_ssize_t
1309 string_buffer_getwritebuf(PyStringObject *self, Py_ssize_t index, const void **ptr)
1311 PyErr_SetString(PyExc_TypeError,
1312 "Cannot use string as modifiable buffer");
1313 return -1;
1316 static Py_ssize_t
1317 string_buffer_getsegcount(PyStringObject *self, Py_ssize_t *lenp)
1319 if ( lenp )
1320 *lenp = Py_SIZE(self);
1321 return 1;
1324 static Py_ssize_t
1325 string_buffer_getcharbuf(PyStringObject *self, Py_ssize_t index, const char **ptr)
1327 if ( index != 0 ) {
1328 PyErr_SetString(PyExc_SystemError,
1329 "accessing non-existent string segment");
1330 return -1;
1332 *ptr = self->ob_sval;
1333 return Py_SIZE(self);
1336 static int
1337 string_buffer_getbuffer(PyStringObject *self, Py_buffer *view, int flags)
1339 return PyBuffer_FillInfo(view, (PyObject*)self,
1340 (void *)self->ob_sval, Py_SIZE(self),
1341 1, flags);
1344 static PySequenceMethods string_as_sequence = {
1345 (lenfunc)string_length, /*sq_length*/
1346 (binaryfunc)string_concat, /*sq_concat*/
1347 (ssizeargfunc)string_repeat, /*sq_repeat*/
1348 (ssizeargfunc)string_item, /*sq_item*/
1349 (ssizessizeargfunc)string_slice, /*sq_slice*/
1350 0, /*sq_ass_item*/
1351 0, /*sq_ass_slice*/
1352 (objobjproc)string_contains /*sq_contains*/
1355 static PyMappingMethods string_as_mapping = {
1356 (lenfunc)string_length,
1357 (binaryfunc)string_subscript,
1361 static PyBufferProcs string_as_buffer = {
1362 (readbufferproc)string_buffer_getreadbuf,
1363 (writebufferproc)string_buffer_getwritebuf,
1364 (segcountproc)string_buffer_getsegcount,
1365 (charbufferproc)string_buffer_getcharbuf,
1366 (getbufferproc)string_buffer_getbuffer,
1367 0, /* XXX */
1372 #define LEFTSTRIP 0
1373 #define RIGHTSTRIP 1
1374 #define BOTHSTRIP 2
1376 /* Arrays indexed by above */
1377 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
1379 #define STRIPNAME(i) (stripformat[i]+3)
1382 /* Don't call if length < 2 */
1383 #define Py_STRING_MATCH(target, offset, pattern, length) \
1384 (target[offset] == pattern[0] && \
1385 target[offset+length-1] == pattern[length-1] && \
1386 !memcmp(target+offset+1, pattern+1, length-2) )
1389 /* Overallocate the initial list to reduce the number of reallocs for small
1390 split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three
1391 resizes, to sizes 4, 8, then 16. Most observed string splits are for human
1392 text (roughly 11 words per line) and field delimited data (usually 1-10
1393 fields). For large strings the split algorithms are bandwidth limited
1394 so increasing the preallocation likely will not improve things.*/
1396 #define MAX_PREALLOC 12
1398 /* 5 splits gives 6 elements */
1399 #define PREALLOC_SIZE(maxsplit) \
1400 (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
1402 #define SPLIT_APPEND(data, left, right) \
1403 str = PyString_FromStringAndSize((data) + (left), \
1404 (right) - (left)); \
1405 if (str == NULL) \
1406 goto onError; \
1407 if (PyList_Append(list, str)) { \
1408 Py_DECREF(str); \
1409 goto onError; \
1411 else \
1412 Py_DECREF(str);
1414 #define SPLIT_ADD(data, left, right) { \
1415 str = PyString_FromStringAndSize((data) + (left), \
1416 (right) - (left)); \
1417 if (str == NULL) \
1418 goto onError; \
1419 if (count < MAX_PREALLOC) { \
1420 PyList_SET_ITEM(list, count, str); \
1421 } else { \
1422 if (PyList_Append(list, str)) { \
1423 Py_DECREF(str); \
1424 goto onError; \
1426 else \
1427 Py_DECREF(str); \
1429 count++; }
1431 /* Always force the list to the expected size. */
1432 #define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count
1434 #define SKIP_SPACE(s, i, len) { while (i<len && isspace(Py_CHARMASK(s[i]))) i++; }
1435 #define SKIP_NONSPACE(s, i, len) { while (i<len && !isspace(Py_CHARMASK(s[i]))) i++; }
1436 #define RSKIP_SPACE(s, i) { while (i>=0 && isspace(Py_CHARMASK(s[i]))) i--; }
1437 #define RSKIP_NONSPACE(s, i) { while (i>=0 && !isspace(Py_CHARMASK(s[i]))) i--; }
1439 Py_LOCAL_INLINE(PyObject *)
1440 split_whitespace(PyStringObject *self, Py_ssize_t len, Py_ssize_t maxsplit)
1442 const char *s = PyString_AS_STRING(self);
1443 Py_ssize_t i, j, count=0;
1444 PyObject *str;
1445 PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit));
1447 if (list == NULL)
1448 return NULL;
1450 i = j = 0;
1452 while (maxsplit-- > 0) {
1453 SKIP_SPACE(s, i, len);
1454 if (i==len) break;
1455 j = i; i++;
1456 SKIP_NONSPACE(s, i, len);
1457 if (j == 0 && i == len && PyString_CheckExact(self)) {
1458 /* No whitespace in self, so just use it as list[0] */
1459 Py_INCREF(self);
1460 PyList_SET_ITEM(list, 0, (PyObject *)self);
1461 count++;
1462 break;
1464 SPLIT_ADD(s, j, i);
1467 if (i < len) {
1468 /* Only occurs when maxsplit was reached */
1469 /* Skip any remaining whitespace and copy to end of string */
1470 SKIP_SPACE(s, i, len);
1471 if (i != len)
1472 SPLIT_ADD(s, i, len);
1474 FIX_PREALLOC_SIZE(list);
1475 return list;
1476 onError:
1477 Py_DECREF(list);
1478 return NULL;
1481 Py_LOCAL_INLINE(PyObject *)
1482 split_char(PyStringObject *self, Py_ssize_t len, char ch, Py_ssize_t maxcount)
1484 const char *s = PyString_AS_STRING(self);
1485 register Py_ssize_t i, j, count=0;
1486 PyObject *str;
1487 PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
1489 if (list == NULL)
1490 return NULL;
1492 i = j = 0;
1493 while ((j < len) && (maxcount-- > 0)) {
1494 for(; j<len; j++) {
1495 /* I found that using memchr makes no difference */
1496 if (s[j] == ch) {
1497 SPLIT_ADD(s, i, j);
1498 i = j = j + 1;
1499 break;
1503 if (i == 0 && count == 0 && PyString_CheckExact(self)) {
1504 /* ch not in self, so just use self as list[0] */
1505 Py_INCREF(self);
1506 PyList_SET_ITEM(list, 0, (PyObject *)self);
1507 count++;
1509 else if (i <= len) {
1510 SPLIT_ADD(s, i, len);
1512 FIX_PREALLOC_SIZE(list);
1513 return list;
1515 onError:
1516 Py_DECREF(list);
1517 return NULL;
1520 PyDoc_STRVAR(split__doc__,
1521 "S.split([sep [,maxsplit]]) -> list of strings\n\
1523 Return a list of the words in the string S, using sep as the\n\
1524 delimiter string. If maxsplit is given, at most maxsplit\n\
1525 splits are done. If sep is not specified or is None, any\n\
1526 whitespace string is a separator and empty strings are removed\n\
1527 from the result.");
1529 static PyObject *
1530 string_split(PyStringObject *self, PyObject *args)
1532 Py_ssize_t len = PyString_GET_SIZE(self), n, i, j;
1533 Py_ssize_t maxsplit = -1, count=0;
1534 const char *s = PyString_AS_STRING(self), *sub;
1535 PyObject *list, *str, *subobj = Py_None;
1536 #ifdef USE_FAST
1537 Py_ssize_t pos;
1538 #endif
1540 if (!PyArg_ParseTuple(args, "|On:split", &subobj, &maxsplit))
1541 return NULL;
1542 if (maxsplit < 0)
1543 maxsplit = PY_SSIZE_T_MAX;
1544 if (subobj == Py_None)
1545 return split_whitespace(self, len, maxsplit);
1546 if (PyString_Check(subobj)) {
1547 sub = PyString_AS_STRING(subobj);
1548 n = PyString_GET_SIZE(subobj);
1550 #ifdef Py_USING_UNICODE
1551 else if (PyUnicode_Check(subobj))
1552 return PyUnicode_Split((PyObject *)self, subobj, maxsplit);
1553 #endif
1554 else if (PyObject_AsCharBuffer(subobj, &sub, &n))
1555 return NULL;
1557 if (n == 0) {
1558 PyErr_SetString(PyExc_ValueError, "empty separator");
1559 return NULL;
1561 else if (n == 1)
1562 return split_char(self, len, sub[0], maxsplit);
1564 list = PyList_New(PREALLOC_SIZE(maxsplit));
1565 if (list == NULL)
1566 return NULL;
1568 #ifdef USE_FAST
1569 i = j = 0;
1570 while (maxsplit-- > 0) {
1571 pos = fastsearch(s+i, len-i, sub, n, FAST_SEARCH);
1572 if (pos < 0)
1573 break;
1574 j = i+pos;
1575 SPLIT_ADD(s, i, j);
1576 i = j + n;
1578 #else
1579 i = j = 0;
1580 while ((j+n <= len) && (maxsplit-- > 0)) {
1581 for (; j+n <= len; j++) {
1582 if (Py_STRING_MATCH(s, j, sub, n)) {
1583 SPLIT_ADD(s, i, j);
1584 i = j = j + n;
1585 break;
1589 #endif
1590 SPLIT_ADD(s, i, len);
1591 FIX_PREALLOC_SIZE(list);
1592 return list;
1594 onError:
1595 Py_DECREF(list);
1596 return NULL;
1599 PyDoc_STRVAR(partition__doc__,
1600 "S.partition(sep) -> (head, sep, tail)\n\
1602 Search for the separator sep in S, and return the part before it,\n\
1603 the separator itself, and the part after it. If the separator is not\n\
1604 found, return S and two empty strings.");
1606 static PyObject *
1607 string_partition(PyStringObject *self, PyObject *sep_obj)
1609 const char *sep;
1610 Py_ssize_t sep_len;
1612 if (PyString_Check(sep_obj)) {
1613 sep = PyString_AS_STRING(sep_obj);
1614 sep_len = PyString_GET_SIZE(sep_obj);
1616 #ifdef Py_USING_UNICODE
1617 else if (PyUnicode_Check(sep_obj))
1618 return PyUnicode_Partition((PyObject *) self, sep_obj);
1619 #endif
1620 else if (PyObject_AsCharBuffer(sep_obj, &sep, &sep_len))
1621 return NULL;
1623 return stringlib_partition(
1624 (PyObject*) self,
1625 PyString_AS_STRING(self), PyString_GET_SIZE(self),
1626 sep_obj, sep, sep_len
1630 PyDoc_STRVAR(rpartition__doc__,
1631 "S.rpartition(sep) -> (tail, sep, head)\n\
1633 Search for the separator sep in S, starting at the end of S, and return\n\
1634 the part before it, the separator itself, and the part after it. If the\n\
1635 separator is not found, return two empty strings and S.");
1637 static PyObject *
1638 string_rpartition(PyStringObject *self, PyObject *sep_obj)
1640 const char *sep;
1641 Py_ssize_t sep_len;
1643 if (PyString_Check(sep_obj)) {
1644 sep = PyString_AS_STRING(sep_obj);
1645 sep_len = PyString_GET_SIZE(sep_obj);
1647 #ifdef Py_USING_UNICODE
1648 else if (PyUnicode_Check(sep_obj))
1649 return PyUnicode_RPartition((PyObject *) self, sep_obj);
1650 #endif
1651 else if (PyObject_AsCharBuffer(sep_obj, &sep, &sep_len))
1652 return NULL;
1654 return stringlib_rpartition(
1655 (PyObject*) self,
1656 PyString_AS_STRING(self), PyString_GET_SIZE(self),
1657 sep_obj, sep, sep_len
1661 Py_LOCAL_INLINE(PyObject *)
1662 rsplit_whitespace(PyStringObject *self, Py_ssize_t len, Py_ssize_t maxsplit)
1664 const char *s = PyString_AS_STRING(self);
1665 Py_ssize_t i, j, count=0;
1666 PyObject *str;
1667 PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit));
1669 if (list == NULL)
1670 return NULL;
1672 i = j = len-1;
1674 while (maxsplit-- > 0) {
1675 RSKIP_SPACE(s, i);
1676 if (i<0) break;
1677 j = i; i--;
1678 RSKIP_NONSPACE(s, i);
1679 if (j == len-1 && i < 0 && PyString_CheckExact(self)) {
1680 /* No whitespace in self, so just use it as list[0] */
1681 Py_INCREF(self);
1682 PyList_SET_ITEM(list, 0, (PyObject *)self);
1683 count++;
1684 break;
1686 SPLIT_ADD(s, i + 1, j + 1);
1688 if (i >= 0) {
1689 /* Only occurs when maxsplit was reached */
1690 /* Skip any remaining whitespace and copy to beginning of string */
1691 RSKIP_SPACE(s, i);
1692 if (i >= 0)
1693 SPLIT_ADD(s, 0, i + 1);
1696 FIX_PREALLOC_SIZE(list);
1697 if (PyList_Reverse(list) < 0)
1698 goto onError;
1699 return list;
1700 onError:
1701 Py_DECREF(list);
1702 return NULL;
1705 Py_LOCAL_INLINE(PyObject *)
1706 rsplit_char(PyStringObject *self, Py_ssize_t len, char ch, Py_ssize_t maxcount)
1708 const char *s = PyString_AS_STRING(self);
1709 register Py_ssize_t i, j, count=0;
1710 PyObject *str;
1711 PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
1713 if (list == NULL)
1714 return NULL;
1716 i = j = len - 1;
1717 while ((i >= 0) && (maxcount-- > 0)) {
1718 for (; i >= 0; i--) {
1719 if (s[i] == ch) {
1720 SPLIT_ADD(s, i + 1, j + 1);
1721 j = i = i - 1;
1722 break;
1726 if (i < 0 && count == 0 && PyString_CheckExact(self)) {
1727 /* ch not in self, so just use self as list[0] */
1728 Py_INCREF(self);
1729 PyList_SET_ITEM(list, 0, (PyObject *)self);
1730 count++;
1732 else if (j >= -1) {
1733 SPLIT_ADD(s, 0, j + 1);
1735 FIX_PREALLOC_SIZE(list);
1736 if (PyList_Reverse(list) < 0)
1737 goto onError;
1738 return list;
1740 onError:
1741 Py_DECREF(list);
1742 return NULL;
1745 PyDoc_STRVAR(rsplit__doc__,
1746 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
1748 Return a list of the words in the string S, using sep as the\n\
1749 delimiter string, starting at the end of the string and working\n\
1750 to the front. If maxsplit is given, at most maxsplit splits are\n\
1751 done. If sep is not specified or is None, any whitespace string\n\
1752 is a separator.");
1754 static PyObject *
1755 string_rsplit(PyStringObject *self, PyObject *args)
1757 Py_ssize_t len = PyString_GET_SIZE(self), n, i, j;
1758 Py_ssize_t maxsplit = -1, count=0;
1759 const char *s, *sub;
1760 PyObject *list, *str, *subobj = Py_None;
1762 if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit))
1763 return NULL;
1764 if (maxsplit < 0)
1765 maxsplit = PY_SSIZE_T_MAX;
1766 if (subobj == Py_None)
1767 return rsplit_whitespace(self, len, maxsplit);
1768 if (PyString_Check(subobj)) {
1769 sub = PyString_AS_STRING(subobj);
1770 n = PyString_GET_SIZE(subobj);
1772 #ifdef Py_USING_UNICODE
1773 else if (PyUnicode_Check(subobj))
1774 return PyUnicode_RSplit((PyObject *)self, subobj, maxsplit);
1775 #endif
1776 else if (PyObject_AsCharBuffer(subobj, &sub, &n))
1777 return NULL;
1779 if (n == 0) {
1780 PyErr_SetString(PyExc_ValueError, "empty separator");
1781 return NULL;
1783 else if (n == 1)
1784 return rsplit_char(self, len, sub[0], maxsplit);
1786 list = PyList_New(PREALLOC_SIZE(maxsplit));
1787 if (list == NULL)
1788 return NULL;
1790 j = len;
1791 i = j - n;
1793 s = PyString_AS_STRING(self);
1794 while ( (i >= 0) && (maxsplit-- > 0) ) {
1795 for (; i>=0; i--) {
1796 if (Py_STRING_MATCH(s, i, sub, n)) {
1797 SPLIT_ADD(s, i + n, j);
1798 j = i;
1799 i -= n;
1800 break;
1804 SPLIT_ADD(s, 0, j);
1805 FIX_PREALLOC_SIZE(list);
1806 if (PyList_Reverse(list) < 0)
1807 goto onError;
1808 return list;
1810 onError:
1811 Py_DECREF(list);
1812 return NULL;
1816 PyDoc_STRVAR(join__doc__,
1817 "S.join(sequence) -> string\n\
1819 Return a string which is the concatenation of the strings in the\n\
1820 sequence. The separator between elements is S.");
1822 static PyObject *
1823 string_join(PyStringObject *self, PyObject *orig)
1825 char *sep = PyString_AS_STRING(self);
1826 const Py_ssize_t seplen = PyString_GET_SIZE(self);
1827 PyObject *res = NULL;
1828 char *p;
1829 Py_ssize_t seqlen = 0;
1830 size_t sz = 0;
1831 Py_ssize_t i;
1832 PyObject *seq, *item;
1834 seq = PySequence_Fast(orig, "");
1835 if (seq == NULL) {
1836 return NULL;
1839 seqlen = PySequence_Size(seq);
1840 if (seqlen == 0) {
1841 Py_DECREF(seq);
1842 return PyString_FromString("");
1844 if (seqlen == 1) {
1845 item = PySequence_Fast_GET_ITEM(seq, 0);
1846 if (PyString_CheckExact(item) || PyUnicode_CheckExact(item)) {
1847 Py_INCREF(item);
1848 Py_DECREF(seq);
1849 return item;
1853 /* There are at least two things to join, or else we have a subclass
1854 * of the builtin types in the sequence.
1855 * Do a pre-pass to figure out the total amount of space we'll
1856 * need (sz), see whether any argument is absurd, and defer to
1857 * the Unicode join if appropriate.
1859 for (i = 0; i < seqlen; i++) {
1860 const size_t old_sz = sz;
1861 item = PySequence_Fast_GET_ITEM(seq, i);
1862 if (!PyString_Check(item)){
1863 #ifdef Py_USING_UNICODE
1864 if (PyUnicode_Check(item)) {
1865 /* Defer to Unicode join.
1866 * CAUTION: There's no gurantee that the
1867 * original sequence can be iterated over
1868 * again, so we must pass seq here.
1870 PyObject *result;
1871 result = PyUnicode_Join((PyObject *)self, seq);
1872 Py_DECREF(seq);
1873 return result;
1875 #endif
1876 PyErr_Format(PyExc_TypeError,
1877 "sequence item %zd: expected string,"
1878 " %.80s found",
1879 i, Py_TYPE(item)->tp_name);
1880 Py_DECREF(seq);
1881 return NULL;
1883 sz += PyString_GET_SIZE(item);
1884 if (i != 0)
1885 sz += seplen;
1886 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
1887 PyErr_SetString(PyExc_OverflowError,
1888 "join() result is too long for a Python string");
1889 Py_DECREF(seq);
1890 return NULL;
1894 /* Allocate result space. */
1895 res = PyString_FromStringAndSize((char*)NULL, sz);
1896 if (res == NULL) {
1897 Py_DECREF(seq);
1898 return NULL;
1901 /* Catenate everything. */
1902 p = PyString_AS_STRING(res);
1903 for (i = 0; i < seqlen; ++i) {
1904 size_t n;
1905 item = PySequence_Fast_GET_ITEM(seq, i);
1906 n = PyString_GET_SIZE(item);
1907 Py_MEMCPY(p, PyString_AS_STRING(item), n);
1908 p += n;
1909 if (i < seqlen - 1) {
1910 Py_MEMCPY(p, sep, seplen);
1911 p += seplen;
1915 Py_DECREF(seq);
1916 return res;
1919 PyObject *
1920 _PyString_Join(PyObject *sep, PyObject *x)
1922 assert(sep != NULL && PyString_Check(sep));
1923 assert(x != NULL);
1924 return string_join((PyStringObject *)sep, x);
1927 Py_LOCAL_INLINE(void)
1928 string_adjust_indices(Py_ssize_t *start, Py_ssize_t *end, Py_ssize_t len)
1930 if (*end > len)
1931 *end = len;
1932 else if (*end < 0)
1933 *end += len;
1934 if (*end < 0)
1935 *end = 0;
1936 if (*start < 0)
1937 *start += len;
1938 if (*start < 0)
1939 *start = 0;
1942 Py_LOCAL_INLINE(Py_ssize_t)
1943 string_find_internal(PyStringObject *self, PyObject *args, int dir)
1945 PyObject *subobj;
1946 const char *sub;
1947 Py_ssize_t sub_len;
1948 Py_ssize_t start=0, end=PY_SSIZE_T_MAX;
1949 PyObject *obj_start=Py_None, *obj_end=Py_None;
1951 if (!PyArg_ParseTuple(args, "O|OO:find/rfind/index/rindex", &subobj,
1952 &obj_start, &obj_end))
1953 return -2;
1954 /* To support None in "start" and "end" arguments, meaning
1955 the same as if they were not passed.
1957 if (obj_start != Py_None)
1958 if (!_PyEval_SliceIndex(obj_start, &start))
1959 return -2;
1960 if (obj_end != Py_None)
1961 if (!_PyEval_SliceIndex(obj_end, &end))
1962 return -2;
1964 if (PyString_Check(subobj)) {
1965 sub = PyString_AS_STRING(subobj);
1966 sub_len = PyString_GET_SIZE(subobj);
1968 #ifdef Py_USING_UNICODE
1969 else if (PyUnicode_Check(subobj))
1970 return PyUnicode_Find(
1971 (PyObject *)self, subobj, start, end, dir);
1972 #endif
1973 else if (PyObject_AsCharBuffer(subobj, &sub, &sub_len))
1974 /* XXX - the "expected a character buffer object" is pretty
1975 confusing for a non-expert. remap to something else ? */
1976 return -2;
1978 if (dir > 0)
1979 return stringlib_find_slice(
1980 PyString_AS_STRING(self), PyString_GET_SIZE(self),
1981 sub, sub_len, start, end);
1982 else
1983 return stringlib_rfind_slice(
1984 PyString_AS_STRING(self), PyString_GET_SIZE(self),
1985 sub, sub_len, start, end);
1989 PyDoc_STRVAR(find__doc__,
1990 "S.find(sub [,start [,end]]) -> int\n\
1992 Return the lowest index in S where substring sub is found,\n\
1993 such that sub is contained within s[start:end]. Optional\n\
1994 arguments start and end are interpreted as in slice notation.\n\
1996 Return -1 on failure.");
1998 static PyObject *
1999 string_find(PyStringObject *self, PyObject *args)
2001 Py_ssize_t result = string_find_internal(self, args, +1);
2002 if (result == -2)
2003 return NULL;
2004 return PyInt_FromSsize_t(result);
2008 PyDoc_STRVAR(index__doc__,
2009 "S.index(sub [,start [,end]]) -> int\n\
2011 Like S.find() but raise ValueError when the substring is not found.");
2013 static PyObject *
2014 string_index(PyStringObject *self, PyObject *args)
2016 Py_ssize_t result = string_find_internal(self, args, +1);
2017 if (result == -2)
2018 return NULL;
2019 if (result == -1) {
2020 PyErr_SetString(PyExc_ValueError,
2021 "substring not found");
2022 return NULL;
2024 return PyInt_FromSsize_t(result);
2028 PyDoc_STRVAR(rfind__doc__,
2029 "S.rfind(sub [,start [,end]]) -> int\n\
2031 Return the highest index in S where substring sub is found,\n\
2032 such that sub is contained within s[start:end]. Optional\n\
2033 arguments start and end are interpreted as in slice notation.\n\
2035 Return -1 on failure.");
2037 static PyObject *
2038 string_rfind(PyStringObject *self, PyObject *args)
2040 Py_ssize_t result = string_find_internal(self, args, -1);
2041 if (result == -2)
2042 return NULL;
2043 return PyInt_FromSsize_t(result);
2047 PyDoc_STRVAR(rindex__doc__,
2048 "S.rindex(sub [,start [,end]]) -> int\n\
2050 Like S.rfind() but raise ValueError when the substring is not found.");
2052 static PyObject *
2053 string_rindex(PyStringObject *self, PyObject *args)
2055 Py_ssize_t result = string_find_internal(self, args, -1);
2056 if (result == -2)
2057 return NULL;
2058 if (result == -1) {
2059 PyErr_SetString(PyExc_ValueError,
2060 "substring not found");
2061 return NULL;
2063 return PyInt_FromSsize_t(result);
2067 Py_LOCAL_INLINE(PyObject *)
2068 do_xstrip(PyStringObject *self, int striptype, PyObject *sepobj)
2070 char *s = PyString_AS_STRING(self);
2071 Py_ssize_t len = PyString_GET_SIZE(self);
2072 char *sep = PyString_AS_STRING(sepobj);
2073 Py_ssize_t seplen = PyString_GET_SIZE(sepobj);
2074 Py_ssize_t i, j;
2076 i = 0;
2077 if (striptype != RIGHTSTRIP) {
2078 while (i < len && memchr(sep, Py_CHARMASK(s[i]), seplen)) {
2079 i++;
2083 j = len;
2084 if (striptype != LEFTSTRIP) {
2085 do {
2086 j--;
2087 } while (j >= i && memchr(sep, Py_CHARMASK(s[j]), seplen));
2088 j++;
2091 if (i == 0 && j == len && PyString_CheckExact(self)) {
2092 Py_INCREF(self);
2093 return (PyObject*)self;
2095 else
2096 return PyString_FromStringAndSize(s+i, j-i);
2100 Py_LOCAL_INLINE(PyObject *)
2101 do_strip(PyStringObject *self, int striptype)
2103 char *s = PyString_AS_STRING(self);
2104 Py_ssize_t len = PyString_GET_SIZE(self), i, j;
2106 i = 0;
2107 if (striptype != RIGHTSTRIP) {
2108 while (i < len && isspace(Py_CHARMASK(s[i]))) {
2109 i++;
2113 j = len;
2114 if (striptype != LEFTSTRIP) {
2115 do {
2116 j--;
2117 } while (j >= i && isspace(Py_CHARMASK(s[j])));
2118 j++;
2121 if (i == 0 && j == len && PyString_CheckExact(self)) {
2122 Py_INCREF(self);
2123 return (PyObject*)self;
2125 else
2126 return PyString_FromStringAndSize(s+i, j-i);
2130 Py_LOCAL_INLINE(PyObject *)
2131 do_argstrip(PyStringObject *self, int striptype, PyObject *args)
2133 PyObject *sep = NULL;
2135 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
2136 return NULL;
2138 if (sep != NULL && sep != Py_None) {
2139 if (PyString_Check(sep))
2140 return do_xstrip(self, striptype, sep);
2141 #ifdef Py_USING_UNICODE
2142 else if (PyUnicode_Check(sep)) {
2143 PyObject *uniself = PyUnicode_FromObject((PyObject *)self);
2144 PyObject *res;
2145 if (uniself==NULL)
2146 return NULL;
2147 res = _PyUnicode_XStrip((PyUnicodeObject *)uniself,
2148 striptype, sep);
2149 Py_DECREF(uniself);
2150 return res;
2152 #endif
2153 PyErr_Format(PyExc_TypeError,
2154 #ifdef Py_USING_UNICODE
2155 "%s arg must be None, str or unicode",
2156 #else
2157 "%s arg must be None or str",
2158 #endif
2159 STRIPNAME(striptype));
2160 return NULL;
2163 return do_strip(self, striptype);
2167 PyDoc_STRVAR(strip__doc__,
2168 "S.strip([chars]) -> string or unicode\n\
2170 Return a copy of the string S with leading and trailing\n\
2171 whitespace removed.\n\
2172 If chars is given and not None, remove characters in chars instead.\n\
2173 If chars is unicode, S will be converted to unicode before stripping");
2175 static PyObject *
2176 string_strip(PyStringObject *self, PyObject *args)
2178 if (PyTuple_GET_SIZE(args) == 0)
2179 return do_strip(self, BOTHSTRIP); /* Common case */
2180 else
2181 return do_argstrip(self, BOTHSTRIP, args);
2185 PyDoc_STRVAR(lstrip__doc__,
2186 "S.lstrip([chars]) -> string or unicode\n\
2188 Return a copy of the string S with leading whitespace removed.\n\
2189 If chars is given and not None, remove characters in chars instead.\n\
2190 If chars is unicode, S will be converted to unicode before stripping");
2192 static PyObject *
2193 string_lstrip(PyStringObject *self, PyObject *args)
2195 if (PyTuple_GET_SIZE(args) == 0)
2196 return do_strip(self, LEFTSTRIP); /* Common case */
2197 else
2198 return do_argstrip(self, LEFTSTRIP, args);
2202 PyDoc_STRVAR(rstrip__doc__,
2203 "S.rstrip([chars]) -> string or unicode\n\
2205 Return a copy of the string S with trailing whitespace removed.\n\
2206 If chars is given and not None, remove characters in chars instead.\n\
2207 If chars is unicode, S will be converted to unicode before stripping");
2209 static PyObject *
2210 string_rstrip(PyStringObject *self, PyObject *args)
2212 if (PyTuple_GET_SIZE(args) == 0)
2213 return do_strip(self, RIGHTSTRIP); /* Common case */
2214 else
2215 return do_argstrip(self, RIGHTSTRIP, args);
2219 PyDoc_STRVAR(lower__doc__,
2220 "S.lower() -> string\n\
2222 Return a copy of the string S converted to lowercase.");
2224 /* _tolower and _toupper are defined by SUSv2, but they're not ISO C */
2225 #ifndef _tolower
2226 #define _tolower tolower
2227 #endif
2229 static PyObject *
2230 string_lower(PyStringObject *self)
2232 char *s;
2233 Py_ssize_t i, n = PyString_GET_SIZE(self);
2234 PyObject *newobj;
2236 newobj = PyString_FromStringAndSize(NULL, n);
2237 if (!newobj)
2238 return NULL;
2240 s = PyString_AS_STRING(newobj);
2242 Py_MEMCPY(s, PyString_AS_STRING(self), n);
2244 for (i = 0; i < n; i++) {
2245 int c = Py_CHARMASK(s[i]);
2246 if (isupper(c))
2247 s[i] = _tolower(c);
2250 return newobj;
2253 PyDoc_STRVAR(upper__doc__,
2254 "S.upper() -> string\n\
2256 Return a copy of the string S converted to uppercase.");
2258 #ifndef _toupper
2259 #define _toupper toupper
2260 #endif
2262 static PyObject *
2263 string_upper(PyStringObject *self)
2265 char *s;
2266 Py_ssize_t i, n = PyString_GET_SIZE(self);
2267 PyObject *newobj;
2269 newobj = PyString_FromStringAndSize(NULL, n);
2270 if (!newobj)
2271 return NULL;
2273 s = PyString_AS_STRING(newobj);
2275 Py_MEMCPY(s, PyString_AS_STRING(self), n);
2277 for (i = 0; i < n; i++) {
2278 int c = Py_CHARMASK(s[i]);
2279 if (islower(c))
2280 s[i] = _toupper(c);
2283 return newobj;
2286 PyDoc_STRVAR(title__doc__,
2287 "S.title() -> string\n\
2289 Return a titlecased version of S, i.e. words start with uppercase\n\
2290 characters, all remaining cased characters have lowercase.");
2292 static PyObject*
2293 string_title(PyStringObject *self)
2295 char *s = PyString_AS_STRING(self), *s_new;
2296 Py_ssize_t i, n = PyString_GET_SIZE(self);
2297 int previous_is_cased = 0;
2298 PyObject *newobj;
2300 newobj = PyString_FromStringAndSize(NULL, n);
2301 if (newobj == NULL)
2302 return NULL;
2303 s_new = PyString_AsString(newobj);
2304 for (i = 0; i < n; i++) {
2305 int c = Py_CHARMASK(*s++);
2306 if (islower(c)) {
2307 if (!previous_is_cased)
2308 c = toupper(c);
2309 previous_is_cased = 1;
2310 } else if (isupper(c)) {
2311 if (previous_is_cased)
2312 c = tolower(c);
2313 previous_is_cased = 1;
2314 } else
2315 previous_is_cased = 0;
2316 *s_new++ = c;
2318 return newobj;
2321 PyDoc_STRVAR(capitalize__doc__,
2322 "S.capitalize() -> string\n\
2324 Return a copy of the string S with only its first character\n\
2325 capitalized.");
2327 static PyObject *
2328 string_capitalize(PyStringObject *self)
2330 char *s = PyString_AS_STRING(self), *s_new;
2331 Py_ssize_t i, n = PyString_GET_SIZE(self);
2332 PyObject *newobj;
2334 newobj = PyString_FromStringAndSize(NULL, n);
2335 if (newobj == NULL)
2336 return NULL;
2337 s_new = PyString_AsString(newobj);
2338 if (0 < n) {
2339 int c = Py_CHARMASK(*s++);
2340 if (islower(c))
2341 *s_new = toupper(c);
2342 else
2343 *s_new = c;
2344 s_new++;
2346 for (i = 1; i < n; i++) {
2347 int c = Py_CHARMASK(*s++);
2348 if (isupper(c))
2349 *s_new = tolower(c);
2350 else
2351 *s_new = c;
2352 s_new++;
2354 return newobj;
2358 PyDoc_STRVAR(count__doc__,
2359 "S.count(sub[, start[, end]]) -> int\n\
2361 Return the number of non-overlapping occurrences of substring sub in\n\
2362 string S[start:end]. Optional arguments start and end are interpreted\n\
2363 as in slice notation.");
2365 static PyObject *
2366 string_count(PyStringObject *self, PyObject *args)
2368 PyObject *sub_obj;
2369 const char *str = PyString_AS_STRING(self), *sub;
2370 Py_ssize_t sub_len;
2371 Py_ssize_t start = 0, end = PY_SSIZE_T_MAX;
2373 if (!PyArg_ParseTuple(args, "O|O&O&:count", &sub_obj,
2374 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
2375 return NULL;
2377 if (PyString_Check(sub_obj)) {
2378 sub = PyString_AS_STRING(sub_obj);
2379 sub_len = PyString_GET_SIZE(sub_obj);
2381 #ifdef Py_USING_UNICODE
2382 else if (PyUnicode_Check(sub_obj)) {
2383 Py_ssize_t count;
2384 count = PyUnicode_Count((PyObject *)self, sub_obj, start, end);
2385 if (count == -1)
2386 return NULL;
2387 else
2388 return PyInt_FromSsize_t(count);
2390 #endif
2391 else if (PyObject_AsCharBuffer(sub_obj, &sub, &sub_len))
2392 return NULL;
2394 string_adjust_indices(&start, &end, PyString_GET_SIZE(self));
2396 return PyInt_FromSsize_t(
2397 stringlib_count(str + start, end - start, sub, sub_len)
2401 PyDoc_STRVAR(swapcase__doc__,
2402 "S.swapcase() -> string\n\
2404 Return a copy of the string S with uppercase characters\n\
2405 converted to lowercase and vice versa.");
2407 static PyObject *
2408 string_swapcase(PyStringObject *self)
2410 char *s = PyString_AS_STRING(self), *s_new;
2411 Py_ssize_t i, n = PyString_GET_SIZE(self);
2412 PyObject *newobj;
2414 newobj = PyString_FromStringAndSize(NULL, n);
2415 if (newobj == NULL)
2416 return NULL;
2417 s_new = PyString_AsString(newobj);
2418 for (i = 0; i < n; i++) {
2419 int c = Py_CHARMASK(*s++);
2420 if (islower(c)) {
2421 *s_new = toupper(c);
2423 else if (isupper(c)) {
2424 *s_new = tolower(c);
2426 else
2427 *s_new = c;
2428 s_new++;
2430 return newobj;
2434 PyDoc_STRVAR(translate__doc__,
2435 "S.translate(table [,deletechars]) -> string\n\
2437 Return a copy of the string S, where all characters occurring\n\
2438 in the optional argument deletechars are removed, and the\n\
2439 remaining characters have been mapped through the given\n\
2440 translation table, which must be a string of length 256.");
2442 static PyObject *
2443 string_translate(PyStringObject *self, PyObject *args)
2445 register char *input, *output;
2446 const char *table;
2447 register Py_ssize_t i, c, changed = 0;
2448 PyObject *input_obj = (PyObject*)self;
2449 const char *output_start, *del_table=NULL;
2450 Py_ssize_t inlen, tablen, dellen = 0;
2451 PyObject *result;
2452 int trans_table[256];
2453 PyObject *tableobj, *delobj = NULL;
2455 if (!PyArg_UnpackTuple(args, "translate", 1, 2,
2456 &tableobj, &delobj))
2457 return NULL;
2459 if (PyString_Check(tableobj)) {
2460 table = PyString_AS_STRING(tableobj);
2461 tablen = PyString_GET_SIZE(tableobj);
2463 else if (tableobj == Py_None) {
2464 table = NULL;
2465 tablen = 256;
2467 #ifdef Py_USING_UNICODE
2468 else if (PyUnicode_Check(tableobj)) {
2469 /* Unicode .translate() does not support the deletechars
2470 parameter; instead a mapping to None will cause characters
2471 to be deleted. */
2472 if (delobj != NULL) {
2473 PyErr_SetString(PyExc_TypeError,
2474 "deletions are implemented differently for unicode");
2475 return NULL;
2477 return PyUnicode_Translate((PyObject *)self, tableobj, NULL);
2479 #endif
2480 else if (PyObject_AsCharBuffer(tableobj, &table, &tablen))
2481 return NULL;
2483 if (tablen != 256) {
2484 PyErr_SetString(PyExc_ValueError,
2485 "translation table must be 256 characters long");
2486 return NULL;
2489 if (delobj != NULL) {
2490 if (PyString_Check(delobj)) {
2491 del_table = PyString_AS_STRING(delobj);
2492 dellen = PyString_GET_SIZE(delobj);
2494 #ifdef Py_USING_UNICODE
2495 else if (PyUnicode_Check(delobj)) {
2496 PyErr_SetString(PyExc_TypeError,
2497 "deletions are implemented differently for unicode");
2498 return NULL;
2500 #endif
2501 else if (PyObject_AsCharBuffer(delobj, &del_table, &dellen))
2502 return NULL;
2504 else {
2505 del_table = NULL;
2506 dellen = 0;
2509 inlen = PyString_GET_SIZE(input_obj);
2510 result = PyString_FromStringAndSize((char *)NULL, inlen);
2511 if (result == NULL)
2512 return NULL;
2513 output_start = output = PyString_AsString(result);
2514 input = PyString_AS_STRING(input_obj);
2516 if (dellen == 0 && table != NULL) {
2517 /* If no deletions are required, use faster code */
2518 for (i = inlen; --i >= 0; ) {
2519 c = Py_CHARMASK(*input++);
2520 if (Py_CHARMASK((*output++ = table[c])) != c)
2521 changed = 1;
2523 if (changed || !PyString_CheckExact(input_obj))
2524 return result;
2525 Py_DECREF(result);
2526 Py_INCREF(input_obj);
2527 return input_obj;
2530 if (table == NULL) {
2531 for (i = 0; i < 256; i++)
2532 trans_table[i] = Py_CHARMASK(i);
2533 } else {
2534 for (i = 0; i < 256; i++)
2535 trans_table[i] = Py_CHARMASK(table[i]);
2538 for (i = 0; i < dellen; i++)
2539 trans_table[(int) Py_CHARMASK(del_table[i])] = -1;
2541 for (i = inlen; --i >= 0; ) {
2542 c = Py_CHARMASK(*input++);
2543 if (trans_table[c] != -1)
2544 if (Py_CHARMASK(*output++ = (char)trans_table[c]) == c)
2545 continue;
2546 changed = 1;
2548 if (!changed && PyString_CheckExact(input_obj)) {
2549 Py_DECREF(result);
2550 Py_INCREF(input_obj);
2551 return input_obj;
2553 /* Fix the size of the resulting string */
2554 if (inlen > 0)
2555 _PyString_Resize(&result, output - output_start);
2556 return result;
2560 #define FORWARD 1
2561 #define REVERSE -1
2563 /* find and count characters and substrings */
2565 #define findchar(target, target_len, c) \
2566 ((char *)memchr((const void *)(target), c, target_len))
2568 /* String ops must return a string. */
2569 /* If the object is subclass of string, create a copy */
2570 Py_LOCAL(PyStringObject *)
2571 return_self(PyStringObject *self)
2573 if (PyString_CheckExact(self)) {
2574 Py_INCREF(self);
2575 return self;
2577 return (PyStringObject *)PyString_FromStringAndSize(
2578 PyString_AS_STRING(self),
2579 PyString_GET_SIZE(self));
2582 Py_LOCAL_INLINE(Py_ssize_t)
2583 countchar(const char *target, int target_len, char c, Py_ssize_t maxcount)
2585 Py_ssize_t count=0;
2586 const char *start=target;
2587 const char *end=target+target_len;
2589 while ( (start=findchar(start, end-start, c)) != NULL ) {
2590 count++;
2591 if (count >= maxcount)
2592 break;
2593 start += 1;
2595 return count;
2598 Py_LOCAL(Py_ssize_t)
2599 findstring(const char *target, Py_ssize_t target_len,
2600 const char *pattern, Py_ssize_t pattern_len,
2601 Py_ssize_t start,
2602 Py_ssize_t end,
2603 int direction)
2605 if (start < 0) {
2606 start += target_len;
2607 if (start < 0)
2608 start = 0;
2610 if (end > target_len) {
2611 end = target_len;
2612 } else if (end < 0) {
2613 end += target_len;
2614 if (end < 0)
2615 end = 0;
2618 /* zero-length substrings always match at the first attempt */
2619 if (pattern_len == 0)
2620 return (direction > 0) ? start : end;
2622 end -= pattern_len;
2624 if (direction < 0) {
2625 for (; end >= start; end--)
2626 if (Py_STRING_MATCH(target, end, pattern, pattern_len))
2627 return end;
2628 } else {
2629 for (; start <= end; start++)
2630 if (Py_STRING_MATCH(target, start, pattern, pattern_len))
2631 return start;
2633 return -1;
2636 Py_LOCAL_INLINE(Py_ssize_t)
2637 countstring(const char *target, Py_ssize_t target_len,
2638 const char *pattern, Py_ssize_t pattern_len,
2639 Py_ssize_t start,
2640 Py_ssize_t end,
2641 int direction, Py_ssize_t maxcount)
2643 Py_ssize_t count=0;
2645 if (start < 0) {
2646 start += target_len;
2647 if (start < 0)
2648 start = 0;
2650 if (end > target_len) {
2651 end = target_len;
2652 } else if (end < 0) {
2653 end += target_len;
2654 if (end < 0)
2655 end = 0;
2658 /* zero-length substrings match everywhere */
2659 if (pattern_len == 0 || maxcount == 0) {
2660 if (target_len+1 < maxcount)
2661 return target_len+1;
2662 return maxcount;
2665 end -= pattern_len;
2666 if (direction < 0) {
2667 for (; (end >= start); end--)
2668 if (Py_STRING_MATCH(target, end, pattern, pattern_len)) {
2669 count++;
2670 if (--maxcount <= 0) break;
2671 end -= pattern_len-1;
2673 } else {
2674 for (; (start <= end); start++)
2675 if (Py_STRING_MATCH(target, start, pattern, pattern_len)) {
2676 count++;
2677 if (--maxcount <= 0)
2678 break;
2679 start += pattern_len-1;
2682 return count;
2686 /* Algorithms for different cases of string replacement */
2688 /* len(self)>=1, from="", len(to)>=1, maxcount>=1 */
2689 Py_LOCAL(PyStringObject *)
2690 replace_interleave(PyStringObject *self,
2691 const char *to_s, Py_ssize_t to_len,
2692 Py_ssize_t maxcount)
2694 char *self_s, *result_s;
2695 Py_ssize_t self_len, result_len;
2696 Py_ssize_t count, i, product;
2697 PyStringObject *result;
2699 self_len = PyString_GET_SIZE(self);
2701 /* 1 at the end plus 1 after every character */
2702 count = self_len+1;
2703 if (maxcount < count)
2704 count = maxcount;
2706 /* Check for overflow */
2707 /* result_len = count * to_len + self_len; */
2708 product = count * to_len;
2709 if (product / to_len != count) {
2710 PyErr_SetString(PyExc_OverflowError,
2711 "replace string is too long");
2712 return NULL;
2714 result_len = product + self_len;
2715 if (result_len < 0) {
2716 PyErr_SetString(PyExc_OverflowError,
2717 "replace string is too long");
2718 return NULL;
2721 if (! (result = (PyStringObject *)
2722 PyString_FromStringAndSize(NULL, result_len)) )
2723 return NULL;
2725 self_s = PyString_AS_STRING(self);
2726 result_s = PyString_AS_STRING(result);
2728 /* TODO: special case single character, which doesn't need memcpy */
2730 /* Lay the first one down (guaranteed this will occur) */
2731 Py_MEMCPY(result_s, to_s, to_len);
2732 result_s += to_len;
2733 count -= 1;
2735 for (i=0; i<count; i++) {
2736 *result_s++ = *self_s++;
2737 Py_MEMCPY(result_s, to_s, to_len);
2738 result_s += to_len;
2741 /* Copy the rest of the original string */
2742 Py_MEMCPY(result_s, self_s, self_len-i);
2744 return result;
2747 /* Special case for deleting a single character */
2748 /* len(self)>=1, len(from)==1, to="", maxcount>=1 */
2749 Py_LOCAL(PyStringObject *)
2750 replace_delete_single_character(PyStringObject *self,
2751 char from_c, Py_ssize_t maxcount)
2753 char *self_s, *result_s;
2754 char *start, *next, *end;
2755 Py_ssize_t self_len, result_len;
2756 Py_ssize_t count;
2757 PyStringObject *result;
2759 self_len = PyString_GET_SIZE(self);
2760 self_s = PyString_AS_STRING(self);
2762 count = countchar(self_s, self_len, from_c, maxcount);
2763 if (count == 0) {
2764 return return_self(self);
2767 result_len = self_len - count; /* from_len == 1 */
2768 assert(result_len>=0);
2770 if ( (result = (PyStringObject *)
2771 PyString_FromStringAndSize(NULL, result_len)) == NULL)
2772 return NULL;
2773 result_s = PyString_AS_STRING(result);
2775 start = self_s;
2776 end = self_s + self_len;
2777 while (count-- > 0) {
2778 next = findchar(start, end-start, from_c);
2779 if (next == NULL)
2780 break;
2781 Py_MEMCPY(result_s, start, next-start);
2782 result_s += (next-start);
2783 start = next+1;
2785 Py_MEMCPY(result_s, start, end-start);
2787 return result;
2790 /* len(self)>=1, len(from)>=2, to="", maxcount>=1 */
2792 Py_LOCAL(PyStringObject *)
2793 replace_delete_substring(PyStringObject *self,
2794 const char *from_s, Py_ssize_t from_len,
2795 Py_ssize_t maxcount) {
2796 char *self_s, *result_s;
2797 char *start, *next, *end;
2798 Py_ssize_t self_len, result_len;
2799 Py_ssize_t count, offset;
2800 PyStringObject *result;
2802 self_len = PyString_GET_SIZE(self);
2803 self_s = PyString_AS_STRING(self);
2805 count = countstring(self_s, self_len,
2806 from_s, from_len,
2807 0, self_len, 1,
2808 maxcount);
2810 if (count == 0) {
2811 /* no matches */
2812 return return_self(self);
2815 result_len = self_len - (count * from_len);
2816 assert (result_len>=0);
2818 if ( (result = (PyStringObject *)
2819 PyString_FromStringAndSize(NULL, result_len)) == NULL )
2820 return NULL;
2822 result_s = PyString_AS_STRING(result);
2824 start = self_s;
2825 end = self_s + self_len;
2826 while (count-- > 0) {
2827 offset = findstring(start, end-start,
2828 from_s, from_len,
2829 0, end-start, FORWARD);
2830 if (offset == -1)
2831 break;
2832 next = start + offset;
2834 Py_MEMCPY(result_s, start, next-start);
2836 result_s += (next-start);
2837 start = next+from_len;
2839 Py_MEMCPY(result_s, start, end-start);
2840 return result;
2843 /* len(self)>=1, len(from)==len(to)==1, maxcount>=1 */
2844 Py_LOCAL(PyStringObject *)
2845 replace_single_character_in_place(PyStringObject *self,
2846 char from_c, char to_c,
2847 Py_ssize_t maxcount)
2849 char *self_s, *result_s, *start, *end, *next;
2850 Py_ssize_t self_len;
2851 PyStringObject *result;
2853 /* The result string will be the same size */
2854 self_s = PyString_AS_STRING(self);
2855 self_len = PyString_GET_SIZE(self);
2857 next = findchar(self_s, self_len, from_c);
2859 if (next == NULL) {
2860 /* No matches; return the original string */
2861 return return_self(self);
2864 /* Need to make a new string */
2865 result = (PyStringObject *) PyString_FromStringAndSize(NULL, self_len);
2866 if (result == NULL)
2867 return NULL;
2868 result_s = PyString_AS_STRING(result);
2869 Py_MEMCPY(result_s, self_s, self_len);
2871 /* change everything in-place, starting with this one */
2872 start = result_s + (next-self_s);
2873 *start = to_c;
2874 start++;
2875 end = result_s + self_len;
2877 while (--maxcount > 0) {
2878 next = findchar(start, end-start, from_c);
2879 if (next == NULL)
2880 break;
2881 *next = to_c;
2882 start = next+1;
2885 return result;
2888 /* len(self)>=1, len(from)==len(to)>=2, maxcount>=1 */
2889 Py_LOCAL(PyStringObject *)
2890 replace_substring_in_place(PyStringObject *self,
2891 const char *from_s, Py_ssize_t from_len,
2892 const char *to_s, Py_ssize_t to_len,
2893 Py_ssize_t maxcount)
2895 char *result_s, *start, *end;
2896 char *self_s;
2897 Py_ssize_t self_len, offset;
2898 PyStringObject *result;
2900 /* The result string will be the same size */
2902 self_s = PyString_AS_STRING(self);
2903 self_len = PyString_GET_SIZE(self);
2905 offset = findstring(self_s, self_len,
2906 from_s, from_len,
2907 0, self_len, FORWARD);
2908 if (offset == -1) {
2909 /* No matches; return the original string */
2910 return return_self(self);
2913 /* Need to make a new string */
2914 result = (PyStringObject *) PyString_FromStringAndSize(NULL, self_len);
2915 if (result == NULL)
2916 return NULL;
2917 result_s = PyString_AS_STRING(result);
2918 Py_MEMCPY(result_s, self_s, self_len);
2920 /* change everything in-place, starting with this one */
2921 start = result_s + offset;
2922 Py_MEMCPY(start, to_s, from_len);
2923 start += from_len;
2924 end = result_s + self_len;
2926 while ( --maxcount > 0) {
2927 offset = findstring(start, end-start,
2928 from_s, from_len,
2929 0, end-start, FORWARD);
2930 if (offset==-1)
2931 break;
2932 Py_MEMCPY(start+offset, to_s, from_len);
2933 start += offset+from_len;
2936 return result;
2939 /* len(self)>=1, len(from)==1, len(to)>=2, maxcount>=1 */
2940 Py_LOCAL(PyStringObject *)
2941 replace_single_character(PyStringObject *self,
2942 char from_c,
2943 const char *to_s, Py_ssize_t to_len,
2944 Py_ssize_t maxcount)
2946 char *self_s, *result_s;
2947 char *start, *next, *end;
2948 Py_ssize_t self_len, result_len;
2949 Py_ssize_t count, product;
2950 PyStringObject *result;
2952 self_s = PyString_AS_STRING(self);
2953 self_len = PyString_GET_SIZE(self);
2955 count = countchar(self_s, self_len, from_c, maxcount);
2956 if (count == 0) {
2957 /* no matches, return unchanged */
2958 return return_self(self);
2961 /* use the difference between current and new, hence the "-1" */
2962 /* result_len = self_len + count * (to_len-1) */
2963 product = count * (to_len-1);
2964 if (product / (to_len-1) != count) {
2965 PyErr_SetString(PyExc_OverflowError, "replace string is too long");
2966 return NULL;
2968 result_len = self_len + product;
2969 if (result_len < 0) {
2970 PyErr_SetString(PyExc_OverflowError, "replace string is too long");
2971 return NULL;
2974 if ( (result = (PyStringObject *)
2975 PyString_FromStringAndSize(NULL, result_len)) == NULL)
2976 return NULL;
2977 result_s = PyString_AS_STRING(result);
2979 start = self_s;
2980 end = self_s + self_len;
2981 while (count-- > 0) {
2982 next = findchar(start, end-start, from_c);
2983 if (next == NULL)
2984 break;
2986 if (next == start) {
2987 /* replace with the 'to' */
2988 Py_MEMCPY(result_s, to_s, to_len);
2989 result_s += to_len;
2990 start += 1;
2991 } else {
2992 /* copy the unchanged old then the 'to' */
2993 Py_MEMCPY(result_s, start, next-start);
2994 result_s += (next-start);
2995 Py_MEMCPY(result_s, to_s, to_len);
2996 result_s += to_len;
2997 start = next+1;
3000 /* Copy the remainder of the remaining string */
3001 Py_MEMCPY(result_s, start, end-start);
3003 return result;
3006 /* len(self)>=1, len(from)>=2, len(to)>=2, maxcount>=1 */
3007 Py_LOCAL(PyStringObject *)
3008 replace_substring(PyStringObject *self,
3009 const char *from_s, Py_ssize_t from_len,
3010 const char *to_s, Py_ssize_t to_len,
3011 Py_ssize_t maxcount) {
3012 char *self_s, *result_s;
3013 char *start, *next, *end;
3014 Py_ssize_t self_len, result_len;
3015 Py_ssize_t count, offset, product;
3016 PyStringObject *result;
3018 self_s = PyString_AS_STRING(self);
3019 self_len = PyString_GET_SIZE(self);
3021 count = countstring(self_s, self_len,
3022 from_s, from_len,
3023 0, self_len, FORWARD, maxcount);
3024 if (count == 0) {
3025 /* no matches, return unchanged */
3026 return return_self(self);
3029 /* Check for overflow */
3030 /* result_len = self_len + count * (to_len-from_len) */
3031 product = count * (to_len-from_len);
3032 if (product / (to_len-from_len) != count) {
3033 PyErr_SetString(PyExc_OverflowError, "replace string is too long");
3034 return NULL;
3036 result_len = self_len + product;
3037 if (result_len < 0) {
3038 PyErr_SetString(PyExc_OverflowError, "replace string is too long");
3039 return NULL;
3042 if ( (result = (PyStringObject *)
3043 PyString_FromStringAndSize(NULL, result_len)) == NULL)
3044 return NULL;
3045 result_s = PyString_AS_STRING(result);
3047 start = self_s;
3048 end = self_s + self_len;
3049 while (count-- > 0) {
3050 offset = findstring(start, end-start,
3051 from_s, from_len,
3052 0, end-start, FORWARD);
3053 if (offset == -1)
3054 break;
3055 next = start+offset;
3056 if (next == start) {
3057 /* replace with the 'to' */
3058 Py_MEMCPY(result_s, to_s, to_len);
3059 result_s += to_len;
3060 start += from_len;
3061 } else {
3062 /* copy the unchanged old then the 'to' */
3063 Py_MEMCPY(result_s, start, next-start);
3064 result_s += (next-start);
3065 Py_MEMCPY(result_s, to_s, to_len);
3066 result_s += to_len;
3067 start = next+from_len;
3070 /* Copy the remainder of the remaining string */
3071 Py_MEMCPY(result_s, start, end-start);
3073 return result;
3077 Py_LOCAL(PyStringObject *)
3078 replace(PyStringObject *self,
3079 const char *from_s, Py_ssize_t from_len,
3080 const char *to_s, Py_ssize_t to_len,
3081 Py_ssize_t maxcount)
3083 if (maxcount < 0) {
3084 maxcount = PY_SSIZE_T_MAX;
3085 } else if (maxcount == 0 || PyString_GET_SIZE(self) == 0) {
3086 /* nothing to do; return the original string */
3087 return return_self(self);
3090 if (maxcount == 0 ||
3091 (from_len == 0 && to_len == 0)) {
3092 /* nothing to do; return the original string */
3093 return return_self(self);
3096 /* Handle zero-length special cases */
3098 if (from_len == 0) {
3099 /* insert the 'to' string everywhere. */
3100 /* >>> "Python".replace("", ".") */
3101 /* '.P.y.t.h.o.n.' */
3102 return replace_interleave(self, to_s, to_len, maxcount);
3105 /* Except for "".replace("", "A") == "A" there is no way beyond this */
3106 /* point for an empty self string to generate a non-empty string */
3107 /* Special case so the remaining code always gets a non-empty string */
3108 if (PyString_GET_SIZE(self) == 0) {
3109 return return_self(self);
3112 if (to_len == 0) {
3113 /* delete all occurances of 'from' string */
3114 if (from_len == 1) {
3115 return replace_delete_single_character(
3116 self, from_s[0], maxcount);
3117 } else {
3118 return replace_delete_substring(self, from_s, from_len, maxcount);
3122 /* Handle special case where both strings have the same length */
3124 if (from_len == to_len) {
3125 if (from_len == 1) {
3126 return replace_single_character_in_place(
3127 self,
3128 from_s[0],
3129 to_s[0],
3130 maxcount);
3131 } else {
3132 return replace_substring_in_place(
3133 self, from_s, from_len, to_s, to_len, maxcount);
3137 /* Otherwise use the more generic algorithms */
3138 if (from_len == 1) {
3139 return replace_single_character(self, from_s[0],
3140 to_s, to_len, maxcount);
3141 } else {
3142 /* len('from')>=2, len('to')>=1 */
3143 return replace_substring(self, from_s, from_len, to_s, to_len, maxcount);
3147 PyDoc_STRVAR(replace__doc__,
3148 "S.replace (old, new[, count]) -> string\n\
3150 Return a copy of string S with all occurrences of substring\n\
3151 old replaced by new. If the optional argument count is\n\
3152 given, only the first count occurrences are replaced.");
3154 static PyObject *
3155 string_replace(PyStringObject *self, PyObject *args)
3157 Py_ssize_t count = -1;
3158 PyObject *from, *to;
3159 const char *from_s, *to_s;
3160 Py_ssize_t from_len, to_len;
3162 if (!PyArg_ParseTuple(args, "OO|n:replace", &from, &to, &count))
3163 return NULL;
3165 if (PyString_Check(from)) {
3166 from_s = PyString_AS_STRING(from);
3167 from_len = PyString_GET_SIZE(from);
3169 #ifdef Py_USING_UNICODE
3170 if (PyUnicode_Check(from))
3171 return PyUnicode_Replace((PyObject *)self,
3172 from, to, count);
3173 #endif
3174 else if (PyObject_AsCharBuffer(from, &from_s, &from_len))
3175 return NULL;
3177 if (PyString_Check(to)) {
3178 to_s = PyString_AS_STRING(to);
3179 to_len = PyString_GET_SIZE(to);
3181 #ifdef Py_USING_UNICODE
3182 else if (PyUnicode_Check(to))
3183 return PyUnicode_Replace((PyObject *)self,
3184 from, to, count);
3185 #endif
3186 else if (PyObject_AsCharBuffer(to, &to_s, &to_len))
3187 return NULL;
3189 return (PyObject *)replace((PyStringObject *) self,
3190 from_s, from_len,
3191 to_s, to_len, count);
3194 /** End DALKE **/
3196 /* Matches the end (direction >= 0) or start (direction < 0) of self
3197 * against substr, using the start and end arguments. Returns
3198 * -1 on error, 0 if not found and 1 if found.
3200 Py_LOCAL(int)
3201 _string_tailmatch(PyStringObject *self, PyObject *substr, Py_ssize_t start,
3202 Py_ssize_t end, int direction)
3204 Py_ssize_t len = PyString_GET_SIZE(self);
3205 Py_ssize_t slen;
3206 const char* sub;
3207 const char* str;
3209 if (PyString_Check(substr)) {
3210 sub = PyString_AS_STRING(substr);
3211 slen = PyString_GET_SIZE(substr);
3213 #ifdef Py_USING_UNICODE
3214 else if (PyUnicode_Check(substr))
3215 return PyUnicode_Tailmatch((PyObject *)self,
3216 substr, start, end, direction);
3217 #endif
3218 else if (PyObject_AsCharBuffer(substr, &sub, &slen))
3219 return -1;
3220 str = PyString_AS_STRING(self);
3222 string_adjust_indices(&start, &end, len);
3224 if (direction < 0) {
3225 /* startswith */
3226 if (start+slen > len)
3227 return 0;
3228 } else {
3229 /* endswith */
3230 if (end-start < slen || start > len)
3231 return 0;
3233 if (end-slen > start)
3234 start = end - slen;
3236 if (end-start >= slen)
3237 return ! memcmp(str+start, sub, slen);
3238 return 0;
3242 PyDoc_STRVAR(startswith__doc__,
3243 "S.startswith(prefix[, start[, end]]) -> bool\n\
3245 Return True if S starts with the specified prefix, False otherwise.\n\
3246 With optional start, test S beginning at that position.\n\
3247 With optional end, stop comparing S at that position.\n\
3248 prefix can also be a tuple of strings to try.");
3250 static PyObject *
3251 string_startswith(PyStringObject *self, PyObject *args)
3253 Py_ssize_t start = 0;
3254 Py_ssize_t end = PY_SSIZE_T_MAX;
3255 PyObject *subobj;
3256 int result;
3258 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
3259 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3260 return NULL;
3261 if (PyTuple_Check(subobj)) {
3262 Py_ssize_t i;
3263 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
3264 result = _string_tailmatch(self,
3265 PyTuple_GET_ITEM(subobj, i),
3266 start, end, -1);
3267 if (result == -1)
3268 return NULL;
3269 else if (result) {
3270 Py_RETURN_TRUE;
3273 Py_RETURN_FALSE;
3275 result = _string_tailmatch(self, subobj, start, end, -1);
3276 if (result == -1)
3277 return NULL;
3278 else
3279 return PyBool_FromLong(result);
3283 PyDoc_STRVAR(endswith__doc__,
3284 "S.endswith(suffix[, start[, end]]) -> bool\n\
3286 Return True if S ends with the specified suffix, False otherwise.\n\
3287 With optional start, test S beginning at that position.\n\
3288 With optional end, stop comparing S at that position.\n\
3289 suffix can also be a tuple of strings to try.");
3291 static PyObject *
3292 string_endswith(PyStringObject *self, PyObject *args)
3294 Py_ssize_t start = 0;
3295 Py_ssize_t end = PY_SSIZE_T_MAX;
3296 PyObject *subobj;
3297 int result;
3299 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
3300 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3301 return NULL;
3302 if (PyTuple_Check(subobj)) {
3303 Py_ssize_t i;
3304 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
3305 result = _string_tailmatch(self,
3306 PyTuple_GET_ITEM(subobj, i),
3307 start, end, +1);
3308 if (result == -1)
3309 return NULL;
3310 else if (result) {
3311 Py_RETURN_TRUE;
3314 Py_RETURN_FALSE;
3316 result = _string_tailmatch(self, subobj, start, end, +1);
3317 if (result == -1)
3318 return NULL;
3319 else
3320 return PyBool_FromLong(result);
3324 PyDoc_STRVAR(encode__doc__,
3325 "S.encode([encoding[,errors]]) -> object\n\
3327 Encodes S using the codec registered for encoding. encoding defaults\n\
3328 to the default encoding. errors may be given to set a different error\n\
3329 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3330 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
3331 'xmlcharrefreplace' as well as any other name registered with\n\
3332 codecs.register_error that is able to handle UnicodeEncodeErrors.");
3334 static PyObject *
3335 string_encode(PyStringObject *self, PyObject *args, PyObject *kwargs)
3337 static char *kwlist[] = {"encoding", "errors", 0};
3338 char *encoding = NULL;
3339 char *errors = NULL;
3340 PyObject *v;
3342 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
3343 kwlist, &encoding, &errors))
3344 return NULL;
3345 v = PyString_AsEncodedObject((PyObject *)self, encoding, errors);
3346 if (v == NULL)
3347 goto onError;
3348 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
3349 PyErr_Format(PyExc_TypeError,
3350 "encoder did not return a string/unicode object "
3351 "(type=%.400s)",
3352 Py_TYPE(v)->tp_name);
3353 Py_DECREF(v);
3354 return NULL;
3356 return v;
3358 onError:
3359 return NULL;
3363 PyDoc_STRVAR(decode__doc__,
3364 "S.decode([encoding[,errors]]) -> object\n\
3366 Decodes S using the codec registered for encoding. encoding defaults\n\
3367 to the default encoding. errors may be given to set a different error\n\
3368 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3369 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
3370 as well as any other name registered with codecs.register_error that is\n\
3371 able to handle UnicodeDecodeErrors.");
3373 static PyObject *
3374 string_decode(PyStringObject *self, PyObject *args, PyObject *kwargs)
3376 static char *kwlist[] = {"encoding", "errors", 0};
3377 char *encoding = NULL;
3378 char *errors = NULL;
3379 PyObject *v;
3381 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
3382 kwlist, &encoding, &errors))
3383 return NULL;
3384 v = PyString_AsDecodedObject((PyObject *)self, encoding, errors);
3385 if (v == NULL)
3386 goto onError;
3387 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
3388 PyErr_Format(PyExc_TypeError,
3389 "decoder did not return a string/unicode object "
3390 "(type=%.400s)",
3391 Py_TYPE(v)->tp_name);
3392 Py_DECREF(v);
3393 return NULL;
3395 return v;
3397 onError:
3398 return NULL;
3402 PyDoc_STRVAR(expandtabs__doc__,
3403 "S.expandtabs([tabsize]) -> string\n\
3405 Return a copy of S where all tab characters are expanded using spaces.\n\
3406 If tabsize is not given, a tab size of 8 characters is assumed.");
3408 static PyObject*
3409 string_expandtabs(PyStringObject *self, PyObject *args)
3411 const char *e, *p, *qe;
3412 char *q;
3413 Py_ssize_t i, j, incr;
3414 PyObject *u;
3415 int tabsize = 8;
3417 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3418 return NULL;
3420 /* First pass: determine size of output string */
3421 i = 0; /* chars up to and including most recent \n or \r */
3422 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
3423 e = PyString_AS_STRING(self) + PyString_GET_SIZE(self); /* end of input */
3424 for (p = PyString_AS_STRING(self); p < e; p++)
3425 if (*p == '\t') {
3426 if (tabsize > 0) {
3427 incr = tabsize - (j % tabsize);
3428 if (j > PY_SSIZE_T_MAX - incr)
3429 goto overflow1;
3430 j += incr;
3433 else {
3434 if (j > PY_SSIZE_T_MAX - 1)
3435 goto overflow1;
3436 j++;
3437 if (*p == '\n' || *p == '\r') {
3438 if (i > PY_SSIZE_T_MAX - j)
3439 goto overflow1;
3440 i += j;
3441 j = 0;
3445 if (i > PY_SSIZE_T_MAX - j)
3446 goto overflow1;
3448 /* Second pass: create output string and fill it */
3449 u = PyString_FromStringAndSize(NULL, i + j);
3450 if (!u)
3451 return NULL;
3453 j = 0; /* same as in first pass */
3454 q = PyString_AS_STRING(u); /* next output char */
3455 qe = PyString_AS_STRING(u) + PyString_GET_SIZE(u); /* end of output */
3457 for (p = PyString_AS_STRING(self); p < e; p++)
3458 if (*p == '\t') {
3459 if (tabsize > 0) {
3460 i = tabsize - (j % tabsize);
3461 j += i;
3462 while (i--) {
3463 if (q >= qe)
3464 goto overflow2;
3465 *q++ = ' ';
3469 else {
3470 if (q >= qe)
3471 goto overflow2;
3472 *q++ = *p;
3473 j++;
3474 if (*p == '\n' || *p == '\r')
3475 j = 0;
3478 return u;
3480 overflow2:
3481 Py_DECREF(u);
3482 overflow1:
3483 PyErr_SetString(PyExc_OverflowError, "new string is too long");
3484 return NULL;
3487 Py_LOCAL_INLINE(PyObject *)
3488 pad(PyStringObject *self, Py_ssize_t left, Py_ssize_t right, char fill)
3490 PyObject *u;
3492 if (left < 0)
3493 left = 0;
3494 if (right < 0)
3495 right = 0;
3497 if (left == 0 && right == 0 && PyString_CheckExact(self)) {
3498 Py_INCREF(self);
3499 return (PyObject *)self;
3502 u = PyString_FromStringAndSize(NULL,
3503 left + PyString_GET_SIZE(self) + right);
3504 if (u) {
3505 if (left)
3506 memset(PyString_AS_STRING(u), fill, left);
3507 Py_MEMCPY(PyString_AS_STRING(u) + left,
3508 PyString_AS_STRING(self),
3509 PyString_GET_SIZE(self));
3510 if (right)
3511 memset(PyString_AS_STRING(u) + left + PyString_GET_SIZE(self),
3512 fill, right);
3515 return u;
3518 PyDoc_STRVAR(ljust__doc__,
3519 "S.ljust(width[, fillchar]) -> string\n"
3520 "\n"
3521 "Return S left-justified in a string of length width. Padding is\n"
3522 "done using the specified fill character (default is a space).");
3524 static PyObject *
3525 string_ljust(PyStringObject *self, PyObject *args)
3527 Py_ssize_t width;
3528 char fillchar = ' ';
3530 if (!PyArg_ParseTuple(args, "n|c:ljust", &width, &fillchar))
3531 return NULL;
3533 if (PyString_GET_SIZE(self) >= width && PyString_CheckExact(self)) {
3534 Py_INCREF(self);
3535 return (PyObject*) self;
3538 return pad(self, 0, width - PyString_GET_SIZE(self), fillchar);
3542 PyDoc_STRVAR(rjust__doc__,
3543 "S.rjust(width[, fillchar]) -> string\n"
3544 "\n"
3545 "Return S right-justified in a string of length width. Padding is\n"
3546 "done using the specified fill character (default is a space)");
3548 static PyObject *
3549 string_rjust(PyStringObject *self, PyObject *args)
3551 Py_ssize_t width;
3552 char fillchar = ' ';
3554 if (!PyArg_ParseTuple(args, "n|c:rjust", &width, &fillchar))
3555 return NULL;
3557 if (PyString_GET_SIZE(self) >= width && PyString_CheckExact(self)) {
3558 Py_INCREF(self);
3559 return (PyObject*) self;
3562 return pad(self, width - PyString_GET_SIZE(self), 0, fillchar);
3566 PyDoc_STRVAR(center__doc__,
3567 "S.center(width[, fillchar]) -> string\n"
3568 "\n"
3569 "Return S centered in a string of length width. Padding is\n"
3570 "done using the specified fill character (default is a space)");
3572 static PyObject *
3573 string_center(PyStringObject *self, PyObject *args)
3575 Py_ssize_t marg, left;
3576 Py_ssize_t width;
3577 char fillchar = ' ';
3579 if (!PyArg_ParseTuple(args, "n|c:center", &width, &fillchar))
3580 return NULL;
3582 if (PyString_GET_SIZE(self) >= width && PyString_CheckExact(self)) {
3583 Py_INCREF(self);
3584 return (PyObject*) self;
3587 marg = width - PyString_GET_SIZE(self);
3588 left = marg / 2 + (marg & width & 1);
3590 return pad(self, left, marg - left, fillchar);
3593 PyDoc_STRVAR(zfill__doc__,
3594 "S.zfill(width) -> string\n"
3595 "\n"
3596 "Pad a numeric string S with zeros on the left, to fill a field\n"
3597 "of the specified width. The string S is never truncated.");
3599 static PyObject *
3600 string_zfill(PyStringObject *self, PyObject *args)
3602 Py_ssize_t fill;
3603 PyObject *s;
3604 char *p;
3605 Py_ssize_t width;
3607 if (!PyArg_ParseTuple(args, "n:zfill", &width))
3608 return NULL;
3610 if (PyString_GET_SIZE(self) >= width) {
3611 if (PyString_CheckExact(self)) {
3612 Py_INCREF(self);
3613 return (PyObject*) self;
3615 else
3616 return PyString_FromStringAndSize(
3617 PyString_AS_STRING(self),
3618 PyString_GET_SIZE(self)
3622 fill = width - PyString_GET_SIZE(self);
3624 s = pad(self, fill, 0, '0');
3626 if (s == NULL)
3627 return NULL;
3629 p = PyString_AS_STRING(s);
3630 if (p[fill] == '+' || p[fill] == '-') {
3631 /* move sign to beginning of string */
3632 p[0] = p[fill];
3633 p[fill] = '0';
3636 return (PyObject*) s;
3639 PyDoc_STRVAR(isspace__doc__,
3640 "S.isspace() -> bool\n\
3642 Return True if all characters in S are whitespace\n\
3643 and there is at least one character in S, False otherwise.");
3645 static PyObject*
3646 string_isspace(PyStringObject *self)
3648 register const unsigned char *p
3649 = (unsigned char *) PyString_AS_STRING(self);
3650 register const unsigned char *e;
3652 /* Shortcut for single character strings */
3653 if (PyString_GET_SIZE(self) == 1 &&
3654 isspace(*p))
3655 return PyBool_FromLong(1);
3657 /* Special case for empty strings */
3658 if (PyString_GET_SIZE(self) == 0)
3659 return PyBool_FromLong(0);
3661 e = p + PyString_GET_SIZE(self);
3662 for (; p < e; p++) {
3663 if (!isspace(*p))
3664 return PyBool_FromLong(0);
3666 return PyBool_FromLong(1);
3670 PyDoc_STRVAR(isalpha__doc__,
3671 "S.isalpha() -> bool\n\
3673 Return True if all characters in S are alphabetic\n\
3674 and there is at least one character in S, False otherwise.");
3676 static PyObject*
3677 string_isalpha(PyStringObject *self)
3679 register const unsigned char *p
3680 = (unsigned char *) PyString_AS_STRING(self);
3681 register const unsigned char *e;
3683 /* Shortcut for single character strings */
3684 if (PyString_GET_SIZE(self) == 1 &&
3685 isalpha(*p))
3686 return PyBool_FromLong(1);
3688 /* Special case for empty strings */
3689 if (PyString_GET_SIZE(self) == 0)
3690 return PyBool_FromLong(0);
3692 e = p + PyString_GET_SIZE(self);
3693 for (; p < e; p++) {
3694 if (!isalpha(*p))
3695 return PyBool_FromLong(0);
3697 return PyBool_FromLong(1);
3701 PyDoc_STRVAR(isalnum__doc__,
3702 "S.isalnum() -> bool\n\
3704 Return True if all characters in S are alphanumeric\n\
3705 and there is at least one character in S, False otherwise.");
3707 static PyObject*
3708 string_isalnum(PyStringObject *self)
3710 register const unsigned char *p
3711 = (unsigned char *) PyString_AS_STRING(self);
3712 register const unsigned char *e;
3714 /* Shortcut for single character strings */
3715 if (PyString_GET_SIZE(self) == 1 &&
3716 isalnum(*p))
3717 return PyBool_FromLong(1);
3719 /* Special case for empty strings */
3720 if (PyString_GET_SIZE(self) == 0)
3721 return PyBool_FromLong(0);
3723 e = p + PyString_GET_SIZE(self);
3724 for (; p < e; p++) {
3725 if (!isalnum(*p))
3726 return PyBool_FromLong(0);
3728 return PyBool_FromLong(1);
3732 PyDoc_STRVAR(isdigit__doc__,
3733 "S.isdigit() -> bool\n\
3735 Return True if all characters in S are digits\n\
3736 and there is at least one character in S, False otherwise.");
3738 static PyObject*
3739 string_isdigit(PyStringObject *self)
3741 register const unsigned char *p
3742 = (unsigned char *) PyString_AS_STRING(self);
3743 register const unsigned char *e;
3745 /* Shortcut for single character strings */
3746 if (PyString_GET_SIZE(self) == 1 &&
3747 isdigit(*p))
3748 return PyBool_FromLong(1);
3750 /* Special case for empty strings */
3751 if (PyString_GET_SIZE(self) == 0)
3752 return PyBool_FromLong(0);
3754 e = p + PyString_GET_SIZE(self);
3755 for (; p < e; p++) {
3756 if (!isdigit(*p))
3757 return PyBool_FromLong(0);
3759 return PyBool_FromLong(1);
3763 PyDoc_STRVAR(islower__doc__,
3764 "S.islower() -> bool\n\
3766 Return True if all cased characters in S are lowercase and there is\n\
3767 at least one cased character in S, False otherwise.");
3769 static PyObject*
3770 string_islower(PyStringObject *self)
3772 register const unsigned char *p
3773 = (unsigned char *) PyString_AS_STRING(self);
3774 register const unsigned char *e;
3775 int cased;
3777 /* Shortcut for single character strings */
3778 if (PyString_GET_SIZE(self) == 1)
3779 return PyBool_FromLong(islower(*p) != 0);
3781 /* Special case for empty strings */
3782 if (PyString_GET_SIZE(self) == 0)
3783 return PyBool_FromLong(0);
3785 e = p + PyString_GET_SIZE(self);
3786 cased = 0;
3787 for (; p < e; p++) {
3788 if (isupper(*p))
3789 return PyBool_FromLong(0);
3790 else if (!cased && islower(*p))
3791 cased = 1;
3793 return PyBool_FromLong(cased);
3797 PyDoc_STRVAR(isupper__doc__,
3798 "S.isupper() -> bool\n\
3800 Return True if all cased characters in S are uppercase and there is\n\
3801 at least one cased character in S, False otherwise.");
3803 static PyObject*
3804 string_isupper(PyStringObject *self)
3806 register const unsigned char *p
3807 = (unsigned char *) PyString_AS_STRING(self);
3808 register const unsigned char *e;
3809 int cased;
3811 /* Shortcut for single character strings */
3812 if (PyString_GET_SIZE(self) == 1)
3813 return PyBool_FromLong(isupper(*p) != 0);
3815 /* Special case for empty strings */
3816 if (PyString_GET_SIZE(self) == 0)
3817 return PyBool_FromLong(0);
3819 e = p + PyString_GET_SIZE(self);
3820 cased = 0;
3821 for (; p < e; p++) {
3822 if (islower(*p))
3823 return PyBool_FromLong(0);
3824 else if (!cased && isupper(*p))
3825 cased = 1;
3827 return PyBool_FromLong(cased);
3831 PyDoc_STRVAR(istitle__doc__,
3832 "S.istitle() -> bool\n\
3834 Return True if S is a titlecased string and there is at least one\n\
3835 character in S, i.e. uppercase characters may only follow uncased\n\
3836 characters and lowercase characters only cased ones. Return False\n\
3837 otherwise.");
3839 static PyObject*
3840 string_istitle(PyStringObject *self, PyObject *uncased)
3842 register const unsigned char *p
3843 = (unsigned char *) PyString_AS_STRING(self);
3844 register const unsigned char *e;
3845 int cased, previous_is_cased;
3847 /* Shortcut for single character strings */
3848 if (PyString_GET_SIZE(self) == 1)
3849 return PyBool_FromLong(isupper(*p) != 0);
3851 /* Special case for empty strings */
3852 if (PyString_GET_SIZE(self) == 0)
3853 return PyBool_FromLong(0);
3855 e = p + PyString_GET_SIZE(self);
3856 cased = 0;
3857 previous_is_cased = 0;
3858 for (; p < e; p++) {
3859 register const unsigned char ch = *p;
3861 if (isupper(ch)) {
3862 if (previous_is_cased)
3863 return PyBool_FromLong(0);
3864 previous_is_cased = 1;
3865 cased = 1;
3867 else if (islower(ch)) {
3868 if (!previous_is_cased)
3869 return PyBool_FromLong(0);
3870 previous_is_cased = 1;
3871 cased = 1;
3873 else
3874 previous_is_cased = 0;
3876 return PyBool_FromLong(cased);
3880 PyDoc_STRVAR(splitlines__doc__,
3881 "S.splitlines([keepends]) -> list of strings\n\
3883 Return a list of the lines in S, breaking at line boundaries.\n\
3884 Line breaks are not included in the resulting list unless keepends\n\
3885 is given and true.");
3887 static PyObject*
3888 string_splitlines(PyStringObject *self, PyObject *args)
3890 register Py_ssize_t i;
3891 register Py_ssize_t j;
3892 Py_ssize_t len;
3893 int keepends = 0;
3894 PyObject *list;
3895 PyObject *str;
3896 char *data;
3898 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
3899 return NULL;
3901 data = PyString_AS_STRING(self);
3902 len = PyString_GET_SIZE(self);
3904 /* This does not use the preallocated list because splitlines is
3905 usually run with hundreds of newlines. The overhead of
3906 switching between PyList_SET_ITEM and append causes about a
3907 2-3% slowdown for that common case. A smarter implementation
3908 could move the if check out, so the SET_ITEMs are done first
3909 and the appends only done when the prealloc buffer is full.
3910 That's too much work for little gain.*/
3912 list = PyList_New(0);
3913 if (!list)
3914 goto onError;
3916 for (i = j = 0; i < len; ) {
3917 Py_ssize_t eol;
3919 /* Find a line and append it */
3920 while (i < len && data[i] != '\n' && data[i] != '\r')
3921 i++;
3923 /* Skip the line break reading CRLF as one line break */
3924 eol = i;
3925 if (i < len) {
3926 if (data[i] == '\r' && i + 1 < len &&
3927 data[i+1] == '\n')
3928 i += 2;
3929 else
3930 i++;
3931 if (keepends)
3932 eol = i;
3934 SPLIT_APPEND(data, j, eol);
3935 j = i;
3937 if (j < len) {
3938 SPLIT_APPEND(data, j, len);
3941 return list;
3943 onError:
3944 Py_XDECREF(list);
3945 return NULL;
3948 PyDoc_STRVAR(sizeof__doc__,
3949 "S.__sizeof__() -> size of S in memory, in bytes");
3951 static PyObject *
3952 string_sizeof(PyStringObject *v)
3954 Py_ssize_t res;
3955 res = PyStringObject_SIZE + PyString_GET_SIZE(v) * Py_TYPE(v)->tp_itemsize;
3956 return PyInt_FromSsize_t(res);
3959 #undef SPLIT_APPEND
3960 #undef SPLIT_ADD
3961 #undef MAX_PREALLOC
3962 #undef PREALLOC_SIZE
3964 static PyObject *
3965 string_getnewargs(PyStringObject *v)
3967 return Py_BuildValue("(s#)", v->ob_sval, Py_SIZE(v));
3971 #include "stringlib/string_format.h"
3973 PyDoc_STRVAR(format__doc__,
3974 "S.format(*args, **kwargs) -> unicode\n\
3978 static PyObject *
3979 string__format__(PyObject* self, PyObject* args)
3981 PyObject *format_spec;
3982 PyObject *result = NULL;
3983 PyObject *tmp = NULL;
3985 /* If 2.x, convert format_spec to the same type as value */
3986 /* This is to allow things like u''.format('') */
3987 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
3988 goto done;
3989 if (!(PyString_Check(format_spec) || PyUnicode_Check(format_spec))) {
3990 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
3991 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
3992 goto done;
3994 tmp = PyObject_Str(format_spec);
3995 if (tmp == NULL)
3996 goto done;
3997 format_spec = tmp;
3999 result = _PyBytes_FormatAdvanced(self,
4000 PyString_AS_STRING(format_spec),
4001 PyString_GET_SIZE(format_spec));
4002 done:
4003 Py_XDECREF(tmp);
4004 return result;
4007 PyDoc_STRVAR(p_format__doc__,
4008 "S.__format__(format_spec) -> unicode\n\
4013 static PyMethodDef
4014 string_methods[] = {
4015 /* Counterparts of the obsolete stropmodule functions; except
4016 string.maketrans(). */
4017 {"join", (PyCFunction)string_join, METH_O, join__doc__},
4018 {"split", (PyCFunction)string_split, METH_VARARGS, split__doc__},
4019 {"rsplit", (PyCFunction)string_rsplit, METH_VARARGS, rsplit__doc__},
4020 {"lower", (PyCFunction)string_lower, METH_NOARGS, lower__doc__},
4021 {"upper", (PyCFunction)string_upper, METH_NOARGS, upper__doc__},
4022 {"islower", (PyCFunction)string_islower, METH_NOARGS, islower__doc__},
4023 {"isupper", (PyCFunction)string_isupper, METH_NOARGS, isupper__doc__},
4024 {"isspace", (PyCFunction)string_isspace, METH_NOARGS, isspace__doc__},
4025 {"isdigit", (PyCFunction)string_isdigit, METH_NOARGS, isdigit__doc__},
4026 {"istitle", (PyCFunction)string_istitle, METH_NOARGS, istitle__doc__},
4027 {"isalpha", (PyCFunction)string_isalpha, METH_NOARGS, isalpha__doc__},
4028 {"isalnum", (PyCFunction)string_isalnum, METH_NOARGS, isalnum__doc__},
4029 {"capitalize", (PyCFunction)string_capitalize, METH_NOARGS,
4030 capitalize__doc__},
4031 {"count", (PyCFunction)string_count, METH_VARARGS, count__doc__},
4032 {"endswith", (PyCFunction)string_endswith, METH_VARARGS,
4033 endswith__doc__},
4034 {"partition", (PyCFunction)string_partition, METH_O, partition__doc__},
4035 {"find", (PyCFunction)string_find, METH_VARARGS, find__doc__},
4036 {"index", (PyCFunction)string_index, METH_VARARGS, index__doc__},
4037 {"lstrip", (PyCFunction)string_lstrip, METH_VARARGS, lstrip__doc__},
4038 {"replace", (PyCFunction)string_replace, METH_VARARGS, replace__doc__},
4039 {"rfind", (PyCFunction)string_rfind, METH_VARARGS, rfind__doc__},
4040 {"rindex", (PyCFunction)string_rindex, METH_VARARGS, rindex__doc__},
4041 {"rstrip", (PyCFunction)string_rstrip, METH_VARARGS, rstrip__doc__},
4042 {"rpartition", (PyCFunction)string_rpartition, METH_O,
4043 rpartition__doc__},
4044 {"startswith", (PyCFunction)string_startswith, METH_VARARGS,
4045 startswith__doc__},
4046 {"strip", (PyCFunction)string_strip, METH_VARARGS, strip__doc__},
4047 {"swapcase", (PyCFunction)string_swapcase, METH_NOARGS,
4048 swapcase__doc__},
4049 {"translate", (PyCFunction)string_translate, METH_VARARGS,
4050 translate__doc__},
4051 {"title", (PyCFunction)string_title, METH_NOARGS, title__doc__},
4052 {"ljust", (PyCFunction)string_ljust, METH_VARARGS, ljust__doc__},
4053 {"rjust", (PyCFunction)string_rjust, METH_VARARGS, rjust__doc__},
4054 {"center", (PyCFunction)string_center, METH_VARARGS, center__doc__},
4055 {"zfill", (PyCFunction)string_zfill, METH_VARARGS, zfill__doc__},
4056 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
4057 {"__format__", (PyCFunction) string__format__, METH_VARARGS, p_format__doc__},
4058 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
4059 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
4060 {"encode", (PyCFunction)string_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
4061 {"decode", (PyCFunction)string_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
4062 {"expandtabs", (PyCFunction)string_expandtabs, METH_VARARGS,
4063 expandtabs__doc__},
4064 {"splitlines", (PyCFunction)string_splitlines, METH_VARARGS,
4065 splitlines__doc__},
4066 {"__sizeof__", (PyCFunction)string_sizeof, METH_NOARGS,
4067 sizeof__doc__},
4068 {"__getnewargs__", (PyCFunction)string_getnewargs, METH_NOARGS},
4069 {NULL, NULL} /* sentinel */
4072 static PyObject *
4073 str_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
4075 static PyObject *
4076 string_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
4078 PyObject *x = NULL;
4079 static char *kwlist[] = {"object", 0};
4081 if (type != &PyString_Type)
4082 return str_subtype_new(type, args, kwds);
4083 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O:str", kwlist, &x))
4084 return NULL;
4085 if (x == NULL)
4086 return PyString_FromString("");
4087 return PyObject_Str(x);
4090 static PyObject *
4091 str_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
4093 PyObject *tmp, *pnew;
4094 Py_ssize_t n;
4096 assert(PyType_IsSubtype(type, &PyString_Type));
4097 tmp = string_new(&PyString_Type, args, kwds);
4098 if (tmp == NULL)
4099 return NULL;
4100 assert(PyString_CheckExact(tmp));
4101 n = PyString_GET_SIZE(tmp);
4102 pnew = type->tp_alloc(type, n);
4103 if (pnew != NULL) {
4104 Py_MEMCPY(PyString_AS_STRING(pnew), PyString_AS_STRING(tmp), n+1);
4105 ((PyStringObject *)pnew)->ob_shash =
4106 ((PyStringObject *)tmp)->ob_shash;
4107 ((PyStringObject *)pnew)->ob_sstate = SSTATE_NOT_INTERNED;
4109 Py_DECREF(tmp);
4110 return pnew;
4113 static PyObject *
4114 basestring_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
4116 PyErr_SetString(PyExc_TypeError,
4117 "The basestring type cannot be instantiated");
4118 return NULL;
4121 static PyObject *
4122 string_mod(PyObject *v, PyObject *w)
4124 if (!PyString_Check(v)) {
4125 Py_INCREF(Py_NotImplemented);
4126 return Py_NotImplemented;
4128 return PyString_Format(v, w);
4131 PyDoc_STRVAR(basestring_doc,
4132 "Type basestring cannot be instantiated; it is the base for str and unicode.");
4134 static PyNumberMethods string_as_number = {
4135 0, /*nb_add*/
4136 0, /*nb_subtract*/
4137 0, /*nb_multiply*/
4138 0, /*nb_divide*/
4139 string_mod, /*nb_remainder*/
4143 PyTypeObject PyBaseString_Type = {
4144 PyVarObject_HEAD_INIT(&PyType_Type, 0)
4145 "basestring",
4148 0, /* tp_dealloc */
4149 0, /* tp_print */
4150 0, /* tp_getattr */
4151 0, /* tp_setattr */
4152 0, /* tp_compare */
4153 0, /* tp_repr */
4154 0, /* tp_as_number */
4155 0, /* tp_as_sequence */
4156 0, /* tp_as_mapping */
4157 0, /* tp_hash */
4158 0, /* tp_call */
4159 0, /* tp_str */
4160 0, /* tp_getattro */
4161 0, /* tp_setattro */
4162 0, /* tp_as_buffer */
4163 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
4164 basestring_doc, /* tp_doc */
4165 0, /* tp_traverse */
4166 0, /* tp_clear */
4167 0, /* tp_richcompare */
4168 0, /* tp_weaklistoffset */
4169 0, /* tp_iter */
4170 0, /* tp_iternext */
4171 0, /* tp_methods */
4172 0, /* tp_members */
4173 0, /* tp_getset */
4174 &PyBaseObject_Type, /* tp_base */
4175 0, /* tp_dict */
4176 0, /* tp_descr_get */
4177 0, /* tp_descr_set */
4178 0, /* tp_dictoffset */
4179 0, /* tp_init */
4180 0, /* tp_alloc */
4181 basestring_new, /* tp_new */
4182 0, /* tp_free */
4185 PyDoc_STRVAR(string_doc,
4186 "str(object) -> string\n\
4188 Return a nice string representation of the object.\n\
4189 If the argument is a string, the return value is the same object.");
4191 PyTypeObject PyString_Type = {
4192 PyVarObject_HEAD_INIT(&PyType_Type, 0)
4193 "str",
4194 PyStringObject_SIZE,
4195 sizeof(char),
4196 string_dealloc, /* tp_dealloc */
4197 (printfunc)string_print, /* tp_print */
4198 0, /* tp_getattr */
4199 0, /* tp_setattr */
4200 0, /* tp_compare */
4201 string_repr, /* tp_repr */
4202 &string_as_number, /* tp_as_number */
4203 &string_as_sequence, /* tp_as_sequence */
4204 &string_as_mapping, /* tp_as_mapping */
4205 (hashfunc)string_hash, /* tp_hash */
4206 0, /* tp_call */
4207 string_str, /* tp_str */
4208 PyObject_GenericGetAttr, /* tp_getattro */
4209 0, /* tp_setattro */
4210 &string_as_buffer, /* tp_as_buffer */
4211 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
4212 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_STRING_SUBCLASS |
4213 Py_TPFLAGS_HAVE_NEWBUFFER, /* tp_flags */
4214 string_doc, /* tp_doc */
4215 0, /* tp_traverse */
4216 0, /* tp_clear */
4217 (richcmpfunc)string_richcompare, /* tp_richcompare */
4218 0, /* tp_weaklistoffset */
4219 0, /* tp_iter */
4220 0, /* tp_iternext */
4221 string_methods, /* tp_methods */
4222 0, /* tp_members */
4223 0, /* tp_getset */
4224 &PyBaseString_Type, /* tp_base */
4225 0, /* tp_dict */
4226 0, /* tp_descr_get */
4227 0, /* tp_descr_set */
4228 0, /* tp_dictoffset */
4229 0, /* tp_init */
4230 0, /* tp_alloc */
4231 string_new, /* tp_new */
4232 PyObject_Del, /* tp_free */
4235 void
4236 PyString_Concat(register PyObject **pv, register PyObject *w)
4238 register PyObject *v;
4239 if (*pv == NULL)
4240 return;
4241 if (w == NULL || !PyString_Check(*pv)) {
4242 Py_DECREF(*pv);
4243 *pv = NULL;
4244 return;
4246 v = string_concat((PyStringObject *) *pv, w);
4247 Py_DECREF(*pv);
4248 *pv = v;
4251 void
4252 PyString_ConcatAndDel(register PyObject **pv, register PyObject *w)
4254 PyString_Concat(pv, w);
4255 Py_XDECREF(w);
4259 /* The following function breaks the notion that strings are immutable:
4260 it changes the size of a string. We get away with this only if there
4261 is only one module referencing the object. You can also think of it
4262 as creating a new string object and destroying the old one, only
4263 more efficiently. In any case, don't use this if the string may
4264 already be known to some other part of the code...
4265 Note that if there's not enough memory to resize the string, the original
4266 string object at *pv is deallocated, *pv is set to NULL, an "out of
4267 memory" exception is set, and -1 is returned. Else (on success) 0 is
4268 returned, and the value in *pv may or may not be the same as on input.
4269 As always, an extra byte is allocated for a trailing \0 byte (newsize
4270 does *not* include that), and a trailing \0 byte is stored.
4274 _PyString_Resize(PyObject **pv, Py_ssize_t newsize)
4276 register PyObject *v;
4277 register PyStringObject *sv;
4278 v = *pv;
4279 if (!PyString_Check(v) || Py_REFCNT(v) != 1 || newsize < 0 ||
4280 PyString_CHECK_INTERNED(v)) {
4281 *pv = 0;
4282 Py_DECREF(v);
4283 PyErr_BadInternalCall();
4284 return -1;
4286 /* XXX UNREF/NEWREF interface should be more symmetrical */
4287 _Py_DEC_REFTOTAL;
4288 _Py_ForgetReference(v);
4289 *pv = (PyObject *)
4290 PyObject_REALLOC((char *)v, PyStringObject_SIZE + newsize);
4291 if (*pv == NULL) {
4292 PyObject_Del(v);
4293 PyErr_NoMemory();
4294 return -1;
4296 _Py_NewReference(*pv);
4297 sv = (PyStringObject *) *pv;
4298 Py_SIZE(sv) = newsize;
4299 sv->ob_sval[newsize] = '\0';
4300 sv->ob_shash = -1; /* invalidate cached hash value */
4301 return 0;
4304 /* Helpers for formatstring */
4306 Py_LOCAL_INLINE(PyObject *)
4307 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
4309 Py_ssize_t argidx = *p_argidx;
4310 if (argidx < arglen) {
4311 (*p_argidx)++;
4312 if (arglen < 0)
4313 return args;
4314 else
4315 return PyTuple_GetItem(args, argidx);
4317 PyErr_SetString(PyExc_TypeError,
4318 "not enough arguments for format string");
4319 return NULL;
4322 /* Format codes
4323 * F_LJUST '-'
4324 * F_SIGN '+'
4325 * F_BLANK ' '
4326 * F_ALT '#'
4327 * F_ZERO '0'
4329 #define F_LJUST (1<<0)
4330 #define F_SIGN (1<<1)
4331 #define F_BLANK (1<<2)
4332 #define F_ALT (1<<3)
4333 #define F_ZERO (1<<4)
4335 Py_LOCAL_INLINE(int)
4336 formatfloat(char *buf, size_t buflen, int flags,
4337 int prec, int type, PyObject *v)
4339 double x;
4340 x = PyFloat_AsDouble(v);
4341 if (x == -1.0 && PyErr_Occurred()) {
4342 PyErr_Format(PyExc_TypeError, "float argument required, "
4343 "not %.200s", Py_TYPE(v)->tp_name);
4344 return -1;
4346 if (prec < 0)
4347 prec = 6;
4348 #if SIZEOF_INT > 4
4349 /* make sure that the decimal representation of precision really does
4350 need at most 10 digits: platforms with sizeof(int) == 8 exist! */
4351 if (prec > 0x7fffffff) {
4352 PyErr_SetString(PyExc_OverflowError,
4353 "outrageously large precision "
4354 "for formatted float");
4355 return -1;
4357 #endif
4359 if (type == 'f' && fabs(x) >= 1e50)
4360 type = 'g';
4361 /* Worst case length calc to ensure no buffer overrun:
4363 'g' formats:
4364 fmt = %#.<prec>g
4365 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4366 for any double rep.)
4367 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4369 'f' formats:
4370 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
4371 len = 1 + 50 + 1 + prec = 52 + prec
4373 If prec=0 the effective precision is 1 (the leading digit is
4374 always given), therefore increase the length by one.
4377 if (((type == 'g' || type == 'G') &&
4378 buflen <= (size_t)10 + (size_t)prec) ||
4379 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
4380 PyErr_SetString(PyExc_OverflowError,
4381 "formatted float is too long (precision too large?)");
4382 return -1;
4384 _PyOS_double_to_string(buf, buflen, x, type, prec,
4385 (flags&F_ALT)?Py_DTSF_ALT:0, NULL);
4386 return (int)strlen(buf);
4389 /* _PyString_FormatLong emulates the format codes d, u, o, x and X, and
4390 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
4391 * Python's regular ints.
4392 * Return value: a new PyString*, or NULL if error.
4393 * . *pbuf is set to point into it,
4394 * *plen set to the # of chars following that.
4395 * Caller must decref it when done using pbuf.
4396 * The string starting at *pbuf is of the form
4397 * "-"? ("0x" | "0X")? digit+
4398 * "0x"/"0X" are present only for x and X conversions, with F_ALT
4399 * set in flags. The case of hex digits will be correct,
4400 * There will be at least prec digits, zero-filled on the left if
4401 * necessary to get that many.
4402 * val object to be converted
4403 * flags bitmask of format flags; only F_ALT is looked at
4404 * prec minimum number of digits; 0-fill on left if needed
4405 * type a character in [duoxX]; u acts the same as d
4407 * CAUTION: o, x and X conversions on regular ints can never
4408 * produce a '-' sign, but can for Python's unbounded ints.
4410 PyObject*
4411 _PyString_FormatLong(PyObject *val, int flags, int prec, int type,
4412 char **pbuf, int *plen)
4414 PyObject *result = NULL;
4415 char *buf;
4416 Py_ssize_t i;
4417 int sign; /* 1 if '-', else 0 */
4418 int len; /* number of characters */
4419 Py_ssize_t llen;
4420 int numdigits; /* len == numnondigits + numdigits */
4421 int numnondigits = 0;
4423 switch (type) {
4424 case 'd':
4425 case 'u':
4426 result = Py_TYPE(val)->tp_str(val);
4427 break;
4428 case 'o':
4429 result = Py_TYPE(val)->tp_as_number->nb_oct(val);
4430 break;
4431 case 'x':
4432 case 'X':
4433 numnondigits = 2;
4434 result = Py_TYPE(val)->tp_as_number->nb_hex(val);
4435 break;
4436 default:
4437 assert(!"'type' not in [duoxX]");
4439 if (!result)
4440 return NULL;
4442 buf = PyString_AsString(result);
4443 if (!buf) {
4444 Py_DECREF(result);
4445 return NULL;
4448 /* To modify the string in-place, there can only be one reference. */
4449 if (Py_REFCNT(result) != 1) {
4450 PyErr_BadInternalCall();
4451 return NULL;
4453 llen = PyString_Size(result);
4454 if (llen > INT_MAX) {
4455 PyErr_SetString(PyExc_ValueError, "string too large in _PyString_FormatLong");
4456 return NULL;
4458 len = (int)llen;
4459 if (buf[len-1] == 'L') {
4460 --len;
4461 buf[len] = '\0';
4463 sign = buf[0] == '-';
4464 numnondigits += sign;
4465 numdigits = len - numnondigits;
4466 assert(numdigits > 0);
4468 /* Get rid of base marker unless F_ALT */
4469 if ((flags & F_ALT) == 0) {
4470 /* Need to skip 0x, 0X or 0. */
4471 int skipped = 0;
4472 switch (type) {
4473 case 'o':
4474 assert(buf[sign] == '0');
4475 /* If 0 is only digit, leave it alone. */
4476 if (numdigits > 1) {
4477 skipped = 1;
4478 --numdigits;
4480 break;
4481 case 'x':
4482 case 'X':
4483 assert(buf[sign] == '0');
4484 assert(buf[sign + 1] == 'x');
4485 skipped = 2;
4486 numnondigits -= 2;
4487 break;
4489 if (skipped) {
4490 buf += skipped;
4491 len -= skipped;
4492 if (sign)
4493 buf[0] = '-';
4495 assert(len == numnondigits + numdigits);
4496 assert(numdigits > 0);
4499 /* Fill with leading zeroes to meet minimum width. */
4500 if (prec > numdigits) {
4501 PyObject *r1 = PyString_FromStringAndSize(NULL,
4502 numnondigits + prec);
4503 char *b1;
4504 if (!r1) {
4505 Py_DECREF(result);
4506 return NULL;
4508 b1 = PyString_AS_STRING(r1);
4509 for (i = 0; i < numnondigits; ++i)
4510 *b1++ = *buf++;
4511 for (i = 0; i < prec - numdigits; i++)
4512 *b1++ = '0';
4513 for (i = 0; i < numdigits; i++)
4514 *b1++ = *buf++;
4515 *b1 = '\0';
4516 Py_DECREF(result);
4517 result = r1;
4518 buf = PyString_AS_STRING(result);
4519 len = numnondigits + prec;
4522 /* Fix up case for hex conversions. */
4523 if (type == 'X') {
4524 /* Need to convert all lower case letters to upper case.
4525 and need to convert 0x to 0X (and -0x to -0X). */
4526 for (i = 0; i < len; i++)
4527 if (buf[i] >= 'a' && buf[i] <= 'x')
4528 buf[i] -= 'a'-'A';
4530 *pbuf = buf;
4531 *plen = len;
4532 return result;
4535 Py_LOCAL_INLINE(int)
4536 formatint(char *buf, size_t buflen, int flags,
4537 int prec, int type, PyObject *v)
4539 /* fmt = '%#.' + `prec` + 'l' + `type`
4540 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4541 + 1 + 1 = 24 */
4542 char fmt[64]; /* plenty big enough! */
4543 char *sign;
4544 long x;
4546 x = PyInt_AsLong(v);
4547 if (x == -1 && PyErr_Occurred()) {
4548 PyErr_Format(PyExc_TypeError, "int argument required, not %.200s",
4549 Py_TYPE(v)->tp_name);
4550 return -1;
4552 if (x < 0 && type == 'u') {
4553 type = 'd';
4555 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
4556 sign = "-";
4557 else
4558 sign = "";
4559 if (prec < 0)
4560 prec = 1;
4562 if ((flags & F_ALT) &&
4563 (type == 'x' || type == 'X')) {
4564 /* When converting under %#x or %#X, there are a number
4565 * of issues that cause pain:
4566 * - when 0 is being converted, the C standard leaves off
4567 * the '0x' or '0X', which is inconsistent with other
4568 * %#x/%#X conversions and inconsistent with Python's
4569 * hex() function
4570 * - there are platforms that violate the standard and
4571 * convert 0 with the '0x' or '0X'
4572 * (Metrowerks, Compaq Tru64)
4573 * - there are platforms that give '0x' when converting
4574 * under %#X, but convert 0 in accordance with the
4575 * standard (OS/2 EMX)
4577 * We can achieve the desired consistency by inserting our
4578 * own '0x' or '0X' prefix, and substituting %x/%X in place
4579 * of %#x/%#X.
4581 * Note that this is the same approach as used in
4582 * formatint() in unicodeobject.c
4584 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
4585 sign, type, prec, type);
4587 else {
4588 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
4589 sign, (flags&F_ALT) ? "#" : "",
4590 prec, type);
4593 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
4594 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
4596 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
4597 PyErr_SetString(PyExc_OverflowError,
4598 "formatted integer is too long (precision too large?)");
4599 return -1;
4601 if (sign[0])
4602 PyOS_snprintf(buf, buflen, fmt, -x);
4603 else
4604 PyOS_snprintf(buf, buflen, fmt, x);
4605 return (int)strlen(buf);
4608 Py_LOCAL_INLINE(int)
4609 formatchar(char *buf, size_t buflen, PyObject *v)
4611 /* presume that the buffer is at least 2 characters long */
4612 if (PyString_Check(v)) {
4613 if (!PyArg_Parse(v, "c;%c requires int or char", &buf[0]))
4614 return -1;
4616 else {
4617 if (!PyArg_Parse(v, "b;%c requires int or char", &buf[0]))
4618 return -1;
4620 buf[1] = '\0';
4621 return 1;
4624 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4626 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4627 chars are formatted. XXX This is a magic number. Each formatting
4628 routine does bounds checking to ensure no overflow, but a better
4629 solution may be to malloc a buffer of appropriate size for each
4630 format. For now, the current solution is sufficient.
4632 #define FORMATBUFLEN (size_t)120
4634 PyObject *
4635 PyString_Format(PyObject *format, PyObject *args)
4637 char *fmt, *res;
4638 Py_ssize_t arglen, argidx;
4639 Py_ssize_t reslen, rescnt, fmtcnt;
4640 int args_owned = 0;
4641 PyObject *result, *orig_args;
4642 #ifdef Py_USING_UNICODE
4643 PyObject *v, *w;
4644 #endif
4645 PyObject *dict = NULL;
4646 if (format == NULL || !PyString_Check(format) || args == NULL) {
4647 PyErr_BadInternalCall();
4648 return NULL;
4650 orig_args = args;
4651 fmt = PyString_AS_STRING(format);
4652 fmtcnt = PyString_GET_SIZE(format);
4653 reslen = rescnt = fmtcnt + 100;
4654 result = PyString_FromStringAndSize((char *)NULL, reslen);
4655 if (result == NULL)
4656 return NULL;
4657 res = PyString_AsString(result);
4658 if (PyTuple_Check(args)) {
4659 arglen = PyTuple_GET_SIZE(args);
4660 argidx = 0;
4662 else {
4663 arglen = -1;
4664 argidx = -2;
4666 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
4667 !PyObject_TypeCheck(args, &PyBaseString_Type))
4668 dict = args;
4669 while (--fmtcnt >= 0) {
4670 if (*fmt != '%') {
4671 if (--rescnt < 0) {
4672 rescnt = fmtcnt + 100;
4673 reslen += rescnt;
4674 if (_PyString_Resize(&result, reslen) < 0)
4675 return NULL;
4676 res = PyString_AS_STRING(result)
4677 + reslen - rescnt;
4678 --rescnt;
4680 *res++ = *fmt++;
4682 else {
4683 /* Got a format specifier */
4684 int flags = 0;
4685 Py_ssize_t width = -1;
4686 int prec = -1;
4687 int c = '\0';
4688 int fill;
4689 int isnumok;
4690 PyObject *v = NULL;
4691 PyObject *temp = NULL;
4692 char *pbuf;
4693 int sign;
4694 Py_ssize_t len;
4695 char formatbuf[FORMATBUFLEN];
4696 /* For format{float,int,char}() */
4697 #ifdef Py_USING_UNICODE
4698 char *fmt_start = fmt;
4699 Py_ssize_t argidx_start = argidx;
4700 #endif
4702 fmt++;
4703 if (*fmt == '(') {
4704 char *keystart;
4705 Py_ssize_t keylen;
4706 PyObject *key;
4707 int pcount = 1;
4709 if (dict == NULL) {
4710 PyErr_SetString(PyExc_TypeError,
4711 "format requires a mapping");
4712 goto error;
4714 ++fmt;
4715 --fmtcnt;
4716 keystart = fmt;
4717 /* Skip over balanced parentheses */
4718 while (pcount > 0 && --fmtcnt >= 0) {
4719 if (*fmt == ')')
4720 --pcount;
4721 else if (*fmt == '(')
4722 ++pcount;
4723 fmt++;
4725 keylen = fmt - keystart - 1;
4726 if (fmtcnt < 0 || pcount > 0) {
4727 PyErr_SetString(PyExc_ValueError,
4728 "incomplete format key");
4729 goto error;
4731 key = PyString_FromStringAndSize(keystart,
4732 keylen);
4733 if (key == NULL)
4734 goto error;
4735 if (args_owned) {
4736 Py_DECREF(args);
4737 args_owned = 0;
4739 args = PyObject_GetItem(dict, key);
4740 Py_DECREF(key);
4741 if (args == NULL) {
4742 goto error;
4744 args_owned = 1;
4745 arglen = -1;
4746 argidx = -2;
4748 while (--fmtcnt >= 0) {
4749 switch (c = *fmt++) {
4750 case '-': flags |= F_LJUST; continue;
4751 case '+': flags |= F_SIGN; continue;
4752 case ' ': flags |= F_BLANK; continue;
4753 case '#': flags |= F_ALT; continue;
4754 case '0': flags |= F_ZERO; continue;
4756 break;
4758 if (c == '*') {
4759 v = getnextarg(args, arglen, &argidx);
4760 if (v == NULL)
4761 goto error;
4762 if (!PyInt_Check(v)) {
4763 PyErr_SetString(PyExc_TypeError,
4764 "* wants int");
4765 goto error;
4767 width = PyInt_AsLong(v);
4768 if (width < 0) {
4769 flags |= F_LJUST;
4770 width = -width;
4772 if (--fmtcnt >= 0)
4773 c = *fmt++;
4775 else if (c >= 0 && isdigit(c)) {
4776 width = c - '0';
4777 while (--fmtcnt >= 0) {
4778 c = Py_CHARMASK(*fmt++);
4779 if (!isdigit(c))
4780 break;
4781 if ((width*10) / 10 != width) {
4782 PyErr_SetString(
4783 PyExc_ValueError,
4784 "width too big");
4785 goto error;
4787 width = width*10 + (c - '0');
4790 if (c == '.') {
4791 prec = 0;
4792 if (--fmtcnt >= 0)
4793 c = *fmt++;
4794 if (c == '*') {
4795 v = getnextarg(args, arglen, &argidx);
4796 if (v == NULL)
4797 goto error;
4798 if (!PyInt_Check(v)) {
4799 PyErr_SetString(
4800 PyExc_TypeError,
4801 "* wants int");
4802 goto error;
4804 prec = PyInt_AsLong(v);
4805 if (prec < 0)
4806 prec = 0;
4807 if (--fmtcnt >= 0)
4808 c = *fmt++;
4810 else if (c >= 0 && isdigit(c)) {
4811 prec = c - '0';
4812 while (--fmtcnt >= 0) {
4813 c = Py_CHARMASK(*fmt++);
4814 if (!isdigit(c))
4815 break;
4816 if ((prec*10) / 10 != prec) {
4817 PyErr_SetString(
4818 PyExc_ValueError,
4819 "prec too big");
4820 goto error;
4822 prec = prec*10 + (c - '0');
4825 } /* prec */
4826 if (fmtcnt >= 0) {
4827 if (c == 'h' || c == 'l' || c == 'L') {
4828 if (--fmtcnt >= 0)
4829 c = *fmt++;
4832 if (fmtcnt < 0) {
4833 PyErr_SetString(PyExc_ValueError,
4834 "incomplete format");
4835 goto error;
4837 if (c != '%') {
4838 v = getnextarg(args, arglen, &argidx);
4839 if (v == NULL)
4840 goto error;
4842 sign = 0;
4843 fill = ' ';
4844 switch (c) {
4845 case '%':
4846 pbuf = "%";
4847 len = 1;
4848 break;
4849 case 's':
4850 #ifdef Py_USING_UNICODE
4851 if (PyUnicode_Check(v)) {
4852 fmt = fmt_start;
4853 argidx = argidx_start;
4854 goto unicode;
4856 #endif
4857 temp = _PyObject_Str(v);
4858 #ifdef Py_USING_UNICODE
4859 if (temp != NULL && PyUnicode_Check(temp)) {
4860 Py_DECREF(temp);
4861 fmt = fmt_start;
4862 argidx = argidx_start;
4863 goto unicode;
4865 #endif
4866 /* Fall through */
4867 case 'r':
4868 if (c == 'r')
4869 temp = PyObject_Repr(v);
4870 if (temp == NULL)
4871 goto error;
4872 if (!PyString_Check(temp)) {
4873 PyErr_SetString(PyExc_TypeError,
4874 "%s argument has non-string str()");
4875 Py_DECREF(temp);
4876 goto error;
4878 pbuf = PyString_AS_STRING(temp);
4879 len = PyString_GET_SIZE(temp);
4880 if (prec >= 0 && len > prec)
4881 len = prec;
4882 break;
4883 case 'i':
4884 case 'd':
4885 case 'u':
4886 case 'o':
4887 case 'x':
4888 case 'X':
4889 if (c == 'i')
4890 c = 'd';
4891 isnumok = 0;
4892 if (PyNumber_Check(v)) {
4893 PyObject *iobj=NULL;
4895 if (PyInt_Check(v) || (PyLong_Check(v))) {
4896 iobj = v;
4897 Py_INCREF(iobj);
4899 else {
4900 iobj = PyNumber_Int(v);
4901 if (iobj==NULL) iobj = PyNumber_Long(v);
4903 if (iobj!=NULL) {
4904 if (PyInt_Check(iobj)) {
4905 isnumok = 1;
4906 pbuf = formatbuf;
4907 len = formatint(pbuf,
4908 sizeof(formatbuf),
4909 flags, prec, c, iobj);
4910 Py_DECREF(iobj);
4911 if (len < 0)
4912 goto error;
4913 sign = 1;
4915 else if (PyLong_Check(iobj)) {
4916 int ilen;
4918 isnumok = 1;
4919 temp = _PyString_FormatLong(iobj, flags,
4920 prec, c, &pbuf, &ilen);
4921 Py_DECREF(iobj);
4922 len = ilen;
4923 if (!temp)
4924 goto error;
4925 sign = 1;
4927 else {
4928 Py_DECREF(iobj);
4932 if (!isnumok) {
4933 PyErr_Format(PyExc_TypeError,
4934 "%%%c format: a number is required, "
4935 "not %.200s", c, Py_TYPE(v)->tp_name);
4936 goto error;
4938 if (flags & F_ZERO)
4939 fill = '0';
4940 break;
4941 case 'e':
4942 case 'E':
4943 case 'f':
4944 case 'F':
4945 case 'g':
4946 case 'G':
4947 if (c == 'F')
4948 c = 'f';
4949 pbuf = formatbuf;
4950 len = formatfloat(pbuf, sizeof(formatbuf),
4951 flags, prec, c, v);
4952 if (len < 0)
4953 goto error;
4954 sign = 1;
4955 if (flags & F_ZERO)
4956 fill = '0';
4957 break;
4958 case 'c':
4959 #ifdef Py_USING_UNICODE
4960 if (PyUnicode_Check(v)) {
4961 fmt = fmt_start;
4962 argidx = argidx_start;
4963 goto unicode;
4965 #endif
4966 pbuf = formatbuf;
4967 len = formatchar(pbuf, sizeof(formatbuf), v);
4968 if (len < 0)
4969 goto error;
4970 break;
4971 default:
4972 PyErr_Format(PyExc_ValueError,
4973 "unsupported format character '%c' (0x%x) "
4974 "at index %zd",
4975 c, c,
4976 (Py_ssize_t)(fmt - 1 -
4977 PyString_AsString(format)));
4978 goto error;
4980 if (sign) {
4981 if (*pbuf == '-' || *pbuf == '+') {
4982 sign = *pbuf++;
4983 len--;
4985 else if (flags & F_SIGN)
4986 sign = '+';
4987 else if (flags & F_BLANK)
4988 sign = ' ';
4989 else
4990 sign = 0;
4992 if (width < len)
4993 width = len;
4994 if (rescnt - (sign != 0) < width) {
4995 reslen -= rescnt;
4996 rescnt = width + fmtcnt + 100;
4997 reslen += rescnt;
4998 if (reslen < 0) {
4999 Py_DECREF(result);
5000 Py_XDECREF(temp);
5001 return PyErr_NoMemory();
5003 if (_PyString_Resize(&result, reslen) < 0) {
5004 Py_XDECREF(temp);
5005 return NULL;
5007 res = PyString_AS_STRING(result)
5008 + reslen - rescnt;
5010 if (sign) {
5011 if (fill != ' ')
5012 *res++ = sign;
5013 rescnt--;
5014 if (width > len)
5015 width--;
5017 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5018 assert(pbuf[0] == '0');
5019 assert(pbuf[1] == c);
5020 if (fill != ' ') {
5021 *res++ = *pbuf++;
5022 *res++ = *pbuf++;
5024 rescnt -= 2;
5025 width -= 2;
5026 if (width < 0)
5027 width = 0;
5028 len -= 2;
5030 if (width > len && !(flags & F_LJUST)) {
5031 do {
5032 --rescnt;
5033 *res++ = fill;
5034 } while (--width > len);
5036 if (fill == ' ') {
5037 if (sign)
5038 *res++ = sign;
5039 if ((flags & F_ALT) &&
5040 (c == 'x' || c == 'X')) {
5041 assert(pbuf[0] == '0');
5042 assert(pbuf[1] == c);
5043 *res++ = *pbuf++;
5044 *res++ = *pbuf++;
5047 Py_MEMCPY(res, pbuf, len);
5048 res += len;
5049 rescnt -= len;
5050 while (--width >= len) {
5051 --rescnt;
5052 *res++ = ' ';
5054 if (dict && (argidx < arglen) && c != '%') {
5055 PyErr_SetString(PyExc_TypeError,
5056 "not all arguments converted during string formatting");
5057 Py_XDECREF(temp);
5058 goto error;
5060 Py_XDECREF(temp);
5061 } /* '%' */
5062 } /* until end */
5063 if (argidx < arglen && !dict) {
5064 PyErr_SetString(PyExc_TypeError,
5065 "not all arguments converted during string formatting");
5066 goto error;
5068 if (args_owned) {
5069 Py_DECREF(args);
5071 _PyString_Resize(&result, reslen - rescnt);
5072 return result;
5074 #ifdef Py_USING_UNICODE
5075 unicode:
5076 if (args_owned) {
5077 Py_DECREF(args);
5078 args_owned = 0;
5080 /* Fiddle args right (remove the first argidx arguments) */
5081 if (PyTuple_Check(orig_args) && argidx > 0) {
5082 PyObject *v;
5083 Py_ssize_t n = PyTuple_GET_SIZE(orig_args) - argidx;
5084 v = PyTuple_New(n);
5085 if (v == NULL)
5086 goto error;
5087 while (--n >= 0) {
5088 PyObject *w = PyTuple_GET_ITEM(orig_args, n + argidx);
5089 Py_INCREF(w);
5090 PyTuple_SET_ITEM(v, n, w);
5092 args = v;
5093 } else {
5094 Py_INCREF(orig_args);
5095 args = orig_args;
5097 args_owned = 1;
5098 /* Take what we have of the result and let the Unicode formatting
5099 function format the rest of the input. */
5100 rescnt = res - PyString_AS_STRING(result);
5101 if (_PyString_Resize(&result, rescnt))
5102 goto error;
5103 fmtcnt = PyString_GET_SIZE(format) - \
5104 (fmt - PyString_AS_STRING(format));
5105 format = PyUnicode_Decode(fmt, fmtcnt, NULL, NULL);
5106 if (format == NULL)
5107 goto error;
5108 v = PyUnicode_Format(format, args);
5109 Py_DECREF(format);
5110 if (v == NULL)
5111 goto error;
5112 /* Paste what we have (result) to what the Unicode formatting
5113 function returned (v) and return the result (or error) */
5114 w = PyUnicode_Concat(result, v);
5115 Py_DECREF(result);
5116 Py_DECREF(v);
5117 Py_DECREF(args);
5118 return w;
5119 #endif /* Py_USING_UNICODE */
5121 error:
5122 Py_DECREF(result);
5123 if (args_owned) {
5124 Py_DECREF(args);
5126 return NULL;
5129 void
5130 PyString_InternInPlace(PyObject **p)
5132 register PyStringObject *s = (PyStringObject *)(*p);
5133 PyObject *t;
5134 if (s == NULL || !PyString_Check(s))
5135 Py_FatalError("PyString_InternInPlace: strings only please!");
5136 /* If it's a string subclass, we don't really know what putting
5137 it in the interned dict might do. */
5138 if (!PyString_CheckExact(s))
5139 return;
5140 if (PyString_CHECK_INTERNED(s))
5141 return;
5142 if (interned == NULL) {
5143 interned = PyDict_New();
5144 if (interned == NULL) {
5145 PyErr_Clear(); /* Don't leave an exception */
5146 return;
5149 t = PyDict_GetItem(interned, (PyObject *)s);
5150 if (t) {
5151 Py_INCREF(t);
5152 Py_DECREF(*p);
5153 *p = t;
5154 return;
5157 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
5158 PyErr_Clear();
5159 return;
5161 /* The two references in interned are not counted by refcnt.
5162 The string deallocator will take care of this */
5163 Py_REFCNT(s) -= 2;
5164 PyString_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
5167 void
5168 PyString_InternImmortal(PyObject **p)
5170 PyString_InternInPlace(p);
5171 if (PyString_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
5172 PyString_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
5173 Py_INCREF(*p);
5178 PyObject *
5179 PyString_InternFromString(const char *cp)
5181 PyObject *s = PyString_FromString(cp);
5182 if (s == NULL)
5183 return NULL;
5184 PyString_InternInPlace(&s);
5185 return s;
5188 void
5189 PyString_Fini(void)
5191 int i;
5192 for (i = 0; i < UCHAR_MAX + 1; i++) {
5193 Py_XDECREF(characters[i]);
5194 characters[i] = NULL;
5196 Py_XDECREF(nullstring);
5197 nullstring = NULL;
5200 void _Py_ReleaseInternedStrings(void)
5202 PyObject *keys;
5203 PyStringObject *s;
5204 Py_ssize_t i, n;
5205 Py_ssize_t immortal_size = 0, mortal_size = 0;
5207 if (interned == NULL || !PyDict_Check(interned))
5208 return;
5209 keys = PyDict_Keys(interned);
5210 if (keys == NULL || !PyList_Check(keys)) {
5211 PyErr_Clear();
5212 return;
5215 /* Since _Py_ReleaseInternedStrings() is intended to help a leak
5216 detector, interned strings are not forcibly deallocated; rather, we
5217 give them their stolen references back, and then clear and DECREF
5218 the interned dict. */
5220 n = PyList_GET_SIZE(keys);
5221 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
5223 for (i = 0; i < n; i++) {
5224 s = (PyStringObject *) PyList_GET_ITEM(keys, i);
5225 switch (s->ob_sstate) {
5226 case SSTATE_NOT_INTERNED:
5227 /* XXX Shouldn't happen */
5228 break;
5229 case SSTATE_INTERNED_IMMORTAL:
5230 Py_REFCNT(s) += 1;
5231 immortal_size += Py_SIZE(s);
5232 break;
5233 case SSTATE_INTERNED_MORTAL:
5234 Py_REFCNT(s) += 2;
5235 mortal_size += Py_SIZE(s);
5236 break;
5237 default:
5238 Py_FatalError("Inconsistent interned string state.");
5240 s->ob_sstate = SSTATE_NOT_INTERNED;
5242 fprintf(stderr, "total size of all interned strings: "
5243 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
5244 "mortal/immortal\n", mortal_size, immortal_size);
5245 Py_DECREF(keys);
5246 PyDict_Clear(interned);
5247 Py_DECREF(interned);
5248 interned = NULL;