Automatic conversion of floats to integers for struct.pack integer codes
[python.git] / Objects / unicodeobject.c
blob6edc2f8347c2a7f6e8dc39d671b4902246fb5453
1 /*
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
10 Copyright (c) Corporation for National Research Initiatives.
12 --------------------------------------------------------------------
13 The original string type implementation is:
15 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
18 By obtaining, using, and/or copying this software and/or its
19 associated documentation, you agree that you have read, understood,
20 and will comply with the following terms and conditions:
22 Permission to use, copy, modify, and distribute this software and its
23 associated documentation for any purpose and without fee is hereby
24 granted, provided that the above copyright notice appears in all
25 copies, and that both that copyright notice and this permission notice
26 appear in supporting documentation, and that the name of Secret Labs
27 AB or the author not be used in advertising or publicity pertaining to
28 distribution of the software without specific, written prior
29 permission.
31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 --------------------------------------------------------------------
42 #define PY_SSIZE_T_CLEAN
43 #include "Python.h"
45 #include "unicodeobject.h"
46 #include "ucnhash.h"
48 #ifdef MS_WINDOWS
49 #include <windows.h>
50 #endif
52 /* Limit for the Unicode object free list */
54 #define PyUnicode_MAXFREELIST 1024
56 /* Limit for the Unicode object free list stay alive optimization.
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
60 limit. This reduces malloc() overhead for small Unicode objects.
62 At worst this will result in PyUnicode_MAXFREELIST *
63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64 malloc()-overhead) bytes of unused garbage.
66 Setting the limit to 0 effectively turns the feature off.
68 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
73 #define KEEPALIVE_SIZE_LIMIT 9
75 /* Endianness switches; defaults to little endian */
77 #ifdef WORDS_BIGENDIAN
78 # define BYTEORDER_IS_BIG_ENDIAN
79 #else
80 # define BYTEORDER_IS_LITTLE_ENDIAN
81 #endif
83 /* --- Globals ------------------------------------------------------------
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
91 #ifdef __cplusplus
92 extern "C" {
93 #endif
95 /* Free list for Unicode objects */
96 static PyUnicodeObject *free_list;
97 static int numfree;
99 /* The empty Unicode object is shared to improve performance. */
100 static PyUnicodeObject *unicode_empty;
102 /* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104 static PyUnicodeObject *unicode_latin1[256];
106 /* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
109 Always use the PyUnicode_SetDefaultEncoding() and
110 PyUnicode_GetDefaultEncoding() APIs to access this global.
113 static char unicode_default_encoding[100];
115 /* Fast detection of the most frequent whitespace characters */
116 const unsigned char _Py_ascii_whitespace[] = {
117 0, 0, 0, 0, 0, 0, 0, 0,
118 /* case 0x0009: * HORIZONTAL TABULATION */
119 /* case 0x000A: * LINE FEED */
120 /* case 0x000B: * VERTICAL TABULATION */
121 /* case 0x000C: * FORM FEED */
122 /* case 0x000D: * CARRIAGE RETURN */
123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
125 /* case 0x001C: * FILE SEPARATOR */
126 /* case 0x001D: * GROUP SEPARATOR */
127 /* case 0x001E: * RECORD SEPARATOR */
128 /* case 0x001F: * UNIT SEPARATOR */
129 0, 0, 0, 0, 1, 1, 1, 1,
130 /* case 0x0020: * SPACE */
131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
146 /* Same for linebreaks */
147 static unsigned char ascii_linebreak[] = {
148 0, 0, 0, 0, 0, 0, 0, 0,
149 /* 0x000A, * LINE FEED */
150 /* 0x000D, * CARRIAGE RETURN */
151 0, 0, 1, 0, 0, 1, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 /* 0x001C, * FILE SEPARATOR */
154 /* 0x001D, * GROUP SEPARATOR */
155 /* 0x001E, * RECORD SEPARATOR */
156 0, 0, 0, 0, 1, 1, 1, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0
173 Py_UNICODE
174 PyUnicode_GetMax(void)
176 #ifdef Py_UNICODE_WIDE
177 return 0x10FFFF;
178 #else
179 /* This is actually an illegal character, so it should
180 not be passed to unichr. */
181 return 0xFFFF;
182 #endif
185 /* --- Bloom Filters ----------------------------------------------------- */
187 /* stuff to implement simple "bloom filters" for Unicode characters.
188 to keep things simple, we use a single bitmask, using the least 5
189 bits from each unicode characters as the bit index. */
191 /* the linebreak mask is set up by Unicode_Init below */
193 #define BLOOM_MASK unsigned long
195 static BLOOM_MASK bloom_linebreak;
197 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
199 #define BLOOM_LINEBREAK(ch) \
200 ((ch) < 128U ? ascii_linebreak[(ch)] : \
201 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
203 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
205 /* calculate simple bloom-style bitmask for a given unicode string */
207 long mask;
208 Py_ssize_t i;
210 mask = 0;
211 for (i = 0; i < len; i++)
212 mask |= (1 << (ptr[i] & 0x1F));
214 return mask;
217 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
219 Py_ssize_t i;
221 for (i = 0; i < setlen; i++)
222 if (set[i] == chr)
223 return 1;
225 return 0;
228 #define BLOOM_MEMBER(mask, chr, set, setlen) \
229 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
231 /* --- Unicode Object ----------------------------------------------------- */
233 static
234 int unicode_resize(register PyUnicodeObject *unicode,
235 Py_ssize_t length)
237 void *oldstr;
239 /* Shortcut if there's nothing much to do. */
240 if (unicode->length == length)
241 goto reset;
243 /* Resizing shared object (unicode_empty or single character
244 objects) in-place is not allowed. Use PyUnicode_Resize()
245 instead ! */
247 if (unicode == unicode_empty ||
248 (unicode->length == 1 &&
249 unicode->str[0] < 256U &&
250 unicode_latin1[unicode->str[0]] == unicode)) {
251 PyErr_SetString(PyExc_SystemError,
252 "can't resize shared unicode objects");
253 return -1;
256 /* We allocate one more byte to make sure the string is Ux0000 terminated.
257 The overallocation is also used by fastsearch, which assumes that it's
258 safe to look at str[length] (without making any assumptions about what
259 it contains). */
261 oldstr = unicode->str;
262 unicode->str = PyObject_REALLOC(unicode->str,
263 sizeof(Py_UNICODE) * (length + 1));
264 if (!unicode->str) {
265 unicode->str = (Py_UNICODE *)oldstr;
266 PyErr_NoMemory();
267 return -1;
269 unicode->str[length] = 0;
270 unicode->length = length;
272 reset:
273 /* Reset the object caches */
274 if (unicode->defenc) {
275 Py_DECREF(unicode->defenc);
276 unicode->defenc = NULL;
278 unicode->hash = -1;
280 return 0;
283 /* We allocate one more byte to make sure the string is
284 Ux0000 terminated -- XXX is this needed ?
286 XXX This allocator could further be enhanced by assuring that the
287 free list never reduces its size below 1.
291 static
292 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
294 register PyUnicodeObject *unicode;
296 /* Optimization for empty strings */
297 if (length == 0 && unicode_empty != NULL) {
298 Py_INCREF(unicode_empty);
299 return unicode_empty;
302 /* Ensure we won't overflow the size. */
303 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
304 return (PyUnicodeObject *)PyErr_NoMemory();
307 /* Unicode freelist & memory allocation */
308 if (free_list) {
309 unicode = free_list;
310 free_list = *(PyUnicodeObject **)unicode;
311 numfree--;
312 if (unicode->str) {
313 /* Keep-Alive optimization: we only upsize the buffer,
314 never downsize it. */
315 if ((unicode->length < length) &&
316 unicode_resize(unicode, length) < 0) {
317 PyObject_DEL(unicode->str);
318 unicode->str = NULL;
321 else {
322 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
323 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
325 PyObject_INIT(unicode, &PyUnicode_Type);
327 else {
328 size_t new_size;
329 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
330 if (unicode == NULL)
331 return NULL;
332 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
333 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
336 if (!unicode->str) {
337 PyErr_NoMemory();
338 goto onError;
340 /* Initialize the first element to guard against cases where
341 * the caller fails before initializing str -- unicode_resize()
342 * reads str[0], and the Keep-Alive optimization can keep memory
343 * allocated for str alive across a call to unicode_dealloc(unicode).
344 * We don't want unicode_resize to read uninitialized memory in
345 * that case.
347 unicode->str[0] = 0;
348 unicode->str[length] = 0;
349 unicode->length = length;
350 unicode->hash = -1;
351 unicode->defenc = NULL;
352 return unicode;
354 onError:
355 /* XXX UNREF/NEWREF interface should be more symmetrical */
356 _Py_DEC_REFTOTAL;
357 _Py_ForgetReference((PyObject *)unicode);
358 PyObject_Del(unicode);
359 return NULL;
362 static
363 void unicode_dealloc(register PyUnicodeObject *unicode)
365 if (PyUnicode_CheckExact(unicode) &&
366 numfree < PyUnicode_MAXFREELIST) {
367 /* Keep-Alive optimization */
368 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
369 PyObject_DEL(unicode->str);
370 unicode->str = NULL;
371 unicode->length = 0;
373 if (unicode->defenc) {
374 Py_DECREF(unicode->defenc);
375 unicode->defenc = NULL;
377 /* Add to free list */
378 *(PyUnicodeObject **)unicode = free_list;
379 free_list = unicode;
380 numfree++;
382 else {
383 PyObject_DEL(unicode->str);
384 Py_XDECREF(unicode->defenc);
385 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
389 static
390 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
392 register PyUnicodeObject *v;
394 /* Argument checks */
395 if (unicode == NULL) {
396 PyErr_BadInternalCall();
397 return -1;
399 v = *unicode;
400 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
401 PyErr_BadInternalCall();
402 return -1;
405 /* Resizing unicode_empty and single character objects is not
406 possible since these are being shared. We simply return a fresh
407 copy with the same Unicode content. */
408 if (v->length != length &&
409 (v == unicode_empty || v->length == 1)) {
410 PyUnicodeObject *w = _PyUnicode_New(length);
411 if (w == NULL)
412 return -1;
413 Py_UNICODE_COPY(w->str, v->str,
414 length < v->length ? length : v->length);
415 Py_DECREF(*unicode);
416 *unicode = w;
417 return 0;
420 /* Note that we don't have to modify *unicode for unshared Unicode
421 objects, since we can modify them in-place. */
422 return unicode_resize(v, length);
425 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
427 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
430 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
431 Py_ssize_t size)
433 PyUnicodeObject *unicode;
435 /* If the Unicode data is known at construction time, we can apply
436 some optimizations which share commonly used objects. */
437 if (u != NULL) {
439 /* Optimization for empty strings */
440 if (size == 0 && unicode_empty != NULL) {
441 Py_INCREF(unicode_empty);
442 return (PyObject *)unicode_empty;
445 /* Single character Unicode objects in the Latin-1 range are
446 shared when using this constructor */
447 if (size == 1 && *u < 256) {
448 unicode = unicode_latin1[*u];
449 if (!unicode) {
450 unicode = _PyUnicode_New(1);
451 if (!unicode)
452 return NULL;
453 unicode->str[0] = *u;
454 unicode_latin1[*u] = unicode;
456 Py_INCREF(unicode);
457 return (PyObject *)unicode;
461 unicode = _PyUnicode_New(size);
462 if (!unicode)
463 return NULL;
465 /* Copy the Unicode data into the new object */
466 if (u != NULL)
467 Py_UNICODE_COPY(unicode->str, u, size);
469 return (PyObject *)unicode;
472 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
474 PyUnicodeObject *unicode;
476 if (size < 0) {
477 PyErr_SetString(PyExc_SystemError,
478 "Negative size passed to PyUnicode_FromStringAndSize");
479 return NULL;
482 /* If the Unicode data is known at construction time, we can apply
483 some optimizations which share commonly used objects.
484 Also, this means the input must be UTF-8, so fall back to the
485 UTF-8 decoder at the end. */
486 if (u != NULL) {
488 /* Optimization for empty strings */
489 if (size == 0 && unicode_empty != NULL) {
490 Py_INCREF(unicode_empty);
491 return (PyObject *)unicode_empty;
494 /* Single characters are shared when using this constructor.
495 Restrict to ASCII, since the input must be UTF-8. */
496 if (size == 1 && Py_CHARMASK(*u) < 128) {
497 unicode = unicode_latin1[Py_CHARMASK(*u)];
498 if (!unicode) {
499 unicode = _PyUnicode_New(1);
500 if (!unicode)
501 return NULL;
502 unicode->str[0] = Py_CHARMASK(*u);
503 unicode_latin1[Py_CHARMASK(*u)] = unicode;
505 Py_INCREF(unicode);
506 return (PyObject *)unicode;
509 return PyUnicode_DecodeUTF8(u, size, NULL);
512 unicode = _PyUnicode_New(size);
513 if (!unicode)
514 return NULL;
516 return (PyObject *)unicode;
519 PyObject *PyUnicode_FromString(const char *u)
521 size_t size = strlen(u);
522 if (size > PY_SSIZE_T_MAX) {
523 PyErr_SetString(PyExc_OverflowError, "input too long");
524 return NULL;
527 return PyUnicode_FromStringAndSize(u, size);
530 #ifdef HAVE_WCHAR_H
532 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
533 # define CONVERT_WCHAR_TO_SURROGATES
534 #endif
536 #ifdef CONVERT_WCHAR_TO_SURROGATES
538 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
539 to convert from UTF32 to UTF16. */
541 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
542 Py_ssize_t size)
544 PyUnicodeObject *unicode;
545 register Py_ssize_t i;
546 Py_ssize_t alloc;
547 const wchar_t *orig_w;
549 if (w == NULL) {
550 PyErr_BadInternalCall();
551 return NULL;
554 alloc = size;
555 orig_w = w;
556 for (i = size; i > 0; i--) {
557 if (*w > 0xFFFF)
558 alloc++;
559 w++;
561 w = orig_w;
562 unicode = _PyUnicode_New(alloc);
563 if (!unicode)
564 return NULL;
566 /* Copy the wchar_t data into the new object */
568 register Py_UNICODE *u;
569 u = PyUnicode_AS_UNICODE(unicode);
570 for (i = size; i > 0; i--) {
571 if (*w > 0xFFFF) {
572 wchar_t ordinal = *w++;
573 ordinal -= 0x10000;
574 *u++ = 0xD800 | (ordinal >> 10);
575 *u++ = 0xDC00 | (ordinal & 0x3FF);
577 else
578 *u++ = *w++;
581 return (PyObject *)unicode;
584 #else
586 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
587 Py_ssize_t size)
589 PyUnicodeObject *unicode;
591 if (w == NULL) {
592 PyErr_BadInternalCall();
593 return NULL;
596 unicode = _PyUnicode_New(size);
597 if (!unicode)
598 return NULL;
600 /* Copy the wchar_t data into the new object */
601 #ifdef HAVE_USABLE_WCHAR_T
602 memcpy(unicode->str, w, size * sizeof(wchar_t));
603 #else
605 register Py_UNICODE *u;
606 register Py_ssize_t i;
607 u = PyUnicode_AS_UNICODE(unicode);
608 for (i = size; i > 0; i--)
609 *u++ = *w++;
611 #endif
613 return (PyObject *)unicode;
616 #endif /* CONVERT_WCHAR_TO_SURROGATES */
618 #undef CONVERT_WCHAR_TO_SURROGATES
620 static void
621 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
623 *fmt++ = '%';
624 if (width) {
625 if (zeropad)
626 *fmt++ = '0';
627 fmt += sprintf(fmt, "%d", width);
629 if (precision)
630 fmt += sprintf(fmt, ".%d", precision);
631 if (longflag)
632 *fmt++ = 'l';
633 else if (size_tflag) {
634 char *f = PY_FORMAT_SIZE_T;
635 while (*f)
636 *fmt++ = *f++;
638 *fmt++ = c;
639 *fmt = '\0';
642 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
644 PyObject *
645 PyUnicode_FromFormatV(const char *format, va_list vargs)
647 va_list count;
648 Py_ssize_t callcount = 0;
649 PyObject **callresults = NULL;
650 PyObject **callresult = NULL;
651 Py_ssize_t n = 0;
652 int width = 0;
653 int precision = 0;
654 int zeropad;
655 const char* f;
656 Py_UNICODE *s;
657 PyObject *string;
658 /* used by sprintf */
659 char buffer[21];
660 /* use abuffer instead of buffer, if we need more space
661 * (which can happen if there's a format specifier with width). */
662 char *abuffer = NULL;
663 char *realbuffer;
664 Py_ssize_t abuffersize = 0;
665 char fmt[60]; /* should be enough for %0width.precisionld */
666 const char *copy;
668 #ifdef VA_LIST_IS_ARRAY
669 Py_MEMCPY(count, vargs, sizeof(va_list));
670 #else
671 #ifdef __va_copy
672 __va_copy(count, vargs);
673 #else
674 count = vargs;
675 #endif
676 #endif
677 /* step 1: count the number of %S/%R format specifications
678 * (we call PyObject_Str()/PyObject_Repr() for these objects
679 * once during step 3 and put the result in an array) */
680 for (f = format; *f; f++) {
681 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
682 ++callcount;
684 /* step 2: allocate memory for the results of
685 * PyObject_Str()/PyObject_Repr() calls */
686 if (callcount) {
687 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
688 if (!callresults) {
689 PyErr_NoMemory();
690 return NULL;
692 callresult = callresults;
694 /* step 3: figure out how large a buffer we need */
695 for (f = format; *f; f++) {
696 if (*f == '%') {
697 const char* p = f;
698 width = 0;
699 while (isdigit((unsigned)*f))
700 width = (width*10) + *f++ - '0';
701 while (*++f && *f != '%' && !isalpha((unsigned)*f))
704 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
705 * they don't affect the amount of space we reserve.
707 if ((*f == 'l' || *f == 'z') &&
708 (f[1] == 'd' || f[1] == 'u'))
709 ++f;
711 switch (*f) {
712 case 'c':
713 (void)va_arg(count, int);
714 /* fall through... */
715 case '%':
716 n++;
717 break;
718 case 'd': case 'u': case 'i': case 'x':
719 (void) va_arg(count, int);
720 /* 20 bytes is enough to hold a 64-bit
721 integer. Decimal takes the most space.
722 This isn't enough for octal.
723 If a width is specified we need more
724 (which we allocate later). */
725 if (width < 20)
726 width = 20;
727 n += width;
728 if (abuffersize < width)
729 abuffersize = width;
730 break;
731 case 's':
733 /* UTF-8 */
734 unsigned char*s;
735 s = va_arg(count, unsigned char*);
736 while (*s) {
737 if (*s < 128) {
738 n++; s++;
739 } else if (*s < 0xc0) {
740 /* invalid UTF-8 */
741 n++; s++;
742 } else if (*s < 0xc0) {
743 n++;
744 s++; if(!*s)break;
745 s++;
746 } else if (*s < 0xe0) {
747 n++;
748 s++; if(!*s)break;
749 s++; if(!*s)break;
750 s++;
751 } else {
752 #ifdef Py_UNICODE_WIDE
753 n++;
754 #else
755 n+=2;
756 #endif
757 s++; if(!*s)break;
758 s++; if(!*s)break;
759 s++; if(!*s)break;
760 s++;
763 break;
765 case 'U':
767 PyObject *obj = va_arg(count, PyObject *);
768 assert(obj && PyUnicode_Check(obj));
769 n += PyUnicode_GET_SIZE(obj);
770 break;
772 case 'V':
774 PyObject *obj = va_arg(count, PyObject *);
775 const char *str = va_arg(count, const char *);
776 assert(obj || str);
777 assert(!obj || PyUnicode_Check(obj));
778 if (obj)
779 n += PyUnicode_GET_SIZE(obj);
780 else
781 n += strlen(str);
782 break;
784 case 'S':
786 PyObject *obj = va_arg(count, PyObject *);
787 PyObject *str;
788 assert(obj);
789 str = PyObject_Str(obj);
790 if (!str)
791 goto fail;
792 n += PyUnicode_GET_SIZE(str);
793 /* Remember the str and switch to the next slot */
794 *callresult++ = str;
795 break;
797 case 'R':
799 PyObject *obj = va_arg(count, PyObject *);
800 PyObject *repr;
801 assert(obj);
802 repr = PyObject_Repr(obj);
803 if (!repr)
804 goto fail;
805 n += PyUnicode_GET_SIZE(repr);
806 /* Remember the repr and switch to the next slot */
807 *callresult++ = repr;
808 break;
810 case 'p':
811 (void) va_arg(count, int);
812 /* maximum 64-bit pointer representation:
813 * 0xffffffffffffffff
814 * so 19 characters is enough.
815 * XXX I count 18 -- what's the extra for?
817 n += 19;
818 break;
819 default:
820 /* if we stumble upon an unknown
821 formatting code, copy the rest of
822 the format string to the output
823 string. (we cannot just skip the
824 code, since there's no way to know
825 what's in the argument list) */
826 n += strlen(p);
827 goto expand;
829 } else
830 n++;
832 expand:
833 if (abuffersize > 20) {
834 abuffer = PyObject_Malloc(abuffersize);
835 if (!abuffer) {
836 PyErr_NoMemory();
837 goto fail;
839 realbuffer = abuffer;
841 else
842 realbuffer = buffer;
843 /* step 4: fill the buffer */
844 /* Since we've analyzed how much space we need for the worst case,
845 we don't have to resize the string.
846 There can be no errors beyond this point. */
847 string = PyUnicode_FromUnicode(NULL, n);
848 if (!string)
849 goto fail;
851 s = PyUnicode_AS_UNICODE(string);
852 callresult = callresults;
854 for (f = format; *f; f++) {
855 if (*f == '%') {
856 const char* p = f++;
857 int longflag = 0;
858 int size_tflag = 0;
859 zeropad = (*f == '0');
860 /* parse the width.precision part */
861 width = 0;
862 while (isdigit((unsigned)*f))
863 width = (width*10) + *f++ - '0';
864 precision = 0;
865 if (*f == '.') {
866 f++;
867 while (isdigit((unsigned)*f))
868 precision = (precision*10) + *f++ - '0';
870 /* handle the long flag, but only for %ld and %lu.
871 others can be added when necessary. */
872 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
873 longflag = 1;
874 ++f;
876 /* handle the size_t flag. */
877 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
878 size_tflag = 1;
879 ++f;
882 switch (*f) {
883 case 'c':
884 *s++ = va_arg(vargs, int);
885 break;
886 case 'd':
887 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
888 if (longflag)
889 sprintf(realbuffer, fmt, va_arg(vargs, long));
890 else if (size_tflag)
891 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
892 else
893 sprintf(realbuffer, fmt, va_arg(vargs, int));
894 appendstring(realbuffer);
895 break;
896 case 'u':
897 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
898 if (longflag)
899 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
900 else if (size_tflag)
901 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
902 else
903 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
904 appendstring(realbuffer);
905 break;
906 case 'i':
907 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
908 sprintf(realbuffer, fmt, va_arg(vargs, int));
909 appendstring(realbuffer);
910 break;
911 case 'x':
912 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
913 sprintf(realbuffer, fmt, va_arg(vargs, int));
914 appendstring(realbuffer);
915 break;
916 case 's':
918 /* Parameter must be UTF-8 encoded.
919 In case of encoding errors, use
920 the replacement character. */
921 PyObject *u;
922 p = va_arg(vargs, char*);
923 u = PyUnicode_DecodeUTF8(p, strlen(p),
924 "replace");
925 if (!u)
926 goto fail;
927 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
928 PyUnicode_GET_SIZE(u));
929 s += PyUnicode_GET_SIZE(u);
930 Py_DECREF(u);
931 break;
933 case 'U':
935 PyObject *obj = va_arg(vargs, PyObject *);
936 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
937 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
938 s += size;
939 break;
941 case 'V':
943 PyObject *obj = va_arg(vargs, PyObject *);
944 const char *str = va_arg(vargs, const char *);
945 if (obj) {
946 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
947 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
948 s += size;
949 } else {
950 appendstring(str);
952 break;
954 case 'S':
955 case 'R':
957 Py_UNICODE *ucopy;
958 Py_ssize_t usize;
959 Py_ssize_t upos;
960 /* unused, since we already have the result */
961 (void) va_arg(vargs, PyObject *);
962 ucopy = PyUnicode_AS_UNICODE(*callresult);
963 usize = PyUnicode_GET_SIZE(*callresult);
964 for (upos = 0; upos<usize;)
965 *s++ = ucopy[upos++];
966 /* We're done with the unicode()/repr() => forget it */
967 Py_DECREF(*callresult);
968 /* switch to next unicode()/repr() result */
969 ++callresult;
970 break;
972 case 'p':
973 sprintf(buffer, "%p", va_arg(vargs, void*));
974 /* %p is ill-defined: ensure leading 0x. */
975 if (buffer[1] == 'X')
976 buffer[1] = 'x';
977 else if (buffer[1] != 'x') {
978 memmove(buffer+2, buffer, strlen(buffer)+1);
979 buffer[0] = '0';
980 buffer[1] = 'x';
982 appendstring(buffer);
983 break;
984 case '%':
985 *s++ = '%';
986 break;
987 default:
988 appendstring(p);
989 goto end;
991 } else
992 *s++ = *f;
995 end:
996 if (callresults)
997 PyObject_Free(callresults);
998 if (abuffer)
999 PyObject_Free(abuffer);
1000 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1001 return string;
1002 fail:
1003 if (callresults) {
1004 PyObject **callresult2 = callresults;
1005 while (callresult2 < callresult) {
1006 Py_DECREF(*callresult2);
1007 ++callresult2;
1009 PyObject_Free(callresults);
1011 if (abuffer)
1012 PyObject_Free(abuffer);
1013 return NULL;
1016 #undef appendstring
1018 PyObject *
1019 PyUnicode_FromFormat(const char *format, ...)
1021 PyObject* ret;
1022 va_list vargs;
1024 #ifdef HAVE_STDARG_PROTOTYPES
1025 va_start(vargs, format);
1026 #else
1027 va_start(vargs);
1028 #endif
1029 ret = PyUnicode_FromFormatV(format, vargs);
1030 va_end(vargs);
1031 return ret;
1034 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1035 wchar_t *w,
1036 Py_ssize_t size)
1038 if (unicode == NULL) {
1039 PyErr_BadInternalCall();
1040 return -1;
1043 /* If possible, try to copy the 0-termination as well */
1044 if (size > PyUnicode_GET_SIZE(unicode))
1045 size = PyUnicode_GET_SIZE(unicode) + 1;
1047 #ifdef HAVE_USABLE_WCHAR_T
1048 memcpy(w, unicode->str, size * sizeof(wchar_t));
1049 #else
1051 register Py_UNICODE *u;
1052 register Py_ssize_t i;
1053 u = PyUnicode_AS_UNICODE(unicode);
1054 for (i = size; i > 0; i--)
1055 *w++ = *u++;
1057 #endif
1059 if (size > PyUnicode_GET_SIZE(unicode))
1060 return PyUnicode_GET_SIZE(unicode);
1061 else
1062 return size;
1065 #endif
1067 PyObject *PyUnicode_FromOrdinal(int ordinal)
1069 Py_UNICODE s[1];
1071 #ifdef Py_UNICODE_WIDE
1072 if (ordinal < 0 || ordinal > 0x10ffff) {
1073 PyErr_SetString(PyExc_ValueError,
1074 "unichr() arg not in range(0x110000) "
1075 "(wide Python build)");
1076 return NULL;
1078 #else
1079 if (ordinal < 0 || ordinal > 0xffff) {
1080 PyErr_SetString(PyExc_ValueError,
1081 "unichr() arg not in range(0x10000) "
1082 "(narrow Python build)");
1083 return NULL;
1085 #endif
1087 s[0] = (Py_UNICODE)ordinal;
1088 return PyUnicode_FromUnicode(s, 1);
1091 PyObject *PyUnicode_FromObject(register PyObject *obj)
1093 /* XXX Perhaps we should make this API an alias of
1094 PyObject_Unicode() instead ?! */
1095 if (PyUnicode_CheckExact(obj)) {
1096 Py_INCREF(obj);
1097 return obj;
1099 if (PyUnicode_Check(obj)) {
1100 /* For a Unicode subtype that's not a Unicode object,
1101 return a true Unicode object with the same data. */
1102 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1103 PyUnicode_GET_SIZE(obj));
1105 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1108 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1109 const char *encoding,
1110 const char *errors)
1112 const char *s = NULL;
1113 Py_ssize_t len;
1114 PyObject *v;
1116 if (obj == NULL) {
1117 PyErr_BadInternalCall();
1118 return NULL;
1121 #if 0
1122 /* For b/w compatibility we also accept Unicode objects provided
1123 that no encodings is given and then redirect to
1124 PyObject_Unicode() which then applies the additional logic for
1125 Unicode subclasses.
1127 NOTE: This API should really only be used for object which
1128 represent *encoded* Unicode !
1131 if (PyUnicode_Check(obj)) {
1132 if (encoding) {
1133 PyErr_SetString(PyExc_TypeError,
1134 "decoding Unicode is not supported");
1135 return NULL;
1137 return PyObject_Unicode(obj);
1139 #else
1140 if (PyUnicode_Check(obj)) {
1141 PyErr_SetString(PyExc_TypeError,
1142 "decoding Unicode is not supported");
1143 return NULL;
1145 #endif
1147 /* Coerce object */
1148 if (PyString_Check(obj)) {
1149 s = PyString_AS_STRING(obj);
1150 len = PyString_GET_SIZE(obj);
1152 else if (PyByteArray_Check(obj)) {
1153 /* Python 2.x specific */
1154 PyErr_Format(PyExc_TypeError,
1155 "decoding bytearray is not supported");
1156 return NULL;
1158 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1159 /* Overwrite the error message with something more useful in
1160 case of a TypeError. */
1161 if (PyErr_ExceptionMatches(PyExc_TypeError))
1162 PyErr_Format(PyExc_TypeError,
1163 "coercing to Unicode: need string or buffer, "
1164 "%.80s found",
1165 Py_TYPE(obj)->tp_name);
1166 goto onError;
1169 /* Convert to Unicode */
1170 if (len == 0) {
1171 Py_INCREF(unicode_empty);
1172 v = (PyObject *)unicode_empty;
1174 else
1175 v = PyUnicode_Decode(s, len, encoding, errors);
1177 return v;
1179 onError:
1180 return NULL;
1183 PyObject *PyUnicode_Decode(const char *s,
1184 Py_ssize_t size,
1185 const char *encoding,
1186 const char *errors)
1188 PyObject *buffer = NULL, *unicode;
1190 if (encoding == NULL)
1191 encoding = PyUnicode_GetDefaultEncoding();
1193 /* Shortcuts for common default encodings */
1194 if (strcmp(encoding, "utf-8") == 0)
1195 return PyUnicode_DecodeUTF8(s, size, errors);
1196 else if (strcmp(encoding, "latin-1") == 0)
1197 return PyUnicode_DecodeLatin1(s, size, errors);
1198 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1199 else if (strcmp(encoding, "mbcs") == 0)
1200 return PyUnicode_DecodeMBCS(s, size, errors);
1201 #endif
1202 else if (strcmp(encoding, "ascii") == 0)
1203 return PyUnicode_DecodeASCII(s, size, errors);
1205 /* Decode via the codec registry */
1206 buffer = PyBuffer_FromMemory((void *)s, size);
1207 if (buffer == NULL)
1208 goto onError;
1209 unicode = PyCodec_Decode(buffer, encoding, errors);
1210 if (unicode == NULL)
1211 goto onError;
1212 if (!PyUnicode_Check(unicode)) {
1213 PyErr_Format(PyExc_TypeError,
1214 "decoder did not return an unicode object (type=%.400s)",
1215 Py_TYPE(unicode)->tp_name);
1216 Py_DECREF(unicode);
1217 goto onError;
1219 Py_DECREF(buffer);
1220 return unicode;
1222 onError:
1223 Py_XDECREF(buffer);
1224 return NULL;
1227 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1228 const char *encoding,
1229 const char *errors)
1231 PyObject *v;
1233 if (!PyUnicode_Check(unicode)) {
1234 PyErr_BadArgument();
1235 goto onError;
1238 if (encoding == NULL)
1239 encoding = PyUnicode_GetDefaultEncoding();
1241 /* Decode via the codec registry */
1242 v = PyCodec_Decode(unicode, encoding, errors);
1243 if (v == NULL)
1244 goto onError;
1245 return v;
1247 onError:
1248 return NULL;
1251 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1252 Py_ssize_t size,
1253 const char *encoding,
1254 const char *errors)
1256 PyObject *v, *unicode;
1258 unicode = PyUnicode_FromUnicode(s, size);
1259 if (unicode == NULL)
1260 return NULL;
1261 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1262 Py_DECREF(unicode);
1263 return v;
1266 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1267 const char *encoding,
1268 const char *errors)
1270 PyObject *v;
1272 if (!PyUnicode_Check(unicode)) {
1273 PyErr_BadArgument();
1274 goto onError;
1277 if (encoding == NULL)
1278 encoding = PyUnicode_GetDefaultEncoding();
1280 /* Encode via the codec registry */
1281 v = PyCodec_Encode(unicode, encoding, errors);
1282 if (v == NULL)
1283 goto onError;
1284 return v;
1286 onError:
1287 return NULL;
1290 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1291 const char *encoding,
1292 const char *errors)
1294 PyObject *v;
1296 if (!PyUnicode_Check(unicode)) {
1297 PyErr_BadArgument();
1298 goto onError;
1301 if (encoding == NULL)
1302 encoding = PyUnicode_GetDefaultEncoding();
1304 /* Shortcuts for common default encodings */
1305 if (errors == NULL) {
1306 if (strcmp(encoding, "utf-8") == 0)
1307 return PyUnicode_AsUTF8String(unicode);
1308 else if (strcmp(encoding, "latin-1") == 0)
1309 return PyUnicode_AsLatin1String(unicode);
1310 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1311 else if (strcmp(encoding, "mbcs") == 0)
1312 return PyUnicode_AsMBCSString(unicode);
1313 #endif
1314 else if (strcmp(encoding, "ascii") == 0)
1315 return PyUnicode_AsASCIIString(unicode);
1318 /* Encode via the codec registry */
1319 v = PyCodec_Encode(unicode, encoding, errors);
1320 if (v == NULL)
1321 goto onError;
1322 if (!PyString_Check(v)) {
1323 PyErr_Format(PyExc_TypeError,
1324 "encoder did not return a string object (type=%.400s)",
1325 Py_TYPE(v)->tp_name);
1326 Py_DECREF(v);
1327 goto onError;
1329 return v;
1331 onError:
1332 return NULL;
1335 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1336 const char *errors)
1338 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1340 if (v)
1341 return v;
1342 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1343 if (v && errors == NULL)
1344 ((PyUnicodeObject *)unicode)->defenc = v;
1345 return v;
1348 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1350 if (!PyUnicode_Check(unicode)) {
1351 PyErr_BadArgument();
1352 goto onError;
1354 return PyUnicode_AS_UNICODE(unicode);
1356 onError:
1357 return NULL;
1360 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1362 if (!PyUnicode_Check(unicode)) {
1363 PyErr_BadArgument();
1364 goto onError;
1366 return PyUnicode_GET_SIZE(unicode);
1368 onError:
1369 return -1;
1372 const char *PyUnicode_GetDefaultEncoding(void)
1374 return unicode_default_encoding;
1377 int PyUnicode_SetDefaultEncoding(const char *encoding)
1379 PyObject *v;
1381 /* Make sure the encoding is valid. As side effect, this also
1382 loads the encoding into the codec registry cache. */
1383 v = _PyCodec_Lookup(encoding);
1384 if (v == NULL)
1385 goto onError;
1386 Py_DECREF(v);
1387 strncpy(unicode_default_encoding,
1388 encoding,
1389 sizeof(unicode_default_encoding));
1390 return 0;
1392 onError:
1393 return -1;
1396 /* error handling callback helper:
1397 build arguments, call the callback and check the arguments,
1398 if no exception occurred, copy the replacement to the output
1399 and adjust various state variables.
1400 return 0 on success, -1 on error
1403 static
1404 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1405 const char *encoding, const char *reason,
1406 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1407 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1408 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1410 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1412 PyObject *restuple = NULL;
1413 PyObject *repunicode = NULL;
1414 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1415 Py_ssize_t requiredsize;
1416 Py_ssize_t newpos;
1417 Py_UNICODE *repptr;
1418 Py_ssize_t repsize;
1419 int res = -1;
1421 if (*errorHandler == NULL) {
1422 *errorHandler = PyCodec_LookupError(errors);
1423 if (*errorHandler == NULL)
1424 goto onError;
1427 if (*exceptionObject == NULL) {
1428 *exceptionObject = PyUnicodeDecodeError_Create(
1429 encoding, input, insize, *startinpos, *endinpos, reason);
1430 if (*exceptionObject == NULL)
1431 goto onError;
1433 else {
1434 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1435 goto onError;
1436 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1437 goto onError;
1438 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1439 goto onError;
1442 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1443 if (restuple == NULL)
1444 goto onError;
1445 if (!PyTuple_Check(restuple)) {
1446 PyErr_SetString(PyExc_TypeError, &argparse[4]);
1447 goto onError;
1449 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1450 goto onError;
1451 if (newpos<0)
1452 newpos = insize+newpos;
1453 if (newpos<0 || newpos>insize) {
1454 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1455 goto onError;
1458 /* need more space? (at least enough for what we
1459 have+the replacement+the rest of the string (starting
1460 at the new input position), so we won't have to check space
1461 when there are no errors in the rest of the string) */
1462 repptr = PyUnicode_AS_UNICODE(repunicode);
1463 repsize = PyUnicode_GET_SIZE(repunicode);
1464 requiredsize = *outpos + repsize + insize-newpos;
1465 if (requiredsize > outsize) {
1466 if (requiredsize<2*outsize)
1467 requiredsize = 2*outsize;
1468 if (_PyUnicode_Resize(output, requiredsize) < 0)
1469 goto onError;
1470 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1472 *endinpos = newpos;
1473 *inptr = input + newpos;
1474 Py_UNICODE_COPY(*outptr, repptr, repsize);
1475 *outptr += repsize;
1476 *outpos += repsize;
1477 /* we made it! */
1478 res = 0;
1480 onError:
1481 Py_XDECREF(restuple);
1482 return res;
1485 /* --- UTF-7 Codec -------------------------------------------------------- */
1487 /* see RFC2152 for details */
1489 static
1490 char utf7_special[128] = {
1491 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1492 encoded:
1493 0 - not special
1494 1 - special
1495 2 - whitespace (optional)
1496 3 - RFC2152 Set O (optional) */
1497 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1498 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1499 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1500 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1501 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1502 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1503 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1504 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1508 /* Note: The comparison (c) <= 0 is a trick to work-around gcc
1509 warnings about the comparison always being false; since
1510 utf7_special[0] is 1, we can safely make that one comparison
1511 true */
1513 #define SPECIAL(c, encodeO, encodeWS) \
1514 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
1515 (encodeWS && (utf7_special[(c)] == 2)) || \
1516 (encodeO && (utf7_special[(c)] == 3)))
1518 #define B64(n) \
1519 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1520 #define B64CHAR(c) \
1521 (isalnum(c) || (c) == '+' || (c) == '/')
1522 #define UB64(c) \
1523 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1524 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
1526 #define ENCODE(out, ch, bits) \
1527 while (bits >= 6) { \
1528 *out++ = B64(ch >> (bits-6)); \
1529 bits -= 6; \
1532 #define DECODE(out, ch, bits, surrogate) \
1533 while (bits >= 16) { \
1534 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1535 bits -= 16; \
1536 if (surrogate) { \
1537 /* We have already generated an error for the high surrogate \
1538 so let's not bother seeing if the low surrogate is correct or not */ \
1539 surrogate = 0; \
1540 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
1541 /* This is a surrogate pair. Unfortunately we can't represent \
1542 it in a 16-bit character */ \
1543 surrogate = 1; \
1544 errmsg = "code pairs are not supported"; \
1545 goto utf7Error; \
1546 } else { \
1547 *out++ = outCh; \
1551 PyObject *PyUnicode_DecodeUTF7(const char *s,
1552 Py_ssize_t size,
1553 const char *errors)
1555 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1558 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1559 Py_ssize_t size,
1560 const char *errors,
1561 Py_ssize_t *consumed)
1563 const char *starts = s;
1564 Py_ssize_t startinpos;
1565 Py_ssize_t endinpos;
1566 Py_ssize_t outpos;
1567 const char *e;
1568 PyUnicodeObject *unicode;
1569 Py_UNICODE *p;
1570 const char *errmsg = "";
1571 int inShift = 0;
1572 unsigned int bitsleft = 0;
1573 unsigned long charsleft = 0;
1574 int surrogate = 0;
1575 PyObject *errorHandler = NULL;
1576 PyObject *exc = NULL;
1578 unicode = _PyUnicode_New(size);
1579 if (!unicode)
1580 return NULL;
1581 if (size == 0) {
1582 if (consumed)
1583 *consumed = 0;
1584 return (PyObject *)unicode;
1587 p = unicode->str;
1588 e = s + size;
1590 while (s < e) {
1591 Py_UNICODE ch;
1592 restart:
1593 ch = (unsigned char) *s;
1595 if (inShift) {
1596 if ((ch == '-') || !B64CHAR(ch)) {
1597 inShift = 0;
1598 s++;
1600 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1601 if (bitsleft >= 6) {
1602 /* The shift sequence has a partial character in it. If
1603 bitsleft < 6 then we could just classify it as padding
1604 but that is not the case here */
1606 errmsg = "partial character in shift sequence";
1607 goto utf7Error;
1609 /* According to RFC2152 the remaining bits should be zero. We
1610 choose to signal an error/insert a replacement character
1611 here so indicate the potential of a misencoded character. */
1613 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1614 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1615 errmsg = "non-zero padding bits in shift sequence";
1616 goto utf7Error;
1619 if (ch == '-') {
1620 if ((s < e) && (*(s) == '-')) {
1621 *p++ = '-';
1622 inShift = 1;
1624 } else if (SPECIAL(ch,0,0)) {
1625 errmsg = "unexpected special character";
1626 goto utf7Error;
1627 } else {
1628 *p++ = ch;
1630 } else {
1631 charsleft = (charsleft << 6) | UB64(ch);
1632 bitsleft += 6;
1633 s++;
1634 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1637 else if ( ch == '+' ) {
1638 startinpos = s-starts;
1639 s++;
1640 if (s < e && *s == '-') {
1641 s++;
1642 *p++ = '+';
1643 } else
1645 inShift = 1;
1646 bitsleft = 0;
1649 else if (SPECIAL(ch,0,0)) {
1650 startinpos = s-starts;
1651 errmsg = "unexpected special character";
1652 s++;
1653 goto utf7Error;
1655 else {
1656 *p++ = ch;
1657 s++;
1659 continue;
1660 utf7Error:
1661 outpos = p-PyUnicode_AS_UNICODE(unicode);
1662 endinpos = s-starts;
1663 if (unicode_decode_call_errorhandler(
1664 errors, &errorHandler,
1665 "utf7", errmsg,
1666 starts, size, &startinpos, &endinpos, &exc, &s,
1667 &unicode, &outpos, &p))
1668 goto onError;
1671 if (inShift && !consumed) {
1672 outpos = p-PyUnicode_AS_UNICODE(unicode);
1673 endinpos = size;
1674 if (unicode_decode_call_errorhandler(
1675 errors, &errorHandler,
1676 "utf7", "unterminated shift sequence",
1677 starts, size, &startinpos, &endinpos, &exc, &s,
1678 &unicode, &outpos, &p))
1679 goto onError;
1680 if (s < e)
1681 goto restart;
1683 if (consumed) {
1684 if(inShift)
1685 *consumed = startinpos;
1686 else
1687 *consumed = s-starts;
1690 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1691 goto onError;
1693 Py_XDECREF(errorHandler);
1694 Py_XDECREF(exc);
1695 return (PyObject *)unicode;
1697 onError:
1698 Py_XDECREF(errorHandler);
1699 Py_XDECREF(exc);
1700 Py_DECREF(unicode);
1701 return NULL;
1705 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1706 Py_ssize_t size,
1707 int encodeSetO,
1708 int encodeWhiteSpace,
1709 const char *errors)
1711 PyObject *v;
1712 /* It might be possible to tighten this worst case */
1713 Py_ssize_t cbAllocated = 5 * size;
1714 int inShift = 0;
1715 Py_ssize_t i = 0;
1716 unsigned int bitsleft = 0;
1717 unsigned long charsleft = 0;
1718 char * out;
1719 char * start;
1721 if (cbAllocated / 5 != size)
1722 return PyErr_NoMemory();
1724 if (size == 0)
1725 return PyString_FromStringAndSize(NULL, 0);
1727 v = PyString_FromStringAndSize(NULL, cbAllocated);
1728 if (v == NULL)
1729 return NULL;
1731 start = out = PyString_AS_STRING(v);
1732 for (;i < size; ++i) {
1733 Py_UNICODE ch = s[i];
1735 if (!inShift) {
1736 if (ch == '+') {
1737 *out++ = '+';
1738 *out++ = '-';
1739 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1740 charsleft = ch;
1741 bitsleft = 16;
1742 *out++ = '+';
1743 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1744 inShift = bitsleft > 0;
1745 } else {
1746 *out++ = (char) ch;
1748 } else {
1749 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1750 *out++ = B64(charsleft << (6-bitsleft));
1751 charsleft = 0;
1752 bitsleft = 0;
1753 /* Characters not in the BASE64 set implicitly unshift the sequence
1754 so no '-' is required, except if the character is itself a '-' */
1755 if (B64CHAR(ch) || ch == '-') {
1756 *out++ = '-';
1758 inShift = 0;
1759 *out++ = (char) ch;
1760 } else {
1761 bitsleft += 16;
1762 charsleft = (charsleft << 16) | ch;
1763 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1765 /* If the next character is special then we don't need to terminate
1766 the shift sequence. If the next character is not a BASE64 character
1767 or '-' then the shift sequence will be terminated implicitly and we
1768 don't have to insert a '-'. */
1770 if (bitsleft == 0) {
1771 if (i + 1 < size) {
1772 Py_UNICODE ch2 = s[i+1];
1774 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1776 } else if (B64CHAR(ch2) || ch2 == '-') {
1777 *out++ = '-';
1778 inShift = 0;
1779 } else {
1780 inShift = 0;
1784 else {
1785 *out++ = '-';
1786 inShift = 0;
1792 if (bitsleft) {
1793 *out++= B64(charsleft << (6-bitsleft) );
1794 *out++ = '-';
1797 _PyString_Resize(&v, out - start);
1798 return v;
1801 #undef SPECIAL
1802 #undef B64
1803 #undef B64CHAR
1804 #undef UB64
1805 #undef ENCODE
1806 #undef DECODE
1808 /* --- UTF-8 Codec -------------------------------------------------------- */
1810 static
1811 char utf8_code_length[256] = {
1812 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1813 illegal prefix. see RFC 2279 for details */
1814 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1815 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1816 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1817 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1818 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1819 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1820 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1821 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1822 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1823 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1824 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1825 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1826 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1827 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1828 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1829 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1832 PyObject *PyUnicode_DecodeUTF8(const char *s,
1833 Py_ssize_t size,
1834 const char *errors)
1836 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1839 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1840 Py_ssize_t size,
1841 const char *errors,
1842 Py_ssize_t *consumed)
1844 const char *starts = s;
1845 int n;
1846 Py_ssize_t startinpos;
1847 Py_ssize_t endinpos;
1848 Py_ssize_t outpos;
1849 const char *e;
1850 PyUnicodeObject *unicode;
1851 Py_UNICODE *p;
1852 const char *errmsg = "";
1853 PyObject *errorHandler = NULL;
1854 PyObject *exc = NULL;
1856 /* Note: size will always be longer than the resulting Unicode
1857 character count */
1858 unicode = _PyUnicode_New(size);
1859 if (!unicode)
1860 return NULL;
1861 if (size == 0) {
1862 if (consumed)
1863 *consumed = 0;
1864 return (PyObject *)unicode;
1867 /* Unpack UTF-8 encoded data */
1868 p = unicode->str;
1869 e = s + size;
1871 while (s < e) {
1872 Py_UCS4 ch = (unsigned char)*s;
1874 if (ch < 0x80) {
1875 *p++ = (Py_UNICODE)ch;
1876 s++;
1877 continue;
1880 n = utf8_code_length[ch];
1882 if (s + n > e) {
1883 if (consumed)
1884 break;
1885 else {
1886 errmsg = "unexpected end of data";
1887 startinpos = s-starts;
1888 endinpos = size;
1889 goto utf8Error;
1893 switch (n) {
1895 case 0:
1896 errmsg = "unexpected code byte";
1897 startinpos = s-starts;
1898 endinpos = startinpos+1;
1899 goto utf8Error;
1901 case 1:
1902 errmsg = "internal error";
1903 startinpos = s-starts;
1904 endinpos = startinpos+1;
1905 goto utf8Error;
1907 case 2:
1908 if ((s[1] & 0xc0) != 0x80) {
1909 errmsg = "invalid data";
1910 startinpos = s-starts;
1911 endinpos = startinpos+2;
1912 goto utf8Error;
1914 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1915 if (ch < 0x80) {
1916 startinpos = s-starts;
1917 endinpos = startinpos+2;
1918 errmsg = "illegal encoding";
1919 goto utf8Error;
1921 else
1922 *p++ = (Py_UNICODE)ch;
1923 break;
1925 case 3:
1926 if ((s[1] & 0xc0) != 0x80 ||
1927 (s[2] & 0xc0) != 0x80) {
1928 errmsg = "invalid data";
1929 startinpos = s-starts;
1930 endinpos = startinpos+3;
1931 goto utf8Error;
1933 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1934 if (ch < 0x0800) {
1935 /* Note: UTF-8 encodings of surrogates are considered
1936 legal UTF-8 sequences;
1938 XXX For wide builds (UCS-4) we should probably try
1939 to recombine the surrogates into a single code
1940 unit.
1942 errmsg = "illegal encoding";
1943 startinpos = s-starts;
1944 endinpos = startinpos+3;
1945 goto utf8Error;
1947 else
1948 *p++ = (Py_UNICODE)ch;
1949 break;
1951 case 4:
1952 if ((s[1] & 0xc0) != 0x80 ||
1953 (s[2] & 0xc0) != 0x80 ||
1954 (s[3] & 0xc0) != 0x80) {
1955 errmsg = "invalid data";
1956 startinpos = s-starts;
1957 endinpos = startinpos+4;
1958 goto utf8Error;
1960 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1961 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1962 /* validate and convert to UTF-16 */
1963 if ((ch < 0x10000) /* minimum value allowed for 4
1964 byte encoding */
1965 || (ch > 0x10ffff)) /* maximum value allowed for
1966 UTF-16 */
1968 errmsg = "illegal encoding";
1969 startinpos = s-starts;
1970 endinpos = startinpos+4;
1971 goto utf8Error;
1973 #ifdef Py_UNICODE_WIDE
1974 *p++ = (Py_UNICODE)ch;
1975 #else
1976 /* compute and append the two surrogates: */
1978 /* translate from 10000..10FFFF to 0..FFFF */
1979 ch -= 0x10000;
1981 /* high surrogate = top 10 bits added to D800 */
1982 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1984 /* low surrogate = bottom 10 bits added to DC00 */
1985 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1986 #endif
1987 break;
1989 default:
1990 /* Other sizes are only needed for UCS-4 */
1991 errmsg = "unsupported Unicode code range";
1992 startinpos = s-starts;
1993 endinpos = startinpos+n;
1994 goto utf8Error;
1996 s += n;
1997 continue;
1999 utf8Error:
2000 outpos = p-PyUnicode_AS_UNICODE(unicode);
2001 if (unicode_decode_call_errorhandler(
2002 errors, &errorHandler,
2003 "utf8", errmsg,
2004 starts, size, &startinpos, &endinpos, &exc, &s,
2005 &unicode, &outpos, &p))
2006 goto onError;
2008 if (consumed)
2009 *consumed = s-starts;
2011 /* Adjust length */
2012 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2013 goto onError;
2015 Py_XDECREF(errorHandler);
2016 Py_XDECREF(exc);
2017 return (PyObject *)unicode;
2019 onError:
2020 Py_XDECREF(errorHandler);
2021 Py_XDECREF(exc);
2022 Py_DECREF(unicode);
2023 return NULL;
2026 /* Allocation strategy: if the string is short, convert into a stack buffer
2027 and allocate exactly as much space needed at the end. Else allocate the
2028 maximum possible needed (4 result bytes per Unicode character), and return
2029 the excess memory at the end.
2031 PyObject *
2032 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2033 Py_ssize_t size,
2034 const char *errors)
2036 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
2038 Py_ssize_t i; /* index into s of next input byte */
2039 PyObject *v; /* result string object */
2040 char *p; /* next free byte in output buffer */
2041 Py_ssize_t nallocated; /* number of result bytes allocated */
2042 Py_ssize_t nneeded; /* number of result bytes needed */
2043 char stackbuf[MAX_SHORT_UNICHARS * 4];
2045 assert(s != NULL);
2046 assert(size >= 0);
2048 if (size <= MAX_SHORT_UNICHARS) {
2049 /* Write into the stack buffer; nallocated can't overflow.
2050 * At the end, we'll allocate exactly as much heap space as it
2051 * turns out we need.
2053 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2054 v = NULL; /* will allocate after we're done */
2055 p = stackbuf;
2057 else {
2058 /* Overallocate on the heap, and give the excess back at the end. */
2059 nallocated = size * 4;
2060 if (nallocated / 4 != size) /* overflow! */
2061 return PyErr_NoMemory();
2062 v = PyString_FromStringAndSize(NULL, nallocated);
2063 if (v == NULL)
2064 return NULL;
2065 p = PyString_AS_STRING(v);
2068 for (i = 0; i < size;) {
2069 Py_UCS4 ch = s[i++];
2071 if (ch < 0x80)
2072 /* Encode ASCII */
2073 *p++ = (char) ch;
2075 else if (ch < 0x0800) {
2076 /* Encode Latin-1 */
2077 *p++ = (char)(0xc0 | (ch >> 6));
2078 *p++ = (char)(0x80 | (ch & 0x3f));
2080 else {
2081 /* Encode UCS2 Unicode ordinals */
2082 if (ch < 0x10000) {
2083 /* Special case: check for high surrogate */
2084 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2085 Py_UCS4 ch2 = s[i];
2086 /* Check for low surrogate and combine the two to
2087 form a UCS4 value */
2088 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2089 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2090 i++;
2091 goto encodeUCS4;
2093 /* Fall through: handles isolated high surrogates */
2095 *p++ = (char)(0xe0 | (ch >> 12));
2096 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2097 *p++ = (char)(0x80 | (ch & 0x3f));
2098 continue;
2100 encodeUCS4:
2101 /* Encode UCS4 Unicode ordinals */
2102 *p++ = (char)(0xf0 | (ch >> 18));
2103 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2104 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2105 *p++ = (char)(0x80 | (ch & 0x3f));
2109 if (v == NULL) {
2110 /* This was stack allocated. */
2111 nneeded = p - stackbuf;
2112 assert(nneeded <= nallocated);
2113 v = PyString_FromStringAndSize(stackbuf, nneeded);
2115 else {
2116 /* Cut back to size actually needed. */
2117 nneeded = p - PyString_AS_STRING(v);
2118 assert(nneeded <= nallocated);
2119 _PyString_Resize(&v, nneeded);
2121 return v;
2123 #undef MAX_SHORT_UNICHARS
2126 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2128 if (!PyUnicode_Check(unicode)) {
2129 PyErr_BadArgument();
2130 return NULL;
2132 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2133 PyUnicode_GET_SIZE(unicode),
2134 NULL);
2137 /* --- UTF-32 Codec ------------------------------------------------------- */
2139 PyObject *
2140 PyUnicode_DecodeUTF32(const char *s,
2141 Py_ssize_t size,
2142 const char *errors,
2143 int *byteorder)
2145 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2148 PyObject *
2149 PyUnicode_DecodeUTF32Stateful(const char *s,
2150 Py_ssize_t size,
2151 const char *errors,
2152 int *byteorder,
2153 Py_ssize_t *consumed)
2155 const char *starts = s;
2156 Py_ssize_t startinpos;
2157 Py_ssize_t endinpos;
2158 Py_ssize_t outpos;
2159 PyUnicodeObject *unicode;
2160 Py_UNICODE *p;
2161 #ifndef Py_UNICODE_WIDE
2162 int i, pairs;
2163 #else
2164 const int pairs = 0;
2165 #endif
2166 const unsigned char *q, *e;
2167 int bo = 0; /* assume native ordering by default */
2168 const char *errmsg = "";
2169 /* Offsets from q for retrieving bytes in the right order. */
2170 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2171 int iorder[] = {0, 1, 2, 3};
2172 #else
2173 int iorder[] = {3, 2, 1, 0};
2174 #endif
2175 PyObject *errorHandler = NULL;
2176 PyObject *exc = NULL;
2177 /* On narrow builds we split characters outside the BMP into two
2178 codepoints => count how much extra space we need. */
2179 #ifndef Py_UNICODE_WIDE
2180 for (i = pairs = 0; i < size/4; i++)
2181 if (((Py_UCS4 *)s)[i] >= 0x10000)
2182 pairs++;
2183 #endif
2185 /* This might be one to much, because of a BOM */
2186 unicode = _PyUnicode_New((size+3)/4+pairs);
2187 if (!unicode)
2188 return NULL;
2189 if (size == 0)
2190 return (PyObject *)unicode;
2192 /* Unpack UTF-32 encoded data */
2193 p = unicode->str;
2194 q = (unsigned char *)s;
2195 e = q + size;
2197 if (byteorder)
2198 bo = *byteorder;
2200 /* Check for BOM marks (U+FEFF) in the input and adjust current
2201 byte order setting accordingly. In native mode, the leading BOM
2202 mark is skipped, in all other modes, it is copied to the output
2203 stream as-is (giving a ZWNBSP character). */
2204 if (bo == 0) {
2205 if (size >= 4) {
2206 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2207 (q[iorder[1]] << 8) | q[iorder[0]];
2208 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2209 if (bom == 0x0000FEFF) {
2210 q += 4;
2211 bo = -1;
2213 else if (bom == 0xFFFE0000) {
2214 q += 4;
2215 bo = 1;
2217 #else
2218 if (bom == 0x0000FEFF) {
2219 q += 4;
2220 bo = 1;
2222 else if (bom == 0xFFFE0000) {
2223 q += 4;
2224 bo = -1;
2226 #endif
2230 if (bo == -1) {
2231 /* force LE */
2232 iorder[0] = 0;
2233 iorder[1] = 1;
2234 iorder[2] = 2;
2235 iorder[3] = 3;
2237 else if (bo == 1) {
2238 /* force BE */
2239 iorder[0] = 3;
2240 iorder[1] = 2;
2241 iorder[2] = 1;
2242 iorder[3] = 0;
2245 while (q < e) {
2246 Py_UCS4 ch;
2247 /* remaining bytes at the end? (size should be divisible by 4) */
2248 if (e-q<4) {
2249 if (consumed)
2250 break;
2251 errmsg = "truncated data";
2252 startinpos = ((const char *)q)-starts;
2253 endinpos = ((const char *)e)-starts;
2254 goto utf32Error;
2255 /* The remaining input chars are ignored if the callback
2256 chooses to skip the input */
2258 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2259 (q[iorder[1]] << 8) | q[iorder[0]];
2261 if (ch >= 0x110000)
2263 errmsg = "codepoint not in range(0x110000)";
2264 startinpos = ((const char *)q)-starts;
2265 endinpos = startinpos+4;
2266 goto utf32Error;
2268 #ifndef Py_UNICODE_WIDE
2269 if (ch >= 0x10000)
2271 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2272 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2274 else
2275 #endif
2276 *p++ = ch;
2277 q += 4;
2278 continue;
2279 utf32Error:
2280 outpos = p-PyUnicode_AS_UNICODE(unicode);
2281 if (unicode_decode_call_errorhandler(
2282 errors, &errorHandler,
2283 "utf32", errmsg,
2284 starts, size, &startinpos, &endinpos, &exc, &s,
2285 &unicode, &outpos, &p))
2286 goto onError;
2289 if (byteorder)
2290 *byteorder = bo;
2292 if (consumed)
2293 *consumed = (const char *)q-starts;
2295 /* Adjust length */
2296 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2297 goto onError;
2299 Py_XDECREF(errorHandler);
2300 Py_XDECREF(exc);
2301 return (PyObject *)unicode;
2303 onError:
2304 Py_DECREF(unicode);
2305 Py_XDECREF(errorHandler);
2306 Py_XDECREF(exc);
2307 return NULL;
2310 PyObject *
2311 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2312 Py_ssize_t size,
2313 const char *errors,
2314 int byteorder)
2316 PyObject *v;
2317 unsigned char *p;
2318 Py_ssize_t nsize, bytesize;
2319 #ifndef Py_UNICODE_WIDE
2320 Py_ssize_t i, pairs;
2321 #else
2322 const int pairs = 0;
2323 #endif
2324 /* Offsets from p for storing byte pairs in the right order. */
2325 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2326 int iorder[] = {0, 1, 2, 3};
2327 #else
2328 int iorder[] = {3, 2, 1, 0};
2329 #endif
2331 #define STORECHAR(CH) \
2332 do { \
2333 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2334 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2335 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2336 p[iorder[0]] = (CH) & 0xff; \
2337 p += 4; \
2338 } while(0)
2340 /* In narrow builds we can output surrogate pairs as one codepoint,
2341 so we need less space. */
2342 #ifndef Py_UNICODE_WIDE
2343 for (i = pairs = 0; i < size-1; i++)
2344 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2345 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2346 pairs++;
2347 #endif
2348 nsize = (size - pairs + (byteorder == 0));
2349 bytesize = nsize * 4;
2350 if (bytesize / 4 != nsize)
2351 return PyErr_NoMemory();
2352 v = PyString_FromStringAndSize(NULL, bytesize);
2353 if (v == NULL)
2354 return NULL;
2356 p = (unsigned char *)PyString_AS_STRING(v);
2357 if (byteorder == 0)
2358 STORECHAR(0xFEFF);
2359 if (size == 0)
2360 return v;
2362 if (byteorder == -1) {
2363 /* force LE */
2364 iorder[0] = 0;
2365 iorder[1] = 1;
2366 iorder[2] = 2;
2367 iorder[3] = 3;
2369 else if (byteorder == 1) {
2370 /* force BE */
2371 iorder[0] = 3;
2372 iorder[1] = 2;
2373 iorder[2] = 1;
2374 iorder[3] = 0;
2377 while (size-- > 0) {
2378 Py_UCS4 ch = *s++;
2379 #ifndef Py_UNICODE_WIDE
2380 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2381 Py_UCS4 ch2 = *s;
2382 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2383 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2384 s++;
2385 size--;
2388 #endif
2389 STORECHAR(ch);
2391 return v;
2392 #undef STORECHAR
2395 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2397 if (!PyUnicode_Check(unicode)) {
2398 PyErr_BadArgument();
2399 return NULL;
2401 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2402 PyUnicode_GET_SIZE(unicode),
2403 NULL,
2407 /* --- UTF-16 Codec ------------------------------------------------------- */
2409 PyObject *
2410 PyUnicode_DecodeUTF16(const char *s,
2411 Py_ssize_t size,
2412 const char *errors,
2413 int *byteorder)
2415 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2418 PyObject *
2419 PyUnicode_DecodeUTF16Stateful(const char *s,
2420 Py_ssize_t size,
2421 const char *errors,
2422 int *byteorder,
2423 Py_ssize_t *consumed)
2425 const char *starts = s;
2426 Py_ssize_t startinpos;
2427 Py_ssize_t endinpos;
2428 Py_ssize_t outpos;
2429 PyUnicodeObject *unicode;
2430 Py_UNICODE *p;
2431 const unsigned char *q, *e;
2432 int bo = 0; /* assume native ordering by default */
2433 const char *errmsg = "";
2434 /* Offsets from q for retrieving byte pairs in the right order. */
2435 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2436 int ihi = 1, ilo = 0;
2437 #else
2438 int ihi = 0, ilo = 1;
2439 #endif
2440 PyObject *errorHandler = NULL;
2441 PyObject *exc = NULL;
2443 /* Note: size will always be longer than the resulting Unicode
2444 character count */
2445 unicode = _PyUnicode_New(size);
2446 if (!unicode)
2447 return NULL;
2448 if (size == 0)
2449 return (PyObject *)unicode;
2451 /* Unpack UTF-16 encoded data */
2452 p = unicode->str;
2453 q = (unsigned char *)s;
2454 e = q + size;
2456 if (byteorder)
2457 bo = *byteorder;
2459 /* Check for BOM marks (U+FEFF) in the input and adjust current
2460 byte order setting accordingly. In native mode, the leading BOM
2461 mark is skipped, in all other modes, it is copied to the output
2462 stream as-is (giving a ZWNBSP character). */
2463 if (bo == 0) {
2464 if (size >= 2) {
2465 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2466 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2467 if (bom == 0xFEFF) {
2468 q += 2;
2469 bo = -1;
2471 else if (bom == 0xFFFE) {
2472 q += 2;
2473 bo = 1;
2475 #else
2476 if (bom == 0xFEFF) {
2477 q += 2;
2478 bo = 1;
2480 else if (bom == 0xFFFE) {
2481 q += 2;
2482 bo = -1;
2484 #endif
2488 if (bo == -1) {
2489 /* force LE */
2490 ihi = 1;
2491 ilo = 0;
2493 else if (bo == 1) {
2494 /* force BE */
2495 ihi = 0;
2496 ilo = 1;
2499 while (q < e) {
2500 Py_UNICODE ch;
2501 /* remaining bytes at the end? (size should be even) */
2502 if (e-q<2) {
2503 if (consumed)
2504 break;
2505 errmsg = "truncated data";
2506 startinpos = ((const char *)q)-starts;
2507 endinpos = ((const char *)e)-starts;
2508 goto utf16Error;
2509 /* The remaining input chars are ignored if the callback
2510 chooses to skip the input */
2512 ch = (q[ihi] << 8) | q[ilo];
2514 q += 2;
2516 if (ch < 0xD800 || ch > 0xDFFF) {
2517 *p++ = ch;
2518 continue;
2521 /* UTF-16 code pair: */
2522 if (q >= e) {
2523 errmsg = "unexpected end of data";
2524 startinpos = (((const char *)q)-2)-starts;
2525 endinpos = ((const char *)e)-starts;
2526 goto utf16Error;
2528 if (0xD800 <= ch && ch <= 0xDBFF) {
2529 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2530 q += 2;
2531 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2532 #ifndef Py_UNICODE_WIDE
2533 *p++ = ch;
2534 *p++ = ch2;
2535 #else
2536 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2537 #endif
2538 continue;
2540 else {
2541 errmsg = "illegal UTF-16 surrogate";
2542 startinpos = (((const char *)q)-4)-starts;
2543 endinpos = startinpos+2;
2544 goto utf16Error;
2548 errmsg = "illegal encoding";
2549 startinpos = (((const char *)q)-2)-starts;
2550 endinpos = startinpos+2;
2551 /* Fall through to report the error */
2553 utf16Error:
2554 outpos = p-PyUnicode_AS_UNICODE(unicode);
2555 if (unicode_decode_call_errorhandler(
2556 errors, &errorHandler,
2557 "utf16", errmsg,
2558 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2559 &unicode, &outpos, &p))
2560 goto onError;
2563 if (byteorder)
2564 *byteorder = bo;
2566 if (consumed)
2567 *consumed = (const char *)q-starts;
2569 /* Adjust length */
2570 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2571 goto onError;
2573 Py_XDECREF(errorHandler);
2574 Py_XDECREF(exc);
2575 return (PyObject *)unicode;
2577 onError:
2578 Py_DECREF(unicode);
2579 Py_XDECREF(errorHandler);
2580 Py_XDECREF(exc);
2581 return NULL;
2584 PyObject *
2585 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2586 Py_ssize_t size,
2587 const char *errors,
2588 int byteorder)
2590 PyObject *v;
2591 unsigned char *p;
2592 Py_ssize_t nsize, bytesize;
2593 #ifdef Py_UNICODE_WIDE
2594 Py_ssize_t i, pairs;
2595 #else
2596 const int pairs = 0;
2597 #endif
2598 /* Offsets from p for storing byte pairs in the right order. */
2599 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2600 int ihi = 1, ilo = 0;
2601 #else
2602 int ihi = 0, ilo = 1;
2603 #endif
2605 #define STORECHAR(CH) \
2606 do { \
2607 p[ihi] = ((CH) >> 8) & 0xff; \
2608 p[ilo] = (CH) & 0xff; \
2609 p += 2; \
2610 } while(0)
2612 #ifdef Py_UNICODE_WIDE
2613 for (i = pairs = 0; i < size; i++)
2614 if (s[i] >= 0x10000)
2615 pairs++;
2616 #endif
2617 /* 2 * (size + pairs + (byteorder == 0)) */
2618 if (size > PY_SSIZE_T_MAX ||
2619 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2620 return PyErr_NoMemory();
2621 nsize = size + pairs + (byteorder == 0);
2622 bytesize = nsize * 2;
2623 if (bytesize / 2 != nsize)
2624 return PyErr_NoMemory();
2625 v = PyString_FromStringAndSize(NULL, bytesize);
2626 if (v == NULL)
2627 return NULL;
2629 p = (unsigned char *)PyString_AS_STRING(v);
2630 if (byteorder == 0)
2631 STORECHAR(0xFEFF);
2632 if (size == 0)
2633 return v;
2635 if (byteorder == -1) {
2636 /* force LE */
2637 ihi = 1;
2638 ilo = 0;
2640 else if (byteorder == 1) {
2641 /* force BE */
2642 ihi = 0;
2643 ilo = 1;
2646 while (size-- > 0) {
2647 Py_UNICODE ch = *s++;
2648 Py_UNICODE ch2 = 0;
2649 #ifdef Py_UNICODE_WIDE
2650 if (ch >= 0x10000) {
2651 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2652 ch = 0xD800 | ((ch-0x10000) >> 10);
2654 #endif
2655 STORECHAR(ch);
2656 if (ch2)
2657 STORECHAR(ch2);
2659 return v;
2660 #undef STORECHAR
2663 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2665 if (!PyUnicode_Check(unicode)) {
2666 PyErr_BadArgument();
2667 return NULL;
2669 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2670 PyUnicode_GET_SIZE(unicode),
2671 NULL,
2675 /* --- Unicode Escape Codec ----------------------------------------------- */
2677 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2679 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2680 Py_ssize_t size,
2681 const char *errors)
2683 const char *starts = s;
2684 Py_ssize_t startinpos;
2685 Py_ssize_t endinpos;
2686 Py_ssize_t outpos;
2687 int i;
2688 PyUnicodeObject *v;
2689 Py_UNICODE *p;
2690 const char *end;
2691 char* message;
2692 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2693 PyObject *errorHandler = NULL;
2694 PyObject *exc = NULL;
2696 /* Escaped strings will always be longer than the resulting
2697 Unicode string, so we start with size here and then reduce the
2698 length after conversion to the true value.
2699 (but if the error callback returns a long replacement string
2700 we'll have to allocate more space) */
2701 v = _PyUnicode_New(size);
2702 if (v == NULL)
2703 goto onError;
2704 if (size == 0)
2705 return (PyObject *)v;
2707 p = PyUnicode_AS_UNICODE(v);
2708 end = s + size;
2710 while (s < end) {
2711 unsigned char c;
2712 Py_UNICODE x;
2713 int digits;
2715 /* Non-escape characters are interpreted as Unicode ordinals */
2716 if (*s != '\\') {
2717 *p++ = (unsigned char) *s++;
2718 continue;
2721 startinpos = s-starts;
2722 /* \ - Escapes */
2723 s++;
2724 c = *s++;
2725 if (s > end)
2726 c = '\0'; /* Invalid after \ */
2727 switch (c) {
2729 /* \x escapes */
2730 case '\n': break;
2731 case '\\': *p++ = '\\'; break;
2732 case '\'': *p++ = '\''; break;
2733 case '\"': *p++ = '\"'; break;
2734 case 'b': *p++ = '\b'; break;
2735 case 'f': *p++ = '\014'; break; /* FF */
2736 case 't': *p++ = '\t'; break;
2737 case 'n': *p++ = '\n'; break;
2738 case 'r': *p++ = '\r'; break;
2739 case 'v': *p++ = '\013'; break; /* VT */
2740 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2742 /* \OOO (octal) escapes */
2743 case '0': case '1': case '2': case '3':
2744 case '4': case '5': case '6': case '7':
2745 x = s[-1] - '0';
2746 if (s < end && '0' <= *s && *s <= '7') {
2747 x = (x<<3) + *s++ - '0';
2748 if (s < end && '0' <= *s && *s <= '7')
2749 x = (x<<3) + *s++ - '0';
2751 *p++ = x;
2752 break;
2754 /* hex escapes */
2755 /* \xXX */
2756 case 'x':
2757 digits = 2;
2758 message = "truncated \\xXX escape";
2759 goto hexescape;
2761 /* \uXXXX */
2762 case 'u':
2763 digits = 4;
2764 message = "truncated \\uXXXX escape";
2765 goto hexescape;
2767 /* \UXXXXXXXX */
2768 case 'U':
2769 digits = 8;
2770 message = "truncated \\UXXXXXXXX escape";
2771 hexescape:
2772 chr = 0;
2773 outpos = p-PyUnicode_AS_UNICODE(v);
2774 if (s+digits>end) {
2775 endinpos = size;
2776 if (unicode_decode_call_errorhandler(
2777 errors, &errorHandler,
2778 "unicodeescape", "end of string in escape sequence",
2779 starts, size, &startinpos, &endinpos, &exc, &s,
2780 &v, &outpos, &p))
2781 goto onError;
2782 goto nextByte;
2784 for (i = 0; i < digits; ++i) {
2785 c = (unsigned char) s[i];
2786 if (!isxdigit(c)) {
2787 endinpos = (s+i+1)-starts;
2788 if (unicode_decode_call_errorhandler(
2789 errors, &errorHandler,
2790 "unicodeescape", message,
2791 starts, size, &startinpos, &endinpos, &exc, &s,
2792 &v, &outpos, &p))
2793 goto onError;
2794 goto nextByte;
2796 chr = (chr<<4) & ~0xF;
2797 if (c >= '0' && c <= '9')
2798 chr += c - '0';
2799 else if (c >= 'a' && c <= 'f')
2800 chr += 10 + c - 'a';
2801 else
2802 chr += 10 + c - 'A';
2804 s += i;
2805 if (chr == 0xffffffff && PyErr_Occurred())
2806 /* _decoding_error will have already written into the
2807 target buffer. */
2808 break;
2809 store:
2810 /* when we get here, chr is a 32-bit unicode character */
2811 if (chr <= 0xffff)
2812 /* UCS-2 character */
2813 *p++ = (Py_UNICODE) chr;
2814 else if (chr <= 0x10ffff) {
2815 /* UCS-4 character. Either store directly, or as
2816 surrogate pair. */
2817 #ifdef Py_UNICODE_WIDE
2818 *p++ = chr;
2819 #else
2820 chr -= 0x10000L;
2821 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2822 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2823 #endif
2824 } else {
2825 endinpos = s-starts;
2826 outpos = p-PyUnicode_AS_UNICODE(v);
2827 if (unicode_decode_call_errorhandler(
2828 errors, &errorHandler,
2829 "unicodeescape", "illegal Unicode character",
2830 starts, size, &startinpos, &endinpos, &exc, &s,
2831 &v, &outpos, &p))
2832 goto onError;
2834 break;
2836 /* \N{name} */
2837 case 'N':
2838 message = "malformed \\N character escape";
2839 if (ucnhash_CAPI == NULL) {
2840 /* load the unicode data module */
2841 PyObject *m, *api;
2842 m = PyImport_ImportModuleNoBlock("unicodedata");
2843 if (m == NULL)
2844 goto ucnhashError;
2845 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
2846 Py_DECREF(m);
2847 if (api == NULL)
2848 goto ucnhashError;
2849 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
2850 Py_DECREF(api);
2851 if (ucnhash_CAPI == NULL)
2852 goto ucnhashError;
2854 if (*s == '{') {
2855 const char *start = s+1;
2856 /* look for the closing brace */
2857 while (*s != '}' && s < end)
2858 s++;
2859 if (s > start && s < end && *s == '}') {
2860 /* found a name. look it up in the unicode database */
2861 message = "unknown Unicode character name";
2862 s++;
2863 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2864 goto store;
2867 endinpos = s-starts;
2868 outpos = p-PyUnicode_AS_UNICODE(v);
2869 if (unicode_decode_call_errorhandler(
2870 errors, &errorHandler,
2871 "unicodeescape", message,
2872 starts, size, &startinpos, &endinpos, &exc, &s,
2873 &v, &outpos, &p))
2874 goto onError;
2875 break;
2877 default:
2878 if (s > end) {
2879 message = "\\ at end of string";
2880 s--;
2881 endinpos = s-starts;
2882 outpos = p-PyUnicode_AS_UNICODE(v);
2883 if (unicode_decode_call_errorhandler(
2884 errors, &errorHandler,
2885 "unicodeescape", message,
2886 starts, size, &startinpos, &endinpos, &exc, &s,
2887 &v, &outpos, &p))
2888 goto onError;
2890 else {
2891 *p++ = '\\';
2892 *p++ = (unsigned char)s[-1];
2894 break;
2896 nextByte:
2899 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2900 goto onError;
2901 Py_XDECREF(errorHandler);
2902 Py_XDECREF(exc);
2903 return (PyObject *)v;
2905 ucnhashError:
2906 PyErr_SetString(
2907 PyExc_UnicodeError,
2908 "\\N escapes not supported (can't load unicodedata module)"
2910 Py_XDECREF(v);
2911 Py_XDECREF(errorHandler);
2912 Py_XDECREF(exc);
2913 return NULL;
2915 onError:
2916 Py_XDECREF(v);
2917 Py_XDECREF(errorHandler);
2918 Py_XDECREF(exc);
2919 return NULL;
2922 /* Return a Unicode-Escape string version of the Unicode object.
2924 If quotes is true, the string is enclosed in u"" or u'' quotes as
2925 appropriate.
2929 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2930 Py_ssize_t size,
2931 Py_UNICODE ch)
2933 /* like wcschr, but doesn't stop at NULL characters */
2935 while (size-- > 0) {
2936 if (*s == ch)
2937 return s;
2938 s++;
2941 return NULL;
2944 static
2945 PyObject *unicodeescape_string(const Py_UNICODE *s,
2946 Py_ssize_t size,
2947 int quotes)
2949 PyObject *repr;
2950 char *p;
2952 static const char *hexdigit = "0123456789abcdef";
2953 #ifdef Py_UNICODE_WIDE
2954 const Py_ssize_t expandsize = 10;
2955 #else
2956 const Py_ssize_t expandsize = 6;
2957 #endif
2959 /* XXX(nnorwitz): rather than over-allocating, it would be
2960 better to choose a different scheme. Perhaps scan the
2961 first N-chars of the string and allocate based on that size.
2963 /* Initial allocation is based on the longest-possible unichr
2964 escape.
2966 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2967 unichr, so in this case it's the longest unichr escape. In
2968 narrow (UTF-16) builds this is five chars per source unichr
2969 since there are two unichrs in the surrogate pair, so in narrow
2970 (UTF-16) builds it's not the longest unichr escape.
2972 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2973 so in the narrow (UTF-16) build case it's the longest unichr
2974 escape.
2977 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
2978 return PyErr_NoMemory();
2980 repr = PyString_FromStringAndSize(NULL,
2982 + expandsize*size
2983 + 1);
2984 if (repr == NULL)
2985 return NULL;
2987 p = PyString_AS_STRING(repr);
2989 if (quotes) {
2990 *p++ = 'u';
2991 *p++ = (findchar(s, size, '\'') &&
2992 !findchar(s, size, '"')) ? '"' : '\'';
2994 while (size-- > 0) {
2995 Py_UNICODE ch = *s++;
2997 /* Escape quotes and backslashes */
2998 if ((quotes &&
2999 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
3000 *p++ = '\\';
3001 *p++ = (char) ch;
3002 continue;
3005 #ifdef Py_UNICODE_WIDE
3006 /* Map 21-bit characters to '\U00xxxxxx' */
3007 else if (ch >= 0x10000) {
3008 *p++ = '\\';
3009 *p++ = 'U';
3010 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3011 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3012 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3013 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3014 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3015 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3016 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
3017 *p++ = hexdigit[ch & 0x0000000F];
3018 continue;
3020 #else
3021 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3022 else if (ch >= 0xD800 && ch < 0xDC00) {
3023 Py_UNICODE ch2;
3024 Py_UCS4 ucs;
3026 ch2 = *s++;
3027 size--;
3028 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3029 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3030 *p++ = '\\';
3031 *p++ = 'U';
3032 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3033 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3034 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3035 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3036 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3037 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3038 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3039 *p++ = hexdigit[ucs & 0x0000000F];
3040 continue;
3042 /* Fall through: isolated surrogates are copied as-is */
3043 s--;
3044 size++;
3046 #endif
3048 /* Map 16-bit characters to '\uxxxx' */
3049 if (ch >= 256) {
3050 *p++ = '\\';
3051 *p++ = 'u';
3052 *p++ = hexdigit[(ch >> 12) & 0x000F];
3053 *p++ = hexdigit[(ch >> 8) & 0x000F];
3054 *p++ = hexdigit[(ch >> 4) & 0x000F];
3055 *p++ = hexdigit[ch & 0x000F];
3058 /* Map special whitespace to '\t', \n', '\r' */
3059 else if (ch == '\t') {
3060 *p++ = '\\';
3061 *p++ = 't';
3063 else if (ch == '\n') {
3064 *p++ = '\\';
3065 *p++ = 'n';
3067 else if (ch == '\r') {
3068 *p++ = '\\';
3069 *p++ = 'r';
3072 /* Map non-printable US ASCII to '\xhh' */
3073 else if (ch < ' ' || ch >= 0x7F) {
3074 *p++ = '\\';
3075 *p++ = 'x';
3076 *p++ = hexdigit[(ch >> 4) & 0x000F];
3077 *p++ = hexdigit[ch & 0x000F];
3080 /* Copy everything else as-is */
3081 else
3082 *p++ = (char) ch;
3084 if (quotes)
3085 *p++ = PyString_AS_STRING(repr)[1];
3087 *p = '\0';
3088 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
3089 return repr;
3092 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3093 Py_ssize_t size)
3095 return unicodeescape_string(s, size, 0);
3098 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3100 if (!PyUnicode_Check(unicode)) {
3101 PyErr_BadArgument();
3102 return NULL;
3104 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3105 PyUnicode_GET_SIZE(unicode));
3108 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3110 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3111 Py_ssize_t size,
3112 const char *errors)
3114 const char *starts = s;
3115 Py_ssize_t startinpos;
3116 Py_ssize_t endinpos;
3117 Py_ssize_t outpos;
3118 PyUnicodeObject *v;
3119 Py_UNICODE *p;
3120 const char *end;
3121 const char *bs;
3122 PyObject *errorHandler = NULL;
3123 PyObject *exc = NULL;
3125 /* Escaped strings will always be longer than the resulting
3126 Unicode string, so we start with size here and then reduce the
3127 length after conversion to the true value. (But decoding error
3128 handler might have to resize the string) */
3129 v = _PyUnicode_New(size);
3130 if (v == NULL)
3131 goto onError;
3132 if (size == 0)
3133 return (PyObject *)v;
3134 p = PyUnicode_AS_UNICODE(v);
3135 end = s + size;
3136 while (s < end) {
3137 unsigned char c;
3138 Py_UCS4 x;
3139 int i;
3140 int count;
3142 /* Non-escape characters are interpreted as Unicode ordinals */
3143 if (*s != '\\') {
3144 *p++ = (unsigned char)*s++;
3145 continue;
3147 startinpos = s-starts;
3149 /* \u-escapes are only interpreted iff the number of leading
3150 backslashes if odd */
3151 bs = s;
3152 for (;s < end;) {
3153 if (*s != '\\')
3154 break;
3155 *p++ = (unsigned char)*s++;
3157 if (((s - bs) & 1) == 0 ||
3158 s >= end ||
3159 (*s != 'u' && *s != 'U')) {
3160 continue;
3162 p--;
3163 count = *s=='u' ? 4 : 8;
3164 s++;
3166 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3167 outpos = p-PyUnicode_AS_UNICODE(v);
3168 for (x = 0, i = 0; i < count; ++i, ++s) {
3169 c = (unsigned char)*s;
3170 if (!isxdigit(c)) {
3171 endinpos = s-starts;
3172 if (unicode_decode_call_errorhandler(
3173 errors, &errorHandler,
3174 "rawunicodeescape", "truncated \\uXXXX",
3175 starts, size, &startinpos, &endinpos, &exc, &s,
3176 &v, &outpos, &p))
3177 goto onError;
3178 goto nextByte;
3180 x = (x<<4) & ~0xF;
3181 if (c >= '0' && c <= '9')
3182 x += c - '0';
3183 else if (c >= 'a' && c <= 'f')
3184 x += 10 + c - 'a';
3185 else
3186 x += 10 + c - 'A';
3188 if (x <= 0xffff)
3189 /* UCS-2 character */
3190 *p++ = (Py_UNICODE) x;
3191 else if (x <= 0x10ffff) {
3192 /* UCS-4 character. Either store directly, or as
3193 surrogate pair. */
3194 #ifdef Py_UNICODE_WIDE
3195 *p++ = (Py_UNICODE) x;
3196 #else
3197 x -= 0x10000L;
3198 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3199 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3200 #endif
3201 } else {
3202 endinpos = s-starts;
3203 outpos = p-PyUnicode_AS_UNICODE(v);
3204 if (unicode_decode_call_errorhandler(
3205 errors, &errorHandler,
3206 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3207 starts, size, &startinpos, &endinpos, &exc, &s,
3208 &v, &outpos, &p))
3209 goto onError;
3211 nextByte:
3214 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3215 goto onError;
3216 Py_XDECREF(errorHandler);
3217 Py_XDECREF(exc);
3218 return (PyObject *)v;
3220 onError:
3221 Py_XDECREF(v);
3222 Py_XDECREF(errorHandler);
3223 Py_XDECREF(exc);
3224 return NULL;
3227 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3228 Py_ssize_t size)
3230 PyObject *repr;
3231 char *p;
3232 char *q;
3234 static const char *hexdigit = "0123456789abcdef";
3235 #ifdef Py_UNICODE_WIDE
3236 const Py_ssize_t expandsize = 10;
3237 #else
3238 const Py_ssize_t expandsize = 6;
3239 #endif
3241 if (size > PY_SSIZE_T_MAX / expandsize)
3242 return PyErr_NoMemory();
3244 repr = PyString_FromStringAndSize(NULL, expandsize * size);
3245 if (repr == NULL)
3246 return NULL;
3247 if (size == 0)
3248 return repr;
3250 p = q = PyString_AS_STRING(repr);
3251 while (size-- > 0) {
3252 Py_UNICODE ch = *s++;
3253 #ifdef Py_UNICODE_WIDE
3254 /* Map 32-bit characters to '\Uxxxxxxxx' */
3255 if (ch >= 0x10000) {
3256 *p++ = '\\';
3257 *p++ = 'U';
3258 *p++ = hexdigit[(ch >> 28) & 0xf];
3259 *p++ = hexdigit[(ch >> 24) & 0xf];
3260 *p++ = hexdigit[(ch >> 20) & 0xf];
3261 *p++ = hexdigit[(ch >> 16) & 0xf];
3262 *p++ = hexdigit[(ch >> 12) & 0xf];
3263 *p++ = hexdigit[(ch >> 8) & 0xf];
3264 *p++ = hexdigit[(ch >> 4) & 0xf];
3265 *p++ = hexdigit[ch & 15];
3267 else
3268 #else
3269 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3270 if (ch >= 0xD800 && ch < 0xDC00) {
3271 Py_UNICODE ch2;
3272 Py_UCS4 ucs;
3274 ch2 = *s++;
3275 size--;
3276 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3277 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3278 *p++ = '\\';
3279 *p++ = 'U';
3280 *p++ = hexdigit[(ucs >> 28) & 0xf];
3281 *p++ = hexdigit[(ucs >> 24) & 0xf];
3282 *p++ = hexdigit[(ucs >> 20) & 0xf];
3283 *p++ = hexdigit[(ucs >> 16) & 0xf];
3284 *p++ = hexdigit[(ucs >> 12) & 0xf];
3285 *p++ = hexdigit[(ucs >> 8) & 0xf];
3286 *p++ = hexdigit[(ucs >> 4) & 0xf];
3287 *p++ = hexdigit[ucs & 0xf];
3288 continue;
3290 /* Fall through: isolated surrogates are copied as-is */
3291 s--;
3292 size++;
3294 #endif
3295 /* Map 16-bit characters to '\uxxxx' */
3296 if (ch >= 256) {
3297 *p++ = '\\';
3298 *p++ = 'u';
3299 *p++ = hexdigit[(ch >> 12) & 0xf];
3300 *p++ = hexdigit[(ch >> 8) & 0xf];
3301 *p++ = hexdigit[(ch >> 4) & 0xf];
3302 *p++ = hexdigit[ch & 15];
3304 /* Copy everything else as-is */
3305 else
3306 *p++ = (char) ch;
3308 *p = '\0';
3309 _PyString_Resize(&repr, p - q);
3310 return repr;
3313 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3315 if (!PyUnicode_Check(unicode)) {
3316 PyErr_BadArgument();
3317 return NULL;
3319 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3320 PyUnicode_GET_SIZE(unicode));
3323 /* --- Unicode Internal Codec ------------------------------------------- */
3325 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3326 Py_ssize_t size,
3327 const char *errors)
3329 const char *starts = s;
3330 Py_ssize_t startinpos;
3331 Py_ssize_t endinpos;
3332 Py_ssize_t outpos;
3333 PyUnicodeObject *v;
3334 Py_UNICODE *p;
3335 const char *end;
3336 const char *reason;
3337 PyObject *errorHandler = NULL;
3338 PyObject *exc = NULL;
3340 #ifdef Py_UNICODE_WIDE
3341 Py_UNICODE unimax = PyUnicode_GetMax();
3342 #endif
3344 /* XXX overflow detection missing */
3345 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3346 if (v == NULL)
3347 goto onError;
3348 if (PyUnicode_GetSize((PyObject *)v) == 0)
3349 return (PyObject *)v;
3350 p = PyUnicode_AS_UNICODE(v);
3351 end = s + size;
3353 while (s < end) {
3354 memcpy(p, s, sizeof(Py_UNICODE));
3355 /* We have to sanity check the raw data, otherwise doom looms for
3356 some malformed UCS-4 data. */
3357 if (
3358 #ifdef Py_UNICODE_WIDE
3359 *p > unimax || *p < 0 ||
3360 #endif
3361 end-s < Py_UNICODE_SIZE
3364 startinpos = s - starts;
3365 if (end-s < Py_UNICODE_SIZE) {
3366 endinpos = end-starts;
3367 reason = "truncated input";
3369 else {
3370 endinpos = s - starts + Py_UNICODE_SIZE;
3371 reason = "illegal code point (> 0x10FFFF)";
3373 outpos = p - PyUnicode_AS_UNICODE(v);
3374 if (unicode_decode_call_errorhandler(
3375 errors, &errorHandler,
3376 "unicode_internal", reason,
3377 starts, size, &startinpos, &endinpos, &exc, &s,
3378 &v, &outpos, &p)) {
3379 goto onError;
3382 else {
3383 p++;
3384 s += Py_UNICODE_SIZE;
3388 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3389 goto onError;
3390 Py_XDECREF(errorHandler);
3391 Py_XDECREF(exc);
3392 return (PyObject *)v;
3394 onError:
3395 Py_XDECREF(v);
3396 Py_XDECREF(errorHandler);
3397 Py_XDECREF(exc);
3398 return NULL;
3401 /* --- Latin-1 Codec ------------------------------------------------------ */
3403 PyObject *PyUnicode_DecodeLatin1(const char *s,
3404 Py_ssize_t size,
3405 const char *errors)
3407 PyUnicodeObject *v;
3408 Py_UNICODE *p;
3410 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3411 if (size == 1) {
3412 Py_UNICODE r = *(unsigned char*)s;
3413 return PyUnicode_FromUnicode(&r, 1);
3416 v = _PyUnicode_New(size);
3417 if (v == NULL)
3418 goto onError;
3419 if (size == 0)
3420 return (PyObject *)v;
3421 p = PyUnicode_AS_UNICODE(v);
3422 while (size-- > 0)
3423 *p++ = (unsigned char)*s++;
3424 return (PyObject *)v;
3426 onError:
3427 Py_XDECREF(v);
3428 return NULL;
3431 /* create or adjust a UnicodeEncodeError */
3432 static void make_encode_exception(PyObject **exceptionObject,
3433 const char *encoding,
3434 const Py_UNICODE *unicode, Py_ssize_t size,
3435 Py_ssize_t startpos, Py_ssize_t endpos,
3436 const char *reason)
3438 if (*exceptionObject == NULL) {
3439 *exceptionObject = PyUnicodeEncodeError_Create(
3440 encoding, unicode, size, startpos, endpos, reason);
3442 else {
3443 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3444 goto onError;
3445 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3446 goto onError;
3447 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3448 goto onError;
3449 return;
3450 onError:
3451 Py_DECREF(*exceptionObject);
3452 *exceptionObject = NULL;
3456 /* raises a UnicodeEncodeError */
3457 static void raise_encode_exception(PyObject **exceptionObject,
3458 const char *encoding,
3459 const Py_UNICODE *unicode, Py_ssize_t size,
3460 Py_ssize_t startpos, Py_ssize_t endpos,
3461 const char *reason)
3463 make_encode_exception(exceptionObject,
3464 encoding, unicode, size, startpos, endpos, reason);
3465 if (*exceptionObject != NULL)
3466 PyCodec_StrictErrors(*exceptionObject);
3469 /* error handling callback helper:
3470 build arguments, call the callback and check the arguments,
3471 put the result into newpos and return the replacement string, which
3472 has to be freed by the caller */
3473 static PyObject *unicode_encode_call_errorhandler(const char *errors,
3474 PyObject **errorHandler,
3475 const char *encoding, const char *reason,
3476 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3477 Py_ssize_t startpos, Py_ssize_t endpos,
3478 Py_ssize_t *newpos)
3480 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3482 PyObject *restuple;
3483 PyObject *resunicode;
3485 if (*errorHandler == NULL) {
3486 *errorHandler = PyCodec_LookupError(errors);
3487 if (*errorHandler == NULL)
3488 return NULL;
3491 make_encode_exception(exceptionObject,
3492 encoding, unicode, size, startpos, endpos, reason);
3493 if (*exceptionObject == NULL)
3494 return NULL;
3496 restuple = PyObject_CallFunctionObjArgs(
3497 *errorHandler, *exceptionObject, NULL);
3498 if (restuple == NULL)
3499 return NULL;
3500 if (!PyTuple_Check(restuple)) {
3501 PyErr_SetString(PyExc_TypeError, &argparse[4]);
3502 Py_DECREF(restuple);
3503 return NULL;
3505 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3506 &resunicode, newpos)) {
3507 Py_DECREF(restuple);
3508 return NULL;
3510 if (*newpos<0)
3511 *newpos = size+*newpos;
3512 if (*newpos<0 || *newpos>size) {
3513 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3514 Py_DECREF(restuple);
3515 return NULL;
3517 Py_INCREF(resunicode);
3518 Py_DECREF(restuple);
3519 return resunicode;
3522 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3523 Py_ssize_t size,
3524 const char *errors,
3525 int limit)
3527 /* output object */
3528 PyObject *res;
3529 /* pointers to the beginning and end+1 of input */
3530 const Py_UNICODE *startp = p;
3531 const Py_UNICODE *endp = p + size;
3532 /* pointer to the beginning of the unencodable characters */
3533 /* const Py_UNICODE *badp = NULL; */
3534 /* pointer into the output */
3535 char *str;
3536 /* current output position */
3537 Py_ssize_t respos = 0;
3538 Py_ssize_t ressize;
3539 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3540 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3541 PyObject *errorHandler = NULL;
3542 PyObject *exc = NULL;
3543 /* the following variable is used for caching string comparisons
3544 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3545 int known_errorHandler = -1;
3547 /* allocate enough for a simple encoding without
3548 replacements, if we need more, we'll resize */
3549 res = PyString_FromStringAndSize(NULL, size);
3550 if (res == NULL)
3551 goto onError;
3552 if (size == 0)
3553 return res;
3554 str = PyString_AS_STRING(res);
3555 ressize = size;
3557 while (p<endp) {
3558 Py_UNICODE c = *p;
3560 /* can we encode this? */
3561 if (c<limit) {
3562 /* no overflow check, because we know that the space is enough */
3563 *str++ = (char)c;
3564 ++p;
3566 else {
3567 Py_ssize_t unicodepos = p-startp;
3568 Py_ssize_t requiredsize;
3569 PyObject *repunicode;
3570 Py_ssize_t repsize;
3571 Py_ssize_t newpos;
3572 Py_ssize_t respos;
3573 Py_UNICODE *uni2;
3574 /* startpos for collecting unencodable chars */
3575 const Py_UNICODE *collstart = p;
3576 const Py_UNICODE *collend = p;
3577 /* find all unecodable characters */
3578 while ((collend < endp) && ((*collend)>=limit))
3579 ++collend;
3580 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3581 if (known_errorHandler==-1) {
3582 if ((errors==NULL) || (!strcmp(errors, "strict")))
3583 known_errorHandler = 1;
3584 else if (!strcmp(errors, "replace"))
3585 known_errorHandler = 2;
3586 else if (!strcmp(errors, "ignore"))
3587 known_errorHandler = 3;
3588 else if (!strcmp(errors, "xmlcharrefreplace"))
3589 known_errorHandler = 4;
3590 else
3591 known_errorHandler = 0;
3593 switch (known_errorHandler) {
3594 case 1: /* strict */
3595 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3596 goto onError;
3597 case 2: /* replace */
3598 while (collstart++<collend)
3599 *str++ = '?'; /* fall through */
3600 case 3: /* ignore */
3601 p = collend;
3602 break;
3603 case 4: /* xmlcharrefreplace */
3604 respos = str-PyString_AS_STRING(res);
3605 /* determine replacement size (temporarily (mis)uses p) */
3606 for (p = collstart, repsize = 0; p < collend; ++p) {
3607 if (*p<10)
3608 repsize += 2+1+1;
3609 else if (*p<100)
3610 repsize += 2+2+1;
3611 else if (*p<1000)
3612 repsize += 2+3+1;
3613 else if (*p<10000)
3614 repsize += 2+4+1;
3615 #ifndef Py_UNICODE_WIDE
3616 else
3617 repsize += 2+5+1;
3618 #else
3619 else if (*p<100000)
3620 repsize += 2+5+1;
3621 else if (*p<1000000)
3622 repsize += 2+6+1;
3623 else
3624 repsize += 2+7+1;
3625 #endif
3627 requiredsize = respos+repsize+(endp-collend);
3628 if (requiredsize > ressize) {
3629 if (requiredsize<2*ressize)
3630 requiredsize = 2*ressize;
3631 if (_PyString_Resize(&res, requiredsize))
3632 goto onError;
3633 str = PyString_AS_STRING(res) + respos;
3634 ressize = requiredsize;
3636 /* generate replacement (temporarily (mis)uses p) */
3637 for (p = collstart; p < collend; ++p) {
3638 str += sprintf(str, "&#%d;", (int)*p);
3640 p = collend;
3641 break;
3642 default:
3643 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3644 encoding, reason, startp, size, &exc,
3645 collstart-startp, collend-startp, &newpos);
3646 if (repunicode == NULL)
3647 goto onError;
3648 /* need more space? (at least enough for what we have+the
3649 replacement+the rest of the string, so we won't have to
3650 check space for encodable characters) */
3651 respos = str-PyString_AS_STRING(res);
3652 repsize = PyUnicode_GET_SIZE(repunicode);
3653 requiredsize = respos+repsize+(endp-collend);
3654 if (requiredsize > ressize) {
3655 if (requiredsize<2*ressize)
3656 requiredsize = 2*ressize;
3657 if (_PyString_Resize(&res, requiredsize)) {
3658 Py_DECREF(repunicode);
3659 goto onError;
3661 str = PyString_AS_STRING(res) + respos;
3662 ressize = requiredsize;
3664 /* check if there is anything unencodable in the replacement
3665 and copy it to the output */
3666 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3667 c = *uni2;
3668 if (c >= limit) {
3669 raise_encode_exception(&exc, encoding, startp, size,
3670 unicodepos, unicodepos+1, reason);
3671 Py_DECREF(repunicode);
3672 goto onError;
3674 *str = (char)c;
3676 p = startp + newpos;
3677 Py_DECREF(repunicode);
3681 /* Resize if we allocated to much */
3682 respos = str-PyString_AS_STRING(res);
3683 if (respos<ressize)
3684 /* If this falls res will be NULL */
3685 _PyString_Resize(&res, respos);
3686 Py_XDECREF(errorHandler);
3687 Py_XDECREF(exc);
3688 return res;
3690 onError:
3691 Py_XDECREF(res);
3692 Py_XDECREF(errorHandler);
3693 Py_XDECREF(exc);
3694 return NULL;
3697 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3698 Py_ssize_t size,
3699 const char *errors)
3701 return unicode_encode_ucs1(p, size, errors, 256);
3704 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3706 if (!PyUnicode_Check(unicode)) {
3707 PyErr_BadArgument();
3708 return NULL;
3710 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3711 PyUnicode_GET_SIZE(unicode),
3712 NULL);
3715 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3717 PyObject *PyUnicode_DecodeASCII(const char *s,
3718 Py_ssize_t size,
3719 const char *errors)
3721 const char *starts = s;
3722 PyUnicodeObject *v;
3723 Py_UNICODE *p;
3724 Py_ssize_t startinpos;
3725 Py_ssize_t endinpos;
3726 Py_ssize_t outpos;
3727 const char *e;
3728 PyObject *errorHandler = NULL;
3729 PyObject *exc = NULL;
3731 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3732 if (size == 1 && *(unsigned char*)s < 128) {
3733 Py_UNICODE r = *(unsigned char*)s;
3734 return PyUnicode_FromUnicode(&r, 1);
3737 v = _PyUnicode_New(size);
3738 if (v == NULL)
3739 goto onError;
3740 if (size == 0)
3741 return (PyObject *)v;
3742 p = PyUnicode_AS_UNICODE(v);
3743 e = s + size;
3744 while (s < e) {
3745 register unsigned char c = (unsigned char)*s;
3746 if (c < 128) {
3747 *p++ = c;
3748 ++s;
3750 else {
3751 startinpos = s-starts;
3752 endinpos = startinpos + 1;
3753 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3754 if (unicode_decode_call_errorhandler(
3755 errors, &errorHandler,
3756 "ascii", "ordinal not in range(128)",
3757 starts, size, &startinpos, &endinpos, &exc, &s,
3758 &v, &outpos, &p))
3759 goto onError;
3762 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
3763 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3764 goto onError;
3765 Py_XDECREF(errorHandler);
3766 Py_XDECREF(exc);
3767 return (PyObject *)v;
3769 onError:
3770 Py_XDECREF(v);
3771 Py_XDECREF(errorHandler);
3772 Py_XDECREF(exc);
3773 return NULL;
3776 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3777 Py_ssize_t size,
3778 const char *errors)
3780 return unicode_encode_ucs1(p, size, errors, 128);
3783 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3785 if (!PyUnicode_Check(unicode)) {
3786 PyErr_BadArgument();
3787 return NULL;
3789 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3790 PyUnicode_GET_SIZE(unicode),
3791 NULL);
3794 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3796 /* --- MBCS codecs for Windows -------------------------------------------- */
3798 #if SIZEOF_INT < SIZEOF_SIZE_T
3799 #define NEED_RETRY
3800 #endif
3802 /* XXX This code is limited to "true" double-byte encodings, as
3803 a) it assumes an incomplete character consists of a single byte, and
3804 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3805 encodings, see IsDBCSLeadByteEx documentation. */
3807 static int is_dbcs_lead_byte(const char *s, int offset)
3809 const char *curr = s + offset;
3811 if (IsDBCSLeadByte(*curr)) {
3812 const char *prev = CharPrev(s, curr);
3813 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3815 return 0;
3819 * Decode MBCS string into unicode object. If 'final' is set, converts
3820 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3822 static int decode_mbcs(PyUnicodeObject **v,
3823 const char *s, /* MBCS string */
3824 int size, /* sizeof MBCS string */
3825 int final)
3827 Py_UNICODE *p;
3828 Py_ssize_t n = 0;
3829 int usize = 0;
3831 assert(size >= 0);
3833 /* Skip trailing lead-byte unless 'final' is set */
3834 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3835 --size;
3837 /* First get the size of the result */
3838 if (size > 0) {
3839 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3840 if (usize == 0) {
3841 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3842 return -1;
3846 if (*v == NULL) {
3847 /* Create unicode object */
3848 *v = _PyUnicode_New(usize);
3849 if (*v == NULL)
3850 return -1;
3852 else {
3853 /* Extend unicode object */
3854 n = PyUnicode_GET_SIZE(*v);
3855 if (_PyUnicode_Resize(v, n + usize) < 0)
3856 return -1;
3859 /* Do the conversion */
3860 if (size > 0) {
3861 p = PyUnicode_AS_UNICODE(*v) + n;
3862 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3863 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3864 return -1;
3868 return size;
3871 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3872 Py_ssize_t size,
3873 const char *errors,
3874 Py_ssize_t *consumed)
3876 PyUnicodeObject *v = NULL;
3877 int done;
3879 if (consumed)
3880 *consumed = 0;
3882 #ifdef NEED_RETRY
3883 retry:
3884 if (size > INT_MAX)
3885 done = decode_mbcs(&v, s, INT_MAX, 0);
3886 else
3887 #endif
3888 done = decode_mbcs(&v, s, (int)size, !consumed);
3890 if (done < 0) {
3891 Py_XDECREF(v);
3892 return NULL;
3895 if (consumed)
3896 *consumed += done;
3898 #ifdef NEED_RETRY
3899 if (size > INT_MAX) {
3900 s += done;
3901 size -= done;
3902 goto retry;
3904 #endif
3906 return (PyObject *)v;
3909 PyObject *PyUnicode_DecodeMBCS(const char *s,
3910 Py_ssize_t size,
3911 const char *errors)
3913 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3917 * Convert unicode into string object (MBCS).
3918 * Returns 0 if succeed, -1 otherwise.
3920 static int encode_mbcs(PyObject **repr,
3921 const Py_UNICODE *p, /* unicode */
3922 int size) /* size of unicode */
3924 int mbcssize = 0;
3925 Py_ssize_t n = 0;
3927 assert(size >= 0);
3929 /* First get the size of the result */
3930 if (size > 0) {
3931 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3932 if (mbcssize == 0) {
3933 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3934 return -1;
3938 if (*repr == NULL) {
3939 /* Create string object */
3940 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3941 if (*repr == NULL)
3942 return -1;
3944 else {
3945 /* Extend string object */
3946 n = PyString_Size(*repr);
3947 if (_PyString_Resize(repr, n + mbcssize) < 0)
3948 return -1;
3951 /* Do the conversion */
3952 if (size > 0) {
3953 char *s = PyString_AS_STRING(*repr) + n;
3954 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3955 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3956 return -1;
3960 return 0;
3963 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
3964 Py_ssize_t size,
3965 const char *errors)
3967 PyObject *repr = NULL;
3968 int ret;
3970 #ifdef NEED_RETRY
3971 retry:
3972 if (size > INT_MAX)
3973 ret = encode_mbcs(&repr, p, INT_MAX);
3974 else
3975 #endif
3976 ret = encode_mbcs(&repr, p, (int)size);
3978 if (ret < 0) {
3979 Py_XDECREF(repr);
3980 return NULL;
3983 #ifdef NEED_RETRY
3984 if (size > INT_MAX) {
3985 p += INT_MAX;
3986 size -= INT_MAX;
3987 goto retry;
3989 #endif
3991 return repr;
3994 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3996 if (!PyUnicode_Check(unicode)) {
3997 PyErr_BadArgument();
3998 return NULL;
4000 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4001 PyUnicode_GET_SIZE(unicode),
4002 NULL);
4005 #undef NEED_RETRY
4007 #endif /* MS_WINDOWS */
4009 /* --- Character Mapping Codec -------------------------------------------- */
4011 PyObject *PyUnicode_DecodeCharmap(const char *s,
4012 Py_ssize_t size,
4013 PyObject *mapping,
4014 const char *errors)
4016 const char *starts = s;
4017 Py_ssize_t startinpos;
4018 Py_ssize_t endinpos;
4019 Py_ssize_t outpos;
4020 const char *e;
4021 PyUnicodeObject *v;
4022 Py_UNICODE *p;
4023 Py_ssize_t extrachars = 0;
4024 PyObject *errorHandler = NULL;
4025 PyObject *exc = NULL;
4026 Py_UNICODE *mapstring = NULL;
4027 Py_ssize_t maplen = 0;
4029 /* Default to Latin-1 */
4030 if (mapping == NULL)
4031 return PyUnicode_DecodeLatin1(s, size, errors);
4033 v = _PyUnicode_New(size);
4034 if (v == NULL)
4035 goto onError;
4036 if (size == 0)
4037 return (PyObject *)v;
4038 p = PyUnicode_AS_UNICODE(v);
4039 e = s + size;
4040 if (PyUnicode_CheckExact(mapping)) {
4041 mapstring = PyUnicode_AS_UNICODE(mapping);
4042 maplen = PyUnicode_GET_SIZE(mapping);
4043 while (s < e) {
4044 unsigned char ch = *s;
4045 Py_UNICODE x = 0xfffe; /* illegal value */
4047 if (ch < maplen)
4048 x = mapstring[ch];
4050 if (x == 0xfffe) {
4051 /* undefined mapping */
4052 outpos = p-PyUnicode_AS_UNICODE(v);
4053 startinpos = s-starts;
4054 endinpos = startinpos+1;
4055 if (unicode_decode_call_errorhandler(
4056 errors, &errorHandler,
4057 "charmap", "character maps to <undefined>",
4058 starts, size, &startinpos, &endinpos, &exc, &s,
4059 &v, &outpos, &p)) {
4060 goto onError;
4062 continue;
4064 *p++ = x;
4065 ++s;
4068 else {
4069 while (s < e) {
4070 unsigned char ch = *s;
4071 PyObject *w, *x;
4073 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4074 w = PyInt_FromLong((long)ch);
4075 if (w == NULL)
4076 goto onError;
4077 x = PyObject_GetItem(mapping, w);
4078 Py_DECREF(w);
4079 if (x == NULL) {
4080 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4081 /* No mapping found means: mapping is undefined. */
4082 PyErr_Clear();
4083 x = Py_None;
4084 Py_INCREF(x);
4085 } else
4086 goto onError;
4089 /* Apply mapping */
4090 if (PyInt_Check(x)) {
4091 long value = PyInt_AS_LONG(x);
4092 if (value < 0 || value > 65535) {
4093 PyErr_SetString(PyExc_TypeError,
4094 "character mapping must be in range(65536)");
4095 Py_DECREF(x);
4096 goto onError;
4098 *p++ = (Py_UNICODE)value;
4100 else if (x == Py_None) {
4101 /* undefined mapping */
4102 outpos = p-PyUnicode_AS_UNICODE(v);
4103 startinpos = s-starts;
4104 endinpos = startinpos+1;
4105 if (unicode_decode_call_errorhandler(
4106 errors, &errorHandler,
4107 "charmap", "character maps to <undefined>",
4108 starts, size, &startinpos, &endinpos, &exc, &s,
4109 &v, &outpos, &p)) {
4110 Py_DECREF(x);
4111 goto onError;
4113 Py_DECREF(x);
4114 continue;
4116 else if (PyUnicode_Check(x)) {
4117 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4119 if (targetsize == 1)
4120 /* 1-1 mapping */
4121 *p++ = *PyUnicode_AS_UNICODE(x);
4123 else if (targetsize > 1) {
4124 /* 1-n mapping */
4125 if (targetsize > extrachars) {
4126 /* resize first */
4127 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4128 Py_ssize_t needed = (targetsize - extrachars) + \
4129 (targetsize << 2);
4130 extrachars += needed;
4131 /* XXX overflow detection missing */
4132 if (_PyUnicode_Resize(&v,
4133 PyUnicode_GET_SIZE(v) + needed) < 0) {
4134 Py_DECREF(x);
4135 goto onError;
4137 p = PyUnicode_AS_UNICODE(v) + oldpos;
4139 Py_UNICODE_COPY(p,
4140 PyUnicode_AS_UNICODE(x),
4141 targetsize);
4142 p += targetsize;
4143 extrachars -= targetsize;
4145 /* 1-0 mapping: skip the character */
4147 else {
4148 /* wrong return value */
4149 PyErr_SetString(PyExc_TypeError,
4150 "character mapping must return integer, None or unicode");
4151 Py_DECREF(x);
4152 goto onError;
4154 Py_DECREF(x);
4155 ++s;
4158 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4159 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4160 goto onError;
4161 Py_XDECREF(errorHandler);
4162 Py_XDECREF(exc);
4163 return (PyObject *)v;
4165 onError:
4166 Py_XDECREF(errorHandler);
4167 Py_XDECREF(exc);
4168 Py_XDECREF(v);
4169 return NULL;
4172 /* Charmap encoding: the lookup table */
4174 struct encoding_map{
4175 PyObject_HEAD
4176 unsigned char level1[32];
4177 int count2, count3;
4178 unsigned char level23[1];
4181 static PyObject*
4182 encoding_map_size(PyObject *obj, PyObject* args)
4184 struct encoding_map *map = (struct encoding_map*)obj;
4185 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4186 128*map->count3);
4189 static PyMethodDef encoding_map_methods[] = {
4190 {"size", encoding_map_size, METH_NOARGS,
4191 PyDoc_STR("Return the size (in bytes) of this object") },
4192 { 0 }
4195 static void
4196 encoding_map_dealloc(PyObject* o)
4198 PyObject_FREE(o);
4201 static PyTypeObject EncodingMapType = {
4202 PyVarObject_HEAD_INIT(NULL, 0)
4203 "EncodingMap", /*tp_name*/
4204 sizeof(struct encoding_map), /*tp_basicsize*/
4205 0, /*tp_itemsize*/
4206 /* methods */
4207 encoding_map_dealloc, /*tp_dealloc*/
4208 0, /*tp_print*/
4209 0, /*tp_getattr*/
4210 0, /*tp_setattr*/
4211 0, /*tp_compare*/
4212 0, /*tp_repr*/
4213 0, /*tp_as_number*/
4214 0, /*tp_as_sequence*/
4215 0, /*tp_as_mapping*/
4216 0, /*tp_hash*/
4217 0, /*tp_call*/
4218 0, /*tp_str*/
4219 0, /*tp_getattro*/
4220 0, /*tp_setattro*/
4221 0, /*tp_as_buffer*/
4222 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4223 0, /*tp_doc*/
4224 0, /*tp_traverse*/
4225 0, /*tp_clear*/
4226 0, /*tp_richcompare*/
4227 0, /*tp_weaklistoffset*/
4228 0, /*tp_iter*/
4229 0, /*tp_iternext*/
4230 encoding_map_methods, /*tp_methods*/
4231 0, /*tp_members*/
4232 0, /*tp_getset*/
4233 0, /*tp_base*/
4234 0, /*tp_dict*/
4235 0, /*tp_descr_get*/
4236 0, /*tp_descr_set*/
4237 0, /*tp_dictoffset*/
4238 0, /*tp_init*/
4239 0, /*tp_alloc*/
4240 0, /*tp_new*/
4241 0, /*tp_free*/
4242 0, /*tp_is_gc*/
4245 PyObject*
4246 PyUnicode_BuildEncodingMap(PyObject* string)
4248 Py_UNICODE *decode;
4249 PyObject *result;
4250 struct encoding_map *mresult;
4251 int i;
4252 int need_dict = 0;
4253 unsigned char level1[32];
4254 unsigned char level2[512];
4255 unsigned char *mlevel1, *mlevel2, *mlevel3;
4256 int count2 = 0, count3 = 0;
4258 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4259 PyErr_BadArgument();
4260 return NULL;
4262 decode = PyUnicode_AS_UNICODE(string);
4263 memset(level1, 0xFF, sizeof level1);
4264 memset(level2, 0xFF, sizeof level2);
4266 /* If there isn't a one-to-one mapping of NULL to \0,
4267 or if there are non-BMP characters, we need to use
4268 a mapping dictionary. */
4269 if (decode[0] != 0)
4270 need_dict = 1;
4271 for (i = 1; i < 256; i++) {
4272 int l1, l2;
4273 if (decode[i] == 0
4274 #ifdef Py_UNICODE_WIDE
4275 || decode[i] > 0xFFFF
4276 #endif
4278 need_dict = 1;
4279 break;
4281 if (decode[i] == 0xFFFE)
4282 /* unmapped character */
4283 continue;
4284 l1 = decode[i] >> 11;
4285 l2 = decode[i] >> 7;
4286 if (level1[l1] == 0xFF)
4287 level1[l1] = count2++;
4288 if (level2[l2] == 0xFF)
4289 level2[l2] = count3++;
4292 if (count2 >= 0xFF || count3 >= 0xFF)
4293 need_dict = 1;
4295 if (need_dict) {
4296 PyObject *result = PyDict_New();
4297 PyObject *key, *value;
4298 if (!result)
4299 return NULL;
4300 for (i = 0; i < 256; i++) {
4301 key = value = NULL;
4302 key = PyInt_FromLong(decode[i]);
4303 value = PyInt_FromLong(i);
4304 if (!key || !value)
4305 goto failed1;
4306 if (PyDict_SetItem(result, key, value) == -1)
4307 goto failed1;
4308 Py_DECREF(key);
4309 Py_DECREF(value);
4311 return result;
4312 failed1:
4313 Py_XDECREF(key);
4314 Py_XDECREF(value);
4315 Py_DECREF(result);
4316 return NULL;
4319 /* Create a three-level trie */
4320 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4321 16*count2 + 128*count3 - 1);
4322 if (!result)
4323 return PyErr_NoMemory();
4324 PyObject_Init(result, &EncodingMapType);
4325 mresult = (struct encoding_map*)result;
4326 mresult->count2 = count2;
4327 mresult->count3 = count3;
4328 mlevel1 = mresult->level1;
4329 mlevel2 = mresult->level23;
4330 mlevel3 = mresult->level23 + 16*count2;
4331 memcpy(mlevel1, level1, 32);
4332 memset(mlevel2, 0xFF, 16*count2);
4333 memset(mlevel3, 0, 128*count3);
4334 count3 = 0;
4335 for (i = 1; i < 256; i++) {
4336 int o1, o2, o3, i2, i3;
4337 if (decode[i] == 0xFFFE)
4338 /* unmapped character */
4339 continue;
4340 o1 = decode[i]>>11;
4341 o2 = (decode[i]>>7) & 0xF;
4342 i2 = 16*mlevel1[o1] + o2;
4343 if (mlevel2[i2] == 0xFF)
4344 mlevel2[i2] = count3++;
4345 o3 = decode[i] & 0x7F;
4346 i3 = 128*mlevel2[i2] + o3;
4347 mlevel3[i3] = i;
4349 return result;
4352 static int
4353 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4355 struct encoding_map *map = (struct encoding_map*)mapping;
4356 int l1 = c>>11;
4357 int l2 = (c>>7) & 0xF;
4358 int l3 = c & 0x7F;
4359 int i;
4361 #ifdef Py_UNICODE_WIDE
4362 if (c > 0xFFFF) {
4363 return -1;
4365 #endif
4366 if (c == 0)
4367 return 0;
4368 /* level 1*/
4369 i = map->level1[l1];
4370 if (i == 0xFF) {
4371 return -1;
4373 /* level 2*/
4374 i = map->level23[16*i+l2];
4375 if (i == 0xFF) {
4376 return -1;
4378 /* level 3 */
4379 i = map->level23[16*map->count2 + 128*i + l3];
4380 if (i == 0) {
4381 return -1;
4383 return i;
4386 /* Lookup the character ch in the mapping. If the character
4387 can't be found, Py_None is returned (or NULL, if another
4388 error occurred). */
4389 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4391 PyObject *w = PyInt_FromLong((long)c);
4392 PyObject *x;
4394 if (w == NULL)
4395 return NULL;
4396 x = PyObject_GetItem(mapping, w);
4397 Py_DECREF(w);
4398 if (x == NULL) {
4399 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4400 /* No mapping found means: mapping is undefined. */
4401 PyErr_Clear();
4402 x = Py_None;
4403 Py_INCREF(x);
4404 return x;
4405 } else
4406 return NULL;
4408 else if (x == Py_None)
4409 return x;
4410 else if (PyInt_Check(x)) {
4411 long value = PyInt_AS_LONG(x);
4412 if (value < 0 || value > 255) {
4413 PyErr_SetString(PyExc_TypeError,
4414 "character mapping must be in range(256)");
4415 Py_DECREF(x);
4416 return NULL;
4418 return x;
4420 else if (PyString_Check(x))
4421 return x;
4422 else {
4423 /* wrong return value */
4424 PyErr_SetString(PyExc_TypeError,
4425 "character mapping must return integer, None or str");
4426 Py_DECREF(x);
4427 return NULL;
4431 static int
4432 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4434 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4435 /* exponentially overallocate to minimize reallocations */
4436 if (requiredsize < 2*outsize)
4437 requiredsize = 2*outsize;
4438 if (_PyString_Resize(outobj, requiredsize)) {
4439 return 0;
4441 return 1;
4444 typedef enum charmapencode_result {
4445 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4446 }charmapencode_result;
4447 /* lookup the character, put the result in the output string and adjust
4448 various state variables. Reallocate the output string if not enough
4449 space is available. Return a new reference to the object that
4450 was put in the output buffer, or Py_None, if the mapping was undefined
4451 (in which case no character was written) or NULL, if a
4452 reallocation error occurred. The caller must decref the result */
4453 static
4454 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4455 PyObject **outobj, Py_ssize_t *outpos)
4457 PyObject *rep;
4458 char *outstart;
4459 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4461 if (Py_TYPE(mapping) == &EncodingMapType) {
4462 int res = encoding_map_lookup(c, mapping);
4463 Py_ssize_t requiredsize = *outpos+1;
4464 if (res == -1)
4465 return enc_FAILED;
4466 if (outsize<requiredsize)
4467 if (!charmapencode_resize(outobj, outpos, requiredsize))
4468 return enc_EXCEPTION;
4469 outstart = PyString_AS_STRING(*outobj);
4470 outstart[(*outpos)++] = (char)res;
4471 return enc_SUCCESS;
4474 rep = charmapencode_lookup(c, mapping);
4475 if (rep==NULL)
4476 return enc_EXCEPTION;
4477 else if (rep==Py_None) {
4478 Py_DECREF(rep);
4479 return enc_FAILED;
4480 } else {
4481 if (PyInt_Check(rep)) {
4482 Py_ssize_t requiredsize = *outpos+1;
4483 if (outsize<requiredsize)
4484 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4485 Py_DECREF(rep);
4486 return enc_EXCEPTION;
4488 outstart = PyString_AS_STRING(*outobj);
4489 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4491 else {
4492 const char *repchars = PyString_AS_STRING(rep);
4493 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4494 Py_ssize_t requiredsize = *outpos+repsize;
4495 if (outsize<requiredsize)
4496 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4497 Py_DECREF(rep);
4498 return enc_EXCEPTION;
4500 outstart = PyString_AS_STRING(*outobj);
4501 memcpy(outstart + *outpos, repchars, repsize);
4502 *outpos += repsize;
4505 Py_DECREF(rep);
4506 return enc_SUCCESS;
4509 /* handle an error in PyUnicode_EncodeCharmap
4510 Return 0 on success, -1 on error */
4511 static
4512 int charmap_encoding_error(
4513 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4514 PyObject **exceptionObject,
4515 int *known_errorHandler, PyObject **errorHandler, const char *errors,
4516 PyObject **res, Py_ssize_t *respos)
4518 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4519 Py_ssize_t repsize;
4520 Py_ssize_t newpos;
4521 Py_UNICODE *uni2;
4522 /* startpos for collecting unencodable chars */
4523 Py_ssize_t collstartpos = *inpos;
4524 Py_ssize_t collendpos = *inpos+1;
4525 Py_ssize_t collpos;
4526 char *encoding = "charmap";
4527 char *reason = "character maps to <undefined>";
4528 charmapencode_result x;
4530 /* find all unencodable characters */
4531 while (collendpos < size) {
4532 PyObject *rep;
4533 if (Py_TYPE(mapping) == &EncodingMapType) {
4534 int res = encoding_map_lookup(p[collendpos], mapping);
4535 if (res != -1)
4536 break;
4537 ++collendpos;
4538 continue;
4541 rep = charmapencode_lookup(p[collendpos], mapping);
4542 if (rep==NULL)
4543 return -1;
4544 else if (rep!=Py_None) {
4545 Py_DECREF(rep);
4546 break;
4548 Py_DECREF(rep);
4549 ++collendpos;
4551 /* cache callback name lookup
4552 * (if not done yet, i.e. it's the first error) */
4553 if (*known_errorHandler==-1) {
4554 if ((errors==NULL) || (!strcmp(errors, "strict")))
4555 *known_errorHandler = 1;
4556 else if (!strcmp(errors, "replace"))
4557 *known_errorHandler = 2;
4558 else if (!strcmp(errors, "ignore"))
4559 *known_errorHandler = 3;
4560 else if (!strcmp(errors, "xmlcharrefreplace"))
4561 *known_errorHandler = 4;
4562 else
4563 *known_errorHandler = 0;
4565 switch (*known_errorHandler) {
4566 case 1: /* strict */
4567 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4568 return -1;
4569 case 2: /* replace */
4570 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4571 x = charmapencode_output('?', mapping, res, respos);
4572 if (x==enc_EXCEPTION) {
4573 return -1;
4575 else if (x==enc_FAILED) {
4576 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4577 return -1;
4580 /* fall through */
4581 case 3: /* ignore */
4582 *inpos = collendpos;
4583 break;
4584 case 4: /* xmlcharrefreplace */
4585 /* generate replacement (temporarily (mis)uses p) */
4586 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4587 char buffer[2+29+1+1];
4588 char *cp;
4589 sprintf(buffer, "&#%d;", (int)p[collpos]);
4590 for (cp = buffer; *cp; ++cp) {
4591 x = charmapencode_output(*cp, mapping, res, respos);
4592 if (x==enc_EXCEPTION)
4593 return -1;
4594 else if (x==enc_FAILED) {
4595 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4596 return -1;
4600 *inpos = collendpos;
4601 break;
4602 default:
4603 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4604 encoding, reason, p, size, exceptionObject,
4605 collstartpos, collendpos, &newpos);
4606 if (repunicode == NULL)
4607 return -1;
4608 /* generate replacement */
4609 repsize = PyUnicode_GET_SIZE(repunicode);
4610 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4611 x = charmapencode_output(*uni2, mapping, res, respos);
4612 if (x==enc_EXCEPTION) {
4613 return -1;
4615 else if (x==enc_FAILED) {
4616 Py_DECREF(repunicode);
4617 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4618 return -1;
4621 *inpos = newpos;
4622 Py_DECREF(repunicode);
4624 return 0;
4627 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4628 Py_ssize_t size,
4629 PyObject *mapping,
4630 const char *errors)
4632 /* output object */
4633 PyObject *res = NULL;
4634 /* current input position */
4635 Py_ssize_t inpos = 0;
4636 /* current output position */
4637 Py_ssize_t respos = 0;
4638 PyObject *errorHandler = NULL;
4639 PyObject *exc = NULL;
4640 /* the following variable is used for caching string comparisons
4641 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4642 * 3=ignore, 4=xmlcharrefreplace */
4643 int known_errorHandler = -1;
4645 /* Default to Latin-1 */
4646 if (mapping == NULL)
4647 return PyUnicode_EncodeLatin1(p, size, errors);
4649 /* allocate enough for a simple encoding without
4650 replacements, if we need more, we'll resize */
4651 res = PyString_FromStringAndSize(NULL, size);
4652 if (res == NULL)
4653 goto onError;
4654 if (size == 0)
4655 return res;
4657 while (inpos<size) {
4658 /* try to encode it */
4659 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4660 if (x==enc_EXCEPTION) /* error */
4661 goto onError;
4662 if (x==enc_FAILED) { /* unencodable character */
4663 if (charmap_encoding_error(p, size, &inpos, mapping,
4664 &exc,
4665 &known_errorHandler, &errorHandler, errors,
4666 &res, &respos)) {
4667 goto onError;
4670 else
4671 /* done with this character => adjust input position */
4672 ++inpos;
4675 /* Resize if we allocated to much */
4676 if (respos<PyString_GET_SIZE(res)) {
4677 if (_PyString_Resize(&res, respos))
4678 goto onError;
4680 Py_XDECREF(exc);
4681 Py_XDECREF(errorHandler);
4682 return res;
4684 onError:
4685 Py_XDECREF(res);
4686 Py_XDECREF(exc);
4687 Py_XDECREF(errorHandler);
4688 return NULL;
4691 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4692 PyObject *mapping)
4694 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4695 PyErr_BadArgument();
4696 return NULL;
4698 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4699 PyUnicode_GET_SIZE(unicode),
4700 mapping,
4701 NULL);
4704 /* create or adjust a UnicodeTranslateError */
4705 static void make_translate_exception(PyObject **exceptionObject,
4706 const Py_UNICODE *unicode, Py_ssize_t size,
4707 Py_ssize_t startpos, Py_ssize_t endpos,
4708 const char *reason)
4710 if (*exceptionObject == NULL) {
4711 *exceptionObject = PyUnicodeTranslateError_Create(
4712 unicode, size, startpos, endpos, reason);
4714 else {
4715 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4716 goto onError;
4717 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4718 goto onError;
4719 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4720 goto onError;
4721 return;
4722 onError:
4723 Py_DECREF(*exceptionObject);
4724 *exceptionObject = NULL;
4728 /* raises a UnicodeTranslateError */
4729 static void raise_translate_exception(PyObject **exceptionObject,
4730 const Py_UNICODE *unicode, Py_ssize_t size,
4731 Py_ssize_t startpos, Py_ssize_t endpos,
4732 const char *reason)
4734 make_translate_exception(exceptionObject,
4735 unicode, size, startpos, endpos, reason);
4736 if (*exceptionObject != NULL)
4737 PyCodec_StrictErrors(*exceptionObject);
4740 /* error handling callback helper:
4741 build arguments, call the callback and check the arguments,
4742 put the result into newpos and return the replacement string, which
4743 has to be freed by the caller */
4744 static PyObject *unicode_translate_call_errorhandler(const char *errors,
4745 PyObject **errorHandler,
4746 const char *reason,
4747 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4748 Py_ssize_t startpos, Py_ssize_t endpos,
4749 Py_ssize_t *newpos)
4751 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4753 Py_ssize_t i_newpos;
4754 PyObject *restuple;
4755 PyObject *resunicode;
4757 if (*errorHandler == NULL) {
4758 *errorHandler = PyCodec_LookupError(errors);
4759 if (*errorHandler == NULL)
4760 return NULL;
4763 make_translate_exception(exceptionObject,
4764 unicode, size, startpos, endpos, reason);
4765 if (*exceptionObject == NULL)
4766 return NULL;
4768 restuple = PyObject_CallFunctionObjArgs(
4769 *errorHandler, *exceptionObject, NULL);
4770 if (restuple == NULL)
4771 return NULL;
4772 if (!PyTuple_Check(restuple)) {
4773 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4774 Py_DECREF(restuple);
4775 return NULL;
4777 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4778 &resunicode, &i_newpos)) {
4779 Py_DECREF(restuple);
4780 return NULL;
4782 if (i_newpos<0)
4783 *newpos = size+i_newpos;
4784 else
4785 *newpos = i_newpos;
4786 if (*newpos<0 || *newpos>size) {
4787 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4788 Py_DECREF(restuple);
4789 return NULL;
4791 Py_INCREF(resunicode);
4792 Py_DECREF(restuple);
4793 return resunicode;
4796 /* Lookup the character ch in the mapping and put the result in result,
4797 which must be decrefed by the caller.
4798 Return 0 on success, -1 on error */
4799 static
4800 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4802 PyObject *w = PyInt_FromLong((long)c);
4803 PyObject *x;
4805 if (w == NULL)
4806 return -1;
4807 x = PyObject_GetItem(mapping, w);
4808 Py_DECREF(w);
4809 if (x == NULL) {
4810 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4811 /* No mapping found means: use 1:1 mapping. */
4812 PyErr_Clear();
4813 *result = NULL;
4814 return 0;
4815 } else
4816 return -1;
4818 else if (x == Py_None) {
4819 *result = x;
4820 return 0;
4822 else if (PyInt_Check(x)) {
4823 long value = PyInt_AS_LONG(x);
4824 long max = PyUnicode_GetMax();
4825 if (value < 0 || value > max) {
4826 PyErr_Format(PyExc_TypeError,
4827 "character mapping must be in range(0x%lx)", max+1);
4828 Py_DECREF(x);
4829 return -1;
4831 *result = x;
4832 return 0;
4834 else if (PyUnicode_Check(x)) {
4835 *result = x;
4836 return 0;
4838 else {
4839 /* wrong return value */
4840 PyErr_SetString(PyExc_TypeError,
4841 "character mapping must return integer, None or unicode");
4842 Py_DECREF(x);
4843 return -1;
4846 /* ensure that *outobj is at least requiredsize characters long,
4847 if not reallocate and adjust various state variables.
4848 Return 0 on success, -1 on error */
4849 static
4850 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4851 Py_ssize_t requiredsize)
4853 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4854 if (requiredsize > oldsize) {
4855 /* remember old output position */
4856 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4857 /* exponentially overallocate to minimize reallocations */
4858 if (requiredsize < 2 * oldsize)
4859 requiredsize = 2 * oldsize;
4860 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4861 return -1;
4862 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4864 return 0;
4866 /* lookup the character, put the result in the output string and adjust
4867 various state variables. Return a new reference to the object that
4868 was put in the output buffer in *result, or Py_None, if the mapping was
4869 undefined (in which case no character was written).
4870 The called must decref result.
4871 Return 0 on success, -1 on error. */
4872 static
4873 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
4874 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4875 PyObject **res)
4877 if (charmaptranslate_lookup(*curinp, mapping, res))
4878 return -1;
4879 if (*res==NULL) {
4880 /* not found => default to 1:1 mapping */
4881 *(*outp)++ = *curinp;
4883 else if (*res==Py_None)
4885 else if (PyInt_Check(*res)) {
4886 /* no overflow check, because we know that the space is enough */
4887 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4889 else if (PyUnicode_Check(*res)) {
4890 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4891 if (repsize==1) {
4892 /* no overflow check, because we know that the space is enough */
4893 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4895 else if (repsize!=0) {
4896 /* more than one character */
4897 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4898 (insize - (curinp-startinp)) +
4899 repsize - 1;
4900 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4901 return -1;
4902 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4903 *outp += repsize;
4906 else
4907 return -1;
4908 return 0;
4911 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
4912 Py_ssize_t size,
4913 PyObject *mapping,
4914 const char *errors)
4916 /* output object */
4917 PyObject *res = NULL;
4918 /* pointers to the beginning and end+1 of input */
4919 const Py_UNICODE *startp = p;
4920 const Py_UNICODE *endp = p + size;
4921 /* pointer into the output */
4922 Py_UNICODE *str;
4923 /* current output position */
4924 Py_ssize_t respos = 0;
4925 char *reason = "character maps to <undefined>";
4926 PyObject *errorHandler = NULL;
4927 PyObject *exc = NULL;
4928 /* the following variable is used for caching string comparisons
4929 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4930 * 3=ignore, 4=xmlcharrefreplace */
4931 int known_errorHandler = -1;
4933 if (mapping == NULL) {
4934 PyErr_BadArgument();
4935 return NULL;
4938 /* allocate enough for a simple 1:1 translation without
4939 replacements, if we need more, we'll resize */
4940 res = PyUnicode_FromUnicode(NULL, size);
4941 if (res == NULL)
4942 goto onError;
4943 if (size == 0)
4944 return res;
4945 str = PyUnicode_AS_UNICODE(res);
4947 while (p<endp) {
4948 /* try to encode it */
4949 PyObject *x = NULL;
4950 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4951 Py_XDECREF(x);
4952 goto onError;
4954 Py_XDECREF(x);
4955 if (x!=Py_None) /* it worked => adjust input pointer */
4956 ++p;
4957 else { /* untranslatable character */
4958 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4959 Py_ssize_t repsize;
4960 Py_ssize_t newpos;
4961 Py_UNICODE *uni2;
4962 /* startpos for collecting untranslatable chars */
4963 const Py_UNICODE *collstart = p;
4964 const Py_UNICODE *collend = p+1;
4965 const Py_UNICODE *coll;
4967 /* find all untranslatable characters */
4968 while (collend < endp) {
4969 if (charmaptranslate_lookup(*collend, mapping, &x))
4970 goto onError;
4971 Py_XDECREF(x);
4972 if (x!=Py_None)
4973 break;
4974 ++collend;
4976 /* cache callback name lookup
4977 * (if not done yet, i.e. it's the first error) */
4978 if (known_errorHandler==-1) {
4979 if ((errors==NULL) || (!strcmp(errors, "strict")))
4980 known_errorHandler = 1;
4981 else if (!strcmp(errors, "replace"))
4982 known_errorHandler = 2;
4983 else if (!strcmp(errors, "ignore"))
4984 known_errorHandler = 3;
4985 else if (!strcmp(errors, "xmlcharrefreplace"))
4986 known_errorHandler = 4;
4987 else
4988 known_errorHandler = 0;
4990 switch (known_errorHandler) {
4991 case 1: /* strict */
4992 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4993 goto onError;
4994 case 2: /* replace */
4995 /* No need to check for space, this is a 1:1 replacement */
4996 for (coll = collstart; coll<collend; ++coll)
4997 *str++ = '?';
4998 /* fall through */
4999 case 3: /* ignore */
5000 p = collend;
5001 break;
5002 case 4: /* xmlcharrefreplace */
5003 /* generate replacement (temporarily (mis)uses p) */
5004 for (p = collstart; p < collend; ++p) {
5005 char buffer[2+29+1+1];
5006 char *cp;
5007 sprintf(buffer, "&#%d;", (int)*p);
5008 if (charmaptranslate_makespace(&res, &str,
5009 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5010 goto onError;
5011 for (cp = buffer; *cp; ++cp)
5012 *str++ = *cp;
5014 p = collend;
5015 break;
5016 default:
5017 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5018 reason, startp, size, &exc,
5019 collstart-startp, collend-startp, &newpos);
5020 if (repunicode == NULL)
5021 goto onError;
5022 /* generate replacement */
5023 repsize = PyUnicode_GET_SIZE(repunicode);
5024 if (charmaptranslate_makespace(&res, &str,
5025 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5026 Py_DECREF(repunicode);
5027 goto onError;
5029 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5030 *str++ = *uni2;
5031 p = startp + newpos;
5032 Py_DECREF(repunicode);
5036 /* Resize if we allocated to much */
5037 respos = str-PyUnicode_AS_UNICODE(res);
5038 if (respos<PyUnicode_GET_SIZE(res)) {
5039 if (PyUnicode_Resize(&res, respos) < 0)
5040 goto onError;
5042 Py_XDECREF(exc);
5043 Py_XDECREF(errorHandler);
5044 return res;
5046 onError:
5047 Py_XDECREF(res);
5048 Py_XDECREF(exc);
5049 Py_XDECREF(errorHandler);
5050 return NULL;
5053 PyObject *PyUnicode_Translate(PyObject *str,
5054 PyObject *mapping,
5055 const char *errors)
5057 PyObject *result;
5059 str = PyUnicode_FromObject(str);
5060 if (str == NULL)
5061 goto onError;
5062 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5063 PyUnicode_GET_SIZE(str),
5064 mapping,
5065 errors);
5066 Py_DECREF(str);
5067 return result;
5069 onError:
5070 Py_XDECREF(str);
5071 return NULL;
5074 /* --- Decimal Encoder ---------------------------------------------------- */
5076 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5077 Py_ssize_t length,
5078 char *output,
5079 const char *errors)
5081 Py_UNICODE *p, *end;
5082 PyObject *errorHandler = NULL;
5083 PyObject *exc = NULL;
5084 const char *encoding = "decimal";
5085 const char *reason = "invalid decimal Unicode string";
5086 /* the following variable is used for caching string comparisons
5087 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5088 int known_errorHandler = -1;
5090 if (output == NULL) {
5091 PyErr_BadArgument();
5092 return -1;
5095 p = s;
5096 end = s + length;
5097 while (p < end) {
5098 register Py_UNICODE ch = *p;
5099 int decimal;
5100 PyObject *repunicode;
5101 Py_ssize_t repsize;
5102 Py_ssize_t newpos;
5103 Py_UNICODE *uni2;
5104 Py_UNICODE *collstart;
5105 Py_UNICODE *collend;
5107 if (Py_UNICODE_ISSPACE(ch)) {
5108 *output++ = ' ';
5109 ++p;
5110 continue;
5112 decimal = Py_UNICODE_TODECIMAL(ch);
5113 if (decimal >= 0) {
5114 *output++ = '0' + decimal;
5115 ++p;
5116 continue;
5118 if (0 < ch && ch < 256) {
5119 *output++ = (char)ch;
5120 ++p;
5121 continue;
5123 /* All other characters are considered unencodable */
5124 collstart = p;
5125 collend = p+1;
5126 while (collend < end) {
5127 if ((0 < *collend && *collend < 256) ||
5128 !Py_UNICODE_ISSPACE(*collend) ||
5129 Py_UNICODE_TODECIMAL(*collend))
5130 break;
5132 /* cache callback name lookup
5133 * (if not done yet, i.e. it's the first error) */
5134 if (known_errorHandler==-1) {
5135 if ((errors==NULL) || (!strcmp(errors, "strict")))
5136 known_errorHandler = 1;
5137 else if (!strcmp(errors, "replace"))
5138 known_errorHandler = 2;
5139 else if (!strcmp(errors, "ignore"))
5140 known_errorHandler = 3;
5141 else if (!strcmp(errors, "xmlcharrefreplace"))
5142 known_errorHandler = 4;
5143 else
5144 known_errorHandler = 0;
5146 switch (known_errorHandler) {
5147 case 1: /* strict */
5148 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5149 goto onError;
5150 case 2: /* replace */
5151 for (p = collstart; p < collend; ++p)
5152 *output++ = '?';
5153 /* fall through */
5154 case 3: /* ignore */
5155 p = collend;
5156 break;
5157 case 4: /* xmlcharrefreplace */
5158 /* generate replacement (temporarily (mis)uses p) */
5159 for (p = collstart; p < collend; ++p)
5160 output += sprintf(output, "&#%d;", (int)*p);
5161 p = collend;
5162 break;
5163 default:
5164 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5165 encoding, reason, s, length, &exc,
5166 collstart-s, collend-s, &newpos);
5167 if (repunicode == NULL)
5168 goto onError;
5169 /* generate replacement */
5170 repsize = PyUnicode_GET_SIZE(repunicode);
5171 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5172 Py_UNICODE ch = *uni2;
5173 if (Py_UNICODE_ISSPACE(ch))
5174 *output++ = ' ';
5175 else {
5176 decimal = Py_UNICODE_TODECIMAL(ch);
5177 if (decimal >= 0)
5178 *output++ = '0' + decimal;
5179 else if (0 < ch && ch < 256)
5180 *output++ = (char)ch;
5181 else {
5182 Py_DECREF(repunicode);
5183 raise_encode_exception(&exc, encoding,
5184 s, length, collstart-s, collend-s, reason);
5185 goto onError;
5189 p = s + newpos;
5190 Py_DECREF(repunicode);
5193 /* 0-terminate the output string */
5194 *output++ = '\0';
5195 Py_XDECREF(exc);
5196 Py_XDECREF(errorHandler);
5197 return 0;
5199 onError:
5200 Py_XDECREF(exc);
5201 Py_XDECREF(errorHandler);
5202 return -1;
5205 /* --- Helpers ------------------------------------------------------------ */
5207 #include "stringlib/unicodedefs.h"
5209 #define FROM_UNICODE
5211 #include "stringlib/fastsearch.h"
5213 #include "stringlib/count.h"
5214 #include "stringlib/find.h"
5215 #include "stringlib/partition.h"
5217 /* helper macro to fixup start/end slice values */
5218 #define FIX_START_END(obj) \
5219 if (start < 0) \
5220 start += (obj)->length; \
5221 if (start < 0) \
5222 start = 0; \
5223 if (end > (obj)->length) \
5224 end = (obj)->length; \
5225 if (end < 0) \
5226 end += (obj)->length; \
5227 if (end < 0) \
5228 end = 0;
5230 Py_ssize_t PyUnicode_Count(PyObject *str,
5231 PyObject *substr,
5232 Py_ssize_t start,
5233 Py_ssize_t end)
5235 Py_ssize_t result;
5236 PyUnicodeObject* str_obj;
5237 PyUnicodeObject* sub_obj;
5239 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5240 if (!str_obj)
5241 return -1;
5242 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5243 if (!sub_obj) {
5244 Py_DECREF(str_obj);
5245 return -1;
5248 FIX_START_END(str_obj);
5250 result = stringlib_count(
5251 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5254 Py_DECREF(sub_obj);
5255 Py_DECREF(str_obj);
5257 return result;
5260 Py_ssize_t PyUnicode_Find(PyObject *str,
5261 PyObject *sub,
5262 Py_ssize_t start,
5263 Py_ssize_t end,
5264 int direction)
5266 Py_ssize_t result;
5268 str = PyUnicode_FromObject(str);
5269 if (!str)
5270 return -2;
5271 sub = PyUnicode_FromObject(sub);
5272 if (!sub) {
5273 Py_DECREF(str);
5274 return -2;
5277 if (direction > 0)
5278 result = stringlib_find_slice(
5279 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5280 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5281 start, end
5283 else
5284 result = stringlib_rfind_slice(
5285 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5286 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5287 start, end
5290 Py_DECREF(str);
5291 Py_DECREF(sub);
5293 return result;
5296 static
5297 int tailmatch(PyUnicodeObject *self,
5298 PyUnicodeObject *substring,
5299 Py_ssize_t start,
5300 Py_ssize_t end,
5301 int direction)
5303 if (substring->length == 0)
5304 return 1;
5306 FIX_START_END(self);
5308 end -= substring->length;
5309 if (end < start)
5310 return 0;
5312 if (direction > 0) {
5313 if (Py_UNICODE_MATCH(self, end, substring))
5314 return 1;
5315 } else {
5316 if (Py_UNICODE_MATCH(self, start, substring))
5317 return 1;
5320 return 0;
5323 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5324 PyObject *substr,
5325 Py_ssize_t start,
5326 Py_ssize_t end,
5327 int direction)
5329 Py_ssize_t result;
5331 str = PyUnicode_FromObject(str);
5332 if (str == NULL)
5333 return -1;
5334 substr = PyUnicode_FromObject(substr);
5335 if (substr == NULL) {
5336 Py_DECREF(str);
5337 return -1;
5340 result = tailmatch((PyUnicodeObject *)str,
5341 (PyUnicodeObject *)substr,
5342 start, end, direction);
5343 Py_DECREF(str);
5344 Py_DECREF(substr);
5345 return result;
5348 /* Apply fixfct filter to the Unicode object self and return a
5349 reference to the modified object */
5351 static
5352 PyObject *fixup(PyUnicodeObject *self,
5353 int (*fixfct)(PyUnicodeObject *s))
5356 PyUnicodeObject *u;
5358 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5359 if (u == NULL)
5360 return NULL;
5362 Py_UNICODE_COPY(u->str, self->str, self->length);
5364 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5365 /* fixfct should return TRUE if it modified the buffer. If
5366 FALSE, return a reference to the original buffer instead
5367 (to save space, not time) */
5368 Py_INCREF(self);
5369 Py_DECREF(u);
5370 return (PyObject*) self;
5372 return (PyObject*) u;
5375 static
5376 int fixupper(PyUnicodeObject *self)
5378 Py_ssize_t len = self->length;
5379 Py_UNICODE *s = self->str;
5380 int status = 0;
5382 while (len-- > 0) {
5383 register Py_UNICODE ch;
5385 ch = Py_UNICODE_TOUPPER(*s);
5386 if (ch != *s) {
5387 status = 1;
5388 *s = ch;
5390 s++;
5393 return status;
5396 static
5397 int fixlower(PyUnicodeObject *self)
5399 Py_ssize_t len = self->length;
5400 Py_UNICODE *s = self->str;
5401 int status = 0;
5403 while (len-- > 0) {
5404 register Py_UNICODE ch;
5406 ch = Py_UNICODE_TOLOWER(*s);
5407 if (ch != *s) {
5408 status = 1;
5409 *s = ch;
5411 s++;
5414 return status;
5417 static
5418 int fixswapcase(PyUnicodeObject *self)
5420 Py_ssize_t len = self->length;
5421 Py_UNICODE *s = self->str;
5422 int status = 0;
5424 while (len-- > 0) {
5425 if (Py_UNICODE_ISUPPER(*s)) {
5426 *s = Py_UNICODE_TOLOWER(*s);
5427 status = 1;
5428 } else if (Py_UNICODE_ISLOWER(*s)) {
5429 *s = Py_UNICODE_TOUPPER(*s);
5430 status = 1;
5432 s++;
5435 return status;
5438 static
5439 int fixcapitalize(PyUnicodeObject *self)
5441 Py_ssize_t len = self->length;
5442 Py_UNICODE *s = self->str;
5443 int status = 0;
5445 if (len == 0)
5446 return 0;
5447 if (Py_UNICODE_ISLOWER(*s)) {
5448 *s = Py_UNICODE_TOUPPER(*s);
5449 status = 1;
5451 s++;
5452 while (--len > 0) {
5453 if (Py_UNICODE_ISUPPER(*s)) {
5454 *s = Py_UNICODE_TOLOWER(*s);
5455 status = 1;
5457 s++;
5459 return status;
5462 static
5463 int fixtitle(PyUnicodeObject *self)
5465 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5466 register Py_UNICODE *e;
5467 int previous_is_cased;
5469 /* Shortcut for single character strings */
5470 if (PyUnicode_GET_SIZE(self) == 1) {
5471 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5472 if (*p != ch) {
5473 *p = ch;
5474 return 1;
5476 else
5477 return 0;
5480 e = p + PyUnicode_GET_SIZE(self);
5481 previous_is_cased = 0;
5482 for (; p < e; p++) {
5483 register const Py_UNICODE ch = *p;
5485 if (previous_is_cased)
5486 *p = Py_UNICODE_TOLOWER(ch);
5487 else
5488 *p = Py_UNICODE_TOTITLE(ch);
5490 if (Py_UNICODE_ISLOWER(ch) ||
5491 Py_UNICODE_ISUPPER(ch) ||
5492 Py_UNICODE_ISTITLE(ch))
5493 previous_is_cased = 1;
5494 else
5495 previous_is_cased = 0;
5497 return 1;
5500 PyObject *
5501 PyUnicode_Join(PyObject *separator, PyObject *seq)
5503 PyObject *internal_separator = NULL;
5504 const Py_UNICODE blank = ' ';
5505 const Py_UNICODE *sep = &blank;
5506 Py_ssize_t seplen = 1;
5507 PyUnicodeObject *res = NULL; /* the result */
5508 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5509 Py_ssize_t res_used; /* # used bytes */
5510 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5511 PyObject *fseq; /* PySequence_Fast(seq) */
5512 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
5513 PyObject *item;
5514 Py_ssize_t i;
5516 fseq = PySequence_Fast(seq, "");
5517 if (fseq == NULL) {
5518 return NULL;
5521 /* Grrrr. A codec may be invoked to convert str objects to
5522 * Unicode, and so it's possible to call back into Python code
5523 * during PyUnicode_FromObject(), and so it's possible for a sick
5524 * codec to change the size of fseq (if seq is a list). Therefore
5525 * we have to keep refetching the size -- can't assume seqlen
5526 * is invariant.
5528 seqlen = PySequence_Fast_GET_SIZE(fseq);
5529 /* If empty sequence, return u"". */
5530 if (seqlen == 0) {
5531 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5532 goto Done;
5534 /* If singleton sequence with an exact Unicode, return that. */
5535 if (seqlen == 1) {
5536 item = PySequence_Fast_GET_ITEM(fseq, 0);
5537 if (PyUnicode_CheckExact(item)) {
5538 Py_INCREF(item);
5539 res = (PyUnicodeObject *)item;
5540 goto Done;
5544 /* At least two items to join, or one that isn't exact Unicode. */
5545 if (seqlen > 1) {
5546 /* Set up sep and seplen -- they're needed. */
5547 if (separator == NULL) {
5548 sep = &blank;
5549 seplen = 1;
5551 else {
5552 internal_separator = PyUnicode_FromObject(separator);
5553 if (internal_separator == NULL)
5554 goto onError;
5555 sep = PyUnicode_AS_UNICODE(internal_separator);
5556 seplen = PyUnicode_GET_SIZE(internal_separator);
5557 /* In case PyUnicode_FromObject() mutated seq. */
5558 seqlen = PySequence_Fast_GET_SIZE(fseq);
5562 /* Get space. */
5563 res = _PyUnicode_New(res_alloc);
5564 if (res == NULL)
5565 goto onError;
5566 res_p = PyUnicode_AS_UNICODE(res);
5567 res_used = 0;
5569 for (i = 0; i < seqlen; ++i) {
5570 Py_ssize_t itemlen;
5571 Py_ssize_t new_res_used;
5573 item = PySequence_Fast_GET_ITEM(fseq, i);
5574 /* Convert item to Unicode. */
5575 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5576 PyErr_Format(PyExc_TypeError,
5577 "sequence item %zd: expected string or Unicode,"
5578 " %.80s found",
5579 i, Py_TYPE(item)->tp_name);
5580 goto onError;
5582 item = PyUnicode_FromObject(item);
5583 if (item == NULL)
5584 goto onError;
5585 /* We own a reference to item from here on. */
5587 /* In case PyUnicode_FromObject() mutated seq. */
5588 seqlen = PySequence_Fast_GET_SIZE(fseq);
5590 /* Make sure we have enough space for the separator and the item. */
5591 itemlen = PyUnicode_GET_SIZE(item);
5592 new_res_used = res_used + itemlen;
5593 if (new_res_used < 0)
5594 goto Overflow;
5595 if (i < seqlen - 1) {
5596 new_res_used += seplen;
5597 if (new_res_used < 0)
5598 goto Overflow;
5600 if (new_res_used > res_alloc) {
5601 /* double allocated size until it's big enough */
5602 do {
5603 res_alloc += res_alloc;
5604 if (res_alloc <= 0)
5605 goto Overflow;
5606 } while (new_res_used > res_alloc);
5607 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5608 Py_DECREF(item);
5609 goto onError;
5611 res_p = PyUnicode_AS_UNICODE(res) + res_used;
5614 /* Copy item, and maybe the separator. */
5615 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5616 res_p += itemlen;
5617 if (i < seqlen - 1) {
5618 Py_UNICODE_COPY(res_p, sep, seplen);
5619 res_p += seplen;
5621 Py_DECREF(item);
5622 res_used = new_res_used;
5625 /* Shrink res to match the used area; this probably can't fail,
5626 * but it's cheap to check.
5628 if (_PyUnicode_Resize(&res, res_used) < 0)
5629 goto onError;
5631 Done:
5632 Py_XDECREF(internal_separator);
5633 Py_DECREF(fseq);
5634 return (PyObject *)res;
5636 Overflow:
5637 PyErr_SetString(PyExc_OverflowError,
5638 "join() result is too long for a Python string");
5639 Py_DECREF(item);
5640 /* fall through */
5642 onError:
5643 Py_XDECREF(internal_separator);
5644 Py_DECREF(fseq);
5645 Py_XDECREF(res);
5646 return NULL;
5649 static
5650 PyUnicodeObject *pad(PyUnicodeObject *self,
5651 Py_ssize_t left,
5652 Py_ssize_t right,
5653 Py_UNICODE fill)
5655 PyUnicodeObject *u;
5657 if (left < 0)
5658 left = 0;
5659 if (right < 0)
5660 right = 0;
5662 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5663 Py_INCREF(self);
5664 return self;
5667 if (left > PY_SSIZE_T_MAX - self->length ||
5668 right > PY_SSIZE_T_MAX - (left + self->length)) {
5669 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5670 return NULL;
5672 u = _PyUnicode_New(left + self->length + right);
5673 if (u) {
5674 if (left)
5675 Py_UNICODE_FILL(u->str, fill, left);
5676 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5677 if (right)
5678 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5681 return u;
5684 #define SPLIT_APPEND(data, left, right) \
5685 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
5686 if (!str) \
5687 goto onError; \
5688 if (PyList_Append(list, str)) { \
5689 Py_DECREF(str); \
5690 goto onError; \
5692 else \
5693 Py_DECREF(str);
5695 static
5696 PyObject *split_whitespace(PyUnicodeObject *self,
5697 PyObject *list,
5698 Py_ssize_t maxcount)
5700 register Py_ssize_t i;
5701 register Py_ssize_t j;
5702 Py_ssize_t len = self->length;
5703 PyObject *str;
5704 register const Py_UNICODE *buf = self->str;
5706 for (i = j = 0; i < len; ) {
5707 /* find a token */
5708 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5709 i++;
5710 j = i;
5711 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
5712 i++;
5713 if (j < i) {
5714 if (maxcount-- <= 0)
5715 break;
5716 SPLIT_APPEND(buf, j, i);
5717 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5718 i++;
5719 j = i;
5722 if (j < len) {
5723 SPLIT_APPEND(buf, j, len);
5725 return list;
5727 onError:
5728 Py_DECREF(list);
5729 return NULL;
5732 PyObject *PyUnicode_Splitlines(PyObject *string,
5733 int keepends)
5735 register Py_ssize_t i;
5736 register Py_ssize_t j;
5737 Py_ssize_t len;
5738 PyObject *list;
5739 PyObject *str;
5740 Py_UNICODE *data;
5742 string = PyUnicode_FromObject(string);
5743 if (string == NULL)
5744 return NULL;
5745 data = PyUnicode_AS_UNICODE(string);
5746 len = PyUnicode_GET_SIZE(string);
5748 list = PyList_New(0);
5749 if (!list)
5750 goto onError;
5752 for (i = j = 0; i < len; ) {
5753 Py_ssize_t eol;
5755 /* Find a line and append it */
5756 while (i < len && !BLOOM_LINEBREAK(data[i]))
5757 i++;
5759 /* Skip the line break reading CRLF as one line break */
5760 eol = i;
5761 if (i < len) {
5762 if (data[i] == '\r' && i + 1 < len &&
5763 data[i+1] == '\n')
5764 i += 2;
5765 else
5766 i++;
5767 if (keepends)
5768 eol = i;
5770 SPLIT_APPEND(data, j, eol);
5771 j = i;
5773 if (j < len) {
5774 SPLIT_APPEND(data, j, len);
5777 Py_DECREF(string);
5778 return list;
5780 onError:
5781 Py_XDECREF(list);
5782 Py_DECREF(string);
5783 return NULL;
5786 static
5787 PyObject *split_char(PyUnicodeObject *self,
5788 PyObject *list,
5789 Py_UNICODE ch,
5790 Py_ssize_t maxcount)
5792 register Py_ssize_t i;
5793 register Py_ssize_t j;
5794 Py_ssize_t len = self->length;
5795 PyObject *str;
5796 register const Py_UNICODE *buf = self->str;
5798 for (i = j = 0; i < len; ) {
5799 if (buf[i] == ch) {
5800 if (maxcount-- <= 0)
5801 break;
5802 SPLIT_APPEND(buf, j, i);
5803 i = j = i + 1;
5804 } else
5805 i++;
5807 if (j <= len) {
5808 SPLIT_APPEND(buf, j, len);
5810 return list;
5812 onError:
5813 Py_DECREF(list);
5814 return NULL;
5817 static
5818 PyObject *split_substring(PyUnicodeObject *self,
5819 PyObject *list,
5820 PyUnicodeObject *substring,
5821 Py_ssize_t maxcount)
5823 register Py_ssize_t i;
5824 register Py_ssize_t j;
5825 Py_ssize_t len = self->length;
5826 Py_ssize_t sublen = substring->length;
5827 PyObject *str;
5829 for (i = j = 0; i <= len - sublen; ) {
5830 if (Py_UNICODE_MATCH(self, i, substring)) {
5831 if (maxcount-- <= 0)
5832 break;
5833 SPLIT_APPEND(self->str, j, i);
5834 i = j = i + sublen;
5835 } else
5836 i++;
5838 if (j <= len) {
5839 SPLIT_APPEND(self->str, j, len);
5841 return list;
5843 onError:
5844 Py_DECREF(list);
5845 return NULL;
5848 static
5849 PyObject *rsplit_whitespace(PyUnicodeObject *self,
5850 PyObject *list,
5851 Py_ssize_t maxcount)
5853 register Py_ssize_t i;
5854 register Py_ssize_t j;
5855 Py_ssize_t len = self->length;
5856 PyObject *str;
5857 register const Py_UNICODE *buf = self->str;
5859 for (i = j = len - 1; i >= 0; ) {
5860 /* find a token */
5861 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5862 i--;
5863 j = i;
5864 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
5865 i--;
5866 if (j > i) {
5867 if (maxcount-- <= 0)
5868 break;
5869 SPLIT_APPEND(buf, i + 1, j + 1);
5870 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5871 i--;
5872 j = i;
5875 if (j >= 0) {
5876 SPLIT_APPEND(buf, 0, j + 1);
5878 if (PyList_Reverse(list) < 0)
5879 goto onError;
5880 return list;
5882 onError:
5883 Py_DECREF(list);
5884 return NULL;
5887 static
5888 PyObject *rsplit_char(PyUnicodeObject *self,
5889 PyObject *list,
5890 Py_UNICODE ch,
5891 Py_ssize_t maxcount)
5893 register Py_ssize_t i;
5894 register Py_ssize_t j;
5895 Py_ssize_t len = self->length;
5896 PyObject *str;
5897 register const Py_UNICODE *buf = self->str;
5899 for (i = j = len - 1; i >= 0; ) {
5900 if (buf[i] == ch) {
5901 if (maxcount-- <= 0)
5902 break;
5903 SPLIT_APPEND(buf, i + 1, j + 1);
5904 j = i = i - 1;
5905 } else
5906 i--;
5908 if (j >= -1) {
5909 SPLIT_APPEND(buf, 0, j + 1);
5911 if (PyList_Reverse(list) < 0)
5912 goto onError;
5913 return list;
5915 onError:
5916 Py_DECREF(list);
5917 return NULL;
5920 static
5921 PyObject *rsplit_substring(PyUnicodeObject *self,
5922 PyObject *list,
5923 PyUnicodeObject *substring,
5924 Py_ssize_t maxcount)
5926 register Py_ssize_t i;
5927 register Py_ssize_t j;
5928 Py_ssize_t len = self->length;
5929 Py_ssize_t sublen = substring->length;
5930 PyObject *str;
5932 for (i = len - sublen, j = len; i >= 0; ) {
5933 if (Py_UNICODE_MATCH(self, i, substring)) {
5934 if (maxcount-- <= 0)
5935 break;
5936 SPLIT_APPEND(self->str, i + sublen, j);
5937 j = i;
5938 i -= sublen;
5939 } else
5940 i--;
5942 if (j >= 0) {
5943 SPLIT_APPEND(self->str, 0, j);
5945 if (PyList_Reverse(list) < 0)
5946 goto onError;
5947 return list;
5949 onError:
5950 Py_DECREF(list);
5951 return NULL;
5954 #undef SPLIT_APPEND
5956 static
5957 PyObject *split(PyUnicodeObject *self,
5958 PyUnicodeObject *substring,
5959 Py_ssize_t maxcount)
5961 PyObject *list;
5963 if (maxcount < 0)
5964 maxcount = PY_SSIZE_T_MAX;
5966 list = PyList_New(0);
5967 if (!list)
5968 return NULL;
5970 if (substring == NULL)
5971 return split_whitespace(self,list,maxcount);
5973 else if (substring->length == 1)
5974 return split_char(self,list,substring->str[0],maxcount);
5976 else if (substring->length == 0) {
5977 Py_DECREF(list);
5978 PyErr_SetString(PyExc_ValueError, "empty separator");
5979 return NULL;
5981 else
5982 return split_substring(self,list,substring,maxcount);
5985 static
5986 PyObject *rsplit(PyUnicodeObject *self,
5987 PyUnicodeObject *substring,
5988 Py_ssize_t maxcount)
5990 PyObject *list;
5992 if (maxcount < 0)
5993 maxcount = PY_SSIZE_T_MAX;
5995 list = PyList_New(0);
5996 if (!list)
5997 return NULL;
5999 if (substring == NULL)
6000 return rsplit_whitespace(self,list,maxcount);
6002 else if (substring->length == 1)
6003 return rsplit_char(self,list,substring->str[0],maxcount);
6005 else if (substring->length == 0) {
6006 Py_DECREF(list);
6007 PyErr_SetString(PyExc_ValueError, "empty separator");
6008 return NULL;
6010 else
6011 return rsplit_substring(self,list,substring,maxcount);
6014 static
6015 PyObject *replace(PyUnicodeObject *self,
6016 PyUnicodeObject *str1,
6017 PyUnicodeObject *str2,
6018 Py_ssize_t maxcount)
6020 PyUnicodeObject *u;
6022 if (maxcount < 0)
6023 maxcount = PY_SSIZE_T_MAX;
6025 if (str1->length == str2->length) {
6026 /* same length */
6027 Py_ssize_t i;
6028 if (str1->length == 1) {
6029 /* replace characters */
6030 Py_UNICODE u1, u2;
6031 if (!findchar(self->str, self->length, str1->str[0]))
6032 goto nothing;
6033 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6034 if (!u)
6035 return NULL;
6036 Py_UNICODE_COPY(u->str, self->str, self->length);
6037 u1 = str1->str[0];
6038 u2 = str2->str[0];
6039 for (i = 0; i < u->length; i++)
6040 if (u->str[i] == u1) {
6041 if (--maxcount < 0)
6042 break;
6043 u->str[i] = u2;
6045 } else {
6046 i = fastsearch(
6047 self->str, self->length, str1->str, str1->length, FAST_SEARCH
6049 if (i < 0)
6050 goto nothing;
6051 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6052 if (!u)
6053 return NULL;
6054 Py_UNICODE_COPY(u->str, self->str, self->length);
6055 while (i <= self->length - str1->length)
6056 if (Py_UNICODE_MATCH(self, i, str1)) {
6057 if (--maxcount < 0)
6058 break;
6059 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6060 i += str1->length;
6061 } else
6062 i++;
6064 } else {
6066 Py_ssize_t n, i, j, e;
6067 Py_ssize_t product, new_size, delta;
6068 Py_UNICODE *p;
6070 /* replace strings */
6071 n = stringlib_count(self->str, self->length, str1->str, str1->length);
6072 if (n > maxcount)
6073 n = maxcount;
6074 if (n == 0)
6075 goto nothing;
6076 /* new_size = self->length + n * (str2->length - str1->length)); */
6077 delta = (str2->length - str1->length);
6078 if (delta == 0) {
6079 new_size = self->length;
6080 } else {
6081 product = n * (str2->length - str1->length);
6082 if ((product / (str2->length - str1->length)) != n) {
6083 PyErr_SetString(PyExc_OverflowError,
6084 "replace string is too long");
6085 return NULL;
6087 new_size = self->length + product;
6088 if (new_size < 0) {
6089 PyErr_SetString(PyExc_OverflowError,
6090 "replace string is too long");
6091 return NULL;
6094 u = _PyUnicode_New(new_size);
6095 if (!u)
6096 return NULL;
6097 i = 0;
6098 p = u->str;
6099 e = self->length - str1->length;
6100 if (str1->length > 0) {
6101 while (n-- > 0) {
6102 /* look for next match */
6103 j = i;
6104 while (j <= e) {
6105 if (Py_UNICODE_MATCH(self, j, str1))
6106 break;
6107 j++;
6109 if (j > i) {
6110 if (j > e)
6111 break;
6112 /* copy unchanged part [i:j] */
6113 Py_UNICODE_COPY(p, self->str+i, j-i);
6114 p += j - i;
6116 /* copy substitution string */
6117 if (str2->length > 0) {
6118 Py_UNICODE_COPY(p, str2->str, str2->length);
6119 p += str2->length;
6121 i = j + str1->length;
6123 if (i < self->length)
6124 /* copy tail [i:] */
6125 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6126 } else {
6127 /* interleave */
6128 while (n > 0) {
6129 Py_UNICODE_COPY(p, str2->str, str2->length);
6130 p += str2->length;
6131 if (--n <= 0)
6132 break;
6133 *p++ = self->str[i++];
6135 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6138 return (PyObject *) u;
6140 nothing:
6141 /* nothing to replace; return original string (when possible) */
6142 if (PyUnicode_CheckExact(self)) {
6143 Py_INCREF(self);
6144 return (PyObject *) self;
6146 return PyUnicode_FromUnicode(self->str, self->length);
6149 /* --- Unicode Object Methods --------------------------------------------- */
6151 PyDoc_STRVAR(title__doc__,
6152 "S.title() -> unicode\n\
6154 Return a titlecased version of S, i.e. words start with title case\n\
6155 characters, all remaining cased characters have lower case.");
6157 static PyObject*
6158 unicode_title(PyUnicodeObject *self)
6160 return fixup(self, fixtitle);
6163 PyDoc_STRVAR(capitalize__doc__,
6164 "S.capitalize() -> unicode\n\
6166 Return a capitalized version of S, i.e. make the first character\n\
6167 have upper case.");
6169 static PyObject*
6170 unicode_capitalize(PyUnicodeObject *self)
6172 return fixup(self, fixcapitalize);
6175 #if 0
6176 PyDoc_STRVAR(capwords__doc__,
6177 "S.capwords() -> unicode\n\
6179 Apply .capitalize() to all words in S and return the result with\n\
6180 normalized whitespace (all whitespace strings are replaced by ' ').");
6182 static PyObject*
6183 unicode_capwords(PyUnicodeObject *self)
6185 PyObject *list;
6186 PyObject *item;
6187 Py_ssize_t i;
6189 /* Split into words */
6190 list = split(self, NULL, -1);
6191 if (!list)
6192 return NULL;
6194 /* Capitalize each word */
6195 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6196 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6197 fixcapitalize);
6198 if (item == NULL)
6199 goto onError;
6200 Py_DECREF(PyList_GET_ITEM(list, i));
6201 PyList_SET_ITEM(list, i, item);
6204 /* Join the words to form a new string */
6205 item = PyUnicode_Join(NULL, list);
6207 onError:
6208 Py_DECREF(list);
6209 return (PyObject *)item;
6211 #endif
6213 /* Argument converter. Coerces to a single unicode character */
6215 static int
6216 convert_uc(PyObject *obj, void *addr)
6218 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6219 PyObject *uniobj;
6220 Py_UNICODE *unistr;
6222 uniobj = PyUnicode_FromObject(obj);
6223 if (uniobj == NULL) {
6224 PyErr_SetString(PyExc_TypeError,
6225 "The fill character cannot be converted to Unicode");
6226 return 0;
6228 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6229 PyErr_SetString(PyExc_TypeError,
6230 "The fill character must be exactly one character long");
6231 Py_DECREF(uniobj);
6232 return 0;
6234 unistr = PyUnicode_AS_UNICODE(uniobj);
6235 *fillcharloc = unistr[0];
6236 Py_DECREF(uniobj);
6237 return 1;
6240 PyDoc_STRVAR(center__doc__,
6241 "S.center(width[, fillchar]) -> unicode\n\
6243 Return S centered in a Unicode string of length width. Padding is\n\
6244 done using the specified fill character (default is a space)");
6246 static PyObject *
6247 unicode_center(PyUnicodeObject *self, PyObject *args)
6249 Py_ssize_t marg, left;
6250 Py_ssize_t width;
6251 Py_UNICODE fillchar = ' ';
6253 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6254 return NULL;
6256 if (self->length >= width && PyUnicode_CheckExact(self)) {
6257 Py_INCREF(self);
6258 return (PyObject*) self;
6261 marg = width - self->length;
6262 left = marg / 2 + (marg & width & 1);
6264 return (PyObject*) pad(self, left, marg - left, fillchar);
6267 #if 0
6269 /* This code should go into some future Unicode collation support
6270 module. The basic comparison should compare ordinals on a naive
6271 basis (this is what Java does and thus JPython too). */
6273 /* speedy UTF-16 code point order comparison */
6274 /* gleaned from: */
6275 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6277 static short utf16Fixup[32] =
6279 0, 0, 0, 0, 0, 0, 0, 0,
6280 0, 0, 0, 0, 0, 0, 0, 0,
6281 0, 0, 0, 0, 0, 0, 0, 0,
6282 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6285 static int
6286 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6288 Py_ssize_t len1, len2;
6290 Py_UNICODE *s1 = str1->str;
6291 Py_UNICODE *s2 = str2->str;
6293 len1 = str1->length;
6294 len2 = str2->length;
6296 while (len1 > 0 && len2 > 0) {
6297 Py_UNICODE c1, c2;
6299 c1 = *s1++;
6300 c2 = *s2++;
6302 if (c1 > (1<<11) * 26)
6303 c1 += utf16Fixup[c1>>11];
6304 if (c2 > (1<<11) * 26)
6305 c2 += utf16Fixup[c2>>11];
6306 /* now c1 and c2 are in UTF-32-compatible order */
6308 if (c1 != c2)
6309 return (c1 < c2) ? -1 : 1;
6311 len1--; len2--;
6314 return (len1 < len2) ? -1 : (len1 != len2);
6317 #else
6319 static int
6320 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6322 register Py_ssize_t len1, len2;
6324 Py_UNICODE *s1 = str1->str;
6325 Py_UNICODE *s2 = str2->str;
6327 len1 = str1->length;
6328 len2 = str2->length;
6330 while (len1 > 0 && len2 > 0) {
6331 Py_UNICODE c1, c2;
6333 c1 = *s1++;
6334 c2 = *s2++;
6336 if (c1 != c2)
6337 return (c1 < c2) ? -1 : 1;
6339 len1--; len2--;
6342 return (len1 < len2) ? -1 : (len1 != len2);
6345 #endif
6347 int PyUnicode_Compare(PyObject *left,
6348 PyObject *right)
6350 PyUnicodeObject *u = NULL, *v = NULL;
6351 int result;
6353 /* Coerce the two arguments */
6354 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6355 if (u == NULL)
6356 goto onError;
6357 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6358 if (v == NULL)
6359 goto onError;
6361 /* Shortcut for empty or interned objects */
6362 if (v == u) {
6363 Py_DECREF(u);
6364 Py_DECREF(v);
6365 return 0;
6368 result = unicode_compare(u, v);
6370 Py_DECREF(u);
6371 Py_DECREF(v);
6372 return result;
6374 onError:
6375 Py_XDECREF(u);
6376 Py_XDECREF(v);
6377 return -1;
6380 PyObject *PyUnicode_RichCompare(PyObject *left,
6381 PyObject *right,
6382 int op)
6384 int result;
6386 result = PyUnicode_Compare(left, right);
6387 if (result == -1 && PyErr_Occurred())
6388 goto onError;
6390 /* Convert the return value to a Boolean */
6391 switch (op) {
6392 case Py_EQ:
6393 result = (result == 0);
6394 break;
6395 case Py_NE:
6396 result = (result != 0);
6397 break;
6398 case Py_LE:
6399 result = (result <= 0);
6400 break;
6401 case Py_GE:
6402 result = (result >= 0);
6403 break;
6404 case Py_LT:
6405 result = (result == -1);
6406 break;
6407 case Py_GT:
6408 result = (result == 1);
6409 break;
6411 return PyBool_FromLong(result);
6413 onError:
6415 /* Standard case
6417 Type errors mean that PyUnicode_FromObject() could not convert
6418 one of the arguments (usually the right hand side) to Unicode,
6419 ie. we can't handle the comparison request. However, it is
6420 possible that the other object knows a comparison method, which
6421 is why we return Py_NotImplemented to give the other object a
6422 chance.
6425 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6426 PyErr_Clear();
6427 Py_INCREF(Py_NotImplemented);
6428 return Py_NotImplemented;
6430 if (op != Py_EQ && op != Py_NE)
6431 return NULL;
6433 /* Equality comparison.
6435 This is a special case: we silence any PyExc_UnicodeDecodeError
6436 and instead turn it into a PyErr_UnicodeWarning.
6439 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6440 return NULL;
6441 PyErr_Clear();
6442 if (PyErr_Warn(PyExc_UnicodeWarning,
6443 (op == Py_EQ) ?
6444 "Unicode equal comparison "
6445 "failed to convert both arguments to Unicode - "
6446 "interpreting them as being unequal" :
6447 "Unicode unequal comparison "
6448 "failed to convert both arguments to Unicode - "
6449 "interpreting them as being unequal"
6450 ) < 0)
6451 return NULL;
6452 result = (op == Py_NE);
6453 return PyBool_FromLong(result);
6456 int PyUnicode_Contains(PyObject *container,
6457 PyObject *element)
6459 PyObject *str, *sub;
6460 int result;
6462 /* Coerce the two arguments */
6463 sub = PyUnicode_FromObject(element);
6464 if (!sub) {
6465 PyErr_SetString(PyExc_TypeError,
6466 "'in <string>' requires string as left operand");
6467 return -1;
6470 str = PyUnicode_FromObject(container);
6471 if (!str) {
6472 Py_DECREF(sub);
6473 return -1;
6476 result = stringlib_contains_obj(str, sub);
6478 Py_DECREF(str);
6479 Py_DECREF(sub);
6481 return result;
6484 /* Concat to string or Unicode object giving a new Unicode object. */
6486 PyObject *PyUnicode_Concat(PyObject *left,
6487 PyObject *right)
6489 PyUnicodeObject *u = NULL, *v = NULL, *w;
6491 /* Coerce the two arguments */
6492 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6493 if (u == NULL)
6494 goto onError;
6495 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6496 if (v == NULL)
6497 goto onError;
6499 /* Shortcuts */
6500 if (v == unicode_empty) {
6501 Py_DECREF(v);
6502 return (PyObject *)u;
6504 if (u == unicode_empty) {
6505 Py_DECREF(u);
6506 return (PyObject *)v;
6509 /* Concat the two Unicode strings */
6510 w = _PyUnicode_New(u->length + v->length);
6511 if (w == NULL)
6512 goto onError;
6513 Py_UNICODE_COPY(w->str, u->str, u->length);
6514 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6516 Py_DECREF(u);
6517 Py_DECREF(v);
6518 return (PyObject *)w;
6520 onError:
6521 Py_XDECREF(u);
6522 Py_XDECREF(v);
6523 return NULL;
6526 PyDoc_STRVAR(count__doc__,
6527 "S.count(sub[, start[, end]]) -> int\n\
6529 Return the number of non-overlapping occurrences of substring sub in\n\
6530 Unicode string S[start:end]. Optional arguments start and end are\n\
6531 interpreted as in slice notation.");
6533 static PyObject *
6534 unicode_count(PyUnicodeObject *self, PyObject *args)
6536 PyUnicodeObject *substring;
6537 Py_ssize_t start = 0;
6538 Py_ssize_t end = PY_SSIZE_T_MAX;
6539 PyObject *result;
6541 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6542 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6543 return NULL;
6545 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6546 (PyObject *)substring);
6547 if (substring == NULL)
6548 return NULL;
6550 FIX_START_END(self);
6552 result = PyInt_FromSsize_t(
6553 stringlib_count(self->str + start, end - start,
6554 substring->str, substring->length)
6557 Py_DECREF(substring);
6559 return result;
6562 PyDoc_STRVAR(encode__doc__,
6563 "S.encode([encoding[,errors]]) -> string or unicode\n\
6565 Encodes S using the codec registered for encoding. encoding defaults\n\
6566 to the default encoding. errors may be given to set a different error\n\
6567 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6568 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6569 'xmlcharrefreplace' as well as any other name registered with\n\
6570 codecs.register_error that can handle UnicodeEncodeErrors.");
6572 static PyObject *
6573 unicode_encode(PyUnicodeObject *self, PyObject *args)
6575 char *encoding = NULL;
6576 char *errors = NULL;
6577 PyObject *v;
6579 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6580 return NULL;
6581 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6582 if (v == NULL)
6583 goto onError;
6584 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6585 PyErr_Format(PyExc_TypeError,
6586 "encoder did not return a string/unicode object "
6587 "(type=%.400s)",
6588 Py_TYPE(v)->tp_name);
6589 Py_DECREF(v);
6590 return NULL;
6592 return v;
6594 onError:
6595 return NULL;
6598 PyDoc_STRVAR(decode__doc__,
6599 "S.decode([encoding[,errors]]) -> string or unicode\n\
6601 Decodes S using the codec registered for encoding. encoding defaults\n\
6602 to the default encoding. errors may be given to set a different error\n\
6603 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6604 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6605 as well as any other name registerd with codecs.register_error that is\n\
6606 able to handle UnicodeDecodeErrors.");
6608 static PyObject *
6609 unicode_decode(PyUnicodeObject *self, PyObject *args)
6611 char *encoding = NULL;
6612 char *errors = NULL;
6613 PyObject *v;
6615 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6616 return NULL;
6617 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
6618 if (v == NULL)
6619 goto onError;
6620 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6621 PyErr_Format(PyExc_TypeError,
6622 "decoder did not return a string/unicode object "
6623 "(type=%.400s)",
6624 Py_TYPE(v)->tp_name);
6625 Py_DECREF(v);
6626 return NULL;
6628 return v;
6630 onError:
6631 return NULL;
6634 PyDoc_STRVAR(expandtabs__doc__,
6635 "S.expandtabs([tabsize]) -> unicode\n\
6637 Return a copy of S where all tab characters are expanded using spaces.\n\
6638 If tabsize is not given, a tab size of 8 characters is assumed.");
6640 static PyObject*
6641 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6643 Py_UNICODE *e;
6644 Py_UNICODE *p;
6645 Py_UNICODE *q;
6646 Py_UNICODE *qe;
6647 Py_ssize_t i, j, incr;
6648 PyUnicodeObject *u;
6649 int tabsize = 8;
6651 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6652 return NULL;
6654 /* First pass: determine size of output string */
6655 i = 0; /* chars up to and including most recent \n or \r */
6656 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6657 e = self->str + self->length; /* end of input */
6658 for (p = self->str; p < e; p++)
6659 if (*p == '\t') {
6660 if (tabsize > 0) {
6661 incr = tabsize - (j % tabsize); /* cannot overflow */
6662 if (j > PY_SSIZE_T_MAX - incr)
6663 goto overflow1;
6664 j += incr;
6667 else {
6668 if (j > PY_SSIZE_T_MAX - 1)
6669 goto overflow1;
6670 j++;
6671 if (*p == '\n' || *p == '\r') {
6672 if (i > PY_SSIZE_T_MAX - j)
6673 goto overflow1;
6674 i += j;
6675 j = 0;
6679 if (i > PY_SSIZE_T_MAX - j)
6680 goto overflow1;
6682 /* Second pass: create output string and fill it */
6683 u = _PyUnicode_New(i + j);
6684 if (!u)
6685 return NULL;
6687 j = 0; /* same as in first pass */
6688 q = u->str; /* next output char */
6689 qe = u->str + u->length; /* end of output */
6691 for (p = self->str; p < e; p++)
6692 if (*p == '\t') {
6693 if (tabsize > 0) {
6694 i = tabsize - (j % tabsize);
6695 j += i;
6696 while (i--) {
6697 if (q >= qe)
6698 goto overflow2;
6699 *q++ = ' ';
6703 else {
6704 if (q >= qe)
6705 goto overflow2;
6706 *q++ = *p;
6707 j++;
6708 if (*p == '\n' || *p == '\r')
6709 j = 0;
6712 return (PyObject*) u;
6714 overflow2:
6715 Py_DECREF(u);
6716 overflow1:
6717 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6718 return NULL;
6721 PyDoc_STRVAR(find__doc__,
6722 "S.find(sub [,start [,end]]) -> int\n\
6724 Return the lowest index in S where substring sub is found,\n\
6725 such that sub is contained within s[start:end]. Optional\n\
6726 arguments start and end are interpreted as in slice notation.\n\
6728 Return -1 on failure.");
6730 static PyObject *
6731 unicode_find(PyUnicodeObject *self, PyObject *args)
6733 PyObject *substring;
6734 Py_ssize_t start;
6735 Py_ssize_t end;
6736 Py_ssize_t result;
6738 if (!_ParseTupleFinds(args, &substring, &start, &end))
6739 return NULL;
6741 result = stringlib_find_slice(
6742 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6743 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6744 start, end
6747 Py_DECREF(substring);
6749 return PyInt_FromSsize_t(result);
6752 static PyObject *
6753 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6755 if (index < 0 || index >= self->length) {
6756 PyErr_SetString(PyExc_IndexError, "string index out of range");
6757 return NULL;
6760 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6763 static long
6764 unicode_hash(PyUnicodeObject *self)
6766 /* Since Unicode objects compare equal to their ASCII string
6767 counterparts, they should use the individual character values
6768 as basis for their hash value. This is needed to assure that
6769 strings and Unicode objects behave in the same way as
6770 dictionary keys. */
6772 register Py_ssize_t len;
6773 register Py_UNICODE *p;
6774 register long x;
6776 if (self->hash != -1)
6777 return self->hash;
6778 len = PyUnicode_GET_SIZE(self);
6779 p = PyUnicode_AS_UNICODE(self);
6780 x = *p << 7;
6781 while (--len >= 0)
6782 x = (1000003*x) ^ *p++;
6783 x ^= PyUnicode_GET_SIZE(self);
6784 if (x == -1)
6785 x = -2;
6786 self->hash = x;
6787 return x;
6790 PyDoc_STRVAR(index__doc__,
6791 "S.index(sub [,start [,end]]) -> int\n\
6793 Like S.find() but raise ValueError when the substring is not found.");
6795 static PyObject *
6796 unicode_index(PyUnicodeObject *self, PyObject *args)
6798 Py_ssize_t result;
6799 PyObject *substring;
6800 Py_ssize_t start;
6801 Py_ssize_t end;
6803 if (!_ParseTupleFinds(args, &substring, &start, &end))
6804 return NULL;
6806 result = stringlib_find_slice(
6807 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6808 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6809 start, end
6812 Py_DECREF(substring);
6814 if (result < 0) {
6815 PyErr_SetString(PyExc_ValueError, "substring not found");
6816 return NULL;
6819 return PyInt_FromSsize_t(result);
6822 PyDoc_STRVAR(islower__doc__,
6823 "S.islower() -> bool\n\
6825 Return True if all cased characters in S are lowercase and there is\n\
6826 at least one cased character in S, False otherwise.");
6828 static PyObject*
6829 unicode_islower(PyUnicodeObject *self)
6831 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6832 register const Py_UNICODE *e;
6833 int cased;
6835 /* Shortcut for single character strings */
6836 if (PyUnicode_GET_SIZE(self) == 1)
6837 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6839 /* Special case for empty strings */
6840 if (PyUnicode_GET_SIZE(self) == 0)
6841 return PyBool_FromLong(0);
6843 e = p + PyUnicode_GET_SIZE(self);
6844 cased = 0;
6845 for (; p < e; p++) {
6846 register const Py_UNICODE ch = *p;
6848 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6849 return PyBool_FromLong(0);
6850 else if (!cased && Py_UNICODE_ISLOWER(ch))
6851 cased = 1;
6853 return PyBool_FromLong(cased);
6856 PyDoc_STRVAR(isupper__doc__,
6857 "S.isupper() -> bool\n\
6859 Return True if all cased characters in S are uppercase and there is\n\
6860 at least one cased character in S, False otherwise.");
6862 static PyObject*
6863 unicode_isupper(PyUnicodeObject *self)
6865 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6866 register const Py_UNICODE *e;
6867 int cased;
6869 /* Shortcut for single character strings */
6870 if (PyUnicode_GET_SIZE(self) == 1)
6871 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6873 /* Special case for empty strings */
6874 if (PyUnicode_GET_SIZE(self) == 0)
6875 return PyBool_FromLong(0);
6877 e = p + PyUnicode_GET_SIZE(self);
6878 cased = 0;
6879 for (; p < e; p++) {
6880 register const Py_UNICODE ch = *p;
6882 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6883 return PyBool_FromLong(0);
6884 else if (!cased && Py_UNICODE_ISUPPER(ch))
6885 cased = 1;
6887 return PyBool_FromLong(cased);
6890 PyDoc_STRVAR(istitle__doc__,
6891 "S.istitle() -> bool\n\
6893 Return True if S is a titlecased string and there is at least one\n\
6894 character in S, i.e. upper- and titlecase characters may only\n\
6895 follow uncased characters and lowercase characters only cased ones.\n\
6896 Return False otherwise.");
6898 static PyObject*
6899 unicode_istitle(PyUnicodeObject *self)
6901 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6902 register const Py_UNICODE *e;
6903 int cased, previous_is_cased;
6905 /* Shortcut for single character strings */
6906 if (PyUnicode_GET_SIZE(self) == 1)
6907 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6908 (Py_UNICODE_ISUPPER(*p) != 0));
6910 /* Special case for empty strings */
6911 if (PyUnicode_GET_SIZE(self) == 0)
6912 return PyBool_FromLong(0);
6914 e = p + PyUnicode_GET_SIZE(self);
6915 cased = 0;
6916 previous_is_cased = 0;
6917 for (; p < e; p++) {
6918 register const Py_UNICODE ch = *p;
6920 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6921 if (previous_is_cased)
6922 return PyBool_FromLong(0);
6923 previous_is_cased = 1;
6924 cased = 1;
6926 else if (Py_UNICODE_ISLOWER(ch)) {
6927 if (!previous_is_cased)
6928 return PyBool_FromLong(0);
6929 previous_is_cased = 1;
6930 cased = 1;
6932 else
6933 previous_is_cased = 0;
6935 return PyBool_FromLong(cased);
6938 PyDoc_STRVAR(isspace__doc__,
6939 "S.isspace() -> bool\n\
6941 Return True if all characters in S are whitespace\n\
6942 and there is at least one character in S, False otherwise.");
6944 static PyObject*
6945 unicode_isspace(PyUnicodeObject *self)
6947 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6948 register const Py_UNICODE *e;
6950 /* Shortcut for single character strings */
6951 if (PyUnicode_GET_SIZE(self) == 1 &&
6952 Py_UNICODE_ISSPACE(*p))
6953 return PyBool_FromLong(1);
6955 /* Special case for empty strings */
6956 if (PyUnicode_GET_SIZE(self) == 0)
6957 return PyBool_FromLong(0);
6959 e = p + PyUnicode_GET_SIZE(self);
6960 for (; p < e; p++) {
6961 if (!Py_UNICODE_ISSPACE(*p))
6962 return PyBool_FromLong(0);
6964 return PyBool_FromLong(1);
6967 PyDoc_STRVAR(isalpha__doc__,
6968 "S.isalpha() -> bool\n\
6970 Return True if all characters in S are alphabetic\n\
6971 and there is at least one character in S, False otherwise.");
6973 static PyObject*
6974 unicode_isalpha(PyUnicodeObject *self)
6976 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6977 register const Py_UNICODE *e;
6979 /* Shortcut for single character strings */
6980 if (PyUnicode_GET_SIZE(self) == 1 &&
6981 Py_UNICODE_ISALPHA(*p))
6982 return PyBool_FromLong(1);
6984 /* Special case for empty strings */
6985 if (PyUnicode_GET_SIZE(self) == 0)
6986 return PyBool_FromLong(0);
6988 e = p + PyUnicode_GET_SIZE(self);
6989 for (; p < e; p++) {
6990 if (!Py_UNICODE_ISALPHA(*p))
6991 return PyBool_FromLong(0);
6993 return PyBool_FromLong(1);
6996 PyDoc_STRVAR(isalnum__doc__,
6997 "S.isalnum() -> bool\n\
6999 Return True if all characters in S are alphanumeric\n\
7000 and there is at least one character in S, False otherwise.");
7002 static PyObject*
7003 unicode_isalnum(PyUnicodeObject *self)
7005 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7006 register const Py_UNICODE *e;
7008 /* Shortcut for single character strings */
7009 if (PyUnicode_GET_SIZE(self) == 1 &&
7010 Py_UNICODE_ISALNUM(*p))
7011 return PyBool_FromLong(1);
7013 /* Special case for empty strings */
7014 if (PyUnicode_GET_SIZE(self) == 0)
7015 return PyBool_FromLong(0);
7017 e = p + PyUnicode_GET_SIZE(self);
7018 for (; p < e; p++) {
7019 if (!Py_UNICODE_ISALNUM(*p))
7020 return PyBool_FromLong(0);
7022 return PyBool_FromLong(1);
7025 PyDoc_STRVAR(isdecimal__doc__,
7026 "S.isdecimal() -> bool\n\
7028 Return True if there are only decimal characters in S,\n\
7029 False otherwise.");
7031 static PyObject*
7032 unicode_isdecimal(PyUnicodeObject *self)
7034 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7035 register const Py_UNICODE *e;
7037 /* Shortcut for single character strings */
7038 if (PyUnicode_GET_SIZE(self) == 1 &&
7039 Py_UNICODE_ISDECIMAL(*p))
7040 return PyBool_FromLong(1);
7042 /* Special case for empty strings */
7043 if (PyUnicode_GET_SIZE(self) == 0)
7044 return PyBool_FromLong(0);
7046 e = p + PyUnicode_GET_SIZE(self);
7047 for (; p < e; p++) {
7048 if (!Py_UNICODE_ISDECIMAL(*p))
7049 return PyBool_FromLong(0);
7051 return PyBool_FromLong(1);
7054 PyDoc_STRVAR(isdigit__doc__,
7055 "S.isdigit() -> bool\n\
7057 Return True if all characters in S are digits\n\
7058 and there is at least one character in S, False otherwise.");
7060 static PyObject*
7061 unicode_isdigit(PyUnicodeObject *self)
7063 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7064 register const Py_UNICODE *e;
7066 /* Shortcut for single character strings */
7067 if (PyUnicode_GET_SIZE(self) == 1 &&
7068 Py_UNICODE_ISDIGIT(*p))
7069 return PyBool_FromLong(1);
7071 /* Special case for empty strings */
7072 if (PyUnicode_GET_SIZE(self) == 0)
7073 return PyBool_FromLong(0);
7075 e = p + PyUnicode_GET_SIZE(self);
7076 for (; p < e; p++) {
7077 if (!Py_UNICODE_ISDIGIT(*p))
7078 return PyBool_FromLong(0);
7080 return PyBool_FromLong(1);
7083 PyDoc_STRVAR(isnumeric__doc__,
7084 "S.isnumeric() -> bool\n\
7086 Return True if there are only numeric characters in S,\n\
7087 False otherwise.");
7089 static PyObject*
7090 unicode_isnumeric(PyUnicodeObject *self)
7092 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7093 register const Py_UNICODE *e;
7095 /* Shortcut for single character strings */
7096 if (PyUnicode_GET_SIZE(self) == 1 &&
7097 Py_UNICODE_ISNUMERIC(*p))
7098 return PyBool_FromLong(1);
7100 /* Special case for empty strings */
7101 if (PyUnicode_GET_SIZE(self) == 0)
7102 return PyBool_FromLong(0);
7104 e = p + PyUnicode_GET_SIZE(self);
7105 for (; p < e; p++) {
7106 if (!Py_UNICODE_ISNUMERIC(*p))
7107 return PyBool_FromLong(0);
7109 return PyBool_FromLong(1);
7112 PyDoc_STRVAR(join__doc__,
7113 "S.join(sequence) -> unicode\n\
7115 Return a string which is the concatenation of the strings in the\n\
7116 sequence. The separator between elements is S.");
7118 static PyObject*
7119 unicode_join(PyObject *self, PyObject *data)
7121 return PyUnicode_Join(self, data);
7124 static Py_ssize_t
7125 unicode_length(PyUnicodeObject *self)
7127 return self->length;
7130 PyDoc_STRVAR(ljust__doc__,
7131 "S.ljust(width[, fillchar]) -> int\n\
7133 Return S left-justified in a Unicode string of length width. Padding is\n\
7134 done using the specified fill character (default is a space).");
7136 static PyObject *
7137 unicode_ljust(PyUnicodeObject *self, PyObject *args)
7139 Py_ssize_t width;
7140 Py_UNICODE fillchar = ' ';
7142 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7143 return NULL;
7145 if (self->length >= width && PyUnicode_CheckExact(self)) {
7146 Py_INCREF(self);
7147 return (PyObject*) self;
7150 return (PyObject*) pad(self, 0, width - self->length, fillchar);
7153 PyDoc_STRVAR(lower__doc__,
7154 "S.lower() -> unicode\n\
7156 Return a copy of the string S converted to lowercase.");
7158 static PyObject*
7159 unicode_lower(PyUnicodeObject *self)
7161 return fixup(self, fixlower);
7164 #define LEFTSTRIP 0
7165 #define RIGHTSTRIP 1
7166 #define BOTHSTRIP 2
7168 /* Arrays indexed by above */
7169 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7171 #define STRIPNAME(i) (stripformat[i]+3)
7173 /* externally visible for str.strip(unicode) */
7174 PyObject *
7175 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7177 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7178 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7179 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7180 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7181 Py_ssize_t i, j;
7183 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7185 i = 0;
7186 if (striptype != RIGHTSTRIP) {
7187 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7188 i++;
7192 j = len;
7193 if (striptype != LEFTSTRIP) {
7194 do {
7195 j--;
7196 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7197 j++;
7200 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7201 Py_INCREF(self);
7202 return (PyObject*)self;
7204 else
7205 return PyUnicode_FromUnicode(s+i, j-i);
7209 static PyObject *
7210 do_strip(PyUnicodeObject *self, int striptype)
7212 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7213 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7215 i = 0;
7216 if (striptype != RIGHTSTRIP) {
7217 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7218 i++;
7222 j = len;
7223 if (striptype != LEFTSTRIP) {
7224 do {
7225 j--;
7226 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7227 j++;
7230 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7231 Py_INCREF(self);
7232 return (PyObject*)self;
7234 else
7235 return PyUnicode_FromUnicode(s+i, j-i);
7239 static PyObject *
7240 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7242 PyObject *sep = NULL;
7244 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7245 return NULL;
7247 if (sep != NULL && sep != Py_None) {
7248 if (PyUnicode_Check(sep))
7249 return _PyUnicode_XStrip(self, striptype, sep);
7250 else if (PyString_Check(sep)) {
7251 PyObject *res;
7252 sep = PyUnicode_FromObject(sep);
7253 if (sep==NULL)
7254 return NULL;
7255 res = _PyUnicode_XStrip(self, striptype, sep);
7256 Py_DECREF(sep);
7257 return res;
7259 else {
7260 PyErr_Format(PyExc_TypeError,
7261 "%s arg must be None, unicode or str",
7262 STRIPNAME(striptype));
7263 return NULL;
7267 return do_strip(self, striptype);
7271 PyDoc_STRVAR(strip__doc__,
7272 "S.strip([chars]) -> unicode\n\
7274 Return a copy of the string S with leading and trailing\n\
7275 whitespace removed.\n\
7276 If chars is given and not None, remove characters in chars instead.\n\
7277 If chars is a str, it will be converted to unicode before stripping");
7279 static PyObject *
7280 unicode_strip(PyUnicodeObject *self, PyObject *args)
7282 if (PyTuple_GET_SIZE(args) == 0)
7283 return do_strip(self, BOTHSTRIP); /* Common case */
7284 else
7285 return do_argstrip(self, BOTHSTRIP, args);
7289 PyDoc_STRVAR(lstrip__doc__,
7290 "S.lstrip([chars]) -> unicode\n\
7292 Return a copy of the string S with leading whitespace removed.\n\
7293 If chars is given and not None, remove characters in chars instead.\n\
7294 If chars is a str, it will be converted to unicode before stripping");
7296 static PyObject *
7297 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7299 if (PyTuple_GET_SIZE(args) == 0)
7300 return do_strip(self, LEFTSTRIP); /* Common case */
7301 else
7302 return do_argstrip(self, LEFTSTRIP, args);
7306 PyDoc_STRVAR(rstrip__doc__,
7307 "S.rstrip([chars]) -> unicode\n\
7309 Return a copy of the string S with trailing whitespace removed.\n\
7310 If chars is given and not None, remove characters in chars instead.\n\
7311 If chars is a str, it will be converted to unicode before stripping");
7313 static PyObject *
7314 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7316 if (PyTuple_GET_SIZE(args) == 0)
7317 return do_strip(self, RIGHTSTRIP); /* Common case */
7318 else
7319 return do_argstrip(self, RIGHTSTRIP, args);
7323 static PyObject*
7324 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7326 PyUnicodeObject *u;
7327 Py_UNICODE *p;
7328 Py_ssize_t nchars;
7329 size_t nbytes;
7331 if (len < 0)
7332 len = 0;
7334 if (len == 1 && PyUnicode_CheckExact(str)) {
7335 /* no repeat, return original string */
7336 Py_INCREF(str);
7337 return (PyObject*) str;
7340 /* ensure # of chars needed doesn't overflow int and # of bytes
7341 * needed doesn't overflow size_t
7343 nchars = len * str->length;
7344 if (len && nchars / len != str->length) {
7345 PyErr_SetString(PyExc_OverflowError,
7346 "repeated string is too long");
7347 return NULL;
7349 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7350 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7351 PyErr_SetString(PyExc_OverflowError,
7352 "repeated string is too long");
7353 return NULL;
7355 u = _PyUnicode_New(nchars);
7356 if (!u)
7357 return NULL;
7359 p = u->str;
7361 if (str->length == 1 && len > 0) {
7362 Py_UNICODE_FILL(p, str->str[0], len);
7363 } else {
7364 Py_ssize_t done = 0; /* number of characters copied this far */
7365 if (done < nchars) {
7366 Py_UNICODE_COPY(p, str->str, str->length);
7367 done = str->length;
7369 while (done < nchars) {
7370 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7371 Py_UNICODE_COPY(p+done, p, n);
7372 done += n;
7376 return (PyObject*) u;
7379 PyObject *PyUnicode_Replace(PyObject *obj,
7380 PyObject *subobj,
7381 PyObject *replobj,
7382 Py_ssize_t maxcount)
7384 PyObject *self;
7385 PyObject *str1;
7386 PyObject *str2;
7387 PyObject *result;
7389 self = PyUnicode_FromObject(obj);
7390 if (self == NULL)
7391 return NULL;
7392 str1 = PyUnicode_FromObject(subobj);
7393 if (str1 == NULL) {
7394 Py_DECREF(self);
7395 return NULL;
7397 str2 = PyUnicode_FromObject(replobj);
7398 if (str2 == NULL) {
7399 Py_DECREF(self);
7400 Py_DECREF(str1);
7401 return NULL;
7403 result = replace((PyUnicodeObject *)self,
7404 (PyUnicodeObject *)str1,
7405 (PyUnicodeObject *)str2,
7406 maxcount);
7407 Py_DECREF(self);
7408 Py_DECREF(str1);
7409 Py_DECREF(str2);
7410 return result;
7413 PyDoc_STRVAR(replace__doc__,
7414 "S.replace (old, new[, count]) -> unicode\n\
7416 Return a copy of S with all occurrences of substring\n\
7417 old replaced by new. If the optional argument count is\n\
7418 given, only the first count occurrences are replaced.");
7420 static PyObject*
7421 unicode_replace(PyUnicodeObject *self, PyObject *args)
7423 PyUnicodeObject *str1;
7424 PyUnicodeObject *str2;
7425 Py_ssize_t maxcount = -1;
7426 PyObject *result;
7428 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7429 return NULL;
7430 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7431 if (str1 == NULL)
7432 return NULL;
7433 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7434 if (str2 == NULL) {
7435 Py_DECREF(str1);
7436 return NULL;
7439 result = replace(self, str1, str2, maxcount);
7441 Py_DECREF(str1);
7442 Py_DECREF(str2);
7443 return result;
7446 static
7447 PyObject *unicode_repr(PyObject *unicode)
7449 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7450 PyUnicode_GET_SIZE(unicode),
7454 PyDoc_STRVAR(rfind__doc__,
7455 "S.rfind(sub [,start [,end]]) -> int\n\
7457 Return the highest index in S where substring sub is found,\n\
7458 such that sub is contained within s[start:end]. Optional\n\
7459 arguments start and end are interpreted as in slice notation.\n\
7461 Return -1 on failure.");
7463 static PyObject *
7464 unicode_rfind(PyUnicodeObject *self, PyObject *args)
7466 PyObject *substring;
7467 Py_ssize_t start;
7468 Py_ssize_t end;
7469 Py_ssize_t result;
7471 if (!_ParseTupleFinds(args, &substring, &start, &end))
7472 return NULL;
7474 result = stringlib_rfind_slice(
7475 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7476 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7477 start, end
7480 Py_DECREF(substring);
7482 return PyInt_FromSsize_t(result);
7485 PyDoc_STRVAR(rindex__doc__,
7486 "S.rindex(sub [,start [,end]]) -> int\n\
7488 Like S.rfind() but raise ValueError when the substring is not found.");
7490 static PyObject *
7491 unicode_rindex(PyUnicodeObject *self, PyObject *args)
7493 PyObject *substring;
7494 Py_ssize_t start;
7495 Py_ssize_t end;
7496 Py_ssize_t result;
7498 if (!_ParseTupleFinds(args, &substring, &start, &end))
7499 return NULL;
7501 result = stringlib_rfind_slice(
7502 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7503 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7504 start, end
7507 Py_DECREF(substring);
7509 if (result < 0) {
7510 PyErr_SetString(PyExc_ValueError, "substring not found");
7511 return NULL;
7513 return PyInt_FromSsize_t(result);
7516 PyDoc_STRVAR(rjust__doc__,
7517 "S.rjust(width[, fillchar]) -> unicode\n\
7519 Return S right-justified in a Unicode string of length width. Padding is\n\
7520 done using the specified fill character (default is a space).");
7522 static PyObject *
7523 unicode_rjust(PyUnicodeObject *self, PyObject *args)
7525 Py_ssize_t width;
7526 Py_UNICODE fillchar = ' ';
7528 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7529 return NULL;
7531 if (self->length >= width && PyUnicode_CheckExact(self)) {
7532 Py_INCREF(self);
7533 return (PyObject*) self;
7536 return (PyObject*) pad(self, width - self->length, 0, fillchar);
7539 static PyObject*
7540 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
7542 /* standard clamping */
7543 if (start < 0)
7544 start = 0;
7545 if (end < 0)
7546 end = 0;
7547 if (end > self->length)
7548 end = self->length;
7549 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
7550 /* full slice, return original string */
7551 Py_INCREF(self);
7552 return (PyObject*) self;
7554 if (start > end)
7555 start = end;
7556 /* copy slice */
7557 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7558 end - start);
7561 PyObject *PyUnicode_Split(PyObject *s,
7562 PyObject *sep,
7563 Py_ssize_t maxsplit)
7565 PyObject *result;
7567 s = PyUnicode_FromObject(s);
7568 if (s == NULL)
7569 return NULL;
7570 if (sep != NULL) {
7571 sep = PyUnicode_FromObject(sep);
7572 if (sep == NULL) {
7573 Py_DECREF(s);
7574 return NULL;
7578 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7580 Py_DECREF(s);
7581 Py_XDECREF(sep);
7582 return result;
7585 PyDoc_STRVAR(split__doc__,
7586 "S.split([sep [,maxsplit]]) -> list of strings\n\
7588 Return a list of the words in S, using sep as the\n\
7589 delimiter string. If maxsplit is given, at most maxsplit\n\
7590 splits are done. If sep is not specified or is None, any\n\
7591 whitespace string is a separator and empty strings are\n\
7592 removed from the result.");
7594 static PyObject*
7595 unicode_split(PyUnicodeObject *self, PyObject *args)
7597 PyObject *substring = Py_None;
7598 Py_ssize_t maxcount = -1;
7600 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7601 return NULL;
7603 if (substring == Py_None)
7604 return split(self, NULL, maxcount);
7605 else if (PyUnicode_Check(substring))
7606 return split(self, (PyUnicodeObject *)substring, maxcount);
7607 else
7608 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7611 PyObject *
7612 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7614 PyObject* str_obj;
7615 PyObject* sep_obj;
7616 PyObject* out;
7618 str_obj = PyUnicode_FromObject(str_in);
7619 if (!str_obj)
7620 return NULL;
7621 sep_obj = PyUnicode_FromObject(sep_in);
7622 if (!sep_obj) {
7623 Py_DECREF(str_obj);
7624 return NULL;
7627 out = stringlib_partition(
7628 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7629 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7632 Py_DECREF(sep_obj);
7633 Py_DECREF(str_obj);
7635 return out;
7639 PyObject *
7640 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7642 PyObject* str_obj;
7643 PyObject* sep_obj;
7644 PyObject* out;
7646 str_obj = PyUnicode_FromObject(str_in);
7647 if (!str_obj)
7648 return NULL;
7649 sep_obj = PyUnicode_FromObject(sep_in);
7650 if (!sep_obj) {
7651 Py_DECREF(str_obj);
7652 return NULL;
7655 out = stringlib_rpartition(
7656 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7657 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7660 Py_DECREF(sep_obj);
7661 Py_DECREF(str_obj);
7663 return out;
7666 PyDoc_STRVAR(partition__doc__,
7667 "S.partition(sep) -> (head, sep, tail)\n\
7669 Search for the separator sep in S, and return the part before it,\n\
7670 the separator itself, and the part after it. If the separator is not\n\
7671 found, return S and two empty strings.");
7673 static PyObject*
7674 unicode_partition(PyUnicodeObject *self, PyObject *separator)
7676 return PyUnicode_Partition((PyObject *)self, separator);
7679 PyDoc_STRVAR(rpartition__doc__,
7680 "S.rpartition(sep) -> (tail, sep, head)\n\
7682 Search for the separator sep in S, starting at the end of S, and return\n\
7683 the part before it, the separator itself, and the part after it. If the\n\
7684 separator is not found, return two empty strings and S.");
7686 static PyObject*
7687 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7689 return PyUnicode_RPartition((PyObject *)self, separator);
7692 PyObject *PyUnicode_RSplit(PyObject *s,
7693 PyObject *sep,
7694 Py_ssize_t maxsplit)
7696 PyObject *result;
7698 s = PyUnicode_FromObject(s);
7699 if (s == NULL)
7700 return NULL;
7701 if (sep != NULL) {
7702 sep = PyUnicode_FromObject(sep);
7703 if (sep == NULL) {
7704 Py_DECREF(s);
7705 return NULL;
7709 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7711 Py_DECREF(s);
7712 Py_XDECREF(sep);
7713 return result;
7716 PyDoc_STRVAR(rsplit__doc__,
7717 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7719 Return a list of the words in S, using sep as the\n\
7720 delimiter string, starting at the end of the string and\n\
7721 working to the front. If maxsplit is given, at most maxsplit\n\
7722 splits are done. If sep is not specified, any whitespace string\n\
7723 is a separator.");
7725 static PyObject*
7726 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7728 PyObject *substring = Py_None;
7729 Py_ssize_t maxcount = -1;
7731 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7732 return NULL;
7734 if (substring == Py_None)
7735 return rsplit(self, NULL, maxcount);
7736 else if (PyUnicode_Check(substring))
7737 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7738 else
7739 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7742 PyDoc_STRVAR(splitlines__doc__,
7743 "S.splitlines([keepends]) -> list of strings\n\
7745 Return a list of the lines in S, breaking at line boundaries.\n\
7746 Line breaks are not included in the resulting list unless keepends\n\
7747 is given and true.");
7749 static PyObject*
7750 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7752 int keepends = 0;
7754 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7755 return NULL;
7757 return PyUnicode_Splitlines((PyObject *)self, keepends);
7760 static
7761 PyObject *unicode_str(PyUnicodeObject *self)
7763 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
7766 PyDoc_STRVAR(swapcase__doc__,
7767 "S.swapcase() -> unicode\n\
7769 Return a copy of S with uppercase characters converted to lowercase\n\
7770 and vice versa.");
7772 static PyObject*
7773 unicode_swapcase(PyUnicodeObject *self)
7775 return fixup(self, fixswapcase);
7778 PyDoc_STRVAR(translate__doc__,
7779 "S.translate(table) -> unicode\n\
7781 Return a copy of the string S, where all characters have been mapped\n\
7782 through the given translation table, which must be a mapping of\n\
7783 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7784 Unmapped characters are left untouched. Characters mapped to None\n\
7785 are deleted.");
7787 static PyObject*
7788 unicode_translate(PyUnicodeObject *self, PyObject *table)
7790 return PyUnicode_TranslateCharmap(self->str,
7791 self->length,
7792 table,
7793 "ignore");
7796 PyDoc_STRVAR(upper__doc__,
7797 "S.upper() -> unicode\n\
7799 Return a copy of S converted to uppercase.");
7801 static PyObject*
7802 unicode_upper(PyUnicodeObject *self)
7804 return fixup(self, fixupper);
7807 PyDoc_STRVAR(zfill__doc__,
7808 "S.zfill(width) -> unicode\n\
7810 Pad a numeric string S with zeros on the left, to fill a field\n\
7811 of the specified width. The string S is never truncated.");
7813 static PyObject *
7814 unicode_zfill(PyUnicodeObject *self, PyObject *args)
7816 Py_ssize_t fill;
7817 PyUnicodeObject *u;
7819 Py_ssize_t width;
7820 if (!PyArg_ParseTuple(args, "n:zfill", &width))
7821 return NULL;
7823 if (self->length >= width) {
7824 if (PyUnicode_CheckExact(self)) {
7825 Py_INCREF(self);
7826 return (PyObject*) self;
7828 else
7829 return PyUnicode_FromUnicode(
7830 PyUnicode_AS_UNICODE(self),
7831 PyUnicode_GET_SIZE(self)
7835 fill = width - self->length;
7837 u = pad(self, fill, 0, '0');
7839 if (u == NULL)
7840 return NULL;
7842 if (u->str[fill] == '+' || u->str[fill] == '-') {
7843 /* move sign to beginning of string */
7844 u->str[0] = u->str[fill];
7845 u->str[fill] = '0';
7848 return (PyObject*) u;
7851 #if 0
7852 static PyObject*
7853 free_listsize(PyUnicodeObject *self)
7855 return PyInt_FromLong(numfree);
7857 #endif
7859 PyDoc_STRVAR(startswith__doc__,
7860 "S.startswith(prefix[, start[, end]]) -> bool\n\
7862 Return True if S starts with the specified prefix, False otherwise.\n\
7863 With optional start, test S beginning at that position.\n\
7864 With optional end, stop comparing S at that position.\n\
7865 prefix can also be a tuple of strings to try.");
7867 static PyObject *
7868 unicode_startswith(PyUnicodeObject *self,
7869 PyObject *args)
7871 PyObject *subobj;
7872 PyUnicodeObject *substring;
7873 Py_ssize_t start = 0;
7874 Py_ssize_t end = PY_SSIZE_T_MAX;
7875 int result;
7877 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
7878 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7879 return NULL;
7880 if (PyTuple_Check(subobj)) {
7881 Py_ssize_t i;
7882 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7883 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7884 PyTuple_GET_ITEM(subobj, i));
7885 if (substring == NULL)
7886 return NULL;
7887 result = tailmatch(self, substring, start, end, -1);
7888 Py_DECREF(substring);
7889 if (result) {
7890 Py_RETURN_TRUE;
7893 /* nothing matched */
7894 Py_RETURN_FALSE;
7896 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7897 if (substring == NULL)
7898 return NULL;
7899 result = tailmatch(self, substring, start, end, -1);
7900 Py_DECREF(substring);
7901 return PyBool_FromLong(result);
7905 PyDoc_STRVAR(endswith__doc__,
7906 "S.endswith(suffix[, start[, end]]) -> bool\n\
7908 Return True if S ends with the specified suffix, False otherwise.\n\
7909 With optional start, test S beginning at that position.\n\
7910 With optional end, stop comparing S at that position.\n\
7911 suffix can also be a tuple of strings to try.");
7913 static PyObject *
7914 unicode_endswith(PyUnicodeObject *self,
7915 PyObject *args)
7917 PyObject *subobj;
7918 PyUnicodeObject *substring;
7919 Py_ssize_t start = 0;
7920 Py_ssize_t end = PY_SSIZE_T_MAX;
7921 int result;
7923 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7924 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7925 return NULL;
7926 if (PyTuple_Check(subobj)) {
7927 Py_ssize_t i;
7928 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7929 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7930 PyTuple_GET_ITEM(subobj, i));
7931 if (substring == NULL)
7932 return NULL;
7933 result = tailmatch(self, substring, start, end, +1);
7934 Py_DECREF(substring);
7935 if (result) {
7936 Py_RETURN_TRUE;
7939 Py_RETURN_FALSE;
7941 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7942 if (substring == NULL)
7943 return NULL;
7945 result = tailmatch(self, substring, start, end, +1);
7946 Py_DECREF(substring);
7947 return PyBool_FromLong(result);
7951 /* Implements do_string_format, which is unicode because of stringlib */
7952 #include "stringlib/string_format.h"
7954 PyDoc_STRVAR(format__doc__,
7955 "S.format(*args, **kwargs) -> unicode\n\
7959 static PyObject *
7960 unicode__format__(PyObject *self, PyObject *args)
7962 PyObject *format_spec;
7963 PyObject *result = NULL;
7964 PyObject *tmp = NULL;
7966 /* If 2.x, convert format_spec to the same type as value */
7967 /* This is to allow things like u''.format('') */
7968 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7969 goto done;
7970 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7971 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
7972 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
7973 goto done;
7975 tmp = PyObject_Unicode(format_spec);
7976 if (tmp == NULL)
7977 goto done;
7978 format_spec = tmp;
7980 result = _PyUnicode_FormatAdvanced(self,
7981 PyUnicode_AS_UNICODE(format_spec),
7982 PyUnicode_GET_SIZE(format_spec));
7983 done:
7984 Py_XDECREF(tmp);
7985 return result;
7988 PyDoc_STRVAR(p_format__doc__,
7989 "S.__format__(format_spec) -> unicode\n\
7993 static PyObject *
7994 unicode__sizeof__(PyUnicodeObject *v)
7996 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7997 sizeof(Py_UNICODE) * (v->length + 1));
8000 PyDoc_STRVAR(sizeof__doc__,
8001 "S.__sizeof__() -> size of S in memory, in bytes\n\
8005 static PyObject *
8006 unicode_getnewargs(PyUnicodeObject *v)
8008 return Py_BuildValue("(u#)", v->str, v->length);
8012 static PyMethodDef unicode_methods[] = {
8014 /* Order is according to common usage: often used methods should
8015 appear first, since lookup is done sequentially. */
8017 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8018 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8019 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
8020 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
8021 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8022 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8023 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8024 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8025 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8026 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8027 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
8028 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
8029 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8030 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8031 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
8032 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
8033 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
8034 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
8035 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8036 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8037 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
8038 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
8039 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
8040 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
8041 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
8042 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8043 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8044 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8045 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8046 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8047 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8048 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8049 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8050 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8051 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8052 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8053 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8054 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8055 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
8056 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
8057 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8058 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
8059 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8060 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
8061 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
8062 #if 0
8063 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
8064 #endif
8066 #if 0
8067 /* This one is just used for debugging the implementation. */
8068 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
8069 #endif
8071 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
8072 {NULL, NULL}
8075 static PyObject *
8076 unicode_mod(PyObject *v, PyObject *w)
8078 if (!PyUnicode_Check(v)) {
8079 Py_INCREF(Py_NotImplemented);
8080 return Py_NotImplemented;
8082 return PyUnicode_Format(v, w);
8085 static PyNumberMethods unicode_as_number = {
8086 0, /*nb_add*/
8087 0, /*nb_subtract*/
8088 0, /*nb_multiply*/
8089 0, /*nb_divide*/
8090 unicode_mod, /*nb_remainder*/
8093 static PySequenceMethods unicode_as_sequence = {
8094 (lenfunc) unicode_length, /* sq_length */
8095 PyUnicode_Concat, /* sq_concat */
8096 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8097 (ssizeargfunc) unicode_getitem, /* sq_item */
8098 (ssizessizeargfunc) unicode_slice, /* sq_slice */
8099 0, /* sq_ass_item */
8100 0, /* sq_ass_slice */
8101 PyUnicode_Contains, /* sq_contains */
8104 static PyObject*
8105 unicode_subscript(PyUnicodeObject* self, PyObject* item)
8107 if (PyIndex_Check(item)) {
8108 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
8109 if (i == -1 && PyErr_Occurred())
8110 return NULL;
8111 if (i < 0)
8112 i += PyUnicode_GET_SIZE(self);
8113 return unicode_getitem(self, i);
8114 } else if (PySlice_Check(item)) {
8115 Py_ssize_t start, stop, step, slicelength, cur, i;
8116 Py_UNICODE* source_buf;
8117 Py_UNICODE* result_buf;
8118 PyObject* result;
8120 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
8121 &start, &stop, &step, &slicelength) < 0) {
8122 return NULL;
8125 if (slicelength <= 0) {
8126 return PyUnicode_FromUnicode(NULL, 0);
8127 } else if (start == 0 && step == 1 && slicelength == self->length &&
8128 PyUnicode_CheckExact(self)) {
8129 Py_INCREF(self);
8130 return (PyObject *)self;
8131 } else if (step == 1) {
8132 return PyUnicode_FromUnicode(self->str + start, slicelength);
8133 } else {
8134 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
8135 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8136 sizeof(Py_UNICODE));
8138 if (result_buf == NULL)
8139 return PyErr_NoMemory();
8141 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8142 result_buf[i] = source_buf[cur];
8145 result = PyUnicode_FromUnicode(result_buf, slicelength);
8146 PyObject_FREE(result_buf);
8147 return result;
8149 } else {
8150 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8151 return NULL;
8155 static PyMappingMethods unicode_as_mapping = {
8156 (lenfunc)unicode_length, /* mp_length */
8157 (binaryfunc)unicode_subscript, /* mp_subscript */
8158 (objobjargproc)0, /* mp_ass_subscript */
8161 static Py_ssize_t
8162 unicode_buffer_getreadbuf(PyUnicodeObject *self,
8163 Py_ssize_t index,
8164 const void **ptr)
8166 if (index != 0) {
8167 PyErr_SetString(PyExc_SystemError,
8168 "accessing non-existent unicode segment");
8169 return -1;
8171 *ptr = (void *) self->str;
8172 return PyUnicode_GET_DATA_SIZE(self);
8175 static Py_ssize_t
8176 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
8177 const void **ptr)
8179 PyErr_SetString(PyExc_TypeError,
8180 "cannot use unicode as modifiable buffer");
8181 return -1;
8184 static int
8185 unicode_buffer_getsegcount(PyUnicodeObject *self,
8186 Py_ssize_t *lenp)
8188 if (lenp)
8189 *lenp = PyUnicode_GET_DATA_SIZE(self);
8190 return 1;
8193 static Py_ssize_t
8194 unicode_buffer_getcharbuf(PyUnicodeObject *self,
8195 Py_ssize_t index,
8196 const void **ptr)
8198 PyObject *str;
8200 if (index != 0) {
8201 PyErr_SetString(PyExc_SystemError,
8202 "accessing non-existent unicode segment");
8203 return -1;
8205 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
8206 if (str == NULL)
8207 return -1;
8208 *ptr = (void *) PyString_AS_STRING(str);
8209 return PyString_GET_SIZE(str);
8212 /* Helpers for PyUnicode_Format() */
8214 static PyObject *
8215 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8217 Py_ssize_t argidx = *p_argidx;
8218 if (argidx < arglen) {
8219 (*p_argidx)++;
8220 if (arglen < 0)
8221 return args;
8222 else
8223 return PyTuple_GetItem(args, argidx);
8225 PyErr_SetString(PyExc_TypeError,
8226 "not enough arguments for format string");
8227 return NULL;
8230 #define F_LJUST (1<<0)
8231 #define F_SIGN (1<<1)
8232 #define F_BLANK (1<<2)
8233 #define F_ALT (1<<3)
8234 #define F_ZERO (1<<4)
8236 static Py_ssize_t
8237 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8239 register Py_ssize_t i;
8240 Py_ssize_t len = strlen(charbuffer);
8241 for (i = len - 1; i >= 0; i--)
8242 buffer[i] = (Py_UNICODE) charbuffer[i];
8244 return len;
8247 static int
8248 doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8250 Py_ssize_t result;
8252 PyOS_ascii_formatd((char *)buffer, len, format, x);
8253 result = strtounicode(buffer, (char *)buffer);
8254 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8257 static int
8258 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8260 Py_ssize_t result;
8262 PyOS_snprintf((char *)buffer, len, format, x);
8263 result = strtounicode(buffer, (char *)buffer);
8264 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8267 /* XXX To save some code duplication, formatfloat/long/int could have been
8268 shared with stringobject.c, converting from 8-bit to Unicode after the
8269 formatting is done. */
8271 static int
8272 formatfloat(Py_UNICODE *buf,
8273 size_t buflen,
8274 int flags,
8275 int prec,
8276 int type,
8277 PyObject *v)
8279 /* fmt = '%#.' + `prec` + `type`
8280 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
8281 char fmt[20];
8282 double x;
8284 x = PyFloat_AsDouble(v);
8285 if (x == -1.0 && PyErr_Occurred())
8286 return -1;
8287 if (prec < 0)
8288 prec = 6;
8289 /* make sure that the decimal representation of precision really does
8290 need at most 10 digits: platforms with sizeof(int) == 8 exist! */
8291 if (prec > 0x7fffffffL) {
8292 PyErr_SetString(PyExc_OverflowError,
8293 "outrageously large precision "
8294 "for formatted float");
8295 return -1;
8298 if (type == 'f' && fabs(x) >= 1e50)
8299 type = 'g';
8300 /* Worst case length calc to ensure no buffer overrun:
8302 'g' formats:
8303 fmt = %#.<prec>g
8304 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8305 for any double rep.)
8306 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8308 'f' formats:
8309 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8310 len = 1 + 50 + 1 + prec = 52 + prec
8312 If prec=0 the effective precision is 1 (the leading digit is
8313 always given), therefore increase the length by one.
8316 if (((type == 'g' || type == 'G') &&
8317 buflen <= (size_t)10 + (size_t)prec) ||
8318 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
8319 PyErr_SetString(PyExc_OverflowError,
8320 "formatted float is too long (precision too large?)");
8321 return -1;
8323 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8324 (flags&F_ALT) ? "#" : "",
8325 prec, type);
8326 return doubletounicode(buf, buflen, fmt, x);
8329 static PyObject*
8330 formatlong(PyObject *val, int flags, int prec, int type)
8332 char *buf;
8333 int i, len;
8334 PyObject *str; /* temporary string object. */
8335 PyUnicodeObject *result;
8337 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8338 if (!str)
8339 return NULL;
8340 result = _PyUnicode_New(len);
8341 if (!result) {
8342 Py_DECREF(str);
8343 return NULL;
8345 for (i = 0; i < len; i++)
8346 result->str[i] = buf[i];
8347 result->str[len] = 0;
8348 Py_DECREF(str);
8349 return (PyObject*)result;
8352 static int
8353 formatint(Py_UNICODE *buf,
8354 size_t buflen,
8355 int flags,
8356 int prec,
8357 int type,
8358 PyObject *v)
8360 /* fmt = '%#.' + `prec` + 'l' + `type`
8361 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8362 * + 1 + 1
8363 * = 24
8365 char fmt[64]; /* plenty big enough! */
8366 char *sign;
8367 long x;
8369 x = PyInt_AsLong(v);
8370 if (x == -1 && PyErr_Occurred())
8371 return -1;
8372 if (x < 0 && type == 'u') {
8373 type = 'd';
8375 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8376 sign = "-";
8377 else
8378 sign = "";
8379 if (prec < 0)
8380 prec = 1;
8382 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8383 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8385 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8386 PyErr_SetString(PyExc_OverflowError,
8387 "formatted integer is too long (precision too large?)");
8388 return -1;
8391 if ((flags & F_ALT) &&
8392 (type == 'x' || type == 'X')) {
8393 /* When converting under %#x or %#X, there are a number
8394 * of issues that cause pain:
8395 * - when 0 is being converted, the C standard leaves off
8396 * the '0x' or '0X', which is inconsistent with other
8397 * %#x/%#X conversions and inconsistent with Python's
8398 * hex() function
8399 * - there are platforms that violate the standard and
8400 * convert 0 with the '0x' or '0X'
8401 * (Metrowerks, Compaq Tru64)
8402 * - there are platforms that give '0x' when converting
8403 * under %#X, but convert 0 in accordance with the
8404 * standard (OS/2 EMX)
8406 * We can achieve the desired consistency by inserting our
8407 * own '0x' or '0X' prefix, and substituting %x/%X in place
8408 * of %#x/%#X.
8410 * Note that this is the same approach as used in
8411 * formatint() in stringobject.c
8413 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8414 sign, type, prec, type);
8416 else {
8417 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8418 sign, (flags&F_ALT) ? "#" : "",
8419 prec, type);
8421 if (sign[0])
8422 return longtounicode(buf, buflen, fmt, -x);
8423 else
8424 return longtounicode(buf, buflen, fmt, x);
8427 static int
8428 formatchar(Py_UNICODE *buf,
8429 size_t buflen,
8430 PyObject *v)
8432 /* presume that the buffer is at least 2 characters long */
8433 if (PyUnicode_Check(v)) {
8434 if (PyUnicode_GET_SIZE(v) != 1)
8435 goto onError;
8436 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8439 else if (PyString_Check(v)) {
8440 if (PyString_GET_SIZE(v) != 1)
8441 goto onError;
8442 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8445 else {
8446 /* Integer input truncated to a character */
8447 long x;
8448 x = PyInt_AsLong(v);
8449 if (x == -1 && PyErr_Occurred())
8450 goto onError;
8451 #ifdef Py_UNICODE_WIDE
8452 if (x < 0 || x > 0x10ffff) {
8453 PyErr_SetString(PyExc_OverflowError,
8454 "%c arg not in range(0x110000) "
8455 "(wide Python build)");
8456 return -1;
8458 #else
8459 if (x < 0 || x > 0xffff) {
8460 PyErr_SetString(PyExc_OverflowError,
8461 "%c arg not in range(0x10000) "
8462 "(narrow Python build)");
8463 return -1;
8465 #endif
8466 buf[0] = (Py_UNICODE) x;
8468 buf[1] = '\0';
8469 return 1;
8471 onError:
8472 PyErr_SetString(PyExc_TypeError,
8473 "%c requires int or char");
8474 return -1;
8477 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8479 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8480 chars are formatted. XXX This is a magic number. Each formatting
8481 routine does bounds checking to ensure no overflow, but a better
8482 solution may be to malloc a buffer of appropriate size for each
8483 format. For now, the current solution is sufficient.
8485 #define FORMATBUFLEN (size_t)120
8487 PyObject *PyUnicode_Format(PyObject *format,
8488 PyObject *args)
8490 Py_UNICODE *fmt, *res;
8491 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8492 int args_owned = 0;
8493 PyUnicodeObject *result = NULL;
8494 PyObject *dict = NULL;
8495 PyObject *uformat;
8497 if (format == NULL || args == NULL) {
8498 PyErr_BadInternalCall();
8499 return NULL;
8501 uformat = PyUnicode_FromObject(format);
8502 if (uformat == NULL)
8503 return NULL;
8504 fmt = PyUnicode_AS_UNICODE(uformat);
8505 fmtcnt = PyUnicode_GET_SIZE(uformat);
8507 reslen = rescnt = fmtcnt + 100;
8508 result = _PyUnicode_New(reslen);
8509 if (result == NULL)
8510 goto onError;
8511 res = PyUnicode_AS_UNICODE(result);
8513 if (PyTuple_Check(args)) {
8514 arglen = PyTuple_Size(args);
8515 argidx = 0;
8517 else {
8518 arglen = -1;
8519 argidx = -2;
8521 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
8522 !PyObject_TypeCheck(args, &PyBaseString_Type))
8523 dict = args;
8525 while (--fmtcnt >= 0) {
8526 if (*fmt != '%') {
8527 if (--rescnt < 0) {
8528 rescnt = fmtcnt + 100;
8529 reslen += rescnt;
8530 if (_PyUnicode_Resize(&result, reslen) < 0)
8531 goto onError;
8532 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8533 --rescnt;
8535 *res++ = *fmt++;
8537 else {
8538 /* Got a format specifier */
8539 int flags = 0;
8540 Py_ssize_t width = -1;
8541 int prec = -1;
8542 Py_UNICODE c = '\0';
8543 Py_UNICODE fill;
8544 int isnumok;
8545 PyObject *v = NULL;
8546 PyObject *temp = NULL;
8547 Py_UNICODE *pbuf;
8548 Py_UNICODE sign;
8549 Py_ssize_t len;
8550 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
8552 fmt++;
8553 if (*fmt == '(') {
8554 Py_UNICODE *keystart;
8555 Py_ssize_t keylen;
8556 PyObject *key;
8557 int pcount = 1;
8559 if (dict == NULL) {
8560 PyErr_SetString(PyExc_TypeError,
8561 "format requires a mapping");
8562 goto onError;
8564 ++fmt;
8565 --fmtcnt;
8566 keystart = fmt;
8567 /* Skip over balanced parentheses */
8568 while (pcount > 0 && --fmtcnt >= 0) {
8569 if (*fmt == ')')
8570 --pcount;
8571 else if (*fmt == '(')
8572 ++pcount;
8573 fmt++;
8575 keylen = fmt - keystart - 1;
8576 if (fmtcnt < 0 || pcount > 0) {
8577 PyErr_SetString(PyExc_ValueError,
8578 "incomplete format key");
8579 goto onError;
8581 #if 0
8582 /* keys are converted to strings using UTF-8 and
8583 then looked up since Python uses strings to hold
8584 variables names etc. in its namespaces and we
8585 wouldn't want to break common idioms. */
8586 key = PyUnicode_EncodeUTF8(keystart,
8587 keylen,
8588 NULL);
8589 #else
8590 key = PyUnicode_FromUnicode(keystart, keylen);
8591 #endif
8592 if (key == NULL)
8593 goto onError;
8594 if (args_owned) {
8595 Py_DECREF(args);
8596 args_owned = 0;
8598 args = PyObject_GetItem(dict, key);
8599 Py_DECREF(key);
8600 if (args == NULL) {
8601 goto onError;
8603 args_owned = 1;
8604 arglen = -1;
8605 argidx = -2;
8607 while (--fmtcnt >= 0) {
8608 switch (c = *fmt++) {
8609 case '-': flags |= F_LJUST; continue;
8610 case '+': flags |= F_SIGN; continue;
8611 case ' ': flags |= F_BLANK; continue;
8612 case '#': flags |= F_ALT; continue;
8613 case '0': flags |= F_ZERO; continue;
8615 break;
8617 if (c == '*') {
8618 v = getnextarg(args, arglen, &argidx);
8619 if (v == NULL)
8620 goto onError;
8621 if (!PyInt_Check(v)) {
8622 PyErr_SetString(PyExc_TypeError,
8623 "* wants int");
8624 goto onError;
8626 width = PyInt_AsLong(v);
8627 if (width < 0) {
8628 flags |= F_LJUST;
8629 width = -width;
8631 if (--fmtcnt >= 0)
8632 c = *fmt++;
8634 else if (c >= '0' && c <= '9') {
8635 width = c - '0';
8636 while (--fmtcnt >= 0) {
8637 c = *fmt++;
8638 if (c < '0' || c > '9')
8639 break;
8640 if ((width*10) / 10 != width) {
8641 PyErr_SetString(PyExc_ValueError,
8642 "width too big");
8643 goto onError;
8645 width = width*10 + (c - '0');
8648 if (c == '.') {
8649 prec = 0;
8650 if (--fmtcnt >= 0)
8651 c = *fmt++;
8652 if (c == '*') {
8653 v = getnextarg(args, arglen, &argidx);
8654 if (v == NULL)
8655 goto onError;
8656 if (!PyInt_Check(v)) {
8657 PyErr_SetString(PyExc_TypeError,
8658 "* wants int");
8659 goto onError;
8661 prec = PyInt_AsLong(v);
8662 if (prec < 0)
8663 prec = 0;
8664 if (--fmtcnt >= 0)
8665 c = *fmt++;
8667 else if (c >= '0' && c <= '9') {
8668 prec = c - '0';
8669 while (--fmtcnt >= 0) {
8670 c = Py_CHARMASK(*fmt++);
8671 if (c < '0' || c > '9')
8672 break;
8673 if ((prec*10) / 10 != prec) {
8674 PyErr_SetString(PyExc_ValueError,
8675 "prec too big");
8676 goto onError;
8678 prec = prec*10 + (c - '0');
8681 } /* prec */
8682 if (fmtcnt >= 0) {
8683 if (c == 'h' || c == 'l' || c == 'L') {
8684 if (--fmtcnt >= 0)
8685 c = *fmt++;
8688 if (fmtcnt < 0) {
8689 PyErr_SetString(PyExc_ValueError,
8690 "incomplete format");
8691 goto onError;
8693 if (c != '%') {
8694 v = getnextarg(args, arglen, &argidx);
8695 if (v == NULL)
8696 goto onError;
8698 sign = 0;
8699 fill = ' ';
8700 switch (c) {
8702 case '%':
8703 pbuf = formatbuf;
8704 /* presume that buffer length is at least 1 */
8705 pbuf[0] = '%';
8706 len = 1;
8707 break;
8709 case 's':
8710 case 'r':
8711 if (PyUnicode_Check(v) && c == 's') {
8712 temp = v;
8713 Py_INCREF(temp);
8715 else {
8716 PyObject *unicode;
8717 if (c == 's')
8718 temp = PyObject_Unicode(v);
8719 else
8720 temp = PyObject_Repr(v);
8721 if (temp == NULL)
8722 goto onError;
8723 if (PyUnicode_Check(temp))
8724 /* nothing to do */;
8725 else if (PyString_Check(temp)) {
8726 /* convert to string to Unicode */
8727 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8728 PyString_GET_SIZE(temp),
8729 NULL,
8730 "strict");
8731 Py_DECREF(temp);
8732 temp = unicode;
8733 if (temp == NULL)
8734 goto onError;
8736 else {
8737 Py_DECREF(temp);
8738 PyErr_SetString(PyExc_TypeError,
8739 "%s argument has non-string str()");
8740 goto onError;
8743 pbuf = PyUnicode_AS_UNICODE(temp);
8744 len = PyUnicode_GET_SIZE(temp);
8745 if (prec >= 0 && len > prec)
8746 len = prec;
8747 break;
8749 case 'i':
8750 case 'd':
8751 case 'u':
8752 case 'o':
8753 case 'x':
8754 case 'X':
8755 if (c == 'i')
8756 c = 'd';
8757 isnumok = 0;
8758 if (PyNumber_Check(v)) {
8759 PyObject *iobj=NULL;
8761 if (PyInt_Check(v) || (PyLong_Check(v))) {
8762 iobj = v;
8763 Py_INCREF(iobj);
8765 else {
8766 iobj = PyNumber_Int(v);
8767 if (iobj==NULL) iobj = PyNumber_Long(v);
8769 if (iobj!=NULL) {
8770 if (PyInt_Check(iobj)) {
8771 isnumok = 1;
8772 pbuf = formatbuf;
8773 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8774 flags, prec, c, iobj);
8775 Py_DECREF(iobj);
8776 if (len < 0)
8777 goto onError;
8778 sign = 1;
8780 else if (PyLong_Check(iobj)) {
8781 isnumok = 1;
8782 temp = formatlong(iobj, flags, prec, c);
8783 Py_DECREF(iobj);
8784 if (!temp)
8785 goto onError;
8786 pbuf = PyUnicode_AS_UNICODE(temp);
8787 len = PyUnicode_GET_SIZE(temp);
8788 sign = 1;
8790 else {
8791 Py_DECREF(iobj);
8795 if (!isnumok) {
8796 PyErr_Format(PyExc_TypeError,
8797 "%%%c format: a number is required, "
8798 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8799 goto onError;
8801 if (flags & F_ZERO)
8802 fill = '0';
8803 break;
8805 case 'e':
8806 case 'E':
8807 case 'f':
8808 case 'F':
8809 case 'g':
8810 case 'G':
8811 if (c == 'F')
8812 c = 'f';
8813 pbuf = formatbuf;
8814 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8815 flags, prec, c, v);
8816 if (len < 0)
8817 goto onError;
8818 sign = 1;
8819 if (flags & F_ZERO)
8820 fill = '0';
8821 break;
8823 case 'c':
8824 pbuf = formatbuf;
8825 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8826 if (len < 0)
8827 goto onError;
8828 break;
8830 default:
8831 PyErr_Format(PyExc_ValueError,
8832 "unsupported format character '%c' (0x%x) "
8833 "at index %zd",
8834 (31<=c && c<=126) ? (char)c : '?',
8835 (int)c,
8836 (Py_ssize_t)(fmt - 1 -
8837 PyUnicode_AS_UNICODE(uformat)));
8838 goto onError;
8840 if (sign) {
8841 if (*pbuf == '-' || *pbuf == '+') {
8842 sign = *pbuf++;
8843 len--;
8845 else if (flags & F_SIGN)
8846 sign = '+';
8847 else if (flags & F_BLANK)
8848 sign = ' ';
8849 else
8850 sign = 0;
8852 if (width < len)
8853 width = len;
8854 if (rescnt - (sign != 0) < width) {
8855 reslen -= rescnt;
8856 rescnt = width + fmtcnt + 100;
8857 reslen += rescnt;
8858 if (reslen < 0) {
8859 Py_XDECREF(temp);
8860 PyErr_NoMemory();
8861 goto onError;
8863 if (_PyUnicode_Resize(&result, reslen) < 0) {
8864 Py_XDECREF(temp);
8865 goto onError;
8867 res = PyUnicode_AS_UNICODE(result)
8868 + reslen - rescnt;
8870 if (sign) {
8871 if (fill != ' ')
8872 *res++ = sign;
8873 rescnt--;
8874 if (width > len)
8875 width--;
8877 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8878 assert(pbuf[0] == '0');
8879 assert(pbuf[1] == c);
8880 if (fill != ' ') {
8881 *res++ = *pbuf++;
8882 *res++ = *pbuf++;
8884 rescnt -= 2;
8885 width -= 2;
8886 if (width < 0)
8887 width = 0;
8888 len -= 2;
8890 if (width > len && !(flags & F_LJUST)) {
8891 do {
8892 --rescnt;
8893 *res++ = fill;
8894 } while (--width > len);
8896 if (fill == ' ') {
8897 if (sign)
8898 *res++ = sign;
8899 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8900 assert(pbuf[0] == '0');
8901 assert(pbuf[1] == c);
8902 *res++ = *pbuf++;
8903 *res++ = *pbuf++;
8906 Py_UNICODE_COPY(res, pbuf, len);
8907 res += len;
8908 rescnt -= len;
8909 while (--width >= len) {
8910 --rescnt;
8911 *res++ = ' ';
8913 if (dict && (argidx < arglen) && c != '%') {
8914 PyErr_SetString(PyExc_TypeError,
8915 "not all arguments converted during string formatting");
8916 Py_XDECREF(temp);
8917 goto onError;
8919 Py_XDECREF(temp);
8920 } /* '%' */
8921 } /* until end */
8922 if (argidx < arglen && !dict) {
8923 PyErr_SetString(PyExc_TypeError,
8924 "not all arguments converted during string formatting");
8925 goto onError;
8928 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8929 goto onError;
8930 if (args_owned) {
8931 Py_DECREF(args);
8933 Py_DECREF(uformat);
8934 return (PyObject *)result;
8936 onError:
8937 Py_XDECREF(result);
8938 Py_DECREF(uformat);
8939 if (args_owned) {
8940 Py_DECREF(args);
8942 return NULL;
8945 static PyBufferProcs unicode_as_buffer = {
8946 (readbufferproc) unicode_buffer_getreadbuf,
8947 (writebufferproc) unicode_buffer_getwritebuf,
8948 (segcountproc) unicode_buffer_getsegcount,
8949 (charbufferproc) unicode_buffer_getcharbuf,
8952 static PyObject *
8953 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8955 static PyObject *
8956 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8958 PyObject *x = NULL;
8959 static char *kwlist[] = {"string", "encoding", "errors", 0};
8960 char *encoding = NULL;
8961 char *errors = NULL;
8963 if (type != &PyUnicode_Type)
8964 return unicode_subtype_new(type, args, kwds);
8965 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8966 kwlist, &x, &encoding, &errors))
8967 return NULL;
8968 if (x == NULL)
8969 return (PyObject *)_PyUnicode_New(0);
8970 if (encoding == NULL && errors == NULL)
8971 return PyObject_Unicode(x);
8972 else
8973 return PyUnicode_FromEncodedObject(x, encoding, errors);
8976 static PyObject *
8977 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8979 PyUnicodeObject *tmp, *pnew;
8980 Py_ssize_t n;
8982 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8983 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8984 if (tmp == NULL)
8985 return NULL;
8986 assert(PyUnicode_Check(tmp));
8987 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8988 if (pnew == NULL) {
8989 Py_DECREF(tmp);
8990 return NULL;
8992 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8993 if (pnew->str == NULL) {
8994 _Py_ForgetReference((PyObject *)pnew);
8995 PyObject_Del(pnew);
8996 Py_DECREF(tmp);
8997 return PyErr_NoMemory();
8999 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9000 pnew->length = n;
9001 pnew->hash = tmp->hash;
9002 Py_DECREF(tmp);
9003 return (PyObject *)pnew;
9006 PyDoc_STRVAR(unicode_doc,
9007 "unicode(string [, encoding[, errors]]) -> object\n\
9009 Create a new Unicode object from the given encoded string.\n\
9010 encoding defaults to the current default string encoding.\n\
9011 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
9013 PyTypeObject PyUnicode_Type = {
9014 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9015 "unicode", /* tp_name */
9016 sizeof(PyUnicodeObject), /* tp_size */
9017 0, /* tp_itemsize */
9018 /* Slots */
9019 (destructor)unicode_dealloc, /* tp_dealloc */
9020 0, /* tp_print */
9021 0, /* tp_getattr */
9022 0, /* tp_setattr */
9023 0, /* tp_compare */
9024 unicode_repr, /* tp_repr */
9025 &unicode_as_number, /* tp_as_number */
9026 &unicode_as_sequence, /* tp_as_sequence */
9027 &unicode_as_mapping, /* tp_as_mapping */
9028 (hashfunc) unicode_hash, /* tp_hash*/
9029 0, /* tp_call*/
9030 (reprfunc) unicode_str, /* tp_str */
9031 PyObject_GenericGetAttr, /* tp_getattro */
9032 0, /* tp_setattro */
9033 &unicode_as_buffer, /* tp_as_buffer */
9034 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
9035 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
9036 unicode_doc, /* tp_doc */
9037 0, /* tp_traverse */
9038 0, /* tp_clear */
9039 PyUnicode_RichCompare, /* tp_richcompare */
9040 0, /* tp_weaklistoffset */
9041 0, /* tp_iter */
9042 0, /* tp_iternext */
9043 unicode_methods, /* tp_methods */
9044 0, /* tp_members */
9045 0, /* tp_getset */
9046 &PyBaseString_Type, /* tp_base */
9047 0, /* tp_dict */
9048 0, /* tp_descr_get */
9049 0, /* tp_descr_set */
9050 0, /* tp_dictoffset */
9051 0, /* tp_init */
9052 0, /* tp_alloc */
9053 unicode_new, /* tp_new */
9054 PyObject_Del, /* tp_free */
9057 /* Initialize the Unicode implementation */
9059 void _PyUnicode_Init(void)
9061 int i;
9063 /* XXX - move this array to unicodectype.c ? */
9064 Py_UNICODE linebreak[] = {
9065 0x000A, /* LINE FEED */
9066 0x000D, /* CARRIAGE RETURN */
9067 0x001C, /* FILE SEPARATOR */
9068 0x001D, /* GROUP SEPARATOR */
9069 0x001E, /* RECORD SEPARATOR */
9070 0x0085, /* NEXT LINE */
9071 0x2028, /* LINE SEPARATOR */
9072 0x2029, /* PARAGRAPH SEPARATOR */
9075 /* Init the implementation */
9076 free_list = NULL;
9077 numfree = 0;
9078 unicode_empty = _PyUnicode_New(0);
9079 if (!unicode_empty)
9080 return;
9082 strcpy(unicode_default_encoding, "ascii");
9083 for (i = 0; i < 256; i++)
9084 unicode_latin1[i] = NULL;
9085 if (PyType_Ready(&PyUnicode_Type) < 0)
9086 Py_FatalError("Can't initialize 'unicode'");
9088 /* initialize the linebreak bloom filter */
9089 bloom_linebreak = make_bloom_mask(
9090 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9093 PyType_Ready(&EncodingMapType);
9096 /* Finalize the Unicode implementation */
9099 PyUnicode_ClearFreeList(void)
9101 int freelist_size = numfree;
9102 PyUnicodeObject *u;
9104 for (u = free_list; u != NULL;) {
9105 PyUnicodeObject *v = u;
9106 u = *(PyUnicodeObject **)u;
9107 if (v->str)
9108 PyObject_DEL(v->str);
9109 Py_XDECREF(v->defenc);
9110 PyObject_Del(v);
9111 numfree--;
9113 free_list = NULL;
9114 assert(numfree == 0);
9115 return freelist_size;
9118 void
9119 _PyUnicode_Fini(void)
9121 int i;
9123 Py_XDECREF(unicode_empty);
9124 unicode_empty = NULL;
9126 for (i = 0; i < 256; i++) {
9127 if (unicode_latin1[i]) {
9128 Py_DECREF(unicode_latin1[i]);
9129 unicode_latin1[i] = NULL;
9132 (void)PyUnicode_ClearFreeList();
9135 #ifdef __cplusplus
9137 #endif
9141 Local variables:
9142 c-basic-offset: 4
9143 indent-tabs-mode: nil
9144 End: