Move function back to its section.
[python.git] / Objects / unicodeobject.c
blobc4b490295f32e14e7a0f8b5bba0888d9d1996be1
1 /*
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
10 Copyright (c) Corporation for National Research Initiatives.
12 --------------------------------------------------------------------
13 The original string type implementation is:
15 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
18 By obtaining, using, and/or copying this software and/or its
19 associated documentation, you agree that you have read, understood,
20 and will comply with the following terms and conditions:
22 Permission to use, copy, modify, and distribute this software and its
23 associated documentation for any purpose and without fee is hereby
24 granted, provided that the above copyright notice appears in all
25 copies, and that both that copyright notice and this permission notice
26 appear in supporting documentation, and that the name of Secret Labs
27 AB or the author not be used in advertising or publicity pertaining to
28 distribution of the software without specific, written prior
29 permission.
31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 --------------------------------------------------------------------
42 #define PY_SSIZE_T_CLEAN
43 #include "Python.h"
45 #include "unicodeobject.h"
46 #include "ucnhash.h"
48 #ifdef MS_WINDOWS
49 #include <windows.h>
50 #endif
52 /* Limit for the Unicode object free list */
54 #define PyUnicode_MAXFREELIST 1024
56 /* Limit for the Unicode object free list stay alive optimization.
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
60 limit. This reduces malloc() overhead for small Unicode objects.
62 At worst this will result in PyUnicode_MAXFREELIST *
63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64 malloc()-overhead) bytes of unused garbage.
66 Setting the limit to 0 effectively turns the feature off.
68 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
73 #define KEEPALIVE_SIZE_LIMIT 9
75 /* Endianness switches; defaults to little endian */
77 #ifdef WORDS_BIGENDIAN
78 # define BYTEORDER_IS_BIG_ENDIAN
79 #else
80 # define BYTEORDER_IS_LITTLE_ENDIAN
81 #endif
83 /* --- Globals ------------------------------------------------------------
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
91 #ifdef __cplusplus
92 extern "C" {
93 #endif
95 /* Free list for Unicode objects */
96 static PyUnicodeObject *free_list;
97 static int numfree;
99 /* The empty Unicode object is shared to improve performance. */
100 static PyUnicodeObject *unicode_empty;
102 /* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104 static PyUnicodeObject *unicode_latin1[256];
106 /* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
109 Always use the PyUnicode_SetDefaultEncoding() and
110 PyUnicode_GetDefaultEncoding() APIs to access this global.
113 static char unicode_default_encoding[100];
115 /* Fast detection of the most frequent whitespace characters */
116 const unsigned char _Py_ascii_whitespace[] = {
117 0, 0, 0, 0, 0, 0, 0, 0,
118 /* case 0x0009: * HORIZONTAL TABULATION */
119 /* case 0x000A: * LINE FEED */
120 /* case 0x000B: * VERTICAL TABULATION */
121 /* case 0x000C: * FORM FEED */
122 /* case 0x000D: * CARRIAGE RETURN */
123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
125 /* case 0x001C: * FILE SEPARATOR */
126 /* case 0x001D: * GROUP SEPARATOR */
127 /* case 0x001E: * RECORD SEPARATOR */
128 /* case 0x001F: * UNIT SEPARATOR */
129 0, 0, 0, 0, 1, 1, 1, 1,
130 /* case 0x0020: * SPACE */
131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
146 /* Same for linebreaks */
147 static unsigned char ascii_linebreak[] = {
148 0, 0, 0, 0, 0, 0, 0, 0,
149 /* 0x000A, * LINE FEED */
150 /* 0x000D, * CARRIAGE RETURN */
151 0, 0, 1, 0, 0, 1, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 /* 0x001C, * FILE SEPARATOR */
154 /* 0x001D, * GROUP SEPARATOR */
155 /* 0x001E, * RECORD SEPARATOR */
156 0, 0, 0, 0, 1, 1, 1, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0
173 Py_UNICODE
174 PyUnicode_GetMax(void)
176 #ifdef Py_UNICODE_WIDE
177 return 0x10FFFF;
178 #else
179 /* This is actually an illegal character, so it should
180 not be passed to unichr. */
181 return 0xFFFF;
182 #endif
185 /* --- Bloom Filters ----------------------------------------------------- */
187 /* stuff to implement simple "bloom filters" for Unicode characters.
188 to keep things simple, we use a single bitmask, using the least 5
189 bits from each unicode characters as the bit index. */
191 /* the linebreak mask is set up by Unicode_Init below */
193 #define BLOOM_MASK unsigned long
195 static BLOOM_MASK bloom_linebreak;
197 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
199 #define BLOOM_LINEBREAK(ch) \
200 ((ch) < 128U ? ascii_linebreak[(ch)] : \
201 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
203 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
205 /* calculate simple bloom-style bitmask for a given unicode string */
207 long mask;
208 Py_ssize_t i;
210 mask = 0;
211 for (i = 0; i < len; i++)
212 mask |= (1 << (ptr[i] & 0x1F));
214 return mask;
217 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
219 Py_ssize_t i;
221 for (i = 0; i < setlen; i++)
222 if (set[i] == chr)
223 return 1;
225 return 0;
228 #define BLOOM_MEMBER(mask, chr, set, setlen) \
229 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
231 /* --- Unicode Object ----------------------------------------------------- */
233 static
234 int unicode_resize(register PyUnicodeObject *unicode,
235 Py_ssize_t length)
237 void *oldstr;
239 /* Shortcut if there's nothing much to do. */
240 if (unicode->length == length)
241 goto reset;
243 /* Resizing shared object (unicode_empty or single character
244 objects) in-place is not allowed. Use PyUnicode_Resize()
245 instead ! */
247 if (unicode == unicode_empty ||
248 (unicode->length == 1 &&
249 unicode->str[0] < 256U &&
250 unicode_latin1[unicode->str[0]] == unicode)) {
251 PyErr_SetString(PyExc_SystemError,
252 "can't resize shared unicode objects");
253 return -1;
256 /* We allocate one more byte to make sure the string is Ux0000 terminated.
257 The overallocation is also used by fastsearch, which assumes that it's
258 safe to look at str[length] (without making any assumptions about what
259 it contains). */
261 oldstr = unicode->str;
262 unicode->str = PyObject_REALLOC(unicode->str,
263 sizeof(Py_UNICODE) * (length + 1));
264 if (!unicode->str) {
265 unicode->str = (Py_UNICODE *)oldstr;
266 PyErr_NoMemory();
267 return -1;
269 unicode->str[length] = 0;
270 unicode->length = length;
272 reset:
273 /* Reset the object caches */
274 if (unicode->defenc) {
275 Py_DECREF(unicode->defenc);
276 unicode->defenc = NULL;
278 unicode->hash = -1;
280 return 0;
283 /* We allocate one more byte to make sure the string is
284 Ux0000 terminated -- XXX is this needed ?
286 XXX This allocator could further be enhanced by assuring that the
287 free list never reduces its size below 1.
291 static
292 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
294 register PyUnicodeObject *unicode;
296 /* Optimization for empty strings */
297 if (length == 0 && unicode_empty != NULL) {
298 Py_INCREF(unicode_empty);
299 return unicode_empty;
302 /* Ensure we won't overflow the size. */
303 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
304 return (PyUnicodeObject *)PyErr_NoMemory();
307 /* Unicode freelist & memory allocation */
308 if (free_list) {
309 unicode = free_list;
310 free_list = *(PyUnicodeObject **)unicode;
311 numfree--;
312 if (unicode->str) {
313 /* Keep-Alive optimization: we only upsize the buffer,
314 never downsize it. */
315 if ((unicode->length < length) &&
316 unicode_resize(unicode, length) < 0) {
317 PyObject_DEL(unicode->str);
318 unicode->str = NULL;
321 else {
322 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
323 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
325 PyObject_INIT(unicode, &PyUnicode_Type);
327 else {
328 size_t new_size;
329 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
330 if (unicode == NULL)
331 return NULL;
332 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
333 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
336 if (!unicode->str) {
337 PyErr_NoMemory();
338 goto onError;
340 /* Initialize the first element to guard against cases where
341 * the caller fails before initializing str -- unicode_resize()
342 * reads str[0], and the Keep-Alive optimization can keep memory
343 * allocated for str alive across a call to unicode_dealloc(unicode).
344 * We don't want unicode_resize to read uninitialized memory in
345 * that case.
347 unicode->str[0] = 0;
348 unicode->str[length] = 0;
349 unicode->length = length;
350 unicode->hash = -1;
351 unicode->defenc = NULL;
352 return unicode;
354 onError:
355 /* XXX UNREF/NEWREF interface should be more symmetrical */
356 _Py_DEC_REFTOTAL;
357 _Py_ForgetReference((PyObject *)unicode);
358 PyObject_Del(unicode);
359 return NULL;
362 static
363 void unicode_dealloc(register PyUnicodeObject *unicode)
365 if (PyUnicode_CheckExact(unicode) &&
366 numfree < PyUnicode_MAXFREELIST) {
367 /* Keep-Alive optimization */
368 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
369 PyObject_DEL(unicode->str);
370 unicode->str = NULL;
371 unicode->length = 0;
373 if (unicode->defenc) {
374 Py_DECREF(unicode->defenc);
375 unicode->defenc = NULL;
377 /* Add to free list */
378 *(PyUnicodeObject **)unicode = free_list;
379 free_list = unicode;
380 numfree++;
382 else {
383 PyObject_DEL(unicode->str);
384 Py_XDECREF(unicode->defenc);
385 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
389 static
390 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
392 register PyUnicodeObject *v;
394 /* Argument checks */
395 if (unicode == NULL) {
396 PyErr_BadInternalCall();
397 return -1;
399 v = *unicode;
400 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
401 PyErr_BadInternalCall();
402 return -1;
405 /* Resizing unicode_empty and single character objects is not
406 possible since these are being shared. We simply return a fresh
407 copy with the same Unicode content. */
408 if (v->length != length &&
409 (v == unicode_empty || v->length == 1)) {
410 PyUnicodeObject *w = _PyUnicode_New(length);
411 if (w == NULL)
412 return -1;
413 Py_UNICODE_COPY(w->str, v->str,
414 length < v->length ? length : v->length);
415 Py_DECREF(*unicode);
416 *unicode = w;
417 return 0;
420 /* Note that we don't have to modify *unicode for unshared Unicode
421 objects, since we can modify them in-place. */
422 return unicode_resize(v, length);
425 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
427 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
430 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
431 Py_ssize_t size)
433 PyUnicodeObject *unicode;
435 /* If the Unicode data is known at construction time, we can apply
436 some optimizations which share commonly used objects. */
437 if (u != NULL) {
439 /* Optimization for empty strings */
440 if (size == 0 && unicode_empty != NULL) {
441 Py_INCREF(unicode_empty);
442 return (PyObject *)unicode_empty;
445 /* Single character Unicode objects in the Latin-1 range are
446 shared when using this constructor */
447 if (size == 1 && *u < 256) {
448 unicode = unicode_latin1[*u];
449 if (!unicode) {
450 unicode = _PyUnicode_New(1);
451 if (!unicode)
452 return NULL;
453 unicode->str[0] = *u;
454 unicode_latin1[*u] = unicode;
456 Py_INCREF(unicode);
457 return (PyObject *)unicode;
461 unicode = _PyUnicode_New(size);
462 if (!unicode)
463 return NULL;
465 /* Copy the Unicode data into the new object */
466 if (u != NULL)
467 Py_UNICODE_COPY(unicode->str, u, size);
469 return (PyObject *)unicode;
472 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
474 PyUnicodeObject *unicode;
476 if (size < 0) {
477 PyErr_SetString(PyExc_SystemError,
478 "Negative size passed to PyUnicode_FromStringAndSize");
479 return NULL;
482 /* If the Unicode data is known at construction time, we can apply
483 some optimizations which share commonly used objects.
484 Also, this means the input must be UTF-8, so fall back to the
485 UTF-8 decoder at the end. */
486 if (u != NULL) {
488 /* Optimization for empty strings */
489 if (size == 0 && unicode_empty != NULL) {
490 Py_INCREF(unicode_empty);
491 return (PyObject *)unicode_empty;
494 /* Single characters are shared when using this constructor.
495 Restrict to ASCII, since the input must be UTF-8. */
496 if (size == 1 && Py_CHARMASK(*u) < 128) {
497 unicode = unicode_latin1[Py_CHARMASK(*u)];
498 if (!unicode) {
499 unicode = _PyUnicode_New(1);
500 if (!unicode)
501 return NULL;
502 unicode->str[0] = Py_CHARMASK(*u);
503 unicode_latin1[Py_CHARMASK(*u)] = unicode;
505 Py_INCREF(unicode);
506 return (PyObject *)unicode;
509 return PyUnicode_DecodeUTF8(u, size, NULL);
512 unicode = _PyUnicode_New(size);
513 if (!unicode)
514 return NULL;
516 return (PyObject *)unicode;
519 PyObject *PyUnicode_FromString(const char *u)
521 size_t size = strlen(u);
522 if (size > PY_SSIZE_T_MAX) {
523 PyErr_SetString(PyExc_OverflowError, "input too long");
524 return NULL;
527 return PyUnicode_FromStringAndSize(u, size);
530 #ifdef HAVE_WCHAR_H
532 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
533 # define CONVERT_WCHAR_TO_SURROGATES
534 #endif
536 #ifdef CONVERT_WCHAR_TO_SURROGATES
538 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
539 to convert from UTF32 to UTF16. */
541 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
542 Py_ssize_t size)
544 PyUnicodeObject *unicode;
545 register Py_ssize_t i;
546 Py_ssize_t alloc;
547 const wchar_t *orig_w;
549 if (w == NULL) {
550 PyErr_BadInternalCall();
551 return NULL;
554 alloc = size;
555 orig_w = w;
556 for (i = size; i > 0; i--) {
557 if (*w > 0xFFFF)
558 alloc++;
559 w++;
561 w = orig_w;
562 unicode = _PyUnicode_New(alloc);
563 if (!unicode)
564 return NULL;
566 /* Copy the wchar_t data into the new object */
568 register Py_UNICODE *u;
569 u = PyUnicode_AS_UNICODE(unicode);
570 for (i = size; i > 0; i--) {
571 if (*w > 0xFFFF) {
572 wchar_t ordinal = *w++;
573 ordinal -= 0x10000;
574 *u++ = 0xD800 | (ordinal >> 10);
575 *u++ = 0xDC00 | (ordinal & 0x3FF);
577 else
578 *u++ = *w++;
581 return (PyObject *)unicode;
584 #else
586 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
587 Py_ssize_t size)
589 PyUnicodeObject *unicode;
591 if (w == NULL) {
592 PyErr_BadInternalCall();
593 return NULL;
596 unicode = _PyUnicode_New(size);
597 if (!unicode)
598 return NULL;
600 /* Copy the wchar_t data into the new object */
601 #ifdef HAVE_USABLE_WCHAR_T
602 memcpy(unicode->str, w, size * sizeof(wchar_t));
603 #else
605 register Py_UNICODE *u;
606 register Py_ssize_t i;
607 u = PyUnicode_AS_UNICODE(unicode);
608 for (i = size; i > 0; i--)
609 *u++ = *w++;
611 #endif
613 return (PyObject *)unicode;
616 #endif /* CONVERT_WCHAR_TO_SURROGATES */
618 #undef CONVERT_WCHAR_TO_SURROGATES
620 static void
621 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
623 *fmt++ = '%';
624 if (width) {
625 if (zeropad)
626 *fmt++ = '0';
627 fmt += sprintf(fmt, "%d", width);
629 if (precision)
630 fmt += sprintf(fmt, ".%d", precision);
631 if (longflag)
632 *fmt++ = 'l';
633 else if (size_tflag) {
634 char *f = PY_FORMAT_SIZE_T;
635 while (*f)
636 *fmt++ = *f++;
638 *fmt++ = c;
639 *fmt = '\0';
642 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
644 PyObject *
645 PyUnicode_FromFormatV(const char *format, va_list vargs)
647 va_list count;
648 Py_ssize_t callcount = 0;
649 PyObject **callresults = NULL;
650 PyObject **callresult = NULL;
651 Py_ssize_t n = 0;
652 int width = 0;
653 int precision = 0;
654 int zeropad;
655 const char* f;
656 Py_UNICODE *s;
657 PyObject *string;
658 /* used by sprintf */
659 char buffer[21];
660 /* use abuffer instead of buffer, if we need more space
661 * (which can happen if there's a format specifier with width). */
662 char *abuffer = NULL;
663 char *realbuffer;
664 Py_ssize_t abuffersize = 0;
665 char fmt[60]; /* should be enough for %0width.precisionld */
666 const char *copy;
668 #ifdef VA_LIST_IS_ARRAY
669 Py_MEMCPY(count, vargs, sizeof(va_list));
670 #else
671 #ifdef __va_copy
672 __va_copy(count, vargs);
673 #else
674 count = vargs;
675 #endif
676 #endif
677 /* step 1: count the number of %S/%R/%s format specifications
678 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
679 * objects once during step 3 and put the result in an array) */
680 for (f = format; *f; f++) {
681 if (*f == '%') {
682 if (*(f+1)=='%')
683 continue;
684 if (*(f+1)=='S' || *(f+1)=='R')
685 ++callcount;
686 while (isdigit((unsigned)*f))
687 width = (width*10) + *f++ - '0';
688 while (*++f && *f != '%' && !isalpha((unsigned)*f))
690 if (*f == 's')
691 ++callcount;
694 /* step 2: allocate memory for the results of
695 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
696 if (callcount) {
697 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
698 if (!callresults) {
699 PyErr_NoMemory();
700 return NULL;
702 callresult = callresults;
704 /* step 3: figure out how large a buffer we need */
705 for (f = format; *f; f++) {
706 if (*f == '%') {
707 const char* p = f;
708 width = 0;
709 while (isdigit((unsigned)*f))
710 width = (width*10) + *f++ - '0';
711 while (*++f && *f != '%' && !isalpha((unsigned)*f))
714 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
715 * they don't affect the amount of space we reserve.
717 if ((*f == 'l' || *f == 'z') &&
718 (f[1] == 'd' || f[1] == 'u'))
719 ++f;
721 switch (*f) {
722 case 'c':
723 (void)va_arg(count, int);
724 /* fall through... */
725 case '%':
726 n++;
727 break;
728 case 'd': case 'u': case 'i': case 'x':
729 (void) va_arg(count, int);
730 /* 20 bytes is enough to hold a 64-bit
731 integer. Decimal takes the most space.
732 This isn't enough for octal.
733 If a width is specified we need more
734 (which we allocate later). */
735 if (width < 20)
736 width = 20;
737 n += width;
738 if (abuffersize < width)
739 abuffersize = width;
740 break;
741 case 's':
743 /* UTF-8 */
744 const char *s = va_arg(count, const char*);
745 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
746 if (!str)
747 goto fail;
748 n += PyUnicode_GET_SIZE(str);
749 /* Remember the str and switch to the next slot */
750 *callresult++ = str;
751 break;
753 case 'U':
755 PyObject *obj = va_arg(count, PyObject *);
756 assert(obj && PyUnicode_Check(obj));
757 n += PyUnicode_GET_SIZE(obj);
758 break;
760 case 'V':
762 PyObject *obj = va_arg(count, PyObject *);
763 const char *str = va_arg(count, const char *);
764 assert(obj || str);
765 assert(!obj || PyUnicode_Check(obj));
766 if (obj)
767 n += PyUnicode_GET_SIZE(obj);
768 else
769 n += strlen(str);
770 break;
772 case 'S':
774 PyObject *obj = va_arg(count, PyObject *);
775 PyObject *str;
776 assert(obj);
777 str = PyObject_Str(obj);
778 if (!str)
779 goto fail;
780 n += PyUnicode_GET_SIZE(str);
781 /* Remember the str and switch to the next slot */
782 *callresult++ = str;
783 break;
785 case 'R':
787 PyObject *obj = va_arg(count, PyObject *);
788 PyObject *repr;
789 assert(obj);
790 repr = PyObject_Repr(obj);
791 if (!repr)
792 goto fail;
793 n += PyUnicode_GET_SIZE(repr);
794 /* Remember the repr and switch to the next slot */
795 *callresult++ = repr;
796 break;
798 case 'p':
799 (void) va_arg(count, int);
800 /* maximum 64-bit pointer representation:
801 * 0xffffffffffffffff
802 * so 19 characters is enough.
803 * XXX I count 18 -- what's the extra for?
805 n += 19;
806 break;
807 default:
808 /* if we stumble upon an unknown
809 formatting code, copy the rest of
810 the format string to the output
811 string. (we cannot just skip the
812 code, since there's no way to know
813 what's in the argument list) */
814 n += strlen(p);
815 goto expand;
817 } else
818 n++;
820 expand:
821 if (abuffersize > 20) {
822 abuffer = PyObject_Malloc(abuffersize);
823 if (!abuffer) {
824 PyErr_NoMemory();
825 goto fail;
827 realbuffer = abuffer;
829 else
830 realbuffer = buffer;
831 /* step 4: fill the buffer */
832 /* Since we've analyzed how much space we need for the worst case,
833 we don't have to resize the string.
834 There can be no errors beyond this point. */
835 string = PyUnicode_FromUnicode(NULL, n);
836 if (!string)
837 goto fail;
839 s = PyUnicode_AS_UNICODE(string);
840 callresult = callresults;
842 for (f = format; *f; f++) {
843 if (*f == '%') {
844 const char* p = f++;
845 int longflag = 0;
846 int size_tflag = 0;
847 zeropad = (*f == '0');
848 /* parse the width.precision part */
849 width = 0;
850 while (isdigit((unsigned)*f))
851 width = (width*10) + *f++ - '0';
852 precision = 0;
853 if (*f == '.') {
854 f++;
855 while (isdigit((unsigned)*f))
856 precision = (precision*10) + *f++ - '0';
858 /* handle the long flag, but only for %ld and %lu.
859 others can be added when necessary. */
860 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
861 longflag = 1;
862 ++f;
864 /* handle the size_t flag. */
865 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
866 size_tflag = 1;
867 ++f;
870 switch (*f) {
871 case 'c':
872 *s++ = va_arg(vargs, int);
873 break;
874 case 'd':
875 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
876 if (longflag)
877 sprintf(realbuffer, fmt, va_arg(vargs, long));
878 else if (size_tflag)
879 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
880 else
881 sprintf(realbuffer, fmt, va_arg(vargs, int));
882 appendstring(realbuffer);
883 break;
884 case 'u':
885 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
886 if (longflag)
887 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
888 else if (size_tflag)
889 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
890 else
891 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
892 appendstring(realbuffer);
893 break;
894 case 'i':
895 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
896 sprintf(realbuffer, fmt, va_arg(vargs, int));
897 appendstring(realbuffer);
898 break;
899 case 'x':
900 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
901 sprintf(realbuffer, fmt, va_arg(vargs, int));
902 appendstring(realbuffer);
903 break;
904 case 's':
906 /* unused, since we already have the result */
907 (void) va_arg(vargs, char *);
908 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
909 PyUnicode_GET_SIZE(*callresult));
910 s += PyUnicode_GET_SIZE(*callresult);
911 /* We're done with the unicode()/repr() => forget it */
912 Py_DECREF(*callresult);
913 /* switch to next unicode()/repr() result */
914 ++callresult;
915 break;
917 case 'U':
919 PyObject *obj = va_arg(vargs, PyObject *);
920 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
921 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
922 s += size;
923 break;
925 case 'V':
927 PyObject *obj = va_arg(vargs, PyObject *);
928 const char *str = va_arg(vargs, const char *);
929 if (obj) {
930 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
931 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
932 s += size;
933 } else {
934 appendstring(str);
936 break;
938 case 'S':
939 case 'R':
941 Py_UNICODE *ucopy;
942 Py_ssize_t usize;
943 Py_ssize_t upos;
944 /* unused, since we already have the result */
945 (void) va_arg(vargs, PyObject *);
946 ucopy = PyUnicode_AS_UNICODE(*callresult);
947 usize = PyUnicode_GET_SIZE(*callresult);
948 for (upos = 0; upos<usize;)
949 *s++ = ucopy[upos++];
950 /* We're done with the unicode()/repr() => forget it */
951 Py_DECREF(*callresult);
952 /* switch to next unicode()/repr() result */
953 ++callresult;
954 break;
956 case 'p':
957 sprintf(buffer, "%p", va_arg(vargs, void*));
958 /* %p is ill-defined: ensure leading 0x. */
959 if (buffer[1] == 'X')
960 buffer[1] = 'x';
961 else if (buffer[1] != 'x') {
962 memmove(buffer+2, buffer, strlen(buffer)+1);
963 buffer[0] = '0';
964 buffer[1] = 'x';
966 appendstring(buffer);
967 break;
968 case '%':
969 *s++ = '%';
970 break;
971 default:
972 appendstring(p);
973 goto end;
975 } else
976 *s++ = *f;
979 end:
980 if (callresults)
981 PyObject_Free(callresults);
982 if (abuffer)
983 PyObject_Free(abuffer);
984 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
985 return string;
986 fail:
987 if (callresults) {
988 PyObject **callresult2 = callresults;
989 while (callresult2 < callresult) {
990 Py_DECREF(*callresult2);
991 ++callresult2;
993 PyObject_Free(callresults);
995 if (abuffer)
996 PyObject_Free(abuffer);
997 return NULL;
1000 #undef appendstring
1002 PyObject *
1003 PyUnicode_FromFormat(const char *format, ...)
1005 PyObject* ret;
1006 va_list vargs;
1008 #ifdef HAVE_STDARG_PROTOTYPES
1009 va_start(vargs, format);
1010 #else
1011 va_start(vargs);
1012 #endif
1013 ret = PyUnicode_FromFormatV(format, vargs);
1014 va_end(vargs);
1015 return ret;
1018 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1019 wchar_t *w,
1020 Py_ssize_t size)
1022 if (unicode == NULL) {
1023 PyErr_BadInternalCall();
1024 return -1;
1027 /* If possible, try to copy the 0-termination as well */
1028 if (size > PyUnicode_GET_SIZE(unicode))
1029 size = PyUnicode_GET_SIZE(unicode) + 1;
1031 #ifdef HAVE_USABLE_WCHAR_T
1032 memcpy(w, unicode->str, size * sizeof(wchar_t));
1033 #else
1035 register Py_UNICODE *u;
1036 register Py_ssize_t i;
1037 u = PyUnicode_AS_UNICODE(unicode);
1038 for (i = size; i > 0; i--)
1039 *w++ = *u++;
1041 #endif
1043 if (size > PyUnicode_GET_SIZE(unicode))
1044 return PyUnicode_GET_SIZE(unicode);
1045 else
1046 return size;
1049 #endif
1051 PyObject *PyUnicode_FromOrdinal(int ordinal)
1053 Py_UNICODE s[1];
1055 #ifdef Py_UNICODE_WIDE
1056 if (ordinal < 0 || ordinal > 0x10ffff) {
1057 PyErr_SetString(PyExc_ValueError,
1058 "unichr() arg not in range(0x110000) "
1059 "(wide Python build)");
1060 return NULL;
1062 #else
1063 if (ordinal < 0 || ordinal > 0xffff) {
1064 PyErr_SetString(PyExc_ValueError,
1065 "unichr() arg not in range(0x10000) "
1066 "(narrow Python build)");
1067 return NULL;
1069 #endif
1071 s[0] = (Py_UNICODE)ordinal;
1072 return PyUnicode_FromUnicode(s, 1);
1075 PyObject *PyUnicode_FromObject(register PyObject *obj)
1077 /* XXX Perhaps we should make this API an alias of
1078 PyObject_Unicode() instead ?! */
1079 if (PyUnicode_CheckExact(obj)) {
1080 Py_INCREF(obj);
1081 return obj;
1083 if (PyUnicode_Check(obj)) {
1084 /* For a Unicode subtype that's not a Unicode object,
1085 return a true Unicode object with the same data. */
1086 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1087 PyUnicode_GET_SIZE(obj));
1089 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1092 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1093 const char *encoding,
1094 const char *errors)
1096 const char *s = NULL;
1097 Py_ssize_t len;
1098 PyObject *v;
1100 if (obj == NULL) {
1101 PyErr_BadInternalCall();
1102 return NULL;
1105 #if 0
1106 /* For b/w compatibility we also accept Unicode objects provided
1107 that no encodings is given and then redirect to
1108 PyObject_Unicode() which then applies the additional logic for
1109 Unicode subclasses.
1111 NOTE: This API should really only be used for object which
1112 represent *encoded* Unicode !
1115 if (PyUnicode_Check(obj)) {
1116 if (encoding) {
1117 PyErr_SetString(PyExc_TypeError,
1118 "decoding Unicode is not supported");
1119 return NULL;
1121 return PyObject_Unicode(obj);
1123 #else
1124 if (PyUnicode_Check(obj)) {
1125 PyErr_SetString(PyExc_TypeError,
1126 "decoding Unicode is not supported");
1127 return NULL;
1129 #endif
1131 /* Coerce object */
1132 if (PyString_Check(obj)) {
1133 s = PyString_AS_STRING(obj);
1134 len = PyString_GET_SIZE(obj);
1136 else if (PyByteArray_Check(obj)) {
1137 /* Python 2.x specific */
1138 PyErr_Format(PyExc_TypeError,
1139 "decoding bytearray is not supported");
1140 return NULL;
1142 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1143 /* Overwrite the error message with something more useful in
1144 case of a TypeError. */
1145 if (PyErr_ExceptionMatches(PyExc_TypeError))
1146 PyErr_Format(PyExc_TypeError,
1147 "coercing to Unicode: need string or buffer, "
1148 "%.80s found",
1149 Py_TYPE(obj)->tp_name);
1150 goto onError;
1153 /* Convert to Unicode */
1154 if (len == 0) {
1155 Py_INCREF(unicode_empty);
1156 v = (PyObject *)unicode_empty;
1158 else
1159 v = PyUnicode_Decode(s, len, encoding, errors);
1161 return v;
1163 onError:
1164 return NULL;
1167 PyObject *PyUnicode_Decode(const char *s,
1168 Py_ssize_t size,
1169 const char *encoding,
1170 const char *errors)
1172 PyObject *buffer = NULL, *unicode;
1174 if (encoding == NULL)
1175 encoding = PyUnicode_GetDefaultEncoding();
1177 /* Shortcuts for common default encodings */
1178 if (strcmp(encoding, "utf-8") == 0)
1179 return PyUnicode_DecodeUTF8(s, size, errors);
1180 else if (strcmp(encoding, "latin-1") == 0)
1181 return PyUnicode_DecodeLatin1(s, size, errors);
1182 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1183 else if (strcmp(encoding, "mbcs") == 0)
1184 return PyUnicode_DecodeMBCS(s, size, errors);
1185 #endif
1186 else if (strcmp(encoding, "ascii") == 0)
1187 return PyUnicode_DecodeASCII(s, size, errors);
1189 /* Decode via the codec registry */
1190 buffer = PyBuffer_FromMemory((void *)s, size);
1191 if (buffer == NULL)
1192 goto onError;
1193 unicode = PyCodec_Decode(buffer, encoding, errors);
1194 if (unicode == NULL)
1195 goto onError;
1196 if (!PyUnicode_Check(unicode)) {
1197 PyErr_Format(PyExc_TypeError,
1198 "decoder did not return an unicode object (type=%.400s)",
1199 Py_TYPE(unicode)->tp_name);
1200 Py_DECREF(unicode);
1201 goto onError;
1203 Py_DECREF(buffer);
1204 return unicode;
1206 onError:
1207 Py_XDECREF(buffer);
1208 return NULL;
1211 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1212 const char *encoding,
1213 const char *errors)
1215 PyObject *v;
1217 if (!PyUnicode_Check(unicode)) {
1218 PyErr_BadArgument();
1219 goto onError;
1222 if (encoding == NULL)
1223 encoding = PyUnicode_GetDefaultEncoding();
1225 /* Decode via the codec registry */
1226 v = PyCodec_Decode(unicode, encoding, errors);
1227 if (v == NULL)
1228 goto onError;
1229 return v;
1231 onError:
1232 return NULL;
1235 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1236 Py_ssize_t size,
1237 const char *encoding,
1238 const char *errors)
1240 PyObject *v, *unicode;
1242 unicode = PyUnicode_FromUnicode(s, size);
1243 if (unicode == NULL)
1244 return NULL;
1245 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1246 Py_DECREF(unicode);
1247 return v;
1250 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1251 const char *encoding,
1252 const char *errors)
1254 PyObject *v;
1256 if (!PyUnicode_Check(unicode)) {
1257 PyErr_BadArgument();
1258 goto onError;
1261 if (encoding == NULL)
1262 encoding = PyUnicode_GetDefaultEncoding();
1264 /* Encode via the codec registry */
1265 v = PyCodec_Encode(unicode, encoding, errors);
1266 if (v == NULL)
1267 goto onError;
1268 return v;
1270 onError:
1271 return NULL;
1274 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1275 const char *encoding,
1276 const char *errors)
1278 PyObject *v;
1280 if (!PyUnicode_Check(unicode)) {
1281 PyErr_BadArgument();
1282 goto onError;
1285 if (encoding == NULL)
1286 encoding = PyUnicode_GetDefaultEncoding();
1288 /* Shortcuts for common default encodings */
1289 if (errors == NULL) {
1290 if (strcmp(encoding, "utf-8") == 0)
1291 return PyUnicode_AsUTF8String(unicode);
1292 else if (strcmp(encoding, "latin-1") == 0)
1293 return PyUnicode_AsLatin1String(unicode);
1294 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1295 else if (strcmp(encoding, "mbcs") == 0)
1296 return PyUnicode_AsMBCSString(unicode);
1297 #endif
1298 else if (strcmp(encoding, "ascii") == 0)
1299 return PyUnicode_AsASCIIString(unicode);
1302 /* Encode via the codec registry */
1303 v = PyCodec_Encode(unicode, encoding, errors);
1304 if (v == NULL)
1305 goto onError;
1306 if (!PyString_Check(v)) {
1307 PyErr_Format(PyExc_TypeError,
1308 "encoder did not return a string object (type=%.400s)",
1309 Py_TYPE(v)->tp_name);
1310 Py_DECREF(v);
1311 goto onError;
1313 return v;
1315 onError:
1316 return NULL;
1319 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1320 const char *errors)
1322 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1324 if (v)
1325 return v;
1326 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1327 if (v && errors == NULL)
1328 ((PyUnicodeObject *)unicode)->defenc = v;
1329 return v;
1332 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1334 if (!PyUnicode_Check(unicode)) {
1335 PyErr_BadArgument();
1336 goto onError;
1338 return PyUnicode_AS_UNICODE(unicode);
1340 onError:
1341 return NULL;
1344 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1346 if (!PyUnicode_Check(unicode)) {
1347 PyErr_BadArgument();
1348 goto onError;
1350 return PyUnicode_GET_SIZE(unicode);
1352 onError:
1353 return -1;
1356 const char *PyUnicode_GetDefaultEncoding(void)
1358 return unicode_default_encoding;
1361 int PyUnicode_SetDefaultEncoding(const char *encoding)
1363 PyObject *v;
1365 /* Make sure the encoding is valid. As side effect, this also
1366 loads the encoding into the codec registry cache. */
1367 v = _PyCodec_Lookup(encoding);
1368 if (v == NULL)
1369 goto onError;
1370 Py_DECREF(v);
1371 strncpy(unicode_default_encoding,
1372 encoding,
1373 sizeof(unicode_default_encoding));
1374 return 0;
1376 onError:
1377 return -1;
1380 /* error handling callback helper:
1381 build arguments, call the callback and check the arguments,
1382 if no exception occurred, copy the replacement to the output
1383 and adjust various state variables.
1384 return 0 on success, -1 on error
1387 static
1388 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1389 const char *encoding, const char *reason,
1390 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1391 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1392 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1394 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1396 PyObject *restuple = NULL;
1397 PyObject *repunicode = NULL;
1398 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1399 Py_ssize_t requiredsize;
1400 Py_ssize_t newpos;
1401 Py_UNICODE *repptr;
1402 Py_ssize_t repsize;
1403 int res = -1;
1405 if (*errorHandler == NULL) {
1406 *errorHandler = PyCodec_LookupError(errors);
1407 if (*errorHandler == NULL)
1408 goto onError;
1411 if (*exceptionObject == NULL) {
1412 *exceptionObject = PyUnicodeDecodeError_Create(
1413 encoding, input, insize, *startinpos, *endinpos, reason);
1414 if (*exceptionObject == NULL)
1415 goto onError;
1417 else {
1418 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1419 goto onError;
1420 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1421 goto onError;
1422 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1423 goto onError;
1426 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1427 if (restuple == NULL)
1428 goto onError;
1429 if (!PyTuple_Check(restuple)) {
1430 PyErr_SetString(PyExc_TypeError, &argparse[4]);
1431 goto onError;
1433 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1434 goto onError;
1435 if (newpos<0)
1436 newpos = insize+newpos;
1437 if (newpos<0 || newpos>insize) {
1438 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1439 goto onError;
1442 /* need more space? (at least enough for what we
1443 have+the replacement+the rest of the string (starting
1444 at the new input position), so we won't have to check space
1445 when there are no errors in the rest of the string) */
1446 repptr = PyUnicode_AS_UNICODE(repunicode);
1447 repsize = PyUnicode_GET_SIZE(repunicode);
1448 requiredsize = *outpos + repsize + insize-newpos;
1449 if (requiredsize > outsize) {
1450 if (requiredsize<2*outsize)
1451 requiredsize = 2*outsize;
1452 if (_PyUnicode_Resize(output, requiredsize) < 0)
1453 goto onError;
1454 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1456 *endinpos = newpos;
1457 *inptr = input + newpos;
1458 Py_UNICODE_COPY(*outptr, repptr, repsize);
1459 *outptr += repsize;
1460 *outpos += repsize;
1461 /* we made it! */
1462 res = 0;
1464 onError:
1465 Py_XDECREF(restuple);
1466 return res;
1469 /* --- UTF-7 Codec -------------------------------------------------------- */
1471 /* See RFC2152 for details. We encode conservatively and decode liberally. */
1473 /* Three simple macros defining base-64. */
1475 /* Is c a base-64 character? */
1477 #define IS_BASE64(c) \
1478 (isalnum(c) || (c) == '+' || (c) == '/')
1480 /* given that c is a base-64 character, what is its base-64 value? */
1482 #define FROM_BASE64(c) \
1483 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1484 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1485 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1486 (c) == '+' ? 62 : 63)
1488 /* What is the base-64 character of the bottom 6 bits of n? */
1490 #define TO_BASE64(n) \
1491 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1493 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1494 * decoded as itself. We are permissive on decoding; the only ASCII
1495 * byte not decoding to itself is the + which begins a base64
1496 * string. */
1498 #define DECODE_DIRECT(c) \
1499 ((c) <= 127 && (c) != '+')
1501 /* The UTF-7 encoder treats ASCII characters differently according to
1502 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1503 * the above). See RFC2152. This array identifies these different
1504 * sets:
1505 * 0 : "Set D"
1506 * alphanumeric and '(),-./:?
1507 * 1 : "Set O"
1508 * !"#$%&*;<=>@[]^_`{|}
1509 * 2 : "whitespace"
1510 * ht nl cr sp
1511 * 3 : special (must be base64 encoded)
1512 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1515 static
1516 char utf7_category[128] = {
1517 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1518 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1519 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1520 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1521 /* sp ! " # $ % & ' ( ) * + , - . / */
1522 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1523 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1524 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1525 /* @ A B C D E F G H I J K L M N O */
1526 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1527 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
1528 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1529 /* ` a b c d e f g h i j k l m n o */
1530 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1531 /* p q r s t u v w x y z { | } ~ del */
1532 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
1535 /* ENCODE_DIRECT: this character should be encoded as itself. The
1536 * answer depends on whether we are encoding set O as itself, and also
1537 * on whether we are encoding whitespace as itself. RFC2152 makes it
1538 * clear that the answers to these questions vary between
1539 * applications, so this code needs to be flexible. */
1541 #define ENCODE_DIRECT(c, directO, directWS) \
1542 ((c) < 128 && (c) > 0 && \
1543 ((utf7_category[(c)] == 0) || \
1544 (directWS && (utf7_category[(c)] == 2)) || \
1545 (directO && (utf7_category[(c)] == 1))))
1547 PyObject *PyUnicode_DecodeUTF7(const char *s,
1548 Py_ssize_t size,
1549 const char *errors)
1551 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1554 /* The decoder. The only state we preserve is our read position,
1555 * i.e. how many characters we have consumed. So if we end in the
1556 * middle of a shift sequence we have to back off the read position
1557 * and the output to the beginning of the sequence, otherwise we lose
1558 * all the shift state (seen bits, number of bits seen, high
1559 * surrogate). */
1561 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1562 Py_ssize_t size,
1563 const char *errors,
1564 Py_ssize_t *consumed)
1566 const char *starts = s;
1567 Py_ssize_t startinpos;
1568 Py_ssize_t endinpos;
1569 Py_ssize_t outpos;
1570 const char *e;
1571 PyUnicodeObject *unicode;
1572 Py_UNICODE *p;
1573 const char *errmsg = "";
1574 int inShift = 0;
1575 Py_UNICODE *shiftOutStart;
1576 unsigned int base64bits = 0;
1577 unsigned long base64buffer = 0;
1578 Py_UNICODE surrogate = 0;
1579 PyObject *errorHandler = NULL;
1580 PyObject *exc = NULL;
1582 unicode = _PyUnicode_New(size);
1583 if (!unicode)
1584 return NULL;
1585 if (size == 0) {
1586 if (consumed)
1587 *consumed = 0;
1588 return (PyObject *)unicode;
1591 p = unicode->str;
1592 shiftOutStart = p;
1593 e = s + size;
1595 while (s < e) {
1596 Py_UNICODE ch = (unsigned char) *s;
1598 if (inShift) { /* in a base-64 section */
1599 if (IS_BASE64(ch)) { /* consume a base-64 character */
1600 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1601 base64bits += 6;
1602 s++;
1603 if (base64bits >= 16) {
1604 /* we have enough bits for a UTF-16 value */
1605 Py_UNICODE outCh = (Py_UNICODE)
1606 (base64buffer >> (base64bits-16));
1607 base64bits -= 16;
1608 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1609 if (surrogate) {
1610 /* expecting a second surrogate */
1611 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1612 #ifdef Py_UNICODE_WIDE
1613 *p++ = (((surrogate & 0x3FF)<<10)
1614 | (outCh & 0x3FF)) + 0x10000;
1615 #else
1616 *p++ = surrogate;
1617 *p++ = outCh;
1618 #endif
1619 surrogate = 0;
1621 else {
1622 surrogate = 0;
1623 errmsg = "second surrogate missing";
1624 goto utf7Error;
1627 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1628 /* first surrogate */
1629 surrogate = outCh;
1631 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1632 errmsg = "unexpected second surrogate";
1633 goto utf7Error;
1635 else {
1636 *p++ = outCh;
1640 else { /* now leaving a base-64 section */
1641 inShift = 0;
1642 s++;
1643 if (surrogate) {
1644 errmsg = "second surrogate missing at end of shift sequence";
1645 goto utf7Error;
1647 if (base64bits > 0) { /* left-over bits */
1648 if (base64bits >= 6) {
1649 /* We've seen at least one base-64 character */
1650 errmsg = "partial character in shift sequence";
1651 goto utf7Error;
1653 else {
1654 /* Some bits remain; they should be zero */
1655 if (base64buffer != 0) {
1656 errmsg = "non-zero padding bits in shift sequence";
1657 goto utf7Error;
1661 if (ch != '-') {
1662 /* '-' is absorbed; other terminating
1663 characters are preserved */
1664 *p++ = ch;
1668 else if ( ch == '+' ) {
1669 startinpos = s-starts;
1670 s++; /* consume '+' */
1671 if (s < e && *s == '-') { /* '+-' encodes '+' */
1672 s++;
1673 *p++ = '+';
1675 else { /* begin base64-encoded section */
1676 inShift = 1;
1677 shiftOutStart = p;
1678 base64bits = 0;
1681 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
1682 *p++ = ch;
1683 s++;
1685 else {
1686 startinpos = s-starts;
1687 s++;
1688 errmsg = "unexpected special character";
1689 goto utf7Error;
1691 continue;
1692 utf7Error:
1693 outpos = p-PyUnicode_AS_UNICODE(unicode);
1694 endinpos = s-starts;
1695 if (unicode_decode_call_errorhandler(
1696 errors, &errorHandler,
1697 "utf7", errmsg,
1698 starts, size, &startinpos, &endinpos, &exc, &s,
1699 &unicode, &outpos, &p))
1700 goto onError;
1703 /* end of string */
1705 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1706 /* if we're in an inconsistent state, that's an error */
1707 if (surrogate ||
1708 (base64bits >= 6) ||
1709 (base64bits > 0 && base64buffer != 0)) {
1710 outpos = p-PyUnicode_AS_UNICODE(unicode);
1711 endinpos = size;
1712 if (unicode_decode_call_errorhandler(
1713 errors, &errorHandler,
1714 "utf7", "unterminated shift sequence",
1715 starts, size, &startinpos, &endinpos, &exc, &s,
1716 &unicode, &outpos, &p))
1717 goto onError;
1721 /* return state */
1722 if (consumed) {
1723 if (inShift) {
1724 p = shiftOutStart; /* back off output */
1725 *consumed = startinpos;
1727 else {
1728 *consumed = s-starts;
1732 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1733 goto onError;
1735 Py_XDECREF(errorHandler);
1736 Py_XDECREF(exc);
1737 return (PyObject *)unicode;
1739 onError:
1740 Py_XDECREF(errorHandler);
1741 Py_XDECREF(exc);
1742 Py_DECREF(unicode);
1743 return NULL;
1747 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1748 Py_ssize_t size,
1749 int base64SetO,
1750 int base64WhiteSpace,
1751 const char *errors)
1753 PyObject *v;
1754 /* It might be possible to tighten this worst case */
1755 Py_ssize_t allocated = 8 * size;
1756 int inShift = 0;
1757 Py_ssize_t i = 0;
1758 unsigned int base64bits = 0;
1759 unsigned long base64buffer = 0;
1760 char * out;
1761 char * start;
1763 if (allocated / 8 != size)
1764 return PyErr_NoMemory();
1766 if (size == 0)
1767 return PyString_FromStringAndSize(NULL, 0);
1769 v = PyString_FromStringAndSize(NULL, allocated);
1770 if (v == NULL)
1771 return NULL;
1773 start = out = PyString_AS_STRING(v);
1774 for (;i < size; ++i) {
1775 Py_UNICODE ch = s[i];
1777 if (inShift) {
1778 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1779 /* shifting out */
1780 if (base64bits) { /* output remaining bits */
1781 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1782 base64buffer = 0;
1783 base64bits = 0;
1785 inShift = 0;
1786 /* Characters not in the BASE64 set implicitly unshift the sequence
1787 so no '-' is required, except if the character is itself a '-' */
1788 if (IS_BASE64(ch) || ch == '-') {
1789 *out++ = '-';
1791 *out++ = (char) ch;
1793 else {
1794 goto encode_char;
1797 else { /* not in a shift sequence */
1798 if (ch == '+') {
1799 *out++ = '+';
1800 *out++ = '-';
1802 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1803 *out++ = (char) ch;
1805 else {
1806 *out++ = '+';
1807 inShift = 1;
1808 goto encode_char;
1811 continue;
1812 encode_char:
1813 #ifdef Py_UNICODE_WIDE
1814 if (ch >= 0x10000) {
1815 /* code first surrogate */
1816 base64bits += 16;
1817 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1818 while (base64bits >= 6) {
1819 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1820 base64bits -= 6;
1822 /* prepare second surrogate */
1823 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1825 #endif
1826 base64bits += 16;
1827 base64buffer = (base64buffer << 16) | ch;
1828 while (base64bits >= 6) {
1829 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1830 base64bits -= 6;
1833 if (base64bits)
1834 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1835 if (inShift)
1836 *out++ = '-';
1838 _PyString_Resize(&v, out - start);
1839 return v;
1842 #undef IS_BASE64
1843 #undef FROM_BASE64
1844 #undef TO_BASE64
1845 #undef DECODE_DIRECT
1846 #undef ENCODE_DIRECT
1848 /* --- UTF-8 Codec -------------------------------------------------------- */
1850 static
1851 char utf8_code_length[256] = {
1852 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1853 illegal prefix. see RFC 2279 for details */
1854 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1855 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1856 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1857 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1858 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1859 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1860 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1861 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1862 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1863 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1864 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1865 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1866 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1867 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1868 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1869 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1872 PyObject *PyUnicode_DecodeUTF8(const char *s,
1873 Py_ssize_t size,
1874 const char *errors)
1876 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1879 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1880 Py_ssize_t size,
1881 const char *errors,
1882 Py_ssize_t *consumed)
1884 const char *starts = s;
1885 int n;
1886 Py_ssize_t startinpos;
1887 Py_ssize_t endinpos;
1888 Py_ssize_t outpos;
1889 const char *e;
1890 PyUnicodeObject *unicode;
1891 Py_UNICODE *p;
1892 const char *errmsg = "";
1893 PyObject *errorHandler = NULL;
1894 PyObject *exc = NULL;
1896 /* Note: size will always be longer than the resulting Unicode
1897 character count */
1898 unicode = _PyUnicode_New(size);
1899 if (!unicode)
1900 return NULL;
1901 if (size == 0) {
1902 if (consumed)
1903 *consumed = 0;
1904 return (PyObject *)unicode;
1907 /* Unpack UTF-8 encoded data */
1908 p = unicode->str;
1909 e = s + size;
1911 while (s < e) {
1912 Py_UCS4 ch = (unsigned char)*s;
1914 if (ch < 0x80) {
1915 *p++ = (Py_UNICODE)ch;
1916 s++;
1917 continue;
1920 n = utf8_code_length[ch];
1922 if (s + n > e) {
1923 if (consumed)
1924 break;
1925 else {
1926 errmsg = "unexpected end of data";
1927 startinpos = s-starts;
1928 endinpos = size;
1929 goto utf8Error;
1933 switch (n) {
1935 case 0:
1936 errmsg = "unexpected code byte";
1937 startinpos = s-starts;
1938 endinpos = startinpos+1;
1939 goto utf8Error;
1941 case 1:
1942 errmsg = "internal error";
1943 startinpos = s-starts;
1944 endinpos = startinpos+1;
1945 goto utf8Error;
1947 case 2:
1948 if ((s[1] & 0xc0) != 0x80) {
1949 errmsg = "invalid data";
1950 startinpos = s-starts;
1951 endinpos = startinpos+2;
1952 goto utf8Error;
1954 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1955 if (ch < 0x80) {
1956 startinpos = s-starts;
1957 endinpos = startinpos+2;
1958 errmsg = "illegal encoding";
1959 goto utf8Error;
1961 else
1962 *p++ = (Py_UNICODE)ch;
1963 break;
1965 case 3:
1966 if ((s[1] & 0xc0) != 0x80 ||
1967 (s[2] & 0xc0) != 0x80) {
1968 errmsg = "invalid data";
1969 startinpos = s-starts;
1970 endinpos = startinpos+3;
1971 goto utf8Error;
1973 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1974 if (ch < 0x0800) {
1975 /* Note: UTF-8 encodings of surrogates are considered
1976 legal UTF-8 sequences;
1978 XXX For wide builds (UCS-4) we should probably try
1979 to recombine the surrogates into a single code
1980 unit.
1982 errmsg = "illegal encoding";
1983 startinpos = s-starts;
1984 endinpos = startinpos+3;
1985 goto utf8Error;
1987 else
1988 *p++ = (Py_UNICODE)ch;
1989 break;
1991 case 4:
1992 if ((s[1] & 0xc0) != 0x80 ||
1993 (s[2] & 0xc0) != 0x80 ||
1994 (s[3] & 0xc0) != 0x80) {
1995 errmsg = "invalid data";
1996 startinpos = s-starts;
1997 endinpos = startinpos+4;
1998 goto utf8Error;
2000 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2001 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2002 /* validate and convert to UTF-16 */
2003 if ((ch < 0x10000) /* minimum value allowed for 4
2004 byte encoding */
2005 || (ch > 0x10ffff)) /* maximum value allowed for
2006 UTF-16 */
2008 errmsg = "illegal encoding";
2009 startinpos = s-starts;
2010 endinpos = startinpos+4;
2011 goto utf8Error;
2013 #ifdef Py_UNICODE_WIDE
2014 *p++ = (Py_UNICODE)ch;
2015 #else
2016 /* compute and append the two surrogates: */
2018 /* translate from 10000..10FFFF to 0..FFFF */
2019 ch -= 0x10000;
2021 /* high surrogate = top 10 bits added to D800 */
2022 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2024 /* low surrogate = bottom 10 bits added to DC00 */
2025 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2026 #endif
2027 break;
2029 default:
2030 /* Other sizes are only needed for UCS-4 */
2031 errmsg = "unsupported Unicode code range";
2032 startinpos = s-starts;
2033 endinpos = startinpos+n;
2034 goto utf8Error;
2036 s += n;
2037 continue;
2039 utf8Error:
2040 outpos = p-PyUnicode_AS_UNICODE(unicode);
2041 if (unicode_decode_call_errorhandler(
2042 errors, &errorHandler,
2043 "utf8", errmsg,
2044 starts, size, &startinpos, &endinpos, &exc, &s,
2045 &unicode, &outpos, &p))
2046 goto onError;
2048 if (consumed)
2049 *consumed = s-starts;
2051 /* Adjust length */
2052 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2053 goto onError;
2055 Py_XDECREF(errorHandler);
2056 Py_XDECREF(exc);
2057 return (PyObject *)unicode;
2059 onError:
2060 Py_XDECREF(errorHandler);
2061 Py_XDECREF(exc);
2062 Py_DECREF(unicode);
2063 return NULL;
2066 /* Allocation strategy: if the string is short, convert into a stack buffer
2067 and allocate exactly as much space needed at the end. Else allocate the
2068 maximum possible needed (4 result bytes per Unicode character), and return
2069 the excess memory at the end.
2071 PyObject *
2072 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2073 Py_ssize_t size,
2074 const char *errors)
2076 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
2078 Py_ssize_t i; /* index into s of next input byte */
2079 PyObject *v; /* result string object */
2080 char *p; /* next free byte in output buffer */
2081 Py_ssize_t nallocated; /* number of result bytes allocated */
2082 Py_ssize_t nneeded; /* number of result bytes needed */
2083 char stackbuf[MAX_SHORT_UNICHARS * 4];
2085 assert(s != NULL);
2086 assert(size >= 0);
2088 if (size <= MAX_SHORT_UNICHARS) {
2089 /* Write into the stack buffer; nallocated can't overflow.
2090 * At the end, we'll allocate exactly as much heap space as it
2091 * turns out we need.
2093 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2094 v = NULL; /* will allocate after we're done */
2095 p = stackbuf;
2097 else {
2098 /* Overallocate on the heap, and give the excess back at the end. */
2099 nallocated = size * 4;
2100 if (nallocated / 4 != size) /* overflow! */
2101 return PyErr_NoMemory();
2102 v = PyString_FromStringAndSize(NULL, nallocated);
2103 if (v == NULL)
2104 return NULL;
2105 p = PyString_AS_STRING(v);
2108 for (i = 0; i < size;) {
2109 Py_UCS4 ch = s[i++];
2111 if (ch < 0x80)
2112 /* Encode ASCII */
2113 *p++ = (char) ch;
2115 else if (ch < 0x0800) {
2116 /* Encode Latin-1 */
2117 *p++ = (char)(0xc0 | (ch >> 6));
2118 *p++ = (char)(0x80 | (ch & 0x3f));
2120 else {
2121 /* Encode UCS2 Unicode ordinals */
2122 if (ch < 0x10000) {
2123 /* Special case: check for high surrogate */
2124 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2125 Py_UCS4 ch2 = s[i];
2126 /* Check for low surrogate and combine the two to
2127 form a UCS4 value */
2128 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2129 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2130 i++;
2131 goto encodeUCS4;
2133 /* Fall through: handles isolated high surrogates */
2135 *p++ = (char)(0xe0 | (ch >> 12));
2136 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2137 *p++ = (char)(0x80 | (ch & 0x3f));
2138 continue;
2140 encodeUCS4:
2141 /* Encode UCS4 Unicode ordinals */
2142 *p++ = (char)(0xf0 | (ch >> 18));
2143 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2144 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2145 *p++ = (char)(0x80 | (ch & 0x3f));
2149 if (v == NULL) {
2150 /* This was stack allocated. */
2151 nneeded = p - stackbuf;
2152 assert(nneeded <= nallocated);
2153 v = PyString_FromStringAndSize(stackbuf, nneeded);
2155 else {
2156 /* Cut back to size actually needed. */
2157 nneeded = p - PyString_AS_STRING(v);
2158 assert(nneeded <= nallocated);
2159 _PyString_Resize(&v, nneeded);
2161 return v;
2163 #undef MAX_SHORT_UNICHARS
2166 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2168 if (!PyUnicode_Check(unicode)) {
2169 PyErr_BadArgument();
2170 return NULL;
2172 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2173 PyUnicode_GET_SIZE(unicode),
2174 NULL);
2177 /* --- UTF-32 Codec ------------------------------------------------------- */
2179 PyObject *
2180 PyUnicode_DecodeUTF32(const char *s,
2181 Py_ssize_t size,
2182 const char *errors,
2183 int *byteorder)
2185 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2188 PyObject *
2189 PyUnicode_DecodeUTF32Stateful(const char *s,
2190 Py_ssize_t size,
2191 const char *errors,
2192 int *byteorder,
2193 Py_ssize_t *consumed)
2195 const char *starts = s;
2196 Py_ssize_t startinpos;
2197 Py_ssize_t endinpos;
2198 Py_ssize_t outpos;
2199 PyUnicodeObject *unicode;
2200 Py_UNICODE *p;
2201 #ifndef Py_UNICODE_WIDE
2202 int i, pairs;
2203 #else
2204 const int pairs = 0;
2205 #endif
2206 const unsigned char *q, *e;
2207 int bo = 0; /* assume native ordering by default */
2208 const char *errmsg = "";
2209 /* Offsets from q for retrieving bytes in the right order. */
2210 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2211 int iorder[] = {0, 1, 2, 3};
2212 #else
2213 int iorder[] = {3, 2, 1, 0};
2214 #endif
2215 PyObject *errorHandler = NULL;
2216 PyObject *exc = NULL;
2217 /* On narrow builds we split characters outside the BMP into two
2218 codepoints => count how much extra space we need. */
2219 #ifndef Py_UNICODE_WIDE
2220 for (i = pairs = 0; i < size/4; i++)
2221 if (((Py_UCS4 *)s)[i] >= 0x10000)
2222 pairs++;
2223 #endif
2225 /* This might be one to much, because of a BOM */
2226 unicode = _PyUnicode_New((size+3)/4+pairs);
2227 if (!unicode)
2228 return NULL;
2229 if (size == 0)
2230 return (PyObject *)unicode;
2232 /* Unpack UTF-32 encoded data */
2233 p = unicode->str;
2234 q = (unsigned char *)s;
2235 e = q + size;
2237 if (byteorder)
2238 bo = *byteorder;
2240 /* Check for BOM marks (U+FEFF) in the input and adjust current
2241 byte order setting accordingly. In native mode, the leading BOM
2242 mark is skipped, in all other modes, it is copied to the output
2243 stream as-is (giving a ZWNBSP character). */
2244 if (bo == 0) {
2245 if (size >= 4) {
2246 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2247 (q[iorder[1]] << 8) | q[iorder[0]];
2248 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2249 if (bom == 0x0000FEFF) {
2250 q += 4;
2251 bo = -1;
2253 else if (bom == 0xFFFE0000) {
2254 q += 4;
2255 bo = 1;
2257 #else
2258 if (bom == 0x0000FEFF) {
2259 q += 4;
2260 bo = 1;
2262 else if (bom == 0xFFFE0000) {
2263 q += 4;
2264 bo = -1;
2266 #endif
2270 if (bo == -1) {
2271 /* force LE */
2272 iorder[0] = 0;
2273 iorder[1] = 1;
2274 iorder[2] = 2;
2275 iorder[3] = 3;
2277 else if (bo == 1) {
2278 /* force BE */
2279 iorder[0] = 3;
2280 iorder[1] = 2;
2281 iorder[2] = 1;
2282 iorder[3] = 0;
2285 while (q < e) {
2286 Py_UCS4 ch;
2287 /* remaining bytes at the end? (size should be divisible by 4) */
2288 if (e-q<4) {
2289 if (consumed)
2290 break;
2291 errmsg = "truncated data";
2292 startinpos = ((const char *)q)-starts;
2293 endinpos = ((const char *)e)-starts;
2294 goto utf32Error;
2295 /* The remaining input chars are ignored if the callback
2296 chooses to skip the input */
2298 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2299 (q[iorder[1]] << 8) | q[iorder[0]];
2301 if (ch >= 0x110000)
2303 errmsg = "codepoint not in range(0x110000)";
2304 startinpos = ((const char *)q)-starts;
2305 endinpos = startinpos+4;
2306 goto utf32Error;
2308 #ifndef Py_UNICODE_WIDE
2309 if (ch >= 0x10000)
2311 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2312 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2314 else
2315 #endif
2316 *p++ = ch;
2317 q += 4;
2318 continue;
2319 utf32Error:
2320 outpos = p-PyUnicode_AS_UNICODE(unicode);
2321 if (unicode_decode_call_errorhandler(
2322 errors, &errorHandler,
2323 "utf32", errmsg,
2324 starts, size, &startinpos, &endinpos, &exc, &s,
2325 &unicode, &outpos, &p))
2326 goto onError;
2329 if (byteorder)
2330 *byteorder = bo;
2332 if (consumed)
2333 *consumed = (const char *)q-starts;
2335 /* Adjust length */
2336 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2337 goto onError;
2339 Py_XDECREF(errorHandler);
2340 Py_XDECREF(exc);
2341 return (PyObject *)unicode;
2343 onError:
2344 Py_DECREF(unicode);
2345 Py_XDECREF(errorHandler);
2346 Py_XDECREF(exc);
2347 return NULL;
2350 PyObject *
2351 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2352 Py_ssize_t size,
2353 const char *errors,
2354 int byteorder)
2356 PyObject *v;
2357 unsigned char *p;
2358 Py_ssize_t nsize, bytesize;
2359 #ifndef Py_UNICODE_WIDE
2360 Py_ssize_t i, pairs;
2361 #else
2362 const int pairs = 0;
2363 #endif
2364 /* Offsets from p for storing byte pairs in the right order. */
2365 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2366 int iorder[] = {0, 1, 2, 3};
2367 #else
2368 int iorder[] = {3, 2, 1, 0};
2369 #endif
2371 #define STORECHAR(CH) \
2372 do { \
2373 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2374 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2375 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2376 p[iorder[0]] = (CH) & 0xff; \
2377 p += 4; \
2378 } while(0)
2380 /* In narrow builds we can output surrogate pairs as one codepoint,
2381 so we need less space. */
2382 #ifndef Py_UNICODE_WIDE
2383 for (i = pairs = 0; i < size-1; i++)
2384 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2385 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2386 pairs++;
2387 #endif
2388 nsize = (size - pairs + (byteorder == 0));
2389 bytesize = nsize * 4;
2390 if (bytesize / 4 != nsize)
2391 return PyErr_NoMemory();
2392 v = PyString_FromStringAndSize(NULL, bytesize);
2393 if (v == NULL)
2394 return NULL;
2396 p = (unsigned char *)PyString_AS_STRING(v);
2397 if (byteorder == 0)
2398 STORECHAR(0xFEFF);
2399 if (size == 0)
2400 return v;
2402 if (byteorder == -1) {
2403 /* force LE */
2404 iorder[0] = 0;
2405 iorder[1] = 1;
2406 iorder[2] = 2;
2407 iorder[3] = 3;
2409 else if (byteorder == 1) {
2410 /* force BE */
2411 iorder[0] = 3;
2412 iorder[1] = 2;
2413 iorder[2] = 1;
2414 iorder[3] = 0;
2417 while (size-- > 0) {
2418 Py_UCS4 ch = *s++;
2419 #ifndef Py_UNICODE_WIDE
2420 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2421 Py_UCS4 ch2 = *s;
2422 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2423 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2424 s++;
2425 size--;
2428 #endif
2429 STORECHAR(ch);
2431 return v;
2432 #undef STORECHAR
2435 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2437 if (!PyUnicode_Check(unicode)) {
2438 PyErr_BadArgument();
2439 return NULL;
2441 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2442 PyUnicode_GET_SIZE(unicode),
2443 NULL,
2447 /* --- UTF-16 Codec ------------------------------------------------------- */
2449 PyObject *
2450 PyUnicode_DecodeUTF16(const char *s,
2451 Py_ssize_t size,
2452 const char *errors,
2453 int *byteorder)
2455 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2458 PyObject *
2459 PyUnicode_DecodeUTF16Stateful(const char *s,
2460 Py_ssize_t size,
2461 const char *errors,
2462 int *byteorder,
2463 Py_ssize_t *consumed)
2465 const char *starts = s;
2466 Py_ssize_t startinpos;
2467 Py_ssize_t endinpos;
2468 Py_ssize_t outpos;
2469 PyUnicodeObject *unicode;
2470 Py_UNICODE *p;
2471 const unsigned char *q, *e;
2472 int bo = 0; /* assume native ordering by default */
2473 const char *errmsg = "";
2474 /* Offsets from q for retrieving byte pairs in the right order. */
2475 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2476 int ihi = 1, ilo = 0;
2477 #else
2478 int ihi = 0, ilo = 1;
2479 #endif
2480 PyObject *errorHandler = NULL;
2481 PyObject *exc = NULL;
2483 /* Note: size will always be longer than the resulting Unicode
2484 character count */
2485 unicode = _PyUnicode_New(size);
2486 if (!unicode)
2487 return NULL;
2488 if (size == 0)
2489 return (PyObject *)unicode;
2491 /* Unpack UTF-16 encoded data */
2492 p = unicode->str;
2493 q = (unsigned char *)s;
2494 e = q + size;
2496 if (byteorder)
2497 bo = *byteorder;
2499 /* Check for BOM marks (U+FEFF) in the input and adjust current
2500 byte order setting accordingly. In native mode, the leading BOM
2501 mark is skipped, in all other modes, it is copied to the output
2502 stream as-is (giving a ZWNBSP character). */
2503 if (bo == 0) {
2504 if (size >= 2) {
2505 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2506 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2507 if (bom == 0xFEFF) {
2508 q += 2;
2509 bo = -1;
2511 else if (bom == 0xFFFE) {
2512 q += 2;
2513 bo = 1;
2515 #else
2516 if (bom == 0xFEFF) {
2517 q += 2;
2518 bo = 1;
2520 else if (bom == 0xFFFE) {
2521 q += 2;
2522 bo = -1;
2524 #endif
2528 if (bo == -1) {
2529 /* force LE */
2530 ihi = 1;
2531 ilo = 0;
2533 else if (bo == 1) {
2534 /* force BE */
2535 ihi = 0;
2536 ilo = 1;
2539 while (q < e) {
2540 Py_UNICODE ch;
2541 /* remaining bytes at the end? (size should be even) */
2542 if (e-q<2) {
2543 if (consumed)
2544 break;
2545 errmsg = "truncated data";
2546 startinpos = ((const char *)q)-starts;
2547 endinpos = ((const char *)e)-starts;
2548 goto utf16Error;
2549 /* The remaining input chars are ignored if the callback
2550 chooses to skip the input */
2552 ch = (q[ihi] << 8) | q[ilo];
2554 q += 2;
2556 if (ch < 0xD800 || ch > 0xDFFF) {
2557 *p++ = ch;
2558 continue;
2561 /* UTF-16 code pair: */
2562 if (q >= e) {
2563 errmsg = "unexpected end of data";
2564 startinpos = (((const char *)q)-2)-starts;
2565 endinpos = ((const char *)e)-starts;
2566 goto utf16Error;
2568 if (0xD800 <= ch && ch <= 0xDBFF) {
2569 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2570 q += 2;
2571 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2572 #ifndef Py_UNICODE_WIDE
2573 *p++ = ch;
2574 *p++ = ch2;
2575 #else
2576 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2577 #endif
2578 continue;
2580 else {
2581 errmsg = "illegal UTF-16 surrogate";
2582 startinpos = (((const char *)q)-4)-starts;
2583 endinpos = startinpos+2;
2584 goto utf16Error;
2588 errmsg = "illegal encoding";
2589 startinpos = (((const char *)q)-2)-starts;
2590 endinpos = startinpos+2;
2591 /* Fall through to report the error */
2593 utf16Error:
2594 outpos = p-PyUnicode_AS_UNICODE(unicode);
2595 if (unicode_decode_call_errorhandler(
2596 errors, &errorHandler,
2597 "utf16", errmsg,
2598 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2599 &unicode, &outpos, &p))
2600 goto onError;
2603 if (byteorder)
2604 *byteorder = bo;
2606 if (consumed)
2607 *consumed = (const char *)q-starts;
2609 /* Adjust length */
2610 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2611 goto onError;
2613 Py_XDECREF(errorHandler);
2614 Py_XDECREF(exc);
2615 return (PyObject *)unicode;
2617 onError:
2618 Py_DECREF(unicode);
2619 Py_XDECREF(errorHandler);
2620 Py_XDECREF(exc);
2621 return NULL;
2624 PyObject *
2625 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2626 Py_ssize_t size,
2627 const char *errors,
2628 int byteorder)
2630 PyObject *v;
2631 unsigned char *p;
2632 Py_ssize_t nsize, bytesize;
2633 #ifdef Py_UNICODE_WIDE
2634 Py_ssize_t i, pairs;
2635 #else
2636 const int pairs = 0;
2637 #endif
2638 /* Offsets from p for storing byte pairs in the right order. */
2639 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2640 int ihi = 1, ilo = 0;
2641 #else
2642 int ihi = 0, ilo = 1;
2643 #endif
2645 #define STORECHAR(CH) \
2646 do { \
2647 p[ihi] = ((CH) >> 8) & 0xff; \
2648 p[ilo] = (CH) & 0xff; \
2649 p += 2; \
2650 } while(0)
2652 #ifdef Py_UNICODE_WIDE
2653 for (i = pairs = 0; i < size; i++)
2654 if (s[i] >= 0x10000)
2655 pairs++;
2656 #endif
2657 /* 2 * (size + pairs + (byteorder == 0)) */
2658 if (size > PY_SSIZE_T_MAX ||
2659 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2660 return PyErr_NoMemory();
2661 nsize = size + pairs + (byteorder == 0);
2662 bytesize = nsize * 2;
2663 if (bytesize / 2 != nsize)
2664 return PyErr_NoMemory();
2665 v = PyString_FromStringAndSize(NULL, bytesize);
2666 if (v == NULL)
2667 return NULL;
2669 p = (unsigned char *)PyString_AS_STRING(v);
2670 if (byteorder == 0)
2671 STORECHAR(0xFEFF);
2672 if (size == 0)
2673 return v;
2675 if (byteorder == -1) {
2676 /* force LE */
2677 ihi = 1;
2678 ilo = 0;
2680 else if (byteorder == 1) {
2681 /* force BE */
2682 ihi = 0;
2683 ilo = 1;
2686 while (size-- > 0) {
2687 Py_UNICODE ch = *s++;
2688 Py_UNICODE ch2 = 0;
2689 #ifdef Py_UNICODE_WIDE
2690 if (ch >= 0x10000) {
2691 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2692 ch = 0xD800 | ((ch-0x10000) >> 10);
2694 #endif
2695 STORECHAR(ch);
2696 if (ch2)
2697 STORECHAR(ch2);
2699 return v;
2700 #undef STORECHAR
2703 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2705 if (!PyUnicode_Check(unicode)) {
2706 PyErr_BadArgument();
2707 return NULL;
2709 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2710 PyUnicode_GET_SIZE(unicode),
2711 NULL,
2715 /* --- Unicode Escape Codec ----------------------------------------------- */
2717 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2719 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2720 Py_ssize_t size,
2721 const char *errors)
2723 const char *starts = s;
2724 Py_ssize_t startinpos;
2725 Py_ssize_t endinpos;
2726 Py_ssize_t outpos;
2727 int i;
2728 PyUnicodeObject *v;
2729 Py_UNICODE *p;
2730 const char *end;
2731 char* message;
2732 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2733 PyObject *errorHandler = NULL;
2734 PyObject *exc = NULL;
2736 /* Escaped strings will always be longer than the resulting
2737 Unicode string, so we start with size here and then reduce the
2738 length after conversion to the true value.
2739 (but if the error callback returns a long replacement string
2740 we'll have to allocate more space) */
2741 v = _PyUnicode_New(size);
2742 if (v == NULL)
2743 goto onError;
2744 if (size == 0)
2745 return (PyObject *)v;
2747 p = PyUnicode_AS_UNICODE(v);
2748 end = s + size;
2750 while (s < end) {
2751 unsigned char c;
2752 Py_UNICODE x;
2753 int digits;
2755 /* Non-escape characters are interpreted as Unicode ordinals */
2756 if (*s != '\\') {
2757 *p++ = (unsigned char) *s++;
2758 continue;
2761 startinpos = s-starts;
2762 /* \ - Escapes */
2763 s++;
2764 c = *s++;
2765 if (s > end)
2766 c = '\0'; /* Invalid after \ */
2767 switch (c) {
2769 /* \x escapes */
2770 case '\n': break;
2771 case '\\': *p++ = '\\'; break;
2772 case '\'': *p++ = '\''; break;
2773 case '\"': *p++ = '\"'; break;
2774 case 'b': *p++ = '\b'; break;
2775 case 'f': *p++ = '\014'; break; /* FF */
2776 case 't': *p++ = '\t'; break;
2777 case 'n': *p++ = '\n'; break;
2778 case 'r': *p++ = '\r'; break;
2779 case 'v': *p++ = '\013'; break; /* VT */
2780 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2782 /* \OOO (octal) escapes */
2783 case '0': case '1': case '2': case '3':
2784 case '4': case '5': case '6': case '7':
2785 x = s[-1] - '0';
2786 if (s < end && '0' <= *s && *s <= '7') {
2787 x = (x<<3) + *s++ - '0';
2788 if (s < end && '0' <= *s && *s <= '7')
2789 x = (x<<3) + *s++ - '0';
2791 *p++ = x;
2792 break;
2794 /* hex escapes */
2795 /* \xXX */
2796 case 'x':
2797 digits = 2;
2798 message = "truncated \\xXX escape";
2799 goto hexescape;
2801 /* \uXXXX */
2802 case 'u':
2803 digits = 4;
2804 message = "truncated \\uXXXX escape";
2805 goto hexescape;
2807 /* \UXXXXXXXX */
2808 case 'U':
2809 digits = 8;
2810 message = "truncated \\UXXXXXXXX escape";
2811 hexescape:
2812 chr = 0;
2813 outpos = p-PyUnicode_AS_UNICODE(v);
2814 if (s+digits>end) {
2815 endinpos = size;
2816 if (unicode_decode_call_errorhandler(
2817 errors, &errorHandler,
2818 "unicodeescape", "end of string in escape sequence",
2819 starts, size, &startinpos, &endinpos, &exc, &s,
2820 &v, &outpos, &p))
2821 goto onError;
2822 goto nextByte;
2824 for (i = 0; i < digits; ++i) {
2825 c = (unsigned char) s[i];
2826 if (!isxdigit(c)) {
2827 endinpos = (s+i+1)-starts;
2828 if (unicode_decode_call_errorhandler(
2829 errors, &errorHandler,
2830 "unicodeescape", message,
2831 starts, size, &startinpos, &endinpos, &exc, &s,
2832 &v, &outpos, &p))
2833 goto onError;
2834 goto nextByte;
2836 chr = (chr<<4) & ~0xF;
2837 if (c >= '0' && c <= '9')
2838 chr += c - '0';
2839 else if (c >= 'a' && c <= 'f')
2840 chr += 10 + c - 'a';
2841 else
2842 chr += 10 + c - 'A';
2844 s += i;
2845 if (chr == 0xffffffff && PyErr_Occurred())
2846 /* _decoding_error will have already written into the
2847 target buffer. */
2848 break;
2849 store:
2850 /* when we get here, chr is a 32-bit unicode character */
2851 if (chr <= 0xffff)
2852 /* UCS-2 character */
2853 *p++ = (Py_UNICODE) chr;
2854 else if (chr <= 0x10ffff) {
2855 /* UCS-4 character. Either store directly, or as
2856 surrogate pair. */
2857 #ifdef Py_UNICODE_WIDE
2858 *p++ = chr;
2859 #else
2860 chr -= 0x10000L;
2861 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2862 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2863 #endif
2864 } else {
2865 endinpos = s-starts;
2866 outpos = p-PyUnicode_AS_UNICODE(v);
2867 if (unicode_decode_call_errorhandler(
2868 errors, &errorHandler,
2869 "unicodeescape", "illegal Unicode character",
2870 starts, size, &startinpos, &endinpos, &exc, &s,
2871 &v, &outpos, &p))
2872 goto onError;
2874 break;
2876 /* \N{name} */
2877 case 'N':
2878 message = "malformed \\N character escape";
2879 if (ucnhash_CAPI == NULL) {
2880 /* load the unicode data module */
2881 PyObject *m, *api;
2882 m = PyImport_ImportModuleNoBlock("unicodedata");
2883 if (m == NULL)
2884 goto ucnhashError;
2885 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
2886 Py_DECREF(m);
2887 if (api == NULL)
2888 goto ucnhashError;
2889 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
2890 Py_DECREF(api);
2891 if (ucnhash_CAPI == NULL)
2892 goto ucnhashError;
2894 if (*s == '{') {
2895 const char *start = s+1;
2896 /* look for the closing brace */
2897 while (*s != '}' && s < end)
2898 s++;
2899 if (s > start && s < end && *s == '}') {
2900 /* found a name. look it up in the unicode database */
2901 message = "unknown Unicode character name";
2902 s++;
2903 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2904 goto store;
2907 endinpos = s-starts;
2908 outpos = p-PyUnicode_AS_UNICODE(v);
2909 if (unicode_decode_call_errorhandler(
2910 errors, &errorHandler,
2911 "unicodeescape", message,
2912 starts, size, &startinpos, &endinpos, &exc, &s,
2913 &v, &outpos, &p))
2914 goto onError;
2915 break;
2917 default:
2918 if (s > end) {
2919 message = "\\ at end of string";
2920 s--;
2921 endinpos = s-starts;
2922 outpos = p-PyUnicode_AS_UNICODE(v);
2923 if (unicode_decode_call_errorhandler(
2924 errors, &errorHandler,
2925 "unicodeescape", message,
2926 starts, size, &startinpos, &endinpos, &exc, &s,
2927 &v, &outpos, &p))
2928 goto onError;
2930 else {
2931 *p++ = '\\';
2932 *p++ = (unsigned char)s[-1];
2934 break;
2936 nextByte:
2939 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2940 goto onError;
2941 Py_XDECREF(errorHandler);
2942 Py_XDECREF(exc);
2943 return (PyObject *)v;
2945 ucnhashError:
2946 PyErr_SetString(
2947 PyExc_UnicodeError,
2948 "\\N escapes not supported (can't load unicodedata module)"
2950 Py_XDECREF(v);
2951 Py_XDECREF(errorHandler);
2952 Py_XDECREF(exc);
2953 return NULL;
2955 onError:
2956 Py_XDECREF(v);
2957 Py_XDECREF(errorHandler);
2958 Py_XDECREF(exc);
2959 return NULL;
2962 /* Return a Unicode-Escape string version of the Unicode object.
2964 If quotes is true, the string is enclosed in u"" or u'' quotes as
2965 appropriate.
2969 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2970 Py_ssize_t size,
2971 Py_UNICODE ch)
2973 /* like wcschr, but doesn't stop at NULL characters */
2975 while (size-- > 0) {
2976 if (*s == ch)
2977 return s;
2978 s++;
2981 return NULL;
2984 static
2985 PyObject *unicodeescape_string(const Py_UNICODE *s,
2986 Py_ssize_t size,
2987 int quotes)
2989 PyObject *repr;
2990 char *p;
2992 static const char *hexdigit = "0123456789abcdef";
2993 #ifdef Py_UNICODE_WIDE
2994 const Py_ssize_t expandsize = 10;
2995 #else
2996 const Py_ssize_t expandsize = 6;
2997 #endif
2999 /* XXX(nnorwitz): rather than over-allocating, it would be
3000 better to choose a different scheme. Perhaps scan the
3001 first N-chars of the string and allocate based on that size.
3003 /* Initial allocation is based on the longest-possible unichr
3004 escape.
3006 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3007 unichr, so in this case it's the longest unichr escape. In
3008 narrow (UTF-16) builds this is five chars per source unichr
3009 since there are two unichrs in the surrogate pair, so in narrow
3010 (UTF-16) builds it's not the longest unichr escape.
3012 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3013 so in the narrow (UTF-16) build case it's the longest unichr
3014 escape.
3017 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3018 return PyErr_NoMemory();
3020 repr = PyString_FromStringAndSize(NULL,
3022 + expandsize*size
3023 + 1);
3024 if (repr == NULL)
3025 return NULL;
3027 p = PyString_AS_STRING(repr);
3029 if (quotes) {
3030 *p++ = 'u';
3031 *p++ = (findchar(s, size, '\'') &&
3032 !findchar(s, size, '"')) ? '"' : '\'';
3034 while (size-- > 0) {
3035 Py_UNICODE ch = *s++;
3037 /* Escape quotes and backslashes */
3038 if ((quotes &&
3039 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
3040 *p++ = '\\';
3041 *p++ = (char) ch;
3042 continue;
3045 #ifdef Py_UNICODE_WIDE
3046 /* Map 21-bit characters to '\U00xxxxxx' */
3047 else if (ch >= 0x10000) {
3048 *p++ = '\\';
3049 *p++ = 'U';
3050 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3051 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3052 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3053 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3054 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3055 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3056 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
3057 *p++ = hexdigit[ch & 0x0000000F];
3058 continue;
3060 #else
3061 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3062 else if (ch >= 0xD800 && ch < 0xDC00) {
3063 Py_UNICODE ch2;
3064 Py_UCS4 ucs;
3066 ch2 = *s++;
3067 size--;
3068 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3069 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3070 *p++ = '\\';
3071 *p++ = 'U';
3072 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3073 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3074 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3075 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3076 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3077 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3078 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3079 *p++ = hexdigit[ucs & 0x0000000F];
3080 continue;
3082 /* Fall through: isolated surrogates are copied as-is */
3083 s--;
3084 size++;
3086 #endif
3088 /* Map 16-bit characters to '\uxxxx' */
3089 if (ch >= 256) {
3090 *p++ = '\\';
3091 *p++ = 'u';
3092 *p++ = hexdigit[(ch >> 12) & 0x000F];
3093 *p++ = hexdigit[(ch >> 8) & 0x000F];
3094 *p++ = hexdigit[(ch >> 4) & 0x000F];
3095 *p++ = hexdigit[ch & 0x000F];
3098 /* Map special whitespace to '\t', \n', '\r' */
3099 else if (ch == '\t') {
3100 *p++ = '\\';
3101 *p++ = 't';
3103 else if (ch == '\n') {
3104 *p++ = '\\';
3105 *p++ = 'n';
3107 else if (ch == '\r') {
3108 *p++ = '\\';
3109 *p++ = 'r';
3112 /* Map non-printable US ASCII to '\xhh' */
3113 else if (ch < ' ' || ch >= 0x7F) {
3114 *p++ = '\\';
3115 *p++ = 'x';
3116 *p++ = hexdigit[(ch >> 4) & 0x000F];
3117 *p++ = hexdigit[ch & 0x000F];
3120 /* Copy everything else as-is */
3121 else
3122 *p++ = (char) ch;
3124 if (quotes)
3125 *p++ = PyString_AS_STRING(repr)[1];
3127 *p = '\0';
3128 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
3129 return repr;
3132 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3133 Py_ssize_t size)
3135 return unicodeescape_string(s, size, 0);
3138 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3140 if (!PyUnicode_Check(unicode)) {
3141 PyErr_BadArgument();
3142 return NULL;
3144 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3145 PyUnicode_GET_SIZE(unicode));
3148 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3150 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3151 Py_ssize_t size,
3152 const char *errors)
3154 const char *starts = s;
3155 Py_ssize_t startinpos;
3156 Py_ssize_t endinpos;
3157 Py_ssize_t outpos;
3158 PyUnicodeObject *v;
3159 Py_UNICODE *p;
3160 const char *end;
3161 const char *bs;
3162 PyObject *errorHandler = NULL;
3163 PyObject *exc = NULL;
3165 /* Escaped strings will always be longer than the resulting
3166 Unicode string, so we start with size here and then reduce the
3167 length after conversion to the true value. (But decoding error
3168 handler might have to resize the string) */
3169 v = _PyUnicode_New(size);
3170 if (v == NULL)
3171 goto onError;
3172 if (size == 0)
3173 return (PyObject *)v;
3174 p = PyUnicode_AS_UNICODE(v);
3175 end = s + size;
3176 while (s < end) {
3177 unsigned char c;
3178 Py_UCS4 x;
3179 int i;
3180 int count;
3182 /* Non-escape characters are interpreted as Unicode ordinals */
3183 if (*s != '\\') {
3184 *p++ = (unsigned char)*s++;
3185 continue;
3187 startinpos = s-starts;
3189 /* \u-escapes are only interpreted iff the number of leading
3190 backslashes if odd */
3191 bs = s;
3192 for (;s < end;) {
3193 if (*s != '\\')
3194 break;
3195 *p++ = (unsigned char)*s++;
3197 if (((s - bs) & 1) == 0 ||
3198 s >= end ||
3199 (*s != 'u' && *s != 'U')) {
3200 continue;
3202 p--;
3203 count = *s=='u' ? 4 : 8;
3204 s++;
3206 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3207 outpos = p-PyUnicode_AS_UNICODE(v);
3208 for (x = 0, i = 0; i < count; ++i, ++s) {
3209 c = (unsigned char)*s;
3210 if (!isxdigit(c)) {
3211 endinpos = s-starts;
3212 if (unicode_decode_call_errorhandler(
3213 errors, &errorHandler,
3214 "rawunicodeescape", "truncated \\uXXXX",
3215 starts, size, &startinpos, &endinpos, &exc, &s,
3216 &v, &outpos, &p))
3217 goto onError;
3218 goto nextByte;
3220 x = (x<<4) & ~0xF;
3221 if (c >= '0' && c <= '9')
3222 x += c - '0';
3223 else if (c >= 'a' && c <= 'f')
3224 x += 10 + c - 'a';
3225 else
3226 x += 10 + c - 'A';
3228 if (x <= 0xffff)
3229 /* UCS-2 character */
3230 *p++ = (Py_UNICODE) x;
3231 else if (x <= 0x10ffff) {
3232 /* UCS-4 character. Either store directly, or as
3233 surrogate pair. */
3234 #ifdef Py_UNICODE_WIDE
3235 *p++ = (Py_UNICODE) x;
3236 #else
3237 x -= 0x10000L;
3238 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3239 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3240 #endif
3241 } else {
3242 endinpos = s-starts;
3243 outpos = p-PyUnicode_AS_UNICODE(v);
3244 if (unicode_decode_call_errorhandler(
3245 errors, &errorHandler,
3246 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3247 starts, size, &startinpos, &endinpos, &exc, &s,
3248 &v, &outpos, &p))
3249 goto onError;
3251 nextByte:
3254 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3255 goto onError;
3256 Py_XDECREF(errorHandler);
3257 Py_XDECREF(exc);
3258 return (PyObject *)v;
3260 onError:
3261 Py_XDECREF(v);
3262 Py_XDECREF(errorHandler);
3263 Py_XDECREF(exc);
3264 return NULL;
3267 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3268 Py_ssize_t size)
3270 PyObject *repr;
3271 char *p;
3272 char *q;
3274 static const char *hexdigit = "0123456789abcdef";
3275 #ifdef Py_UNICODE_WIDE
3276 const Py_ssize_t expandsize = 10;
3277 #else
3278 const Py_ssize_t expandsize = 6;
3279 #endif
3281 if (size > PY_SSIZE_T_MAX / expandsize)
3282 return PyErr_NoMemory();
3284 repr = PyString_FromStringAndSize(NULL, expandsize * size);
3285 if (repr == NULL)
3286 return NULL;
3287 if (size == 0)
3288 return repr;
3290 p = q = PyString_AS_STRING(repr);
3291 while (size-- > 0) {
3292 Py_UNICODE ch = *s++;
3293 #ifdef Py_UNICODE_WIDE
3294 /* Map 32-bit characters to '\Uxxxxxxxx' */
3295 if (ch >= 0x10000) {
3296 *p++ = '\\';
3297 *p++ = 'U';
3298 *p++ = hexdigit[(ch >> 28) & 0xf];
3299 *p++ = hexdigit[(ch >> 24) & 0xf];
3300 *p++ = hexdigit[(ch >> 20) & 0xf];
3301 *p++ = hexdigit[(ch >> 16) & 0xf];
3302 *p++ = hexdigit[(ch >> 12) & 0xf];
3303 *p++ = hexdigit[(ch >> 8) & 0xf];
3304 *p++ = hexdigit[(ch >> 4) & 0xf];
3305 *p++ = hexdigit[ch & 15];
3307 else
3308 #else
3309 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3310 if (ch >= 0xD800 && ch < 0xDC00) {
3311 Py_UNICODE ch2;
3312 Py_UCS4 ucs;
3314 ch2 = *s++;
3315 size--;
3316 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3317 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3318 *p++ = '\\';
3319 *p++ = 'U';
3320 *p++ = hexdigit[(ucs >> 28) & 0xf];
3321 *p++ = hexdigit[(ucs >> 24) & 0xf];
3322 *p++ = hexdigit[(ucs >> 20) & 0xf];
3323 *p++ = hexdigit[(ucs >> 16) & 0xf];
3324 *p++ = hexdigit[(ucs >> 12) & 0xf];
3325 *p++ = hexdigit[(ucs >> 8) & 0xf];
3326 *p++ = hexdigit[(ucs >> 4) & 0xf];
3327 *p++ = hexdigit[ucs & 0xf];
3328 continue;
3330 /* Fall through: isolated surrogates are copied as-is */
3331 s--;
3332 size++;
3334 #endif
3335 /* Map 16-bit characters to '\uxxxx' */
3336 if (ch >= 256) {
3337 *p++ = '\\';
3338 *p++ = 'u';
3339 *p++ = hexdigit[(ch >> 12) & 0xf];
3340 *p++ = hexdigit[(ch >> 8) & 0xf];
3341 *p++ = hexdigit[(ch >> 4) & 0xf];
3342 *p++ = hexdigit[ch & 15];
3344 /* Copy everything else as-is */
3345 else
3346 *p++ = (char) ch;
3348 *p = '\0';
3349 _PyString_Resize(&repr, p - q);
3350 return repr;
3353 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3355 if (!PyUnicode_Check(unicode)) {
3356 PyErr_BadArgument();
3357 return NULL;
3359 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3360 PyUnicode_GET_SIZE(unicode));
3363 /* --- Unicode Internal Codec ------------------------------------------- */
3365 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3366 Py_ssize_t size,
3367 const char *errors)
3369 const char *starts = s;
3370 Py_ssize_t startinpos;
3371 Py_ssize_t endinpos;
3372 Py_ssize_t outpos;
3373 PyUnicodeObject *v;
3374 Py_UNICODE *p;
3375 const char *end;
3376 const char *reason;
3377 PyObject *errorHandler = NULL;
3378 PyObject *exc = NULL;
3380 #ifdef Py_UNICODE_WIDE
3381 Py_UNICODE unimax = PyUnicode_GetMax();
3382 #endif
3384 /* XXX overflow detection missing */
3385 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3386 if (v == NULL)
3387 goto onError;
3388 if (PyUnicode_GetSize((PyObject *)v) == 0)
3389 return (PyObject *)v;
3390 p = PyUnicode_AS_UNICODE(v);
3391 end = s + size;
3393 while (s < end) {
3394 memcpy(p, s, sizeof(Py_UNICODE));
3395 /* We have to sanity check the raw data, otherwise doom looms for
3396 some malformed UCS-4 data. */
3397 if (
3398 #ifdef Py_UNICODE_WIDE
3399 *p > unimax || *p < 0 ||
3400 #endif
3401 end-s < Py_UNICODE_SIZE
3404 startinpos = s - starts;
3405 if (end-s < Py_UNICODE_SIZE) {
3406 endinpos = end-starts;
3407 reason = "truncated input";
3409 else {
3410 endinpos = s - starts + Py_UNICODE_SIZE;
3411 reason = "illegal code point (> 0x10FFFF)";
3413 outpos = p - PyUnicode_AS_UNICODE(v);
3414 if (unicode_decode_call_errorhandler(
3415 errors, &errorHandler,
3416 "unicode_internal", reason,
3417 starts, size, &startinpos, &endinpos, &exc, &s,
3418 &v, &outpos, &p)) {
3419 goto onError;
3422 else {
3423 p++;
3424 s += Py_UNICODE_SIZE;
3428 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3429 goto onError;
3430 Py_XDECREF(errorHandler);
3431 Py_XDECREF(exc);
3432 return (PyObject *)v;
3434 onError:
3435 Py_XDECREF(v);
3436 Py_XDECREF(errorHandler);
3437 Py_XDECREF(exc);
3438 return NULL;
3441 /* --- Latin-1 Codec ------------------------------------------------------ */
3443 PyObject *PyUnicode_DecodeLatin1(const char *s,
3444 Py_ssize_t size,
3445 const char *errors)
3447 PyUnicodeObject *v;
3448 Py_UNICODE *p;
3450 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3451 if (size == 1) {
3452 Py_UNICODE r = *(unsigned char*)s;
3453 return PyUnicode_FromUnicode(&r, 1);
3456 v = _PyUnicode_New(size);
3457 if (v == NULL)
3458 goto onError;
3459 if (size == 0)
3460 return (PyObject *)v;
3461 p = PyUnicode_AS_UNICODE(v);
3462 while (size-- > 0)
3463 *p++ = (unsigned char)*s++;
3464 return (PyObject *)v;
3466 onError:
3467 Py_XDECREF(v);
3468 return NULL;
3471 /* create or adjust a UnicodeEncodeError */
3472 static void make_encode_exception(PyObject **exceptionObject,
3473 const char *encoding,
3474 const Py_UNICODE *unicode, Py_ssize_t size,
3475 Py_ssize_t startpos, Py_ssize_t endpos,
3476 const char *reason)
3478 if (*exceptionObject == NULL) {
3479 *exceptionObject = PyUnicodeEncodeError_Create(
3480 encoding, unicode, size, startpos, endpos, reason);
3482 else {
3483 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3484 goto onError;
3485 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3486 goto onError;
3487 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3488 goto onError;
3489 return;
3490 onError:
3491 Py_DECREF(*exceptionObject);
3492 *exceptionObject = NULL;
3496 /* raises a UnicodeEncodeError */
3497 static void raise_encode_exception(PyObject **exceptionObject,
3498 const char *encoding,
3499 const Py_UNICODE *unicode, Py_ssize_t size,
3500 Py_ssize_t startpos, Py_ssize_t endpos,
3501 const char *reason)
3503 make_encode_exception(exceptionObject,
3504 encoding, unicode, size, startpos, endpos, reason);
3505 if (*exceptionObject != NULL)
3506 PyCodec_StrictErrors(*exceptionObject);
3509 /* error handling callback helper:
3510 build arguments, call the callback and check the arguments,
3511 put the result into newpos and return the replacement string, which
3512 has to be freed by the caller */
3513 static PyObject *unicode_encode_call_errorhandler(const char *errors,
3514 PyObject **errorHandler,
3515 const char *encoding, const char *reason,
3516 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3517 Py_ssize_t startpos, Py_ssize_t endpos,
3518 Py_ssize_t *newpos)
3520 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3522 PyObject *restuple;
3523 PyObject *resunicode;
3525 if (*errorHandler == NULL) {
3526 *errorHandler = PyCodec_LookupError(errors);
3527 if (*errorHandler == NULL)
3528 return NULL;
3531 make_encode_exception(exceptionObject,
3532 encoding, unicode, size, startpos, endpos, reason);
3533 if (*exceptionObject == NULL)
3534 return NULL;
3536 restuple = PyObject_CallFunctionObjArgs(
3537 *errorHandler, *exceptionObject, NULL);
3538 if (restuple == NULL)
3539 return NULL;
3540 if (!PyTuple_Check(restuple)) {
3541 PyErr_SetString(PyExc_TypeError, &argparse[4]);
3542 Py_DECREF(restuple);
3543 return NULL;
3545 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3546 &resunicode, newpos)) {
3547 Py_DECREF(restuple);
3548 return NULL;
3550 if (*newpos<0)
3551 *newpos = size+*newpos;
3552 if (*newpos<0 || *newpos>size) {
3553 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3554 Py_DECREF(restuple);
3555 return NULL;
3557 Py_INCREF(resunicode);
3558 Py_DECREF(restuple);
3559 return resunicode;
3562 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3563 Py_ssize_t size,
3564 const char *errors,
3565 int limit)
3567 /* output object */
3568 PyObject *res;
3569 /* pointers to the beginning and end+1 of input */
3570 const Py_UNICODE *startp = p;
3571 const Py_UNICODE *endp = p + size;
3572 /* pointer to the beginning of the unencodable characters */
3573 /* const Py_UNICODE *badp = NULL; */
3574 /* pointer into the output */
3575 char *str;
3576 /* current output position */
3577 Py_ssize_t respos = 0;
3578 Py_ssize_t ressize;
3579 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3580 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3581 PyObject *errorHandler = NULL;
3582 PyObject *exc = NULL;
3583 /* the following variable is used for caching string comparisons
3584 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3585 int known_errorHandler = -1;
3587 /* allocate enough for a simple encoding without
3588 replacements, if we need more, we'll resize */
3589 res = PyString_FromStringAndSize(NULL, size);
3590 if (res == NULL)
3591 goto onError;
3592 if (size == 0)
3593 return res;
3594 str = PyString_AS_STRING(res);
3595 ressize = size;
3597 while (p<endp) {
3598 Py_UNICODE c = *p;
3600 /* can we encode this? */
3601 if (c<limit) {
3602 /* no overflow check, because we know that the space is enough */
3603 *str++ = (char)c;
3604 ++p;
3606 else {
3607 Py_ssize_t unicodepos = p-startp;
3608 Py_ssize_t requiredsize;
3609 PyObject *repunicode;
3610 Py_ssize_t repsize;
3611 Py_ssize_t newpos;
3612 Py_ssize_t respos;
3613 Py_UNICODE *uni2;
3614 /* startpos for collecting unencodable chars */
3615 const Py_UNICODE *collstart = p;
3616 const Py_UNICODE *collend = p;
3617 /* find all unecodable characters */
3618 while ((collend < endp) && ((*collend)>=limit))
3619 ++collend;
3620 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3621 if (known_errorHandler==-1) {
3622 if ((errors==NULL) || (!strcmp(errors, "strict")))
3623 known_errorHandler = 1;
3624 else if (!strcmp(errors, "replace"))
3625 known_errorHandler = 2;
3626 else if (!strcmp(errors, "ignore"))
3627 known_errorHandler = 3;
3628 else if (!strcmp(errors, "xmlcharrefreplace"))
3629 known_errorHandler = 4;
3630 else
3631 known_errorHandler = 0;
3633 switch (known_errorHandler) {
3634 case 1: /* strict */
3635 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3636 goto onError;
3637 case 2: /* replace */
3638 while (collstart++<collend)
3639 *str++ = '?'; /* fall through */
3640 case 3: /* ignore */
3641 p = collend;
3642 break;
3643 case 4: /* xmlcharrefreplace */
3644 respos = str-PyString_AS_STRING(res);
3645 /* determine replacement size (temporarily (mis)uses p) */
3646 for (p = collstart, repsize = 0; p < collend; ++p) {
3647 if (*p<10)
3648 repsize += 2+1+1;
3649 else if (*p<100)
3650 repsize += 2+2+1;
3651 else if (*p<1000)
3652 repsize += 2+3+1;
3653 else if (*p<10000)
3654 repsize += 2+4+1;
3655 #ifndef Py_UNICODE_WIDE
3656 else
3657 repsize += 2+5+1;
3658 #else
3659 else if (*p<100000)
3660 repsize += 2+5+1;
3661 else if (*p<1000000)
3662 repsize += 2+6+1;
3663 else
3664 repsize += 2+7+1;
3665 #endif
3667 requiredsize = respos+repsize+(endp-collend);
3668 if (requiredsize > ressize) {
3669 if (requiredsize<2*ressize)
3670 requiredsize = 2*ressize;
3671 if (_PyString_Resize(&res, requiredsize))
3672 goto onError;
3673 str = PyString_AS_STRING(res) + respos;
3674 ressize = requiredsize;
3676 /* generate replacement (temporarily (mis)uses p) */
3677 for (p = collstart; p < collend; ++p) {
3678 str += sprintf(str, "&#%d;", (int)*p);
3680 p = collend;
3681 break;
3682 default:
3683 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3684 encoding, reason, startp, size, &exc,
3685 collstart-startp, collend-startp, &newpos);
3686 if (repunicode == NULL)
3687 goto onError;
3688 /* need more space? (at least enough for what we have+the
3689 replacement+the rest of the string, so we won't have to
3690 check space for encodable characters) */
3691 respos = str-PyString_AS_STRING(res);
3692 repsize = PyUnicode_GET_SIZE(repunicode);
3693 requiredsize = respos+repsize+(endp-collend);
3694 if (requiredsize > ressize) {
3695 if (requiredsize<2*ressize)
3696 requiredsize = 2*ressize;
3697 if (_PyString_Resize(&res, requiredsize)) {
3698 Py_DECREF(repunicode);
3699 goto onError;
3701 str = PyString_AS_STRING(res) + respos;
3702 ressize = requiredsize;
3704 /* check if there is anything unencodable in the replacement
3705 and copy it to the output */
3706 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3707 c = *uni2;
3708 if (c >= limit) {
3709 raise_encode_exception(&exc, encoding, startp, size,
3710 unicodepos, unicodepos+1, reason);
3711 Py_DECREF(repunicode);
3712 goto onError;
3714 *str = (char)c;
3716 p = startp + newpos;
3717 Py_DECREF(repunicode);
3721 /* Resize if we allocated to much */
3722 respos = str-PyString_AS_STRING(res);
3723 if (respos<ressize)
3724 /* If this falls res will be NULL */
3725 _PyString_Resize(&res, respos);
3726 Py_XDECREF(errorHandler);
3727 Py_XDECREF(exc);
3728 return res;
3730 onError:
3731 Py_XDECREF(res);
3732 Py_XDECREF(errorHandler);
3733 Py_XDECREF(exc);
3734 return NULL;
3737 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3738 Py_ssize_t size,
3739 const char *errors)
3741 return unicode_encode_ucs1(p, size, errors, 256);
3744 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3746 if (!PyUnicode_Check(unicode)) {
3747 PyErr_BadArgument();
3748 return NULL;
3750 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3751 PyUnicode_GET_SIZE(unicode),
3752 NULL);
3755 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3757 PyObject *PyUnicode_DecodeASCII(const char *s,
3758 Py_ssize_t size,
3759 const char *errors)
3761 const char *starts = s;
3762 PyUnicodeObject *v;
3763 Py_UNICODE *p;
3764 Py_ssize_t startinpos;
3765 Py_ssize_t endinpos;
3766 Py_ssize_t outpos;
3767 const char *e;
3768 PyObject *errorHandler = NULL;
3769 PyObject *exc = NULL;
3771 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3772 if (size == 1 && *(unsigned char*)s < 128) {
3773 Py_UNICODE r = *(unsigned char*)s;
3774 return PyUnicode_FromUnicode(&r, 1);
3777 v = _PyUnicode_New(size);
3778 if (v == NULL)
3779 goto onError;
3780 if (size == 0)
3781 return (PyObject *)v;
3782 p = PyUnicode_AS_UNICODE(v);
3783 e = s + size;
3784 while (s < e) {
3785 register unsigned char c = (unsigned char)*s;
3786 if (c < 128) {
3787 *p++ = c;
3788 ++s;
3790 else {
3791 startinpos = s-starts;
3792 endinpos = startinpos + 1;
3793 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3794 if (unicode_decode_call_errorhandler(
3795 errors, &errorHandler,
3796 "ascii", "ordinal not in range(128)",
3797 starts, size, &startinpos, &endinpos, &exc, &s,
3798 &v, &outpos, &p))
3799 goto onError;
3802 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
3803 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3804 goto onError;
3805 Py_XDECREF(errorHandler);
3806 Py_XDECREF(exc);
3807 return (PyObject *)v;
3809 onError:
3810 Py_XDECREF(v);
3811 Py_XDECREF(errorHandler);
3812 Py_XDECREF(exc);
3813 return NULL;
3816 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3817 Py_ssize_t size,
3818 const char *errors)
3820 return unicode_encode_ucs1(p, size, errors, 128);
3823 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3825 if (!PyUnicode_Check(unicode)) {
3826 PyErr_BadArgument();
3827 return NULL;
3829 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3830 PyUnicode_GET_SIZE(unicode),
3831 NULL);
3834 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3836 /* --- MBCS codecs for Windows -------------------------------------------- */
3838 #if SIZEOF_INT < SIZEOF_SIZE_T
3839 #define NEED_RETRY
3840 #endif
3842 /* XXX This code is limited to "true" double-byte encodings, as
3843 a) it assumes an incomplete character consists of a single byte, and
3844 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3845 encodings, see IsDBCSLeadByteEx documentation. */
3847 static int is_dbcs_lead_byte(const char *s, int offset)
3849 const char *curr = s + offset;
3851 if (IsDBCSLeadByte(*curr)) {
3852 const char *prev = CharPrev(s, curr);
3853 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3855 return 0;
3859 * Decode MBCS string into unicode object. If 'final' is set, converts
3860 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3862 static int decode_mbcs(PyUnicodeObject **v,
3863 const char *s, /* MBCS string */
3864 int size, /* sizeof MBCS string */
3865 int final)
3867 Py_UNICODE *p;
3868 Py_ssize_t n = 0;
3869 int usize = 0;
3871 assert(size >= 0);
3873 /* Skip trailing lead-byte unless 'final' is set */
3874 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3875 --size;
3877 /* First get the size of the result */
3878 if (size > 0) {
3879 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3880 if (usize == 0) {
3881 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3882 return -1;
3886 if (*v == NULL) {
3887 /* Create unicode object */
3888 *v = _PyUnicode_New(usize);
3889 if (*v == NULL)
3890 return -1;
3892 else {
3893 /* Extend unicode object */
3894 n = PyUnicode_GET_SIZE(*v);
3895 if (_PyUnicode_Resize(v, n + usize) < 0)
3896 return -1;
3899 /* Do the conversion */
3900 if (size > 0) {
3901 p = PyUnicode_AS_UNICODE(*v) + n;
3902 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3903 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3904 return -1;
3908 return size;
3911 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3912 Py_ssize_t size,
3913 const char *errors,
3914 Py_ssize_t *consumed)
3916 PyUnicodeObject *v = NULL;
3917 int done;
3919 if (consumed)
3920 *consumed = 0;
3922 #ifdef NEED_RETRY
3923 retry:
3924 if (size > INT_MAX)
3925 done = decode_mbcs(&v, s, INT_MAX, 0);
3926 else
3927 #endif
3928 done = decode_mbcs(&v, s, (int)size, !consumed);
3930 if (done < 0) {
3931 Py_XDECREF(v);
3932 return NULL;
3935 if (consumed)
3936 *consumed += done;
3938 #ifdef NEED_RETRY
3939 if (size > INT_MAX) {
3940 s += done;
3941 size -= done;
3942 goto retry;
3944 #endif
3946 return (PyObject *)v;
3949 PyObject *PyUnicode_DecodeMBCS(const char *s,
3950 Py_ssize_t size,
3951 const char *errors)
3953 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3957 * Convert unicode into string object (MBCS).
3958 * Returns 0 if succeed, -1 otherwise.
3960 static int encode_mbcs(PyObject **repr,
3961 const Py_UNICODE *p, /* unicode */
3962 int size) /* size of unicode */
3964 int mbcssize = 0;
3965 Py_ssize_t n = 0;
3967 assert(size >= 0);
3969 /* First get the size of the result */
3970 if (size > 0) {
3971 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3972 if (mbcssize == 0) {
3973 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3974 return -1;
3978 if (*repr == NULL) {
3979 /* Create string object */
3980 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3981 if (*repr == NULL)
3982 return -1;
3984 else {
3985 /* Extend string object */
3986 n = PyString_Size(*repr);
3987 if (_PyString_Resize(repr, n + mbcssize) < 0)
3988 return -1;
3991 /* Do the conversion */
3992 if (size > 0) {
3993 char *s = PyString_AS_STRING(*repr) + n;
3994 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3995 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3996 return -1;
4000 return 0;
4003 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
4004 Py_ssize_t size,
4005 const char *errors)
4007 PyObject *repr = NULL;
4008 int ret;
4010 #ifdef NEED_RETRY
4011 retry:
4012 if (size > INT_MAX)
4013 ret = encode_mbcs(&repr, p, INT_MAX);
4014 else
4015 #endif
4016 ret = encode_mbcs(&repr, p, (int)size);
4018 if (ret < 0) {
4019 Py_XDECREF(repr);
4020 return NULL;
4023 #ifdef NEED_RETRY
4024 if (size > INT_MAX) {
4025 p += INT_MAX;
4026 size -= INT_MAX;
4027 goto retry;
4029 #endif
4031 return repr;
4034 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4036 if (!PyUnicode_Check(unicode)) {
4037 PyErr_BadArgument();
4038 return NULL;
4040 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4041 PyUnicode_GET_SIZE(unicode),
4042 NULL);
4045 #undef NEED_RETRY
4047 #endif /* MS_WINDOWS */
4049 /* --- Character Mapping Codec -------------------------------------------- */
4051 PyObject *PyUnicode_DecodeCharmap(const char *s,
4052 Py_ssize_t size,
4053 PyObject *mapping,
4054 const char *errors)
4056 const char *starts = s;
4057 Py_ssize_t startinpos;
4058 Py_ssize_t endinpos;
4059 Py_ssize_t outpos;
4060 const char *e;
4061 PyUnicodeObject *v;
4062 Py_UNICODE *p;
4063 Py_ssize_t extrachars = 0;
4064 PyObject *errorHandler = NULL;
4065 PyObject *exc = NULL;
4066 Py_UNICODE *mapstring = NULL;
4067 Py_ssize_t maplen = 0;
4069 /* Default to Latin-1 */
4070 if (mapping == NULL)
4071 return PyUnicode_DecodeLatin1(s, size, errors);
4073 v = _PyUnicode_New(size);
4074 if (v == NULL)
4075 goto onError;
4076 if (size == 0)
4077 return (PyObject *)v;
4078 p = PyUnicode_AS_UNICODE(v);
4079 e = s + size;
4080 if (PyUnicode_CheckExact(mapping)) {
4081 mapstring = PyUnicode_AS_UNICODE(mapping);
4082 maplen = PyUnicode_GET_SIZE(mapping);
4083 while (s < e) {
4084 unsigned char ch = *s;
4085 Py_UNICODE x = 0xfffe; /* illegal value */
4087 if (ch < maplen)
4088 x = mapstring[ch];
4090 if (x == 0xfffe) {
4091 /* undefined mapping */
4092 outpos = p-PyUnicode_AS_UNICODE(v);
4093 startinpos = s-starts;
4094 endinpos = startinpos+1;
4095 if (unicode_decode_call_errorhandler(
4096 errors, &errorHandler,
4097 "charmap", "character maps to <undefined>",
4098 starts, size, &startinpos, &endinpos, &exc, &s,
4099 &v, &outpos, &p)) {
4100 goto onError;
4102 continue;
4104 *p++ = x;
4105 ++s;
4108 else {
4109 while (s < e) {
4110 unsigned char ch = *s;
4111 PyObject *w, *x;
4113 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4114 w = PyInt_FromLong((long)ch);
4115 if (w == NULL)
4116 goto onError;
4117 x = PyObject_GetItem(mapping, w);
4118 Py_DECREF(w);
4119 if (x == NULL) {
4120 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4121 /* No mapping found means: mapping is undefined. */
4122 PyErr_Clear();
4123 x = Py_None;
4124 Py_INCREF(x);
4125 } else
4126 goto onError;
4129 /* Apply mapping */
4130 if (PyInt_Check(x)) {
4131 long value = PyInt_AS_LONG(x);
4132 if (value < 0 || value > 65535) {
4133 PyErr_SetString(PyExc_TypeError,
4134 "character mapping must be in range(65536)");
4135 Py_DECREF(x);
4136 goto onError;
4138 *p++ = (Py_UNICODE)value;
4140 else if (x == Py_None) {
4141 /* undefined mapping */
4142 outpos = p-PyUnicode_AS_UNICODE(v);
4143 startinpos = s-starts;
4144 endinpos = startinpos+1;
4145 if (unicode_decode_call_errorhandler(
4146 errors, &errorHandler,
4147 "charmap", "character maps to <undefined>",
4148 starts, size, &startinpos, &endinpos, &exc, &s,
4149 &v, &outpos, &p)) {
4150 Py_DECREF(x);
4151 goto onError;
4153 Py_DECREF(x);
4154 continue;
4156 else if (PyUnicode_Check(x)) {
4157 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4159 if (targetsize == 1)
4160 /* 1-1 mapping */
4161 *p++ = *PyUnicode_AS_UNICODE(x);
4163 else if (targetsize > 1) {
4164 /* 1-n mapping */
4165 if (targetsize > extrachars) {
4166 /* resize first */
4167 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4168 Py_ssize_t needed = (targetsize - extrachars) + \
4169 (targetsize << 2);
4170 extrachars += needed;
4171 /* XXX overflow detection missing */
4172 if (_PyUnicode_Resize(&v,
4173 PyUnicode_GET_SIZE(v) + needed) < 0) {
4174 Py_DECREF(x);
4175 goto onError;
4177 p = PyUnicode_AS_UNICODE(v) + oldpos;
4179 Py_UNICODE_COPY(p,
4180 PyUnicode_AS_UNICODE(x),
4181 targetsize);
4182 p += targetsize;
4183 extrachars -= targetsize;
4185 /* 1-0 mapping: skip the character */
4187 else {
4188 /* wrong return value */
4189 PyErr_SetString(PyExc_TypeError,
4190 "character mapping must return integer, None or unicode");
4191 Py_DECREF(x);
4192 goto onError;
4194 Py_DECREF(x);
4195 ++s;
4198 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4199 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4200 goto onError;
4201 Py_XDECREF(errorHandler);
4202 Py_XDECREF(exc);
4203 return (PyObject *)v;
4205 onError:
4206 Py_XDECREF(errorHandler);
4207 Py_XDECREF(exc);
4208 Py_XDECREF(v);
4209 return NULL;
4212 /* Charmap encoding: the lookup table */
4214 struct encoding_map{
4215 PyObject_HEAD
4216 unsigned char level1[32];
4217 int count2, count3;
4218 unsigned char level23[1];
4221 static PyObject*
4222 encoding_map_size(PyObject *obj, PyObject* args)
4224 struct encoding_map *map = (struct encoding_map*)obj;
4225 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4226 128*map->count3);
4229 static PyMethodDef encoding_map_methods[] = {
4230 {"size", encoding_map_size, METH_NOARGS,
4231 PyDoc_STR("Return the size (in bytes) of this object") },
4232 { 0 }
4235 static void
4236 encoding_map_dealloc(PyObject* o)
4238 PyObject_FREE(o);
4241 static PyTypeObject EncodingMapType = {
4242 PyVarObject_HEAD_INIT(NULL, 0)
4243 "EncodingMap", /*tp_name*/
4244 sizeof(struct encoding_map), /*tp_basicsize*/
4245 0, /*tp_itemsize*/
4246 /* methods */
4247 encoding_map_dealloc, /*tp_dealloc*/
4248 0, /*tp_print*/
4249 0, /*tp_getattr*/
4250 0, /*tp_setattr*/
4251 0, /*tp_compare*/
4252 0, /*tp_repr*/
4253 0, /*tp_as_number*/
4254 0, /*tp_as_sequence*/
4255 0, /*tp_as_mapping*/
4256 0, /*tp_hash*/
4257 0, /*tp_call*/
4258 0, /*tp_str*/
4259 0, /*tp_getattro*/
4260 0, /*tp_setattro*/
4261 0, /*tp_as_buffer*/
4262 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4263 0, /*tp_doc*/
4264 0, /*tp_traverse*/
4265 0, /*tp_clear*/
4266 0, /*tp_richcompare*/
4267 0, /*tp_weaklistoffset*/
4268 0, /*tp_iter*/
4269 0, /*tp_iternext*/
4270 encoding_map_methods, /*tp_methods*/
4271 0, /*tp_members*/
4272 0, /*tp_getset*/
4273 0, /*tp_base*/
4274 0, /*tp_dict*/
4275 0, /*tp_descr_get*/
4276 0, /*tp_descr_set*/
4277 0, /*tp_dictoffset*/
4278 0, /*tp_init*/
4279 0, /*tp_alloc*/
4280 0, /*tp_new*/
4281 0, /*tp_free*/
4282 0, /*tp_is_gc*/
4285 PyObject*
4286 PyUnicode_BuildEncodingMap(PyObject* string)
4288 Py_UNICODE *decode;
4289 PyObject *result;
4290 struct encoding_map *mresult;
4291 int i;
4292 int need_dict = 0;
4293 unsigned char level1[32];
4294 unsigned char level2[512];
4295 unsigned char *mlevel1, *mlevel2, *mlevel3;
4296 int count2 = 0, count3 = 0;
4298 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4299 PyErr_BadArgument();
4300 return NULL;
4302 decode = PyUnicode_AS_UNICODE(string);
4303 memset(level1, 0xFF, sizeof level1);
4304 memset(level2, 0xFF, sizeof level2);
4306 /* If there isn't a one-to-one mapping of NULL to \0,
4307 or if there are non-BMP characters, we need to use
4308 a mapping dictionary. */
4309 if (decode[0] != 0)
4310 need_dict = 1;
4311 for (i = 1; i < 256; i++) {
4312 int l1, l2;
4313 if (decode[i] == 0
4314 #ifdef Py_UNICODE_WIDE
4315 || decode[i] > 0xFFFF
4316 #endif
4318 need_dict = 1;
4319 break;
4321 if (decode[i] == 0xFFFE)
4322 /* unmapped character */
4323 continue;
4324 l1 = decode[i] >> 11;
4325 l2 = decode[i] >> 7;
4326 if (level1[l1] == 0xFF)
4327 level1[l1] = count2++;
4328 if (level2[l2] == 0xFF)
4329 level2[l2] = count3++;
4332 if (count2 >= 0xFF || count3 >= 0xFF)
4333 need_dict = 1;
4335 if (need_dict) {
4336 PyObject *result = PyDict_New();
4337 PyObject *key, *value;
4338 if (!result)
4339 return NULL;
4340 for (i = 0; i < 256; i++) {
4341 key = value = NULL;
4342 key = PyInt_FromLong(decode[i]);
4343 value = PyInt_FromLong(i);
4344 if (!key || !value)
4345 goto failed1;
4346 if (PyDict_SetItem(result, key, value) == -1)
4347 goto failed1;
4348 Py_DECREF(key);
4349 Py_DECREF(value);
4351 return result;
4352 failed1:
4353 Py_XDECREF(key);
4354 Py_XDECREF(value);
4355 Py_DECREF(result);
4356 return NULL;
4359 /* Create a three-level trie */
4360 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4361 16*count2 + 128*count3 - 1);
4362 if (!result)
4363 return PyErr_NoMemory();
4364 PyObject_Init(result, &EncodingMapType);
4365 mresult = (struct encoding_map*)result;
4366 mresult->count2 = count2;
4367 mresult->count3 = count3;
4368 mlevel1 = mresult->level1;
4369 mlevel2 = mresult->level23;
4370 mlevel3 = mresult->level23 + 16*count2;
4371 memcpy(mlevel1, level1, 32);
4372 memset(mlevel2, 0xFF, 16*count2);
4373 memset(mlevel3, 0, 128*count3);
4374 count3 = 0;
4375 for (i = 1; i < 256; i++) {
4376 int o1, o2, o3, i2, i3;
4377 if (decode[i] == 0xFFFE)
4378 /* unmapped character */
4379 continue;
4380 o1 = decode[i]>>11;
4381 o2 = (decode[i]>>7) & 0xF;
4382 i2 = 16*mlevel1[o1] + o2;
4383 if (mlevel2[i2] == 0xFF)
4384 mlevel2[i2] = count3++;
4385 o3 = decode[i] & 0x7F;
4386 i3 = 128*mlevel2[i2] + o3;
4387 mlevel3[i3] = i;
4389 return result;
4392 static int
4393 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4395 struct encoding_map *map = (struct encoding_map*)mapping;
4396 int l1 = c>>11;
4397 int l2 = (c>>7) & 0xF;
4398 int l3 = c & 0x7F;
4399 int i;
4401 #ifdef Py_UNICODE_WIDE
4402 if (c > 0xFFFF) {
4403 return -1;
4405 #endif
4406 if (c == 0)
4407 return 0;
4408 /* level 1*/
4409 i = map->level1[l1];
4410 if (i == 0xFF) {
4411 return -1;
4413 /* level 2*/
4414 i = map->level23[16*i+l2];
4415 if (i == 0xFF) {
4416 return -1;
4418 /* level 3 */
4419 i = map->level23[16*map->count2 + 128*i + l3];
4420 if (i == 0) {
4421 return -1;
4423 return i;
4426 /* Lookup the character ch in the mapping. If the character
4427 can't be found, Py_None is returned (or NULL, if another
4428 error occurred). */
4429 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4431 PyObject *w = PyInt_FromLong((long)c);
4432 PyObject *x;
4434 if (w == NULL)
4435 return NULL;
4436 x = PyObject_GetItem(mapping, w);
4437 Py_DECREF(w);
4438 if (x == NULL) {
4439 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4440 /* No mapping found means: mapping is undefined. */
4441 PyErr_Clear();
4442 x = Py_None;
4443 Py_INCREF(x);
4444 return x;
4445 } else
4446 return NULL;
4448 else if (x == Py_None)
4449 return x;
4450 else if (PyInt_Check(x)) {
4451 long value = PyInt_AS_LONG(x);
4452 if (value < 0 || value > 255) {
4453 PyErr_SetString(PyExc_TypeError,
4454 "character mapping must be in range(256)");
4455 Py_DECREF(x);
4456 return NULL;
4458 return x;
4460 else if (PyString_Check(x))
4461 return x;
4462 else {
4463 /* wrong return value */
4464 PyErr_SetString(PyExc_TypeError,
4465 "character mapping must return integer, None or str");
4466 Py_DECREF(x);
4467 return NULL;
4471 static int
4472 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4474 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4475 /* exponentially overallocate to minimize reallocations */
4476 if (requiredsize < 2*outsize)
4477 requiredsize = 2*outsize;
4478 if (_PyString_Resize(outobj, requiredsize)) {
4479 return 0;
4481 return 1;
4484 typedef enum charmapencode_result {
4485 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4486 }charmapencode_result;
4487 /* lookup the character, put the result in the output string and adjust
4488 various state variables. Reallocate the output string if not enough
4489 space is available. Return a new reference to the object that
4490 was put in the output buffer, or Py_None, if the mapping was undefined
4491 (in which case no character was written) or NULL, if a
4492 reallocation error occurred. The caller must decref the result */
4493 static
4494 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4495 PyObject **outobj, Py_ssize_t *outpos)
4497 PyObject *rep;
4498 char *outstart;
4499 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4501 if (Py_TYPE(mapping) == &EncodingMapType) {
4502 int res = encoding_map_lookup(c, mapping);
4503 Py_ssize_t requiredsize = *outpos+1;
4504 if (res == -1)
4505 return enc_FAILED;
4506 if (outsize<requiredsize)
4507 if (!charmapencode_resize(outobj, outpos, requiredsize))
4508 return enc_EXCEPTION;
4509 outstart = PyString_AS_STRING(*outobj);
4510 outstart[(*outpos)++] = (char)res;
4511 return enc_SUCCESS;
4514 rep = charmapencode_lookup(c, mapping);
4515 if (rep==NULL)
4516 return enc_EXCEPTION;
4517 else if (rep==Py_None) {
4518 Py_DECREF(rep);
4519 return enc_FAILED;
4520 } else {
4521 if (PyInt_Check(rep)) {
4522 Py_ssize_t requiredsize = *outpos+1;
4523 if (outsize<requiredsize)
4524 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4525 Py_DECREF(rep);
4526 return enc_EXCEPTION;
4528 outstart = PyString_AS_STRING(*outobj);
4529 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4531 else {
4532 const char *repchars = PyString_AS_STRING(rep);
4533 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4534 Py_ssize_t requiredsize = *outpos+repsize;
4535 if (outsize<requiredsize)
4536 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4537 Py_DECREF(rep);
4538 return enc_EXCEPTION;
4540 outstart = PyString_AS_STRING(*outobj);
4541 memcpy(outstart + *outpos, repchars, repsize);
4542 *outpos += repsize;
4545 Py_DECREF(rep);
4546 return enc_SUCCESS;
4549 /* handle an error in PyUnicode_EncodeCharmap
4550 Return 0 on success, -1 on error */
4551 static
4552 int charmap_encoding_error(
4553 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4554 PyObject **exceptionObject,
4555 int *known_errorHandler, PyObject **errorHandler, const char *errors,
4556 PyObject **res, Py_ssize_t *respos)
4558 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4559 Py_ssize_t repsize;
4560 Py_ssize_t newpos;
4561 Py_UNICODE *uni2;
4562 /* startpos for collecting unencodable chars */
4563 Py_ssize_t collstartpos = *inpos;
4564 Py_ssize_t collendpos = *inpos+1;
4565 Py_ssize_t collpos;
4566 char *encoding = "charmap";
4567 char *reason = "character maps to <undefined>";
4568 charmapencode_result x;
4570 /* find all unencodable characters */
4571 while (collendpos < size) {
4572 PyObject *rep;
4573 if (Py_TYPE(mapping) == &EncodingMapType) {
4574 int res = encoding_map_lookup(p[collendpos], mapping);
4575 if (res != -1)
4576 break;
4577 ++collendpos;
4578 continue;
4581 rep = charmapencode_lookup(p[collendpos], mapping);
4582 if (rep==NULL)
4583 return -1;
4584 else if (rep!=Py_None) {
4585 Py_DECREF(rep);
4586 break;
4588 Py_DECREF(rep);
4589 ++collendpos;
4591 /* cache callback name lookup
4592 * (if not done yet, i.e. it's the first error) */
4593 if (*known_errorHandler==-1) {
4594 if ((errors==NULL) || (!strcmp(errors, "strict")))
4595 *known_errorHandler = 1;
4596 else if (!strcmp(errors, "replace"))
4597 *known_errorHandler = 2;
4598 else if (!strcmp(errors, "ignore"))
4599 *known_errorHandler = 3;
4600 else if (!strcmp(errors, "xmlcharrefreplace"))
4601 *known_errorHandler = 4;
4602 else
4603 *known_errorHandler = 0;
4605 switch (*known_errorHandler) {
4606 case 1: /* strict */
4607 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4608 return -1;
4609 case 2: /* replace */
4610 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4611 x = charmapencode_output('?', mapping, res, respos);
4612 if (x==enc_EXCEPTION) {
4613 return -1;
4615 else if (x==enc_FAILED) {
4616 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4617 return -1;
4620 /* fall through */
4621 case 3: /* ignore */
4622 *inpos = collendpos;
4623 break;
4624 case 4: /* xmlcharrefreplace */
4625 /* generate replacement (temporarily (mis)uses p) */
4626 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4627 char buffer[2+29+1+1];
4628 char *cp;
4629 sprintf(buffer, "&#%d;", (int)p[collpos]);
4630 for (cp = buffer; *cp; ++cp) {
4631 x = charmapencode_output(*cp, mapping, res, respos);
4632 if (x==enc_EXCEPTION)
4633 return -1;
4634 else if (x==enc_FAILED) {
4635 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4636 return -1;
4640 *inpos = collendpos;
4641 break;
4642 default:
4643 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4644 encoding, reason, p, size, exceptionObject,
4645 collstartpos, collendpos, &newpos);
4646 if (repunicode == NULL)
4647 return -1;
4648 /* generate replacement */
4649 repsize = PyUnicode_GET_SIZE(repunicode);
4650 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4651 x = charmapencode_output(*uni2, mapping, res, respos);
4652 if (x==enc_EXCEPTION) {
4653 return -1;
4655 else if (x==enc_FAILED) {
4656 Py_DECREF(repunicode);
4657 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4658 return -1;
4661 *inpos = newpos;
4662 Py_DECREF(repunicode);
4664 return 0;
4667 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4668 Py_ssize_t size,
4669 PyObject *mapping,
4670 const char *errors)
4672 /* output object */
4673 PyObject *res = NULL;
4674 /* current input position */
4675 Py_ssize_t inpos = 0;
4676 /* current output position */
4677 Py_ssize_t respos = 0;
4678 PyObject *errorHandler = NULL;
4679 PyObject *exc = NULL;
4680 /* the following variable is used for caching string comparisons
4681 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4682 * 3=ignore, 4=xmlcharrefreplace */
4683 int known_errorHandler = -1;
4685 /* Default to Latin-1 */
4686 if (mapping == NULL)
4687 return PyUnicode_EncodeLatin1(p, size, errors);
4689 /* allocate enough for a simple encoding without
4690 replacements, if we need more, we'll resize */
4691 res = PyString_FromStringAndSize(NULL, size);
4692 if (res == NULL)
4693 goto onError;
4694 if (size == 0)
4695 return res;
4697 while (inpos<size) {
4698 /* try to encode it */
4699 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4700 if (x==enc_EXCEPTION) /* error */
4701 goto onError;
4702 if (x==enc_FAILED) { /* unencodable character */
4703 if (charmap_encoding_error(p, size, &inpos, mapping,
4704 &exc,
4705 &known_errorHandler, &errorHandler, errors,
4706 &res, &respos)) {
4707 goto onError;
4710 else
4711 /* done with this character => adjust input position */
4712 ++inpos;
4715 /* Resize if we allocated to much */
4716 if (respos<PyString_GET_SIZE(res)) {
4717 if (_PyString_Resize(&res, respos))
4718 goto onError;
4720 Py_XDECREF(exc);
4721 Py_XDECREF(errorHandler);
4722 return res;
4724 onError:
4725 Py_XDECREF(res);
4726 Py_XDECREF(exc);
4727 Py_XDECREF(errorHandler);
4728 return NULL;
4731 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4732 PyObject *mapping)
4734 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4735 PyErr_BadArgument();
4736 return NULL;
4738 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4739 PyUnicode_GET_SIZE(unicode),
4740 mapping,
4741 NULL);
4744 /* create or adjust a UnicodeTranslateError */
4745 static void make_translate_exception(PyObject **exceptionObject,
4746 const Py_UNICODE *unicode, Py_ssize_t size,
4747 Py_ssize_t startpos, Py_ssize_t endpos,
4748 const char *reason)
4750 if (*exceptionObject == NULL) {
4751 *exceptionObject = PyUnicodeTranslateError_Create(
4752 unicode, size, startpos, endpos, reason);
4754 else {
4755 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4756 goto onError;
4757 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4758 goto onError;
4759 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4760 goto onError;
4761 return;
4762 onError:
4763 Py_DECREF(*exceptionObject);
4764 *exceptionObject = NULL;
4768 /* raises a UnicodeTranslateError */
4769 static void raise_translate_exception(PyObject **exceptionObject,
4770 const Py_UNICODE *unicode, Py_ssize_t size,
4771 Py_ssize_t startpos, Py_ssize_t endpos,
4772 const char *reason)
4774 make_translate_exception(exceptionObject,
4775 unicode, size, startpos, endpos, reason);
4776 if (*exceptionObject != NULL)
4777 PyCodec_StrictErrors(*exceptionObject);
4780 /* error handling callback helper:
4781 build arguments, call the callback and check the arguments,
4782 put the result into newpos and return the replacement string, which
4783 has to be freed by the caller */
4784 static PyObject *unicode_translate_call_errorhandler(const char *errors,
4785 PyObject **errorHandler,
4786 const char *reason,
4787 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4788 Py_ssize_t startpos, Py_ssize_t endpos,
4789 Py_ssize_t *newpos)
4791 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4793 Py_ssize_t i_newpos;
4794 PyObject *restuple;
4795 PyObject *resunicode;
4797 if (*errorHandler == NULL) {
4798 *errorHandler = PyCodec_LookupError(errors);
4799 if (*errorHandler == NULL)
4800 return NULL;
4803 make_translate_exception(exceptionObject,
4804 unicode, size, startpos, endpos, reason);
4805 if (*exceptionObject == NULL)
4806 return NULL;
4808 restuple = PyObject_CallFunctionObjArgs(
4809 *errorHandler, *exceptionObject, NULL);
4810 if (restuple == NULL)
4811 return NULL;
4812 if (!PyTuple_Check(restuple)) {
4813 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4814 Py_DECREF(restuple);
4815 return NULL;
4817 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4818 &resunicode, &i_newpos)) {
4819 Py_DECREF(restuple);
4820 return NULL;
4822 if (i_newpos<0)
4823 *newpos = size+i_newpos;
4824 else
4825 *newpos = i_newpos;
4826 if (*newpos<0 || *newpos>size) {
4827 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4828 Py_DECREF(restuple);
4829 return NULL;
4831 Py_INCREF(resunicode);
4832 Py_DECREF(restuple);
4833 return resunicode;
4836 /* Lookup the character ch in the mapping and put the result in result,
4837 which must be decrefed by the caller.
4838 Return 0 on success, -1 on error */
4839 static
4840 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4842 PyObject *w = PyInt_FromLong((long)c);
4843 PyObject *x;
4845 if (w == NULL)
4846 return -1;
4847 x = PyObject_GetItem(mapping, w);
4848 Py_DECREF(w);
4849 if (x == NULL) {
4850 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4851 /* No mapping found means: use 1:1 mapping. */
4852 PyErr_Clear();
4853 *result = NULL;
4854 return 0;
4855 } else
4856 return -1;
4858 else if (x == Py_None) {
4859 *result = x;
4860 return 0;
4862 else if (PyInt_Check(x)) {
4863 long value = PyInt_AS_LONG(x);
4864 long max = PyUnicode_GetMax();
4865 if (value < 0 || value > max) {
4866 PyErr_Format(PyExc_TypeError,
4867 "character mapping must be in range(0x%lx)", max+1);
4868 Py_DECREF(x);
4869 return -1;
4871 *result = x;
4872 return 0;
4874 else if (PyUnicode_Check(x)) {
4875 *result = x;
4876 return 0;
4878 else {
4879 /* wrong return value */
4880 PyErr_SetString(PyExc_TypeError,
4881 "character mapping must return integer, None or unicode");
4882 Py_DECREF(x);
4883 return -1;
4886 /* ensure that *outobj is at least requiredsize characters long,
4887 if not reallocate and adjust various state variables.
4888 Return 0 on success, -1 on error */
4889 static
4890 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4891 Py_ssize_t requiredsize)
4893 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4894 if (requiredsize > oldsize) {
4895 /* remember old output position */
4896 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4897 /* exponentially overallocate to minimize reallocations */
4898 if (requiredsize < 2 * oldsize)
4899 requiredsize = 2 * oldsize;
4900 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4901 return -1;
4902 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4904 return 0;
4906 /* lookup the character, put the result in the output string and adjust
4907 various state variables. Return a new reference to the object that
4908 was put in the output buffer in *result, or Py_None, if the mapping was
4909 undefined (in which case no character was written).
4910 The called must decref result.
4911 Return 0 on success, -1 on error. */
4912 static
4913 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
4914 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4915 PyObject **res)
4917 if (charmaptranslate_lookup(*curinp, mapping, res))
4918 return -1;
4919 if (*res==NULL) {
4920 /* not found => default to 1:1 mapping */
4921 *(*outp)++ = *curinp;
4923 else if (*res==Py_None)
4925 else if (PyInt_Check(*res)) {
4926 /* no overflow check, because we know that the space is enough */
4927 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4929 else if (PyUnicode_Check(*res)) {
4930 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4931 if (repsize==1) {
4932 /* no overflow check, because we know that the space is enough */
4933 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4935 else if (repsize!=0) {
4936 /* more than one character */
4937 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4938 (insize - (curinp-startinp)) +
4939 repsize - 1;
4940 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4941 return -1;
4942 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4943 *outp += repsize;
4946 else
4947 return -1;
4948 return 0;
4951 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
4952 Py_ssize_t size,
4953 PyObject *mapping,
4954 const char *errors)
4956 /* output object */
4957 PyObject *res = NULL;
4958 /* pointers to the beginning and end+1 of input */
4959 const Py_UNICODE *startp = p;
4960 const Py_UNICODE *endp = p + size;
4961 /* pointer into the output */
4962 Py_UNICODE *str;
4963 /* current output position */
4964 Py_ssize_t respos = 0;
4965 char *reason = "character maps to <undefined>";
4966 PyObject *errorHandler = NULL;
4967 PyObject *exc = NULL;
4968 /* the following variable is used for caching string comparisons
4969 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4970 * 3=ignore, 4=xmlcharrefreplace */
4971 int known_errorHandler = -1;
4973 if (mapping == NULL) {
4974 PyErr_BadArgument();
4975 return NULL;
4978 /* allocate enough for a simple 1:1 translation without
4979 replacements, if we need more, we'll resize */
4980 res = PyUnicode_FromUnicode(NULL, size);
4981 if (res == NULL)
4982 goto onError;
4983 if (size == 0)
4984 return res;
4985 str = PyUnicode_AS_UNICODE(res);
4987 while (p<endp) {
4988 /* try to encode it */
4989 PyObject *x = NULL;
4990 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4991 Py_XDECREF(x);
4992 goto onError;
4994 Py_XDECREF(x);
4995 if (x!=Py_None) /* it worked => adjust input pointer */
4996 ++p;
4997 else { /* untranslatable character */
4998 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4999 Py_ssize_t repsize;
5000 Py_ssize_t newpos;
5001 Py_UNICODE *uni2;
5002 /* startpos for collecting untranslatable chars */
5003 const Py_UNICODE *collstart = p;
5004 const Py_UNICODE *collend = p+1;
5005 const Py_UNICODE *coll;
5007 /* find all untranslatable characters */
5008 while (collend < endp) {
5009 if (charmaptranslate_lookup(*collend, mapping, &x))
5010 goto onError;
5011 Py_XDECREF(x);
5012 if (x!=Py_None)
5013 break;
5014 ++collend;
5016 /* cache callback name lookup
5017 * (if not done yet, i.e. it's the first error) */
5018 if (known_errorHandler==-1) {
5019 if ((errors==NULL) || (!strcmp(errors, "strict")))
5020 known_errorHandler = 1;
5021 else if (!strcmp(errors, "replace"))
5022 known_errorHandler = 2;
5023 else if (!strcmp(errors, "ignore"))
5024 known_errorHandler = 3;
5025 else if (!strcmp(errors, "xmlcharrefreplace"))
5026 known_errorHandler = 4;
5027 else
5028 known_errorHandler = 0;
5030 switch (known_errorHandler) {
5031 case 1: /* strict */
5032 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5033 goto onError;
5034 case 2: /* replace */
5035 /* No need to check for space, this is a 1:1 replacement */
5036 for (coll = collstart; coll<collend; ++coll)
5037 *str++ = '?';
5038 /* fall through */
5039 case 3: /* ignore */
5040 p = collend;
5041 break;
5042 case 4: /* xmlcharrefreplace */
5043 /* generate replacement (temporarily (mis)uses p) */
5044 for (p = collstart; p < collend; ++p) {
5045 char buffer[2+29+1+1];
5046 char *cp;
5047 sprintf(buffer, "&#%d;", (int)*p);
5048 if (charmaptranslate_makespace(&res, &str,
5049 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5050 goto onError;
5051 for (cp = buffer; *cp; ++cp)
5052 *str++ = *cp;
5054 p = collend;
5055 break;
5056 default:
5057 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5058 reason, startp, size, &exc,
5059 collstart-startp, collend-startp, &newpos);
5060 if (repunicode == NULL)
5061 goto onError;
5062 /* generate replacement */
5063 repsize = PyUnicode_GET_SIZE(repunicode);
5064 if (charmaptranslate_makespace(&res, &str,
5065 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5066 Py_DECREF(repunicode);
5067 goto onError;
5069 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5070 *str++ = *uni2;
5071 p = startp + newpos;
5072 Py_DECREF(repunicode);
5076 /* Resize if we allocated to much */
5077 respos = str-PyUnicode_AS_UNICODE(res);
5078 if (respos<PyUnicode_GET_SIZE(res)) {
5079 if (PyUnicode_Resize(&res, respos) < 0)
5080 goto onError;
5082 Py_XDECREF(exc);
5083 Py_XDECREF(errorHandler);
5084 return res;
5086 onError:
5087 Py_XDECREF(res);
5088 Py_XDECREF(exc);
5089 Py_XDECREF(errorHandler);
5090 return NULL;
5093 PyObject *PyUnicode_Translate(PyObject *str,
5094 PyObject *mapping,
5095 const char *errors)
5097 PyObject *result;
5099 str = PyUnicode_FromObject(str);
5100 if (str == NULL)
5101 goto onError;
5102 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5103 PyUnicode_GET_SIZE(str),
5104 mapping,
5105 errors);
5106 Py_DECREF(str);
5107 return result;
5109 onError:
5110 Py_XDECREF(str);
5111 return NULL;
5114 /* --- Decimal Encoder ---------------------------------------------------- */
5116 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5117 Py_ssize_t length,
5118 char *output,
5119 const char *errors)
5121 Py_UNICODE *p, *end;
5122 PyObject *errorHandler = NULL;
5123 PyObject *exc = NULL;
5124 const char *encoding = "decimal";
5125 const char *reason = "invalid decimal Unicode string";
5126 /* the following variable is used for caching string comparisons
5127 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5128 int known_errorHandler = -1;
5130 if (output == NULL) {
5131 PyErr_BadArgument();
5132 return -1;
5135 p = s;
5136 end = s + length;
5137 while (p < end) {
5138 register Py_UNICODE ch = *p;
5139 int decimal;
5140 PyObject *repunicode;
5141 Py_ssize_t repsize;
5142 Py_ssize_t newpos;
5143 Py_UNICODE *uni2;
5144 Py_UNICODE *collstart;
5145 Py_UNICODE *collend;
5147 if (Py_UNICODE_ISSPACE(ch)) {
5148 *output++ = ' ';
5149 ++p;
5150 continue;
5152 decimal = Py_UNICODE_TODECIMAL(ch);
5153 if (decimal >= 0) {
5154 *output++ = '0' + decimal;
5155 ++p;
5156 continue;
5158 if (0 < ch && ch < 256) {
5159 *output++ = (char)ch;
5160 ++p;
5161 continue;
5163 /* All other characters are considered unencodable */
5164 collstart = p;
5165 collend = p+1;
5166 while (collend < end) {
5167 if ((0 < *collend && *collend < 256) ||
5168 !Py_UNICODE_ISSPACE(*collend) ||
5169 Py_UNICODE_TODECIMAL(*collend))
5170 break;
5172 /* cache callback name lookup
5173 * (if not done yet, i.e. it's the first error) */
5174 if (known_errorHandler==-1) {
5175 if ((errors==NULL) || (!strcmp(errors, "strict")))
5176 known_errorHandler = 1;
5177 else if (!strcmp(errors, "replace"))
5178 known_errorHandler = 2;
5179 else if (!strcmp(errors, "ignore"))
5180 known_errorHandler = 3;
5181 else if (!strcmp(errors, "xmlcharrefreplace"))
5182 known_errorHandler = 4;
5183 else
5184 known_errorHandler = 0;
5186 switch (known_errorHandler) {
5187 case 1: /* strict */
5188 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5189 goto onError;
5190 case 2: /* replace */
5191 for (p = collstart; p < collend; ++p)
5192 *output++ = '?';
5193 /* fall through */
5194 case 3: /* ignore */
5195 p = collend;
5196 break;
5197 case 4: /* xmlcharrefreplace */
5198 /* generate replacement (temporarily (mis)uses p) */
5199 for (p = collstart; p < collend; ++p)
5200 output += sprintf(output, "&#%d;", (int)*p);
5201 p = collend;
5202 break;
5203 default:
5204 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5205 encoding, reason, s, length, &exc,
5206 collstart-s, collend-s, &newpos);
5207 if (repunicode == NULL)
5208 goto onError;
5209 /* generate replacement */
5210 repsize = PyUnicode_GET_SIZE(repunicode);
5211 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5212 Py_UNICODE ch = *uni2;
5213 if (Py_UNICODE_ISSPACE(ch))
5214 *output++ = ' ';
5215 else {
5216 decimal = Py_UNICODE_TODECIMAL(ch);
5217 if (decimal >= 0)
5218 *output++ = '0' + decimal;
5219 else if (0 < ch && ch < 256)
5220 *output++ = (char)ch;
5221 else {
5222 Py_DECREF(repunicode);
5223 raise_encode_exception(&exc, encoding,
5224 s, length, collstart-s, collend-s, reason);
5225 goto onError;
5229 p = s + newpos;
5230 Py_DECREF(repunicode);
5233 /* 0-terminate the output string */
5234 *output++ = '\0';
5235 Py_XDECREF(exc);
5236 Py_XDECREF(errorHandler);
5237 return 0;
5239 onError:
5240 Py_XDECREF(exc);
5241 Py_XDECREF(errorHandler);
5242 return -1;
5245 /* --- Helpers ------------------------------------------------------------ */
5247 #include "stringlib/unicodedefs.h"
5249 #define FROM_UNICODE
5251 #include "stringlib/fastsearch.h"
5253 #include "stringlib/count.h"
5254 #include "stringlib/find.h"
5255 #include "stringlib/partition.h"
5257 /* helper macro to fixup start/end slice values */
5258 #define FIX_START_END(obj) \
5259 if (start < 0) \
5260 start += (obj)->length; \
5261 if (start < 0) \
5262 start = 0; \
5263 if (end > (obj)->length) \
5264 end = (obj)->length; \
5265 if (end < 0) \
5266 end += (obj)->length; \
5267 if (end < 0) \
5268 end = 0;
5270 Py_ssize_t PyUnicode_Count(PyObject *str,
5271 PyObject *substr,
5272 Py_ssize_t start,
5273 Py_ssize_t end)
5275 Py_ssize_t result;
5276 PyUnicodeObject* str_obj;
5277 PyUnicodeObject* sub_obj;
5279 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5280 if (!str_obj)
5281 return -1;
5282 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5283 if (!sub_obj) {
5284 Py_DECREF(str_obj);
5285 return -1;
5288 FIX_START_END(str_obj);
5290 result = stringlib_count(
5291 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5294 Py_DECREF(sub_obj);
5295 Py_DECREF(str_obj);
5297 return result;
5300 Py_ssize_t PyUnicode_Find(PyObject *str,
5301 PyObject *sub,
5302 Py_ssize_t start,
5303 Py_ssize_t end,
5304 int direction)
5306 Py_ssize_t result;
5308 str = PyUnicode_FromObject(str);
5309 if (!str)
5310 return -2;
5311 sub = PyUnicode_FromObject(sub);
5312 if (!sub) {
5313 Py_DECREF(str);
5314 return -2;
5317 if (direction > 0)
5318 result = stringlib_find_slice(
5319 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5320 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5321 start, end
5323 else
5324 result = stringlib_rfind_slice(
5325 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5326 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5327 start, end
5330 Py_DECREF(str);
5331 Py_DECREF(sub);
5333 return result;
5336 static
5337 int tailmatch(PyUnicodeObject *self,
5338 PyUnicodeObject *substring,
5339 Py_ssize_t start,
5340 Py_ssize_t end,
5341 int direction)
5343 if (substring->length == 0)
5344 return 1;
5346 FIX_START_END(self);
5348 end -= substring->length;
5349 if (end < start)
5350 return 0;
5352 if (direction > 0) {
5353 if (Py_UNICODE_MATCH(self, end, substring))
5354 return 1;
5355 } else {
5356 if (Py_UNICODE_MATCH(self, start, substring))
5357 return 1;
5360 return 0;
5363 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5364 PyObject *substr,
5365 Py_ssize_t start,
5366 Py_ssize_t end,
5367 int direction)
5369 Py_ssize_t result;
5371 str = PyUnicode_FromObject(str);
5372 if (str == NULL)
5373 return -1;
5374 substr = PyUnicode_FromObject(substr);
5375 if (substr == NULL) {
5376 Py_DECREF(str);
5377 return -1;
5380 result = tailmatch((PyUnicodeObject *)str,
5381 (PyUnicodeObject *)substr,
5382 start, end, direction);
5383 Py_DECREF(str);
5384 Py_DECREF(substr);
5385 return result;
5388 /* Apply fixfct filter to the Unicode object self and return a
5389 reference to the modified object */
5391 static
5392 PyObject *fixup(PyUnicodeObject *self,
5393 int (*fixfct)(PyUnicodeObject *s))
5396 PyUnicodeObject *u;
5398 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5399 if (u == NULL)
5400 return NULL;
5402 Py_UNICODE_COPY(u->str, self->str, self->length);
5404 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5405 /* fixfct should return TRUE if it modified the buffer. If
5406 FALSE, return a reference to the original buffer instead
5407 (to save space, not time) */
5408 Py_INCREF(self);
5409 Py_DECREF(u);
5410 return (PyObject*) self;
5412 return (PyObject*) u;
5415 static
5416 int fixupper(PyUnicodeObject *self)
5418 Py_ssize_t len = self->length;
5419 Py_UNICODE *s = self->str;
5420 int status = 0;
5422 while (len-- > 0) {
5423 register Py_UNICODE ch;
5425 ch = Py_UNICODE_TOUPPER(*s);
5426 if (ch != *s) {
5427 status = 1;
5428 *s = ch;
5430 s++;
5433 return status;
5436 static
5437 int fixlower(PyUnicodeObject *self)
5439 Py_ssize_t len = self->length;
5440 Py_UNICODE *s = self->str;
5441 int status = 0;
5443 while (len-- > 0) {
5444 register Py_UNICODE ch;
5446 ch = Py_UNICODE_TOLOWER(*s);
5447 if (ch != *s) {
5448 status = 1;
5449 *s = ch;
5451 s++;
5454 return status;
5457 static
5458 int fixswapcase(PyUnicodeObject *self)
5460 Py_ssize_t len = self->length;
5461 Py_UNICODE *s = self->str;
5462 int status = 0;
5464 while (len-- > 0) {
5465 if (Py_UNICODE_ISUPPER(*s)) {
5466 *s = Py_UNICODE_TOLOWER(*s);
5467 status = 1;
5468 } else if (Py_UNICODE_ISLOWER(*s)) {
5469 *s = Py_UNICODE_TOUPPER(*s);
5470 status = 1;
5472 s++;
5475 return status;
5478 static
5479 int fixcapitalize(PyUnicodeObject *self)
5481 Py_ssize_t len = self->length;
5482 Py_UNICODE *s = self->str;
5483 int status = 0;
5485 if (len == 0)
5486 return 0;
5487 if (Py_UNICODE_ISLOWER(*s)) {
5488 *s = Py_UNICODE_TOUPPER(*s);
5489 status = 1;
5491 s++;
5492 while (--len > 0) {
5493 if (Py_UNICODE_ISUPPER(*s)) {
5494 *s = Py_UNICODE_TOLOWER(*s);
5495 status = 1;
5497 s++;
5499 return status;
5502 static
5503 int fixtitle(PyUnicodeObject *self)
5505 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5506 register Py_UNICODE *e;
5507 int previous_is_cased;
5509 /* Shortcut for single character strings */
5510 if (PyUnicode_GET_SIZE(self) == 1) {
5511 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5512 if (*p != ch) {
5513 *p = ch;
5514 return 1;
5516 else
5517 return 0;
5520 e = p + PyUnicode_GET_SIZE(self);
5521 previous_is_cased = 0;
5522 for (; p < e; p++) {
5523 register const Py_UNICODE ch = *p;
5525 if (previous_is_cased)
5526 *p = Py_UNICODE_TOLOWER(ch);
5527 else
5528 *p = Py_UNICODE_TOTITLE(ch);
5530 if (Py_UNICODE_ISLOWER(ch) ||
5531 Py_UNICODE_ISUPPER(ch) ||
5532 Py_UNICODE_ISTITLE(ch))
5533 previous_is_cased = 1;
5534 else
5535 previous_is_cased = 0;
5537 return 1;
5540 PyObject *
5541 PyUnicode_Join(PyObject *separator, PyObject *seq)
5543 PyObject *internal_separator = NULL;
5544 const Py_UNICODE blank = ' ';
5545 const Py_UNICODE *sep = &blank;
5546 Py_ssize_t seplen = 1;
5547 PyUnicodeObject *res = NULL; /* the result */
5548 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5549 Py_ssize_t res_used; /* # used bytes */
5550 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5551 PyObject *fseq; /* PySequence_Fast(seq) */
5552 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
5553 PyObject *item;
5554 Py_ssize_t i;
5556 fseq = PySequence_Fast(seq, "");
5557 if (fseq == NULL) {
5558 return NULL;
5561 /* Grrrr. A codec may be invoked to convert str objects to
5562 * Unicode, and so it's possible to call back into Python code
5563 * during PyUnicode_FromObject(), and so it's possible for a sick
5564 * codec to change the size of fseq (if seq is a list). Therefore
5565 * we have to keep refetching the size -- can't assume seqlen
5566 * is invariant.
5568 seqlen = PySequence_Fast_GET_SIZE(fseq);
5569 /* If empty sequence, return u"". */
5570 if (seqlen == 0) {
5571 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5572 goto Done;
5574 /* If singleton sequence with an exact Unicode, return that. */
5575 if (seqlen == 1) {
5576 item = PySequence_Fast_GET_ITEM(fseq, 0);
5577 if (PyUnicode_CheckExact(item)) {
5578 Py_INCREF(item);
5579 res = (PyUnicodeObject *)item;
5580 goto Done;
5584 /* At least two items to join, or one that isn't exact Unicode. */
5585 if (seqlen > 1) {
5586 /* Set up sep and seplen -- they're needed. */
5587 if (separator == NULL) {
5588 sep = &blank;
5589 seplen = 1;
5591 else {
5592 internal_separator = PyUnicode_FromObject(separator);
5593 if (internal_separator == NULL)
5594 goto onError;
5595 sep = PyUnicode_AS_UNICODE(internal_separator);
5596 seplen = PyUnicode_GET_SIZE(internal_separator);
5597 /* In case PyUnicode_FromObject() mutated seq. */
5598 seqlen = PySequence_Fast_GET_SIZE(fseq);
5602 /* Get space. */
5603 res = _PyUnicode_New(res_alloc);
5604 if (res == NULL)
5605 goto onError;
5606 res_p = PyUnicode_AS_UNICODE(res);
5607 res_used = 0;
5609 for (i = 0; i < seqlen; ++i) {
5610 Py_ssize_t itemlen;
5611 Py_ssize_t new_res_used;
5613 item = PySequence_Fast_GET_ITEM(fseq, i);
5614 /* Convert item to Unicode. */
5615 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5616 PyErr_Format(PyExc_TypeError,
5617 "sequence item %zd: expected string or Unicode,"
5618 " %.80s found",
5619 i, Py_TYPE(item)->tp_name);
5620 goto onError;
5622 item = PyUnicode_FromObject(item);
5623 if (item == NULL)
5624 goto onError;
5625 /* We own a reference to item from here on. */
5627 /* In case PyUnicode_FromObject() mutated seq. */
5628 seqlen = PySequence_Fast_GET_SIZE(fseq);
5630 /* Make sure we have enough space for the separator and the item. */
5631 itemlen = PyUnicode_GET_SIZE(item);
5632 new_res_used = res_used + itemlen;
5633 if (new_res_used < 0)
5634 goto Overflow;
5635 if (i < seqlen - 1) {
5636 new_res_used += seplen;
5637 if (new_res_used < 0)
5638 goto Overflow;
5640 if (new_res_used > res_alloc) {
5641 /* double allocated size until it's big enough */
5642 do {
5643 res_alloc += res_alloc;
5644 if (res_alloc <= 0)
5645 goto Overflow;
5646 } while (new_res_used > res_alloc);
5647 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5648 Py_DECREF(item);
5649 goto onError;
5651 res_p = PyUnicode_AS_UNICODE(res) + res_used;
5654 /* Copy item, and maybe the separator. */
5655 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5656 res_p += itemlen;
5657 if (i < seqlen - 1) {
5658 Py_UNICODE_COPY(res_p, sep, seplen);
5659 res_p += seplen;
5661 Py_DECREF(item);
5662 res_used = new_res_used;
5665 /* Shrink res to match the used area; this probably can't fail,
5666 * but it's cheap to check.
5668 if (_PyUnicode_Resize(&res, res_used) < 0)
5669 goto onError;
5671 Done:
5672 Py_XDECREF(internal_separator);
5673 Py_DECREF(fseq);
5674 return (PyObject *)res;
5676 Overflow:
5677 PyErr_SetString(PyExc_OverflowError,
5678 "join() result is too long for a Python string");
5679 Py_DECREF(item);
5680 /* fall through */
5682 onError:
5683 Py_XDECREF(internal_separator);
5684 Py_DECREF(fseq);
5685 Py_XDECREF(res);
5686 return NULL;
5689 static
5690 PyUnicodeObject *pad(PyUnicodeObject *self,
5691 Py_ssize_t left,
5692 Py_ssize_t right,
5693 Py_UNICODE fill)
5695 PyUnicodeObject *u;
5697 if (left < 0)
5698 left = 0;
5699 if (right < 0)
5700 right = 0;
5702 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5703 Py_INCREF(self);
5704 return self;
5707 if (left > PY_SSIZE_T_MAX - self->length ||
5708 right > PY_SSIZE_T_MAX - (left + self->length)) {
5709 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5710 return NULL;
5712 u = _PyUnicode_New(left + self->length + right);
5713 if (u) {
5714 if (left)
5715 Py_UNICODE_FILL(u->str, fill, left);
5716 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5717 if (right)
5718 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5721 return u;
5724 #define SPLIT_APPEND(data, left, right) \
5725 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
5726 if (!str) \
5727 goto onError; \
5728 if (PyList_Append(list, str)) { \
5729 Py_DECREF(str); \
5730 goto onError; \
5732 else \
5733 Py_DECREF(str);
5735 static
5736 PyObject *split_whitespace(PyUnicodeObject *self,
5737 PyObject *list,
5738 Py_ssize_t maxcount)
5740 register Py_ssize_t i;
5741 register Py_ssize_t j;
5742 Py_ssize_t len = self->length;
5743 PyObject *str;
5744 register const Py_UNICODE *buf = self->str;
5746 for (i = j = 0; i < len; ) {
5747 /* find a token */
5748 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5749 i++;
5750 j = i;
5751 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
5752 i++;
5753 if (j < i) {
5754 if (maxcount-- <= 0)
5755 break;
5756 SPLIT_APPEND(buf, j, i);
5757 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5758 i++;
5759 j = i;
5762 if (j < len) {
5763 SPLIT_APPEND(buf, j, len);
5765 return list;
5767 onError:
5768 Py_DECREF(list);
5769 return NULL;
5772 PyObject *PyUnicode_Splitlines(PyObject *string,
5773 int keepends)
5775 register Py_ssize_t i;
5776 register Py_ssize_t j;
5777 Py_ssize_t len;
5778 PyObject *list;
5779 PyObject *str;
5780 Py_UNICODE *data;
5782 string = PyUnicode_FromObject(string);
5783 if (string == NULL)
5784 return NULL;
5785 data = PyUnicode_AS_UNICODE(string);
5786 len = PyUnicode_GET_SIZE(string);
5788 list = PyList_New(0);
5789 if (!list)
5790 goto onError;
5792 for (i = j = 0; i < len; ) {
5793 Py_ssize_t eol;
5795 /* Find a line and append it */
5796 while (i < len && !BLOOM_LINEBREAK(data[i]))
5797 i++;
5799 /* Skip the line break reading CRLF as one line break */
5800 eol = i;
5801 if (i < len) {
5802 if (data[i] == '\r' && i + 1 < len &&
5803 data[i+1] == '\n')
5804 i += 2;
5805 else
5806 i++;
5807 if (keepends)
5808 eol = i;
5810 SPLIT_APPEND(data, j, eol);
5811 j = i;
5813 if (j < len) {
5814 SPLIT_APPEND(data, j, len);
5817 Py_DECREF(string);
5818 return list;
5820 onError:
5821 Py_XDECREF(list);
5822 Py_DECREF(string);
5823 return NULL;
5826 static
5827 PyObject *split_char(PyUnicodeObject *self,
5828 PyObject *list,
5829 Py_UNICODE ch,
5830 Py_ssize_t maxcount)
5832 register Py_ssize_t i;
5833 register Py_ssize_t j;
5834 Py_ssize_t len = self->length;
5835 PyObject *str;
5836 register const Py_UNICODE *buf = self->str;
5838 for (i = j = 0; i < len; ) {
5839 if (buf[i] == ch) {
5840 if (maxcount-- <= 0)
5841 break;
5842 SPLIT_APPEND(buf, j, i);
5843 i = j = i + 1;
5844 } else
5845 i++;
5847 if (j <= len) {
5848 SPLIT_APPEND(buf, j, len);
5850 return list;
5852 onError:
5853 Py_DECREF(list);
5854 return NULL;
5857 static
5858 PyObject *split_substring(PyUnicodeObject *self,
5859 PyObject *list,
5860 PyUnicodeObject *substring,
5861 Py_ssize_t maxcount)
5863 register Py_ssize_t i;
5864 register Py_ssize_t j;
5865 Py_ssize_t len = self->length;
5866 Py_ssize_t sublen = substring->length;
5867 PyObject *str;
5869 for (i = j = 0; i <= len - sublen; ) {
5870 if (Py_UNICODE_MATCH(self, i, substring)) {
5871 if (maxcount-- <= 0)
5872 break;
5873 SPLIT_APPEND(self->str, j, i);
5874 i = j = i + sublen;
5875 } else
5876 i++;
5878 if (j <= len) {
5879 SPLIT_APPEND(self->str, j, len);
5881 return list;
5883 onError:
5884 Py_DECREF(list);
5885 return NULL;
5888 static
5889 PyObject *rsplit_whitespace(PyUnicodeObject *self,
5890 PyObject *list,
5891 Py_ssize_t maxcount)
5893 register Py_ssize_t i;
5894 register Py_ssize_t j;
5895 Py_ssize_t len = self->length;
5896 PyObject *str;
5897 register const Py_UNICODE *buf = self->str;
5899 for (i = j = len - 1; i >= 0; ) {
5900 /* find a token */
5901 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5902 i--;
5903 j = i;
5904 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
5905 i--;
5906 if (j > i) {
5907 if (maxcount-- <= 0)
5908 break;
5909 SPLIT_APPEND(buf, i + 1, j + 1);
5910 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5911 i--;
5912 j = i;
5915 if (j >= 0) {
5916 SPLIT_APPEND(buf, 0, j + 1);
5918 if (PyList_Reverse(list) < 0)
5919 goto onError;
5920 return list;
5922 onError:
5923 Py_DECREF(list);
5924 return NULL;
5927 static
5928 PyObject *rsplit_char(PyUnicodeObject *self,
5929 PyObject *list,
5930 Py_UNICODE ch,
5931 Py_ssize_t maxcount)
5933 register Py_ssize_t i;
5934 register Py_ssize_t j;
5935 Py_ssize_t len = self->length;
5936 PyObject *str;
5937 register const Py_UNICODE *buf = self->str;
5939 for (i = j = len - 1; i >= 0; ) {
5940 if (buf[i] == ch) {
5941 if (maxcount-- <= 0)
5942 break;
5943 SPLIT_APPEND(buf, i + 1, j + 1);
5944 j = i = i - 1;
5945 } else
5946 i--;
5948 if (j >= -1) {
5949 SPLIT_APPEND(buf, 0, j + 1);
5951 if (PyList_Reverse(list) < 0)
5952 goto onError;
5953 return list;
5955 onError:
5956 Py_DECREF(list);
5957 return NULL;
5960 static
5961 PyObject *rsplit_substring(PyUnicodeObject *self,
5962 PyObject *list,
5963 PyUnicodeObject *substring,
5964 Py_ssize_t maxcount)
5966 register Py_ssize_t i;
5967 register Py_ssize_t j;
5968 Py_ssize_t len = self->length;
5969 Py_ssize_t sublen = substring->length;
5970 PyObject *str;
5972 for (i = len - sublen, j = len; i >= 0; ) {
5973 if (Py_UNICODE_MATCH(self, i, substring)) {
5974 if (maxcount-- <= 0)
5975 break;
5976 SPLIT_APPEND(self->str, i + sublen, j);
5977 j = i;
5978 i -= sublen;
5979 } else
5980 i--;
5982 if (j >= 0) {
5983 SPLIT_APPEND(self->str, 0, j);
5985 if (PyList_Reverse(list) < 0)
5986 goto onError;
5987 return list;
5989 onError:
5990 Py_DECREF(list);
5991 return NULL;
5994 #undef SPLIT_APPEND
5996 static
5997 PyObject *split(PyUnicodeObject *self,
5998 PyUnicodeObject *substring,
5999 Py_ssize_t maxcount)
6001 PyObject *list;
6003 if (maxcount < 0)
6004 maxcount = PY_SSIZE_T_MAX;
6006 list = PyList_New(0);
6007 if (!list)
6008 return NULL;
6010 if (substring == NULL)
6011 return split_whitespace(self,list,maxcount);
6013 else if (substring->length == 1)
6014 return split_char(self,list,substring->str[0],maxcount);
6016 else if (substring->length == 0) {
6017 Py_DECREF(list);
6018 PyErr_SetString(PyExc_ValueError, "empty separator");
6019 return NULL;
6021 else
6022 return split_substring(self,list,substring,maxcount);
6025 static
6026 PyObject *rsplit(PyUnicodeObject *self,
6027 PyUnicodeObject *substring,
6028 Py_ssize_t maxcount)
6030 PyObject *list;
6032 if (maxcount < 0)
6033 maxcount = PY_SSIZE_T_MAX;
6035 list = PyList_New(0);
6036 if (!list)
6037 return NULL;
6039 if (substring == NULL)
6040 return rsplit_whitespace(self,list,maxcount);
6042 else if (substring->length == 1)
6043 return rsplit_char(self,list,substring->str[0],maxcount);
6045 else if (substring->length == 0) {
6046 Py_DECREF(list);
6047 PyErr_SetString(PyExc_ValueError, "empty separator");
6048 return NULL;
6050 else
6051 return rsplit_substring(self,list,substring,maxcount);
6054 static
6055 PyObject *replace(PyUnicodeObject *self,
6056 PyUnicodeObject *str1,
6057 PyUnicodeObject *str2,
6058 Py_ssize_t maxcount)
6060 PyUnicodeObject *u;
6062 if (maxcount < 0)
6063 maxcount = PY_SSIZE_T_MAX;
6065 if (str1->length == str2->length) {
6066 /* same length */
6067 Py_ssize_t i;
6068 if (str1->length == 1) {
6069 /* replace characters */
6070 Py_UNICODE u1, u2;
6071 if (!findchar(self->str, self->length, str1->str[0]))
6072 goto nothing;
6073 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6074 if (!u)
6075 return NULL;
6076 Py_UNICODE_COPY(u->str, self->str, self->length);
6077 u1 = str1->str[0];
6078 u2 = str2->str[0];
6079 for (i = 0; i < u->length; i++)
6080 if (u->str[i] == u1) {
6081 if (--maxcount < 0)
6082 break;
6083 u->str[i] = u2;
6085 } else {
6086 i = fastsearch(
6087 self->str, self->length, str1->str, str1->length, FAST_SEARCH
6089 if (i < 0)
6090 goto nothing;
6091 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6092 if (!u)
6093 return NULL;
6094 Py_UNICODE_COPY(u->str, self->str, self->length);
6095 while (i <= self->length - str1->length)
6096 if (Py_UNICODE_MATCH(self, i, str1)) {
6097 if (--maxcount < 0)
6098 break;
6099 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6100 i += str1->length;
6101 } else
6102 i++;
6104 } else {
6106 Py_ssize_t n, i, j, e;
6107 Py_ssize_t product, new_size, delta;
6108 Py_UNICODE *p;
6110 /* replace strings */
6111 n = stringlib_count(self->str, self->length, str1->str, str1->length);
6112 if (n > maxcount)
6113 n = maxcount;
6114 if (n == 0)
6115 goto nothing;
6116 /* new_size = self->length + n * (str2->length - str1->length)); */
6117 delta = (str2->length - str1->length);
6118 if (delta == 0) {
6119 new_size = self->length;
6120 } else {
6121 product = n * (str2->length - str1->length);
6122 if ((product / (str2->length - str1->length)) != n) {
6123 PyErr_SetString(PyExc_OverflowError,
6124 "replace string is too long");
6125 return NULL;
6127 new_size = self->length + product;
6128 if (new_size < 0) {
6129 PyErr_SetString(PyExc_OverflowError,
6130 "replace string is too long");
6131 return NULL;
6134 u = _PyUnicode_New(new_size);
6135 if (!u)
6136 return NULL;
6137 i = 0;
6138 p = u->str;
6139 e = self->length - str1->length;
6140 if (str1->length > 0) {
6141 while (n-- > 0) {
6142 /* look for next match */
6143 j = i;
6144 while (j <= e) {
6145 if (Py_UNICODE_MATCH(self, j, str1))
6146 break;
6147 j++;
6149 if (j > i) {
6150 if (j > e)
6151 break;
6152 /* copy unchanged part [i:j] */
6153 Py_UNICODE_COPY(p, self->str+i, j-i);
6154 p += j - i;
6156 /* copy substitution string */
6157 if (str2->length > 0) {
6158 Py_UNICODE_COPY(p, str2->str, str2->length);
6159 p += str2->length;
6161 i = j + str1->length;
6163 if (i < self->length)
6164 /* copy tail [i:] */
6165 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6166 } else {
6167 /* interleave */
6168 while (n > 0) {
6169 Py_UNICODE_COPY(p, str2->str, str2->length);
6170 p += str2->length;
6171 if (--n <= 0)
6172 break;
6173 *p++ = self->str[i++];
6175 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6178 return (PyObject *) u;
6180 nothing:
6181 /* nothing to replace; return original string (when possible) */
6182 if (PyUnicode_CheckExact(self)) {
6183 Py_INCREF(self);
6184 return (PyObject *) self;
6186 return PyUnicode_FromUnicode(self->str, self->length);
6189 /* --- Unicode Object Methods --------------------------------------------- */
6191 PyDoc_STRVAR(title__doc__,
6192 "S.title() -> unicode\n\
6194 Return a titlecased version of S, i.e. words start with title case\n\
6195 characters, all remaining cased characters have lower case.");
6197 static PyObject*
6198 unicode_title(PyUnicodeObject *self)
6200 return fixup(self, fixtitle);
6203 PyDoc_STRVAR(capitalize__doc__,
6204 "S.capitalize() -> unicode\n\
6206 Return a capitalized version of S, i.e. make the first character\n\
6207 have upper case.");
6209 static PyObject*
6210 unicode_capitalize(PyUnicodeObject *self)
6212 return fixup(self, fixcapitalize);
6215 #if 0
6216 PyDoc_STRVAR(capwords__doc__,
6217 "S.capwords() -> unicode\n\
6219 Apply .capitalize() to all words in S and return the result with\n\
6220 normalized whitespace (all whitespace strings are replaced by ' ').");
6222 static PyObject*
6223 unicode_capwords(PyUnicodeObject *self)
6225 PyObject *list;
6226 PyObject *item;
6227 Py_ssize_t i;
6229 /* Split into words */
6230 list = split(self, NULL, -1);
6231 if (!list)
6232 return NULL;
6234 /* Capitalize each word */
6235 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6236 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6237 fixcapitalize);
6238 if (item == NULL)
6239 goto onError;
6240 Py_DECREF(PyList_GET_ITEM(list, i));
6241 PyList_SET_ITEM(list, i, item);
6244 /* Join the words to form a new string */
6245 item = PyUnicode_Join(NULL, list);
6247 onError:
6248 Py_DECREF(list);
6249 return (PyObject *)item;
6251 #endif
6253 /* Argument converter. Coerces to a single unicode character */
6255 static int
6256 convert_uc(PyObject *obj, void *addr)
6258 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6259 PyObject *uniobj;
6260 Py_UNICODE *unistr;
6262 uniobj = PyUnicode_FromObject(obj);
6263 if (uniobj == NULL) {
6264 PyErr_SetString(PyExc_TypeError,
6265 "The fill character cannot be converted to Unicode");
6266 return 0;
6268 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6269 PyErr_SetString(PyExc_TypeError,
6270 "The fill character must be exactly one character long");
6271 Py_DECREF(uniobj);
6272 return 0;
6274 unistr = PyUnicode_AS_UNICODE(uniobj);
6275 *fillcharloc = unistr[0];
6276 Py_DECREF(uniobj);
6277 return 1;
6280 PyDoc_STRVAR(center__doc__,
6281 "S.center(width[, fillchar]) -> unicode\n\
6283 Return S centered in a Unicode string of length width. Padding is\n\
6284 done using the specified fill character (default is a space)");
6286 static PyObject *
6287 unicode_center(PyUnicodeObject *self, PyObject *args)
6289 Py_ssize_t marg, left;
6290 Py_ssize_t width;
6291 Py_UNICODE fillchar = ' ';
6293 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6294 return NULL;
6296 if (self->length >= width && PyUnicode_CheckExact(self)) {
6297 Py_INCREF(self);
6298 return (PyObject*) self;
6301 marg = width - self->length;
6302 left = marg / 2 + (marg & width & 1);
6304 return (PyObject*) pad(self, left, marg - left, fillchar);
6307 #if 0
6309 /* This code should go into some future Unicode collation support
6310 module. The basic comparison should compare ordinals on a naive
6311 basis (this is what Java does and thus Jython too). */
6313 /* speedy UTF-16 code point order comparison */
6314 /* gleaned from: */
6315 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6317 static short utf16Fixup[32] =
6319 0, 0, 0, 0, 0, 0, 0, 0,
6320 0, 0, 0, 0, 0, 0, 0, 0,
6321 0, 0, 0, 0, 0, 0, 0, 0,
6322 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6325 static int
6326 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6328 Py_ssize_t len1, len2;
6330 Py_UNICODE *s1 = str1->str;
6331 Py_UNICODE *s2 = str2->str;
6333 len1 = str1->length;
6334 len2 = str2->length;
6336 while (len1 > 0 && len2 > 0) {
6337 Py_UNICODE c1, c2;
6339 c1 = *s1++;
6340 c2 = *s2++;
6342 if (c1 > (1<<11) * 26)
6343 c1 += utf16Fixup[c1>>11];
6344 if (c2 > (1<<11) * 26)
6345 c2 += utf16Fixup[c2>>11];
6346 /* now c1 and c2 are in UTF-32-compatible order */
6348 if (c1 != c2)
6349 return (c1 < c2) ? -1 : 1;
6351 len1--; len2--;
6354 return (len1 < len2) ? -1 : (len1 != len2);
6357 #else
6359 static int
6360 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6362 register Py_ssize_t len1, len2;
6364 Py_UNICODE *s1 = str1->str;
6365 Py_UNICODE *s2 = str2->str;
6367 len1 = str1->length;
6368 len2 = str2->length;
6370 while (len1 > 0 && len2 > 0) {
6371 Py_UNICODE c1, c2;
6373 c1 = *s1++;
6374 c2 = *s2++;
6376 if (c1 != c2)
6377 return (c1 < c2) ? -1 : 1;
6379 len1--; len2--;
6382 return (len1 < len2) ? -1 : (len1 != len2);
6385 #endif
6387 int PyUnicode_Compare(PyObject *left,
6388 PyObject *right)
6390 PyUnicodeObject *u = NULL, *v = NULL;
6391 int result;
6393 /* Coerce the two arguments */
6394 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6395 if (u == NULL)
6396 goto onError;
6397 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6398 if (v == NULL)
6399 goto onError;
6401 /* Shortcut for empty or interned objects */
6402 if (v == u) {
6403 Py_DECREF(u);
6404 Py_DECREF(v);
6405 return 0;
6408 result = unicode_compare(u, v);
6410 Py_DECREF(u);
6411 Py_DECREF(v);
6412 return result;
6414 onError:
6415 Py_XDECREF(u);
6416 Py_XDECREF(v);
6417 return -1;
6420 PyObject *PyUnicode_RichCompare(PyObject *left,
6421 PyObject *right,
6422 int op)
6424 int result;
6426 result = PyUnicode_Compare(left, right);
6427 if (result == -1 && PyErr_Occurred())
6428 goto onError;
6430 /* Convert the return value to a Boolean */
6431 switch (op) {
6432 case Py_EQ:
6433 result = (result == 0);
6434 break;
6435 case Py_NE:
6436 result = (result != 0);
6437 break;
6438 case Py_LE:
6439 result = (result <= 0);
6440 break;
6441 case Py_GE:
6442 result = (result >= 0);
6443 break;
6444 case Py_LT:
6445 result = (result == -1);
6446 break;
6447 case Py_GT:
6448 result = (result == 1);
6449 break;
6451 return PyBool_FromLong(result);
6453 onError:
6455 /* Standard case
6457 Type errors mean that PyUnicode_FromObject() could not convert
6458 one of the arguments (usually the right hand side) to Unicode,
6459 ie. we can't handle the comparison request. However, it is
6460 possible that the other object knows a comparison method, which
6461 is why we return Py_NotImplemented to give the other object a
6462 chance.
6465 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6466 PyErr_Clear();
6467 Py_INCREF(Py_NotImplemented);
6468 return Py_NotImplemented;
6470 if (op != Py_EQ && op != Py_NE)
6471 return NULL;
6473 /* Equality comparison.
6475 This is a special case: we silence any PyExc_UnicodeDecodeError
6476 and instead turn it into a PyErr_UnicodeWarning.
6479 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6480 return NULL;
6481 PyErr_Clear();
6482 if (PyErr_Warn(PyExc_UnicodeWarning,
6483 (op == Py_EQ) ?
6484 "Unicode equal comparison "
6485 "failed to convert both arguments to Unicode - "
6486 "interpreting them as being unequal" :
6487 "Unicode unequal comparison "
6488 "failed to convert both arguments to Unicode - "
6489 "interpreting them as being unequal"
6490 ) < 0)
6491 return NULL;
6492 result = (op == Py_NE);
6493 return PyBool_FromLong(result);
6496 int PyUnicode_Contains(PyObject *container,
6497 PyObject *element)
6499 PyObject *str, *sub;
6500 int result;
6502 /* Coerce the two arguments */
6503 sub = PyUnicode_FromObject(element);
6504 if (!sub) {
6505 PyErr_SetString(PyExc_TypeError,
6506 "'in <string>' requires string as left operand");
6507 return -1;
6510 str = PyUnicode_FromObject(container);
6511 if (!str) {
6512 Py_DECREF(sub);
6513 return -1;
6516 result = stringlib_contains_obj(str, sub);
6518 Py_DECREF(str);
6519 Py_DECREF(sub);
6521 return result;
6524 /* Concat to string or Unicode object giving a new Unicode object. */
6526 PyObject *PyUnicode_Concat(PyObject *left,
6527 PyObject *right)
6529 PyUnicodeObject *u = NULL, *v = NULL, *w;
6531 /* Coerce the two arguments */
6532 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6533 if (u == NULL)
6534 goto onError;
6535 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6536 if (v == NULL)
6537 goto onError;
6539 /* Shortcuts */
6540 if (v == unicode_empty) {
6541 Py_DECREF(v);
6542 return (PyObject *)u;
6544 if (u == unicode_empty) {
6545 Py_DECREF(u);
6546 return (PyObject *)v;
6549 /* Concat the two Unicode strings */
6550 w = _PyUnicode_New(u->length + v->length);
6551 if (w == NULL)
6552 goto onError;
6553 Py_UNICODE_COPY(w->str, u->str, u->length);
6554 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6556 Py_DECREF(u);
6557 Py_DECREF(v);
6558 return (PyObject *)w;
6560 onError:
6561 Py_XDECREF(u);
6562 Py_XDECREF(v);
6563 return NULL;
6566 PyDoc_STRVAR(count__doc__,
6567 "S.count(sub[, start[, end]]) -> int\n\
6569 Return the number of non-overlapping occurrences of substring sub in\n\
6570 Unicode string S[start:end]. Optional arguments start and end are\n\
6571 interpreted as in slice notation.");
6573 static PyObject *
6574 unicode_count(PyUnicodeObject *self, PyObject *args)
6576 PyUnicodeObject *substring;
6577 Py_ssize_t start = 0;
6578 Py_ssize_t end = PY_SSIZE_T_MAX;
6579 PyObject *result;
6581 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6582 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6583 return NULL;
6585 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6586 (PyObject *)substring);
6587 if (substring == NULL)
6588 return NULL;
6590 FIX_START_END(self);
6592 result = PyInt_FromSsize_t(
6593 stringlib_count(self->str + start, end - start,
6594 substring->str, substring->length)
6597 Py_DECREF(substring);
6599 return result;
6602 PyDoc_STRVAR(encode__doc__,
6603 "S.encode([encoding[,errors]]) -> string or unicode\n\
6605 Encodes S using the codec registered for encoding. encoding defaults\n\
6606 to the default encoding. errors may be given to set a different error\n\
6607 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6608 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6609 'xmlcharrefreplace' as well as any other name registered with\n\
6610 codecs.register_error that can handle UnicodeEncodeErrors.");
6612 static PyObject *
6613 unicode_encode(PyUnicodeObject *self, PyObject *args)
6615 char *encoding = NULL;
6616 char *errors = NULL;
6617 PyObject *v;
6619 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6620 return NULL;
6621 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6622 if (v == NULL)
6623 goto onError;
6624 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6625 PyErr_Format(PyExc_TypeError,
6626 "encoder did not return a string/unicode object "
6627 "(type=%.400s)",
6628 Py_TYPE(v)->tp_name);
6629 Py_DECREF(v);
6630 return NULL;
6632 return v;
6634 onError:
6635 return NULL;
6638 PyDoc_STRVAR(decode__doc__,
6639 "S.decode([encoding[,errors]]) -> string or unicode\n\
6641 Decodes S using the codec registered for encoding. encoding defaults\n\
6642 to the default encoding. errors may be given to set a different error\n\
6643 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6644 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6645 as well as any other name registerd with codecs.register_error that is\n\
6646 able to handle UnicodeDecodeErrors.");
6648 static PyObject *
6649 unicode_decode(PyUnicodeObject *self, PyObject *args)
6651 char *encoding = NULL;
6652 char *errors = NULL;
6653 PyObject *v;
6655 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6656 return NULL;
6657 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
6658 if (v == NULL)
6659 goto onError;
6660 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6661 PyErr_Format(PyExc_TypeError,
6662 "decoder did not return a string/unicode object "
6663 "(type=%.400s)",
6664 Py_TYPE(v)->tp_name);
6665 Py_DECREF(v);
6666 return NULL;
6668 return v;
6670 onError:
6671 return NULL;
6674 PyDoc_STRVAR(expandtabs__doc__,
6675 "S.expandtabs([tabsize]) -> unicode\n\
6677 Return a copy of S where all tab characters are expanded using spaces.\n\
6678 If tabsize is not given, a tab size of 8 characters is assumed.");
6680 static PyObject*
6681 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6683 Py_UNICODE *e;
6684 Py_UNICODE *p;
6685 Py_UNICODE *q;
6686 Py_UNICODE *qe;
6687 Py_ssize_t i, j, incr;
6688 PyUnicodeObject *u;
6689 int tabsize = 8;
6691 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6692 return NULL;
6694 /* First pass: determine size of output string */
6695 i = 0; /* chars up to and including most recent \n or \r */
6696 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6697 e = self->str + self->length; /* end of input */
6698 for (p = self->str; p < e; p++)
6699 if (*p == '\t') {
6700 if (tabsize > 0) {
6701 incr = tabsize - (j % tabsize); /* cannot overflow */
6702 if (j > PY_SSIZE_T_MAX - incr)
6703 goto overflow1;
6704 j += incr;
6707 else {
6708 if (j > PY_SSIZE_T_MAX - 1)
6709 goto overflow1;
6710 j++;
6711 if (*p == '\n' || *p == '\r') {
6712 if (i > PY_SSIZE_T_MAX - j)
6713 goto overflow1;
6714 i += j;
6715 j = 0;
6719 if (i > PY_SSIZE_T_MAX - j)
6720 goto overflow1;
6722 /* Second pass: create output string and fill it */
6723 u = _PyUnicode_New(i + j);
6724 if (!u)
6725 return NULL;
6727 j = 0; /* same as in first pass */
6728 q = u->str; /* next output char */
6729 qe = u->str + u->length; /* end of output */
6731 for (p = self->str; p < e; p++)
6732 if (*p == '\t') {
6733 if (tabsize > 0) {
6734 i = tabsize - (j % tabsize);
6735 j += i;
6736 while (i--) {
6737 if (q >= qe)
6738 goto overflow2;
6739 *q++ = ' ';
6743 else {
6744 if (q >= qe)
6745 goto overflow2;
6746 *q++ = *p;
6747 j++;
6748 if (*p == '\n' || *p == '\r')
6749 j = 0;
6752 return (PyObject*) u;
6754 overflow2:
6755 Py_DECREF(u);
6756 overflow1:
6757 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6758 return NULL;
6761 PyDoc_STRVAR(find__doc__,
6762 "S.find(sub [,start [,end]]) -> int\n\
6764 Return the lowest index in S where substring sub is found,\n\
6765 such that sub is contained within s[start:end]. Optional\n\
6766 arguments start and end are interpreted as in slice notation.\n\
6768 Return -1 on failure.");
6770 static PyObject *
6771 unicode_find(PyUnicodeObject *self, PyObject *args)
6773 PyObject *substring;
6774 Py_ssize_t start;
6775 Py_ssize_t end;
6776 Py_ssize_t result;
6778 if (!_ParseTupleFinds(args, &substring, &start, &end))
6779 return NULL;
6781 result = stringlib_find_slice(
6782 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6783 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6784 start, end
6787 Py_DECREF(substring);
6789 return PyInt_FromSsize_t(result);
6792 static PyObject *
6793 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6795 if (index < 0 || index >= self->length) {
6796 PyErr_SetString(PyExc_IndexError, "string index out of range");
6797 return NULL;
6800 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6803 static long
6804 unicode_hash(PyUnicodeObject *self)
6806 /* Since Unicode objects compare equal to their ASCII string
6807 counterparts, they should use the individual character values
6808 as basis for their hash value. This is needed to assure that
6809 strings and Unicode objects behave in the same way as
6810 dictionary keys. */
6812 register Py_ssize_t len;
6813 register Py_UNICODE *p;
6814 register long x;
6816 if (self->hash != -1)
6817 return self->hash;
6818 len = PyUnicode_GET_SIZE(self);
6819 p = PyUnicode_AS_UNICODE(self);
6820 x = *p << 7;
6821 while (--len >= 0)
6822 x = (1000003*x) ^ *p++;
6823 x ^= PyUnicode_GET_SIZE(self);
6824 if (x == -1)
6825 x = -2;
6826 self->hash = x;
6827 return x;
6830 PyDoc_STRVAR(index__doc__,
6831 "S.index(sub [,start [,end]]) -> int\n\
6833 Like S.find() but raise ValueError when the substring is not found.");
6835 static PyObject *
6836 unicode_index(PyUnicodeObject *self, PyObject *args)
6838 Py_ssize_t result;
6839 PyObject *substring;
6840 Py_ssize_t start;
6841 Py_ssize_t end;
6843 if (!_ParseTupleFinds(args, &substring, &start, &end))
6844 return NULL;
6846 result = stringlib_find_slice(
6847 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6848 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6849 start, end
6852 Py_DECREF(substring);
6854 if (result < 0) {
6855 PyErr_SetString(PyExc_ValueError, "substring not found");
6856 return NULL;
6859 return PyInt_FromSsize_t(result);
6862 PyDoc_STRVAR(islower__doc__,
6863 "S.islower() -> bool\n\
6865 Return True if all cased characters in S are lowercase and there is\n\
6866 at least one cased character in S, False otherwise.");
6868 static PyObject*
6869 unicode_islower(PyUnicodeObject *self)
6871 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6872 register const Py_UNICODE *e;
6873 int cased;
6875 /* Shortcut for single character strings */
6876 if (PyUnicode_GET_SIZE(self) == 1)
6877 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6879 /* Special case for empty strings */
6880 if (PyUnicode_GET_SIZE(self) == 0)
6881 return PyBool_FromLong(0);
6883 e = p + PyUnicode_GET_SIZE(self);
6884 cased = 0;
6885 for (; p < e; p++) {
6886 register const Py_UNICODE ch = *p;
6888 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6889 return PyBool_FromLong(0);
6890 else if (!cased && Py_UNICODE_ISLOWER(ch))
6891 cased = 1;
6893 return PyBool_FromLong(cased);
6896 PyDoc_STRVAR(isupper__doc__,
6897 "S.isupper() -> bool\n\
6899 Return True if all cased characters in S are uppercase and there is\n\
6900 at least one cased character in S, False otherwise.");
6902 static PyObject*
6903 unicode_isupper(PyUnicodeObject *self)
6905 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6906 register const Py_UNICODE *e;
6907 int cased;
6909 /* Shortcut for single character strings */
6910 if (PyUnicode_GET_SIZE(self) == 1)
6911 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6913 /* Special case for empty strings */
6914 if (PyUnicode_GET_SIZE(self) == 0)
6915 return PyBool_FromLong(0);
6917 e = p + PyUnicode_GET_SIZE(self);
6918 cased = 0;
6919 for (; p < e; p++) {
6920 register const Py_UNICODE ch = *p;
6922 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6923 return PyBool_FromLong(0);
6924 else if (!cased && Py_UNICODE_ISUPPER(ch))
6925 cased = 1;
6927 return PyBool_FromLong(cased);
6930 PyDoc_STRVAR(istitle__doc__,
6931 "S.istitle() -> bool\n\
6933 Return True if S is a titlecased string and there is at least one\n\
6934 character in S, i.e. upper- and titlecase characters may only\n\
6935 follow uncased characters and lowercase characters only cased ones.\n\
6936 Return False otherwise.");
6938 static PyObject*
6939 unicode_istitle(PyUnicodeObject *self)
6941 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6942 register const Py_UNICODE *e;
6943 int cased, previous_is_cased;
6945 /* Shortcut for single character strings */
6946 if (PyUnicode_GET_SIZE(self) == 1)
6947 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6948 (Py_UNICODE_ISUPPER(*p) != 0));
6950 /* Special case for empty strings */
6951 if (PyUnicode_GET_SIZE(self) == 0)
6952 return PyBool_FromLong(0);
6954 e = p + PyUnicode_GET_SIZE(self);
6955 cased = 0;
6956 previous_is_cased = 0;
6957 for (; p < e; p++) {
6958 register const Py_UNICODE ch = *p;
6960 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6961 if (previous_is_cased)
6962 return PyBool_FromLong(0);
6963 previous_is_cased = 1;
6964 cased = 1;
6966 else if (Py_UNICODE_ISLOWER(ch)) {
6967 if (!previous_is_cased)
6968 return PyBool_FromLong(0);
6969 previous_is_cased = 1;
6970 cased = 1;
6972 else
6973 previous_is_cased = 0;
6975 return PyBool_FromLong(cased);
6978 PyDoc_STRVAR(isspace__doc__,
6979 "S.isspace() -> bool\n\
6981 Return True if all characters in S are whitespace\n\
6982 and there is at least one character in S, False otherwise.");
6984 static PyObject*
6985 unicode_isspace(PyUnicodeObject *self)
6987 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6988 register const Py_UNICODE *e;
6990 /* Shortcut for single character strings */
6991 if (PyUnicode_GET_SIZE(self) == 1 &&
6992 Py_UNICODE_ISSPACE(*p))
6993 return PyBool_FromLong(1);
6995 /* Special case for empty strings */
6996 if (PyUnicode_GET_SIZE(self) == 0)
6997 return PyBool_FromLong(0);
6999 e = p + PyUnicode_GET_SIZE(self);
7000 for (; p < e; p++) {
7001 if (!Py_UNICODE_ISSPACE(*p))
7002 return PyBool_FromLong(0);
7004 return PyBool_FromLong(1);
7007 PyDoc_STRVAR(isalpha__doc__,
7008 "S.isalpha() -> bool\n\
7010 Return True if all characters in S are alphabetic\n\
7011 and there is at least one character in S, False otherwise.");
7013 static PyObject*
7014 unicode_isalpha(PyUnicodeObject *self)
7016 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7017 register const Py_UNICODE *e;
7019 /* Shortcut for single character strings */
7020 if (PyUnicode_GET_SIZE(self) == 1 &&
7021 Py_UNICODE_ISALPHA(*p))
7022 return PyBool_FromLong(1);
7024 /* Special case for empty strings */
7025 if (PyUnicode_GET_SIZE(self) == 0)
7026 return PyBool_FromLong(0);
7028 e = p + PyUnicode_GET_SIZE(self);
7029 for (; p < e; p++) {
7030 if (!Py_UNICODE_ISALPHA(*p))
7031 return PyBool_FromLong(0);
7033 return PyBool_FromLong(1);
7036 PyDoc_STRVAR(isalnum__doc__,
7037 "S.isalnum() -> bool\n\
7039 Return True if all characters in S are alphanumeric\n\
7040 and there is at least one character in S, False otherwise.");
7042 static PyObject*
7043 unicode_isalnum(PyUnicodeObject *self)
7045 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7046 register const Py_UNICODE *e;
7048 /* Shortcut for single character strings */
7049 if (PyUnicode_GET_SIZE(self) == 1 &&
7050 Py_UNICODE_ISALNUM(*p))
7051 return PyBool_FromLong(1);
7053 /* Special case for empty strings */
7054 if (PyUnicode_GET_SIZE(self) == 0)
7055 return PyBool_FromLong(0);
7057 e = p + PyUnicode_GET_SIZE(self);
7058 for (; p < e; p++) {
7059 if (!Py_UNICODE_ISALNUM(*p))
7060 return PyBool_FromLong(0);
7062 return PyBool_FromLong(1);
7065 PyDoc_STRVAR(isdecimal__doc__,
7066 "S.isdecimal() -> bool\n\
7068 Return True if there are only decimal characters in S,\n\
7069 False otherwise.");
7071 static PyObject*
7072 unicode_isdecimal(PyUnicodeObject *self)
7074 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7075 register const Py_UNICODE *e;
7077 /* Shortcut for single character strings */
7078 if (PyUnicode_GET_SIZE(self) == 1 &&
7079 Py_UNICODE_ISDECIMAL(*p))
7080 return PyBool_FromLong(1);
7082 /* Special case for empty strings */
7083 if (PyUnicode_GET_SIZE(self) == 0)
7084 return PyBool_FromLong(0);
7086 e = p + PyUnicode_GET_SIZE(self);
7087 for (; p < e; p++) {
7088 if (!Py_UNICODE_ISDECIMAL(*p))
7089 return PyBool_FromLong(0);
7091 return PyBool_FromLong(1);
7094 PyDoc_STRVAR(isdigit__doc__,
7095 "S.isdigit() -> bool\n\
7097 Return True if all characters in S are digits\n\
7098 and there is at least one character in S, False otherwise.");
7100 static PyObject*
7101 unicode_isdigit(PyUnicodeObject *self)
7103 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7104 register const Py_UNICODE *e;
7106 /* Shortcut for single character strings */
7107 if (PyUnicode_GET_SIZE(self) == 1 &&
7108 Py_UNICODE_ISDIGIT(*p))
7109 return PyBool_FromLong(1);
7111 /* Special case for empty strings */
7112 if (PyUnicode_GET_SIZE(self) == 0)
7113 return PyBool_FromLong(0);
7115 e = p + PyUnicode_GET_SIZE(self);
7116 for (; p < e; p++) {
7117 if (!Py_UNICODE_ISDIGIT(*p))
7118 return PyBool_FromLong(0);
7120 return PyBool_FromLong(1);
7123 PyDoc_STRVAR(isnumeric__doc__,
7124 "S.isnumeric() -> bool\n\
7126 Return True if there are only numeric characters in S,\n\
7127 False otherwise.");
7129 static PyObject*
7130 unicode_isnumeric(PyUnicodeObject *self)
7132 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7133 register const Py_UNICODE *e;
7135 /* Shortcut for single character strings */
7136 if (PyUnicode_GET_SIZE(self) == 1 &&
7137 Py_UNICODE_ISNUMERIC(*p))
7138 return PyBool_FromLong(1);
7140 /* Special case for empty strings */
7141 if (PyUnicode_GET_SIZE(self) == 0)
7142 return PyBool_FromLong(0);
7144 e = p + PyUnicode_GET_SIZE(self);
7145 for (; p < e; p++) {
7146 if (!Py_UNICODE_ISNUMERIC(*p))
7147 return PyBool_FromLong(0);
7149 return PyBool_FromLong(1);
7152 PyDoc_STRVAR(join__doc__,
7153 "S.join(sequence) -> unicode\n\
7155 Return a string which is the concatenation of the strings in the\n\
7156 sequence. The separator between elements is S.");
7158 static PyObject*
7159 unicode_join(PyObject *self, PyObject *data)
7161 return PyUnicode_Join(self, data);
7164 static Py_ssize_t
7165 unicode_length(PyUnicodeObject *self)
7167 return self->length;
7170 PyDoc_STRVAR(ljust__doc__,
7171 "S.ljust(width[, fillchar]) -> int\n\
7173 Return S left-justified in a Unicode string of length width. Padding is\n\
7174 done using the specified fill character (default is a space).");
7176 static PyObject *
7177 unicode_ljust(PyUnicodeObject *self, PyObject *args)
7179 Py_ssize_t width;
7180 Py_UNICODE fillchar = ' ';
7182 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7183 return NULL;
7185 if (self->length >= width && PyUnicode_CheckExact(self)) {
7186 Py_INCREF(self);
7187 return (PyObject*) self;
7190 return (PyObject*) pad(self, 0, width - self->length, fillchar);
7193 PyDoc_STRVAR(lower__doc__,
7194 "S.lower() -> unicode\n\
7196 Return a copy of the string S converted to lowercase.");
7198 static PyObject*
7199 unicode_lower(PyUnicodeObject *self)
7201 return fixup(self, fixlower);
7204 #define LEFTSTRIP 0
7205 #define RIGHTSTRIP 1
7206 #define BOTHSTRIP 2
7208 /* Arrays indexed by above */
7209 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7211 #define STRIPNAME(i) (stripformat[i]+3)
7213 /* externally visible for str.strip(unicode) */
7214 PyObject *
7215 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7217 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7218 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7219 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7220 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7221 Py_ssize_t i, j;
7223 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7225 i = 0;
7226 if (striptype != RIGHTSTRIP) {
7227 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7228 i++;
7232 j = len;
7233 if (striptype != LEFTSTRIP) {
7234 do {
7235 j--;
7236 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7237 j++;
7240 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7241 Py_INCREF(self);
7242 return (PyObject*)self;
7244 else
7245 return PyUnicode_FromUnicode(s+i, j-i);
7249 static PyObject *
7250 do_strip(PyUnicodeObject *self, int striptype)
7252 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7253 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7255 i = 0;
7256 if (striptype != RIGHTSTRIP) {
7257 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7258 i++;
7262 j = len;
7263 if (striptype != LEFTSTRIP) {
7264 do {
7265 j--;
7266 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7267 j++;
7270 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7271 Py_INCREF(self);
7272 return (PyObject*)self;
7274 else
7275 return PyUnicode_FromUnicode(s+i, j-i);
7279 static PyObject *
7280 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7282 PyObject *sep = NULL;
7284 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7285 return NULL;
7287 if (sep != NULL && sep != Py_None) {
7288 if (PyUnicode_Check(sep))
7289 return _PyUnicode_XStrip(self, striptype, sep);
7290 else if (PyString_Check(sep)) {
7291 PyObject *res;
7292 sep = PyUnicode_FromObject(sep);
7293 if (sep==NULL)
7294 return NULL;
7295 res = _PyUnicode_XStrip(self, striptype, sep);
7296 Py_DECREF(sep);
7297 return res;
7299 else {
7300 PyErr_Format(PyExc_TypeError,
7301 "%s arg must be None, unicode or str",
7302 STRIPNAME(striptype));
7303 return NULL;
7307 return do_strip(self, striptype);
7311 PyDoc_STRVAR(strip__doc__,
7312 "S.strip([chars]) -> unicode\n\
7314 Return a copy of the string S with leading and trailing\n\
7315 whitespace removed.\n\
7316 If chars is given and not None, remove characters in chars instead.\n\
7317 If chars is a str, it will be converted to unicode before stripping");
7319 static PyObject *
7320 unicode_strip(PyUnicodeObject *self, PyObject *args)
7322 if (PyTuple_GET_SIZE(args) == 0)
7323 return do_strip(self, BOTHSTRIP); /* Common case */
7324 else
7325 return do_argstrip(self, BOTHSTRIP, args);
7329 PyDoc_STRVAR(lstrip__doc__,
7330 "S.lstrip([chars]) -> unicode\n\
7332 Return a copy of the string S with leading whitespace removed.\n\
7333 If chars is given and not None, remove characters in chars instead.\n\
7334 If chars is a str, it will be converted to unicode before stripping");
7336 static PyObject *
7337 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7339 if (PyTuple_GET_SIZE(args) == 0)
7340 return do_strip(self, LEFTSTRIP); /* Common case */
7341 else
7342 return do_argstrip(self, LEFTSTRIP, args);
7346 PyDoc_STRVAR(rstrip__doc__,
7347 "S.rstrip([chars]) -> unicode\n\
7349 Return a copy of the string S with trailing whitespace removed.\n\
7350 If chars is given and not None, remove characters in chars instead.\n\
7351 If chars is a str, it will be converted to unicode before stripping");
7353 static PyObject *
7354 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7356 if (PyTuple_GET_SIZE(args) == 0)
7357 return do_strip(self, RIGHTSTRIP); /* Common case */
7358 else
7359 return do_argstrip(self, RIGHTSTRIP, args);
7363 static PyObject*
7364 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7366 PyUnicodeObject *u;
7367 Py_UNICODE *p;
7368 Py_ssize_t nchars;
7369 size_t nbytes;
7371 if (len < 0)
7372 len = 0;
7374 if (len == 1 && PyUnicode_CheckExact(str)) {
7375 /* no repeat, return original string */
7376 Py_INCREF(str);
7377 return (PyObject*) str;
7380 /* ensure # of chars needed doesn't overflow int and # of bytes
7381 * needed doesn't overflow size_t
7383 nchars = len * str->length;
7384 if (len && nchars / len != str->length) {
7385 PyErr_SetString(PyExc_OverflowError,
7386 "repeated string is too long");
7387 return NULL;
7389 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7390 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7391 PyErr_SetString(PyExc_OverflowError,
7392 "repeated string is too long");
7393 return NULL;
7395 u = _PyUnicode_New(nchars);
7396 if (!u)
7397 return NULL;
7399 p = u->str;
7401 if (str->length == 1 && len > 0) {
7402 Py_UNICODE_FILL(p, str->str[0], len);
7403 } else {
7404 Py_ssize_t done = 0; /* number of characters copied this far */
7405 if (done < nchars) {
7406 Py_UNICODE_COPY(p, str->str, str->length);
7407 done = str->length;
7409 while (done < nchars) {
7410 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7411 Py_UNICODE_COPY(p+done, p, n);
7412 done += n;
7416 return (PyObject*) u;
7419 PyObject *PyUnicode_Replace(PyObject *obj,
7420 PyObject *subobj,
7421 PyObject *replobj,
7422 Py_ssize_t maxcount)
7424 PyObject *self;
7425 PyObject *str1;
7426 PyObject *str2;
7427 PyObject *result;
7429 self = PyUnicode_FromObject(obj);
7430 if (self == NULL)
7431 return NULL;
7432 str1 = PyUnicode_FromObject(subobj);
7433 if (str1 == NULL) {
7434 Py_DECREF(self);
7435 return NULL;
7437 str2 = PyUnicode_FromObject(replobj);
7438 if (str2 == NULL) {
7439 Py_DECREF(self);
7440 Py_DECREF(str1);
7441 return NULL;
7443 result = replace((PyUnicodeObject *)self,
7444 (PyUnicodeObject *)str1,
7445 (PyUnicodeObject *)str2,
7446 maxcount);
7447 Py_DECREF(self);
7448 Py_DECREF(str1);
7449 Py_DECREF(str2);
7450 return result;
7453 PyDoc_STRVAR(replace__doc__,
7454 "S.replace (old, new[, count]) -> unicode\n\
7456 Return a copy of S with all occurrences of substring\n\
7457 old replaced by new. If the optional argument count is\n\
7458 given, only the first count occurrences are replaced.");
7460 static PyObject*
7461 unicode_replace(PyUnicodeObject *self, PyObject *args)
7463 PyUnicodeObject *str1;
7464 PyUnicodeObject *str2;
7465 Py_ssize_t maxcount = -1;
7466 PyObject *result;
7468 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7469 return NULL;
7470 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7471 if (str1 == NULL)
7472 return NULL;
7473 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7474 if (str2 == NULL) {
7475 Py_DECREF(str1);
7476 return NULL;
7479 result = replace(self, str1, str2, maxcount);
7481 Py_DECREF(str1);
7482 Py_DECREF(str2);
7483 return result;
7486 static
7487 PyObject *unicode_repr(PyObject *unicode)
7489 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7490 PyUnicode_GET_SIZE(unicode),
7494 PyDoc_STRVAR(rfind__doc__,
7495 "S.rfind(sub [,start [,end]]) -> int\n\
7497 Return the highest index in S where substring sub is found,\n\
7498 such that sub is contained within s[start:end]. Optional\n\
7499 arguments start and end are interpreted as in slice notation.\n\
7501 Return -1 on failure.");
7503 static PyObject *
7504 unicode_rfind(PyUnicodeObject *self, PyObject *args)
7506 PyObject *substring;
7507 Py_ssize_t start;
7508 Py_ssize_t end;
7509 Py_ssize_t result;
7511 if (!_ParseTupleFinds(args, &substring, &start, &end))
7512 return NULL;
7514 result = stringlib_rfind_slice(
7515 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7516 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7517 start, end
7520 Py_DECREF(substring);
7522 return PyInt_FromSsize_t(result);
7525 PyDoc_STRVAR(rindex__doc__,
7526 "S.rindex(sub [,start [,end]]) -> int\n\
7528 Like S.rfind() but raise ValueError when the substring is not found.");
7530 static PyObject *
7531 unicode_rindex(PyUnicodeObject *self, PyObject *args)
7533 PyObject *substring;
7534 Py_ssize_t start;
7535 Py_ssize_t end;
7536 Py_ssize_t result;
7538 if (!_ParseTupleFinds(args, &substring, &start, &end))
7539 return NULL;
7541 result = stringlib_rfind_slice(
7542 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7543 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7544 start, end
7547 Py_DECREF(substring);
7549 if (result < 0) {
7550 PyErr_SetString(PyExc_ValueError, "substring not found");
7551 return NULL;
7553 return PyInt_FromSsize_t(result);
7556 PyDoc_STRVAR(rjust__doc__,
7557 "S.rjust(width[, fillchar]) -> unicode\n\
7559 Return S right-justified in a Unicode string of length width. Padding is\n\
7560 done using the specified fill character (default is a space).");
7562 static PyObject *
7563 unicode_rjust(PyUnicodeObject *self, PyObject *args)
7565 Py_ssize_t width;
7566 Py_UNICODE fillchar = ' ';
7568 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7569 return NULL;
7571 if (self->length >= width && PyUnicode_CheckExact(self)) {
7572 Py_INCREF(self);
7573 return (PyObject*) self;
7576 return (PyObject*) pad(self, width - self->length, 0, fillchar);
7579 static PyObject*
7580 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
7582 /* standard clamping */
7583 if (start < 0)
7584 start = 0;
7585 if (end < 0)
7586 end = 0;
7587 if (end > self->length)
7588 end = self->length;
7589 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
7590 /* full slice, return original string */
7591 Py_INCREF(self);
7592 return (PyObject*) self;
7594 if (start > end)
7595 start = end;
7596 /* copy slice */
7597 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7598 end - start);
7601 PyObject *PyUnicode_Split(PyObject *s,
7602 PyObject *sep,
7603 Py_ssize_t maxsplit)
7605 PyObject *result;
7607 s = PyUnicode_FromObject(s);
7608 if (s == NULL)
7609 return NULL;
7610 if (sep != NULL) {
7611 sep = PyUnicode_FromObject(sep);
7612 if (sep == NULL) {
7613 Py_DECREF(s);
7614 return NULL;
7618 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7620 Py_DECREF(s);
7621 Py_XDECREF(sep);
7622 return result;
7625 PyDoc_STRVAR(split__doc__,
7626 "S.split([sep [,maxsplit]]) -> list of strings\n\
7628 Return a list of the words in S, using sep as the\n\
7629 delimiter string. If maxsplit is given, at most maxsplit\n\
7630 splits are done. If sep is not specified or is None, any\n\
7631 whitespace string is a separator and empty strings are\n\
7632 removed from the result.");
7634 static PyObject*
7635 unicode_split(PyUnicodeObject *self, PyObject *args)
7637 PyObject *substring = Py_None;
7638 Py_ssize_t maxcount = -1;
7640 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7641 return NULL;
7643 if (substring == Py_None)
7644 return split(self, NULL, maxcount);
7645 else if (PyUnicode_Check(substring))
7646 return split(self, (PyUnicodeObject *)substring, maxcount);
7647 else
7648 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7651 PyObject *
7652 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7654 PyObject* str_obj;
7655 PyObject* sep_obj;
7656 PyObject* out;
7658 str_obj = PyUnicode_FromObject(str_in);
7659 if (!str_obj)
7660 return NULL;
7661 sep_obj = PyUnicode_FromObject(sep_in);
7662 if (!sep_obj) {
7663 Py_DECREF(str_obj);
7664 return NULL;
7667 out = stringlib_partition(
7668 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7669 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7672 Py_DECREF(sep_obj);
7673 Py_DECREF(str_obj);
7675 return out;
7679 PyObject *
7680 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7682 PyObject* str_obj;
7683 PyObject* sep_obj;
7684 PyObject* out;
7686 str_obj = PyUnicode_FromObject(str_in);
7687 if (!str_obj)
7688 return NULL;
7689 sep_obj = PyUnicode_FromObject(sep_in);
7690 if (!sep_obj) {
7691 Py_DECREF(str_obj);
7692 return NULL;
7695 out = stringlib_rpartition(
7696 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7697 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7700 Py_DECREF(sep_obj);
7701 Py_DECREF(str_obj);
7703 return out;
7706 PyDoc_STRVAR(partition__doc__,
7707 "S.partition(sep) -> (head, sep, tail)\n\
7709 Search for the separator sep in S, and return the part before it,\n\
7710 the separator itself, and the part after it. If the separator is not\n\
7711 found, return S and two empty strings.");
7713 static PyObject*
7714 unicode_partition(PyUnicodeObject *self, PyObject *separator)
7716 return PyUnicode_Partition((PyObject *)self, separator);
7719 PyDoc_STRVAR(rpartition__doc__,
7720 "S.rpartition(sep) -> (tail, sep, head)\n\
7722 Search for the separator sep in S, starting at the end of S, and return\n\
7723 the part before it, the separator itself, and the part after it. If the\n\
7724 separator is not found, return two empty strings and S.");
7726 static PyObject*
7727 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7729 return PyUnicode_RPartition((PyObject *)self, separator);
7732 PyObject *PyUnicode_RSplit(PyObject *s,
7733 PyObject *sep,
7734 Py_ssize_t maxsplit)
7736 PyObject *result;
7738 s = PyUnicode_FromObject(s);
7739 if (s == NULL)
7740 return NULL;
7741 if (sep != NULL) {
7742 sep = PyUnicode_FromObject(sep);
7743 if (sep == NULL) {
7744 Py_DECREF(s);
7745 return NULL;
7749 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7751 Py_DECREF(s);
7752 Py_XDECREF(sep);
7753 return result;
7756 PyDoc_STRVAR(rsplit__doc__,
7757 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7759 Return a list of the words in S, using sep as the\n\
7760 delimiter string, starting at the end of the string and\n\
7761 working to the front. If maxsplit is given, at most maxsplit\n\
7762 splits are done. If sep is not specified, any whitespace string\n\
7763 is a separator.");
7765 static PyObject*
7766 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7768 PyObject *substring = Py_None;
7769 Py_ssize_t maxcount = -1;
7771 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7772 return NULL;
7774 if (substring == Py_None)
7775 return rsplit(self, NULL, maxcount);
7776 else if (PyUnicode_Check(substring))
7777 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7778 else
7779 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7782 PyDoc_STRVAR(splitlines__doc__,
7783 "S.splitlines([keepends]) -> list of strings\n\
7785 Return a list of the lines in S, breaking at line boundaries.\n\
7786 Line breaks are not included in the resulting list unless keepends\n\
7787 is given and true.");
7789 static PyObject*
7790 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7792 int keepends = 0;
7794 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7795 return NULL;
7797 return PyUnicode_Splitlines((PyObject *)self, keepends);
7800 static
7801 PyObject *unicode_str(PyUnicodeObject *self)
7803 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
7806 PyDoc_STRVAR(swapcase__doc__,
7807 "S.swapcase() -> unicode\n\
7809 Return a copy of S with uppercase characters converted to lowercase\n\
7810 and vice versa.");
7812 static PyObject*
7813 unicode_swapcase(PyUnicodeObject *self)
7815 return fixup(self, fixswapcase);
7818 PyDoc_STRVAR(translate__doc__,
7819 "S.translate(table) -> unicode\n\
7821 Return a copy of the string S, where all characters have been mapped\n\
7822 through the given translation table, which must be a mapping of\n\
7823 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7824 Unmapped characters are left untouched. Characters mapped to None\n\
7825 are deleted.");
7827 static PyObject*
7828 unicode_translate(PyUnicodeObject *self, PyObject *table)
7830 return PyUnicode_TranslateCharmap(self->str,
7831 self->length,
7832 table,
7833 "ignore");
7836 PyDoc_STRVAR(upper__doc__,
7837 "S.upper() -> unicode\n\
7839 Return a copy of S converted to uppercase.");
7841 static PyObject*
7842 unicode_upper(PyUnicodeObject *self)
7844 return fixup(self, fixupper);
7847 PyDoc_STRVAR(zfill__doc__,
7848 "S.zfill(width) -> unicode\n\
7850 Pad a numeric string S with zeros on the left, to fill a field\n\
7851 of the specified width. The string S is never truncated.");
7853 static PyObject *
7854 unicode_zfill(PyUnicodeObject *self, PyObject *args)
7856 Py_ssize_t fill;
7857 PyUnicodeObject *u;
7859 Py_ssize_t width;
7860 if (!PyArg_ParseTuple(args, "n:zfill", &width))
7861 return NULL;
7863 if (self->length >= width) {
7864 if (PyUnicode_CheckExact(self)) {
7865 Py_INCREF(self);
7866 return (PyObject*) self;
7868 else
7869 return PyUnicode_FromUnicode(
7870 PyUnicode_AS_UNICODE(self),
7871 PyUnicode_GET_SIZE(self)
7875 fill = width - self->length;
7877 u = pad(self, fill, 0, '0');
7879 if (u == NULL)
7880 return NULL;
7882 if (u->str[fill] == '+' || u->str[fill] == '-') {
7883 /* move sign to beginning of string */
7884 u->str[0] = u->str[fill];
7885 u->str[fill] = '0';
7888 return (PyObject*) u;
7891 #if 0
7892 static PyObject*
7893 free_listsize(PyUnicodeObject *self)
7895 return PyInt_FromLong(numfree);
7897 #endif
7899 PyDoc_STRVAR(startswith__doc__,
7900 "S.startswith(prefix[, start[, end]]) -> bool\n\
7902 Return True if S starts with the specified prefix, False otherwise.\n\
7903 With optional start, test S beginning at that position.\n\
7904 With optional end, stop comparing S at that position.\n\
7905 prefix can also be a tuple of strings to try.");
7907 static PyObject *
7908 unicode_startswith(PyUnicodeObject *self,
7909 PyObject *args)
7911 PyObject *subobj;
7912 PyUnicodeObject *substring;
7913 Py_ssize_t start = 0;
7914 Py_ssize_t end = PY_SSIZE_T_MAX;
7915 int result;
7917 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
7918 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7919 return NULL;
7920 if (PyTuple_Check(subobj)) {
7921 Py_ssize_t i;
7922 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7923 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7924 PyTuple_GET_ITEM(subobj, i));
7925 if (substring == NULL)
7926 return NULL;
7927 result = tailmatch(self, substring, start, end, -1);
7928 Py_DECREF(substring);
7929 if (result) {
7930 Py_RETURN_TRUE;
7933 /* nothing matched */
7934 Py_RETURN_FALSE;
7936 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7937 if (substring == NULL)
7938 return NULL;
7939 result = tailmatch(self, substring, start, end, -1);
7940 Py_DECREF(substring);
7941 return PyBool_FromLong(result);
7945 PyDoc_STRVAR(endswith__doc__,
7946 "S.endswith(suffix[, start[, end]]) -> bool\n\
7948 Return True if S ends with the specified suffix, False otherwise.\n\
7949 With optional start, test S beginning at that position.\n\
7950 With optional end, stop comparing S at that position.\n\
7951 suffix can also be a tuple of strings to try.");
7953 static PyObject *
7954 unicode_endswith(PyUnicodeObject *self,
7955 PyObject *args)
7957 PyObject *subobj;
7958 PyUnicodeObject *substring;
7959 Py_ssize_t start = 0;
7960 Py_ssize_t end = PY_SSIZE_T_MAX;
7961 int result;
7963 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7964 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7965 return NULL;
7966 if (PyTuple_Check(subobj)) {
7967 Py_ssize_t i;
7968 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7969 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7970 PyTuple_GET_ITEM(subobj, i));
7971 if (substring == NULL)
7972 return NULL;
7973 result = tailmatch(self, substring, start, end, +1);
7974 Py_DECREF(substring);
7975 if (result) {
7976 Py_RETURN_TRUE;
7979 Py_RETURN_FALSE;
7981 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7982 if (substring == NULL)
7983 return NULL;
7985 result = tailmatch(self, substring, start, end, +1);
7986 Py_DECREF(substring);
7987 return PyBool_FromLong(result);
7991 /* Implements do_string_format, which is unicode because of stringlib */
7992 #include "stringlib/string_format.h"
7994 PyDoc_STRVAR(format__doc__,
7995 "S.format(*args, **kwargs) -> unicode\n\
7999 static PyObject *
8000 unicode__format__(PyObject *self, PyObject *args)
8002 PyObject *format_spec;
8003 PyObject *result = NULL;
8004 PyObject *tmp = NULL;
8006 /* If 2.x, convert format_spec to the same type as value */
8007 /* This is to allow things like u''.format('') */
8008 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
8009 goto done;
8010 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
8011 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
8012 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
8013 goto done;
8015 tmp = PyObject_Unicode(format_spec);
8016 if (tmp == NULL)
8017 goto done;
8018 format_spec = tmp;
8020 result = _PyUnicode_FormatAdvanced(self,
8021 PyUnicode_AS_UNICODE(format_spec),
8022 PyUnicode_GET_SIZE(format_spec));
8023 done:
8024 Py_XDECREF(tmp);
8025 return result;
8028 PyDoc_STRVAR(p_format__doc__,
8029 "S.__format__(format_spec) -> unicode\n\
8033 static PyObject *
8034 unicode__sizeof__(PyUnicodeObject *v)
8036 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
8037 sizeof(Py_UNICODE) * (v->length + 1));
8040 PyDoc_STRVAR(sizeof__doc__,
8041 "S.__sizeof__() -> size of S in memory, in bytes\n\
8045 static PyObject *
8046 unicode_getnewargs(PyUnicodeObject *v)
8048 return Py_BuildValue("(u#)", v->str, v->length);
8052 static PyMethodDef unicode_methods[] = {
8054 /* Order is according to common usage: often used methods should
8055 appear first, since lookup is done sequentially. */
8057 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8058 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8059 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
8060 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
8061 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8062 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8063 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8064 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8065 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8066 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8067 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
8068 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
8069 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8070 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8071 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
8072 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
8073 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
8074 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
8075 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8076 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8077 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
8078 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
8079 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
8080 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
8081 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
8082 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8083 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8084 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8085 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8086 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8087 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8088 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8089 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8090 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8091 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8092 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8093 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8094 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8095 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
8096 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
8097 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8098 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
8099 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8100 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
8101 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
8102 #if 0
8103 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
8104 #endif
8106 #if 0
8107 /* This one is just used for debugging the implementation. */
8108 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
8109 #endif
8111 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
8112 {NULL, NULL}
8115 static PyObject *
8116 unicode_mod(PyObject *v, PyObject *w)
8118 if (!PyUnicode_Check(v)) {
8119 Py_INCREF(Py_NotImplemented);
8120 return Py_NotImplemented;
8122 return PyUnicode_Format(v, w);
8125 static PyNumberMethods unicode_as_number = {
8126 0, /*nb_add*/
8127 0, /*nb_subtract*/
8128 0, /*nb_multiply*/
8129 0, /*nb_divide*/
8130 unicode_mod, /*nb_remainder*/
8133 static PySequenceMethods unicode_as_sequence = {
8134 (lenfunc) unicode_length, /* sq_length */
8135 PyUnicode_Concat, /* sq_concat */
8136 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8137 (ssizeargfunc) unicode_getitem, /* sq_item */
8138 (ssizessizeargfunc) unicode_slice, /* sq_slice */
8139 0, /* sq_ass_item */
8140 0, /* sq_ass_slice */
8141 PyUnicode_Contains, /* sq_contains */
8144 static PyObject*
8145 unicode_subscript(PyUnicodeObject* self, PyObject* item)
8147 if (PyIndex_Check(item)) {
8148 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
8149 if (i == -1 && PyErr_Occurred())
8150 return NULL;
8151 if (i < 0)
8152 i += PyUnicode_GET_SIZE(self);
8153 return unicode_getitem(self, i);
8154 } else if (PySlice_Check(item)) {
8155 Py_ssize_t start, stop, step, slicelength, cur, i;
8156 Py_UNICODE* source_buf;
8157 Py_UNICODE* result_buf;
8158 PyObject* result;
8160 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
8161 &start, &stop, &step, &slicelength) < 0) {
8162 return NULL;
8165 if (slicelength <= 0) {
8166 return PyUnicode_FromUnicode(NULL, 0);
8167 } else if (start == 0 && step == 1 && slicelength == self->length &&
8168 PyUnicode_CheckExact(self)) {
8169 Py_INCREF(self);
8170 return (PyObject *)self;
8171 } else if (step == 1) {
8172 return PyUnicode_FromUnicode(self->str + start, slicelength);
8173 } else {
8174 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
8175 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8176 sizeof(Py_UNICODE));
8178 if (result_buf == NULL)
8179 return PyErr_NoMemory();
8181 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8182 result_buf[i] = source_buf[cur];
8185 result = PyUnicode_FromUnicode(result_buf, slicelength);
8186 PyObject_FREE(result_buf);
8187 return result;
8189 } else {
8190 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8191 return NULL;
8195 static PyMappingMethods unicode_as_mapping = {
8196 (lenfunc)unicode_length, /* mp_length */
8197 (binaryfunc)unicode_subscript, /* mp_subscript */
8198 (objobjargproc)0, /* mp_ass_subscript */
8201 static Py_ssize_t
8202 unicode_buffer_getreadbuf(PyUnicodeObject *self,
8203 Py_ssize_t index,
8204 const void **ptr)
8206 if (index != 0) {
8207 PyErr_SetString(PyExc_SystemError,
8208 "accessing non-existent unicode segment");
8209 return -1;
8211 *ptr = (void *) self->str;
8212 return PyUnicode_GET_DATA_SIZE(self);
8215 static Py_ssize_t
8216 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
8217 const void **ptr)
8219 PyErr_SetString(PyExc_TypeError,
8220 "cannot use unicode as modifiable buffer");
8221 return -1;
8224 static int
8225 unicode_buffer_getsegcount(PyUnicodeObject *self,
8226 Py_ssize_t *lenp)
8228 if (lenp)
8229 *lenp = PyUnicode_GET_DATA_SIZE(self);
8230 return 1;
8233 static Py_ssize_t
8234 unicode_buffer_getcharbuf(PyUnicodeObject *self,
8235 Py_ssize_t index,
8236 const void **ptr)
8238 PyObject *str;
8240 if (index != 0) {
8241 PyErr_SetString(PyExc_SystemError,
8242 "accessing non-existent unicode segment");
8243 return -1;
8245 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
8246 if (str == NULL)
8247 return -1;
8248 *ptr = (void *) PyString_AS_STRING(str);
8249 return PyString_GET_SIZE(str);
8252 /* Helpers for PyUnicode_Format() */
8254 static PyObject *
8255 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8257 Py_ssize_t argidx = *p_argidx;
8258 if (argidx < arglen) {
8259 (*p_argidx)++;
8260 if (arglen < 0)
8261 return args;
8262 else
8263 return PyTuple_GetItem(args, argidx);
8265 PyErr_SetString(PyExc_TypeError,
8266 "not enough arguments for format string");
8267 return NULL;
8270 #define F_LJUST (1<<0)
8271 #define F_SIGN (1<<1)
8272 #define F_BLANK (1<<2)
8273 #define F_ALT (1<<3)
8274 #define F_ZERO (1<<4)
8276 static Py_ssize_t
8277 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8279 register Py_ssize_t i;
8280 Py_ssize_t len = strlen(charbuffer);
8281 for (i = len - 1; i >= 0; i--)
8282 buffer[i] = (Py_UNICODE) charbuffer[i];
8284 return len;
8287 static int
8288 doubletounicode(Py_UNICODE *buffer, size_t len, int format_code,
8289 int precision, int flags, double x)
8291 Py_ssize_t result;
8293 _PyOS_double_to_string((char *)buffer, len, x, format_code, precision,
8294 flags, NULL);
8295 result = strtounicode(buffer, (char *)buffer);
8296 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8299 static int
8300 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8302 Py_ssize_t result;
8304 PyOS_snprintf((char *)buffer, len, format, x);
8305 result = strtounicode(buffer, (char *)buffer);
8306 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8309 /* XXX To save some code duplication, formatfloat/long/int could have been
8310 shared with stringobject.c, converting from 8-bit to Unicode after the
8311 formatting is done. */
8313 static int
8314 formatfloat(Py_UNICODE *buf,
8315 size_t buflen,
8316 int flags,
8317 int prec,
8318 int type,
8319 PyObject *v)
8321 double x;
8323 x = PyFloat_AsDouble(v);
8324 if (x == -1.0 && PyErr_Occurred())
8325 return -1;
8326 if (prec < 0)
8327 prec = 6;
8328 #if SIZEOF_INT > 4
8329 /* make sure that the decimal representation of precision really does
8330 need at most 10 digits: platforms with sizeof(int) == 8 exist! */
8331 if (prec > 0x7fffffff) {
8332 PyErr_SetString(PyExc_OverflowError,
8333 "outrageously large precision "
8334 "for formatted float");
8335 return -1;
8337 #endif
8339 if (type == 'f' && fabs(x) >= 1e50)
8340 type = 'g';
8341 /* Worst case length calc to ensure no buffer overrun:
8343 'g' formats:
8344 fmt = %#.<prec>g
8345 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8346 for any double rep.)
8347 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8349 'f' formats:
8350 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8351 len = 1 + 50 + 1 + prec = 52 + prec
8353 If prec=0 the effective precision is 1 (the leading digit is
8354 always given), therefore increase the length by one.
8357 if (((type == 'g' || type == 'G') &&
8358 buflen <= (size_t)10 + (size_t)prec) ||
8359 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
8360 PyErr_SetString(PyExc_OverflowError,
8361 "formatted float is too long (precision too large?)");
8362 return -1;
8364 return doubletounicode(buf, buflen, type, prec,
8365 (flags&F_ALT)?Py_DTSF_ALT:0, x);
8368 static PyObject*
8369 formatlong(PyObject *val, int flags, int prec, int type)
8371 char *buf;
8372 int i, len;
8373 PyObject *str; /* temporary string object. */
8374 PyUnicodeObject *result;
8376 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8377 if (!str)
8378 return NULL;
8379 result = _PyUnicode_New(len);
8380 if (!result) {
8381 Py_DECREF(str);
8382 return NULL;
8384 for (i = 0; i < len; i++)
8385 result->str[i] = buf[i];
8386 result->str[len] = 0;
8387 Py_DECREF(str);
8388 return (PyObject*)result;
8391 static int
8392 formatint(Py_UNICODE *buf,
8393 size_t buflen,
8394 int flags,
8395 int prec,
8396 int type,
8397 PyObject *v)
8399 /* fmt = '%#.' + `prec` + 'l' + `type`
8400 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8401 * + 1 + 1
8402 * = 24
8404 char fmt[64]; /* plenty big enough! */
8405 char *sign;
8406 long x;
8408 x = PyInt_AsLong(v);
8409 if (x == -1 && PyErr_Occurred())
8410 return -1;
8411 if (x < 0 && type == 'u') {
8412 type = 'd';
8414 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8415 sign = "-";
8416 else
8417 sign = "";
8418 if (prec < 0)
8419 prec = 1;
8421 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8422 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8424 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8425 PyErr_SetString(PyExc_OverflowError,
8426 "formatted integer is too long (precision too large?)");
8427 return -1;
8430 if ((flags & F_ALT) &&
8431 (type == 'x' || type == 'X')) {
8432 /* When converting under %#x or %#X, there are a number
8433 * of issues that cause pain:
8434 * - when 0 is being converted, the C standard leaves off
8435 * the '0x' or '0X', which is inconsistent with other
8436 * %#x/%#X conversions and inconsistent with Python's
8437 * hex() function
8438 * - there are platforms that violate the standard and
8439 * convert 0 with the '0x' or '0X'
8440 * (Metrowerks, Compaq Tru64)
8441 * - there are platforms that give '0x' when converting
8442 * under %#X, but convert 0 in accordance with the
8443 * standard (OS/2 EMX)
8445 * We can achieve the desired consistency by inserting our
8446 * own '0x' or '0X' prefix, and substituting %x/%X in place
8447 * of %#x/%#X.
8449 * Note that this is the same approach as used in
8450 * formatint() in stringobject.c
8452 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8453 sign, type, prec, type);
8455 else {
8456 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8457 sign, (flags&F_ALT) ? "#" : "",
8458 prec, type);
8460 if (sign[0])
8461 return longtounicode(buf, buflen, fmt, -x);
8462 else
8463 return longtounicode(buf, buflen, fmt, x);
8466 static int
8467 formatchar(Py_UNICODE *buf,
8468 size_t buflen,
8469 PyObject *v)
8471 /* presume that the buffer is at least 2 characters long */
8472 if (PyUnicode_Check(v)) {
8473 if (PyUnicode_GET_SIZE(v) != 1)
8474 goto onError;
8475 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8478 else if (PyString_Check(v)) {
8479 if (PyString_GET_SIZE(v) != 1)
8480 goto onError;
8481 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8484 else {
8485 /* Integer input truncated to a character */
8486 long x;
8487 x = PyInt_AsLong(v);
8488 if (x == -1 && PyErr_Occurred())
8489 goto onError;
8490 #ifdef Py_UNICODE_WIDE
8491 if (x < 0 || x > 0x10ffff) {
8492 PyErr_SetString(PyExc_OverflowError,
8493 "%c arg not in range(0x110000) "
8494 "(wide Python build)");
8495 return -1;
8497 #else
8498 if (x < 0 || x > 0xffff) {
8499 PyErr_SetString(PyExc_OverflowError,
8500 "%c arg not in range(0x10000) "
8501 "(narrow Python build)");
8502 return -1;
8504 #endif
8505 buf[0] = (Py_UNICODE) x;
8507 buf[1] = '\0';
8508 return 1;
8510 onError:
8511 PyErr_SetString(PyExc_TypeError,
8512 "%c requires int or char");
8513 return -1;
8516 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8518 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8519 chars are formatted. XXX This is a magic number. Each formatting
8520 routine does bounds checking to ensure no overflow, but a better
8521 solution may be to malloc a buffer of appropriate size for each
8522 format. For now, the current solution is sufficient.
8524 #define FORMATBUFLEN (size_t)120
8526 PyObject *PyUnicode_Format(PyObject *format,
8527 PyObject *args)
8529 Py_UNICODE *fmt, *res;
8530 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8531 int args_owned = 0;
8532 PyUnicodeObject *result = NULL;
8533 PyObject *dict = NULL;
8534 PyObject *uformat;
8536 if (format == NULL || args == NULL) {
8537 PyErr_BadInternalCall();
8538 return NULL;
8540 uformat = PyUnicode_FromObject(format);
8541 if (uformat == NULL)
8542 return NULL;
8543 fmt = PyUnicode_AS_UNICODE(uformat);
8544 fmtcnt = PyUnicode_GET_SIZE(uformat);
8546 reslen = rescnt = fmtcnt + 100;
8547 result = _PyUnicode_New(reslen);
8548 if (result == NULL)
8549 goto onError;
8550 res = PyUnicode_AS_UNICODE(result);
8552 if (PyTuple_Check(args)) {
8553 arglen = PyTuple_Size(args);
8554 argidx = 0;
8556 else {
8557 arglen = -1;
8558 argidx = -2;
8560 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
8561 !PyObject_TypeCheck(args, &PyBaseString_Type))
8562 dict = args;
8564 while (--fmtcnt >= 0) {
8565 if (*fmt != '%') {
8566 if (--rescnt < 0) {
8567 rescnt = fmtcnt + 100;
8568 reslen += rescnt;
8569 if (_PyUnicode_Resize(&result, reslen) < 0)
8570 goto onError;
8571 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8572 --rescnt;
8574 *res++ = *fmt++;
8576 else {
8577 /* Got a format specifier */
8578 int flags = 0;
8579 Py_ssize_t width = -1;
8580 int prec = -1;
8581 Py_UNICODE c = '\0';
8582 Py_UNICODE fill;
8583 int isnumok;
8584 PyObject *v = NULL;
8585 PyObject *temp = NULL;
8586 Py_UNICODE *pbuf;
8587 Py_UNICODE sign;
8588 Py_ssize_t len;
8589 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
8591 fmt++;
8592 if (*fmt == '(') {
8593 Py_UNICODE *keystart;
8594 Py_ssize_t keylen;
8595 PyObject *key;
8596 int pcount = 1;
8598 if (dict == NULL) {
8599 PyErr_SetString(PyExc_TypeError,
8600 "format requires a mapping");
8601 goto onError;
8603 ++fmt;
8604 --fmtcnt;
8605 keystart = fmt;
8606 /* Skip over balanced parentheses */
8607 while (pcount > 0 && --fmtcnt >= 0) {
8608 if (*fmt == ')')
8609 --pcount;
8610 else if (*fmt == '(')
8611 ++pcount;
8612 fmt++;
8614 keylen = fmt - keystart - 1;
8615 if (fmtcnt < 0 || pcount > 0) {
8616 PyErr_SetString(PyExc_ValueError,
8617 "incomplete format key");
8618 goto onError;
8620 #if 0
8621 /* keys are converted to strings using UTF-8 and
8622 then looked up since Python uses strings to hold
8623 variables names etc. in its namespaces and we
8624 wouldn't want to break common idioms. */
8625 key = PyUnicode_EncodeUTF8(keystart,
8626 keylen,
8627 NULL);
8628 #else
8629 key = PyUnicode_FromUnicode(keystart, keylen);
8630 #endif
8631 if (key == NULL)
8632 goto onError;
8633 if (args_owned) {
8634 Py_DECREF(args);
8635 args_owned = 0;
8637 args = PyObject_GetItem(dict, key);
8638 Py_DECREF(key);
8639 if (args == NULL) {
8640 goto onError;
8642 args_owned = 1;
8643 arglen = -1;
8644 argidx = -2;
8646 while (--fmtcnt >= 0) {
8647 switch (c = *fmt++) {
8648 case '-': flags |= F_LJUST; continue;
8649 case '+': flags |= F_SIGN; continue;
8650 case ' ': flags |= F_BLANK; continue;
8651 case '#': flags |= F_ALT; continue;
8652 case '0': flags |= F_ZERO; continue;
8654 break;
8656 if (c == '*') {
8657 v = getnextarg(args, arglen, &argidx);
8658 if (v == NULL)
8659 goto onError;
8660 if (!PyInt_Check(v)) {
8661 PyErr_SetString(PyExc_TypeError,
8662 "* wants int");
8663 goto onError;
8665 width = PyInt_AsLong(v);
8666 if (width < 0) {
8667 flags |= F_LJUST;
8668 width = -width;
8670 if (--fmtcnt >= 0)
8671 c = *fmt++;
8673 else if (c >= '0' && c <= '9') {
8674 width = c - '0';
8675 while (--fmtcnt >= 0) {
8676 c = *fmt++;
8677 if (c < '0' || c > '9')
8678 break;
8679 if ((width*10) / 10 != width) {
8680 PyErr_SetString(PyExc_ValueError,
8681 "width too big");
8682 goto onError;
8684 width = width*10 + (c - '0');
8687 if (c == '.') {
8688 prec = 0;
8689 if (--fmtcnt >= 0)
8690 c = *fmt++;
8691 if (c == '*') {
8692 v = getnextarg(args, arglen, &argidx);
8693 if (v == NULL)
8694 goto onError;
8695 if (!PyInt_Check(v)) {
8696 PyErr_SetString(PyExc_TypeError,
8697 "* wants int");
8698 goto onError;
8700 prec = PyInt_AsLong(v);
8701 if (prec < 0)
8702 prec = 0;
8703 if (--fmtcnt >= 0)
8704 c = *fmt++;
8706 else if (c >= '0' && c <= '9') {
8707 prec = c - '0';
8708 while (--fmtcnt >= 0) {
8709 c = Py_CHARMASK(*fmt++);
8710 if (c < '0' || c > '9')
8711 break;
8712 if ((prec*10) / 10 != prec) {
8713 PyErr_SetString(PyExc_ValueError,
8714 "prec too big");
8715 goto onError;
8717 prec = prec*10 + (c - '0');
8720 } /* prec */
8721 if (fmtcnt >= 0) {
8722 if (c == 'h' || c == 'l' || c == 'L') {
8723 if (--fmtcnt >= 0)
8724 c = *fmt++;
8727 if (fmtcnt < 0) {
8728 PyErr_SetString(PyExc_ValueError,
8729 "incomplete format");
8730 goto onError;
8732 if (c != '%') {
8733 v = getnextarg(args, arglen, &argidx);
8734 if (v == NULL)
8735 goto onError;
8737 sign = 0;
8738 fill = ' ';
8739 switch (c) {
8741 case '%':
8742 pbuf = formatbuf;
8743 /* presume that buffer length is at least 1 */
8744 pbuf[0] = '%';
8745 len = 1;
8746 break;
8748 case 's':
8749 case 'r':
8750 if (PyUnicode_Check(v) && c == 's') {
8751 temp = v;
8752 Py_INCREF(temp);
8754 else {
8755 PyObject *unicode;
8756 if (c == 's')
8757 temp = PyObject_Unicode(v);
8758 else
8759 temp = PyObject_Repr(v);
8760 if (temp == NULL)
8761 goto onError;
8762 if (PyUnicode_Check(temp))
8763 /* nothing to do */;
8764 else if (PyString_Check(temp)) {
8765 /* convert to string to Unicode */
8766 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8767 PyString_GET_SIZE(temp),
8768 NULL,
8769 "strict");
8770 Py_DECREF(temp);
8771 temp = unicode;
8772 if (temp == NULL)
8773 goto onError;
8775 else {
8776 Py_DECREF(temp);
8777 PyErr_SetString(PyExc_TypeError,
8778 "%s argument has non-string str()");
8779 goto onError;
8782 pbuf = PyUnicode_AS_UNICODE(temp);
8783 len = PyUnicode_GET_SIZE(temp);
8784 if (prec >= 0 && len > prec)
8785 len = prec;
8786 break;
8788 case 'i':
8789 case 'd':
8790 case 'u':
8791 case 'o':
8792 case 'x':
8793 case 'X':
8794 if (c == 'i')
8795 c = 'd';
8796 isnumok = 0;
8797 if (PyNumber_Check(v)) {
8798 PyObject *iobj=NULL;
8800 if (PyInt_Check(v) || (PyLong_Check(v))) {
8801 iobj = v;
8802 Py_INCREF(iobj);
8804 else {
8805 iobj = PyNumber_Int(v);
8806 if (iobj==NULL) iobj = PyNumber_Long(v);
8808 if (iobj!=NULL) {
8809 if (PyInt_Check(iobj)) {
8810 isnumok = 1;
8811 pbuf = formatbuf;
8812 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8813 flags, prec, c, iobj);
8814 Py_DECREF(iobj);
8815 if (len < 0)
8816 goto onError;
8817 sign = 1;
8819 else if (PyLong_Check(iobj)) {
8820 isnumok = 1;
8821 temp = formatlong(iobj, flags, prec, c);
8822 Py_DECREF(iobj);
8823 if (!temp)
8824 goto onError;
8825 pbuf = PyUnicode_AS_UNICODE(temp);
8826 len = PyUnicode_GET_SIZE(temp);
8827 sign = 1;
8829 else {
8830 Py_DECREF(iobj);
8834 if (!isnumok) {
8835 PyErr_Format(PyExc_TypeError,
8836 "%%%c format: a number is required, "
8837 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8838 goto onError;
8840 if (flags & F_ZERO)
8841 fill = '0';
8842 break;
8844 case 'e':
8845 case 'E':
8846 case 'f':
8847 case 'F':
8848 case 'g':
8849 case 'G':
8850 if (c == 'F')
8851 c = 'f';
8852 pbuf = formatbuf;
8853 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8854 flags, prec, c, v);
8855 if (len < 0)
8856 goto onError;
8857 sign = 1;
8858 if (flags & F_ZERO)
8859 fill = '0';
8860 break;
8862 case 'c':
8863 pbuf = formatbuf;
8864 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8865 if (len < 0)
8866 goto onError;
8867 break;
8869 default:
8870 PyErr_Format(PyExc_ValueError,
8871 "unsupported format character '%c' (0x%x) "
8872 "at index %zd",
8873 (31<=c && c<=126) ? (char)c : '?',
8874 (int)c,
8875 (Py_ssize_t)(fmt - 1 -
8876 PyUnicode_AS_UNICODE(uformat)));
8877 goto onError;
8879 if (sign) {
8880 if (*pbuf == '-' || *pbuf == '+') {
8881 sign = *pbuf++;
8882 len--;
8884 else if (flags & F_SIGN)
8885 sign = '+';
8886 else if (flags & F_BLANK)
8887 sign = ' ';
8888 else
8889 sign = 0;
8891 if (width < len)
8892 width = len;
8893 if (rescnt - (sign != 0) < width) {
8894 reslen -= rescnt;
8895 rescnt = width + fmtcnt + 100;
8896 reslen += rescnt;
8897 if (reslen < 0) {
8898 Py_XDECREF(temp);
8899 PyErr_NoMemory();
8900 goto onError;
8902 if (_PyUnicode_Resize(&result, reslen) < 0) {
8903 Py_XDECREF(temp);
8904 goto onError;
8906 res = PyUnicode_AS_UNICODE(result)
8907 + reslen - rescnt;
8909 if (sign) {
8910 if (fill != ' ')
8911 *res++ = sign;
8912 rescnt--;
8913 if (width > len)
8914 width--;
8916 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8917 assert(pbuf[0] == '0');
8918 assert(pbuf[1] == c);
8919 if (fill != ' ') {
8920 *res++ = *pbuf++;
8921 *res++ = *pbuf++;
8923 rescnt -= 2;
8924 width -= 2;
8925 if (width < 0)
8926 width = 0;
8927 len -= 2;
8929 if (width > len && !(flags & F_LJUST)) {
8930 do {
8931 --rescnt;
8932 *res++ = fill;
8933 } while (--width > len);
8935 if (fill == ' ') {
8936 if (sign)
8937 *res++ = sign;
8938 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8939 assert(pbuf[0] == '0');
8940 assert(pbuf[1] == c);
8941 *res++ = *pbuf++;
8942 *res++ = *pbuf++;
8945 Py_UNICODE_COPY(res, pbuf, len);
8946 res += len;
8947 rescnt -= len;
8948 while (--width >= len) {
8949 --rescnt;
8950 *res++ = ' ';
8952 if (dict && (argidx < arglen) && c != '%') {
8953 PyErr_SetString(PyExc_TypeError,
8954 "not all arguments converted during string formatting");
8955 Py_XDECREF(temp);
8956 goto onError;
8958 Py_XDECREF(temp);
8959 } /* '%' */
8960 } /* until end */
8961 if (argidx < arglen && !dict) {
8962 PyErr_SetString(PyExc_TypeError,
8963 "not all arguments converted during string formatting");
8964 goto onError;
8967 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8968 goto onError;
8969 if (args_owned) {
8970 Py_DECREF(args);
8972 Py_DECREF(uformat);
8973 return (PyObject *)result;
8975 onError:
8976 Py_XDECREF(result);
8977 Py_DECREF(uformat);
8978 if (args_owned) {
8979 Py_DECREF(args);
8981 return NULL;
8984 static PyBufferProcs unicode_as_buffer = {
8985 (readbufferproc) unicode_buffer_getreadbuf,
8986 (writebufferproc) unicode_buffer_getwritebuf,
8987 (segcountproc) unicode_buffer_getsegcount,
8988 (charbufferproc) unicode_buffer_getcharbuf,
8991 static PyObject *
8992 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8994 static PyObject *
8995 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8997 PyObject *x = NULL;
8998 static char *kwlist[] = {"string", "encoding", "errors", 0};
8999 char *encoding = NULL;
9000 char *errors = NULL;
9002 if (type != &PyUnicode_Type)
9003 return unicode_subtype_new(type, args, kwds);
9004 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
9005 kwlist, &x, &encoding, &errors))
9006 return NULL;
9007 if (x == NULL)
9008 return (PyObject *)_PyUnicode_New(0);
9009 if (encoding == NULL && errors == NULL)
9010 return PyObject_Unicode(x);
9011 else
9012 return PyUnicode_FromEncodedObject(x, encoding, errors);
9015 static PyObject *
9016 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9018 PyUnicodeObject *tmp, *pnew;
9019 Py_ssize_t n;
9021 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9022 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9023 if (tmp == NULL)
9024 return NULL;
9025 assert(PyUnicode_Check(tmp));
9026 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9027 if (pnew == NULL) {
9028 Py_DECREF(tmp);
9029 return NULL;
9031 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9032 if (pnew->str == NULL) {
9033 _Py_ForgetReference((PyObject *)pnew);
9034 PyObject_Del(pnew);
9035 Py_DECREF(tmp);
9036 return PyErr_NoMemory();
9038 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9039 pnew->length = n;
9040 pnew->hash = tmp->hash;
9041 Py_DECREF(tmp);
9042 return (PyObject *)pnew;
9045 PyDoc_STRVAR(unicode_doc,
9046 "unicode(string [, encoding[, errors]]) -> object\n\
9048 Create a new Unicode object from the given encoded string.\n\
9049 encoding defaults to the current default string encoding.\n\
9050 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
9052 PyTypeObject PyUnicode_Type = {
9053 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9054 "unicode", /* tp_name */
9055 sizeof(PyUnicodeObject), /* tp_size */
9056 0, /* tp_itemsize */
9057 /* Slots */
9058 (destructor)unicode_dealloc, /* tp_dealloc */
9059 0, /* tp_print */
9060 0, /* tp_getattr */
9061 0, /* tp_setattr */
9062 0, /* tp_compare */
9063 unicode_repr, /* tp_repr */
9064 &unicode_as_number, /* tp_as_number */
9065 &unicode_as_sequence, /* tp_as_sequence */
9066 &unicode_as_mapping, /* tp_as_mapping */
9067 (hashfunc) unicode_hash, /* tp_hash*/
9068 0, /* tp_call*/
9069 (reprfunc) unicode_str, /* tp_str */
9070 PyObject_GenericGetAttr, /* tp_getattro */
9071 0, /* tp_setattro */
9072 &unicode_as_buffer, /* tp_as_buffer */
9073 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
9074 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
9075 unicode_doc, /* tp_doc */
9076 0, /* tp_traverse */
9077 0, /* tp_clear */
9078 PyUnicode_RichCompare, /* tp_richcompare */
9079 0, /* tp_weaklistoffset */
9080 0, /* tp_iter */
9081 0, /* tp_iternext */
9082 unicode_methods, /* tp_methods */
9083 0, /* tp_members */
9084 0, /* tp_getset */
9085 &PyBaseString_Type, /* tp_base */
9086 0, /* tp_dict */
9087 0, /* tp_descr_get */
9088 0, /* tp_descr_set */
9089 0, /* tp_dictoffset */
9090 0, /* tp_init */
9091 0, /* tp_alloc */
9092 unicode_new, /* tp_new */
9093 PyObject_Del, /* tp_free */
9096 /* Initialize the Unicode implementation */
9098 void _PyUnicode_Init(void)
9100 int i;
9102 /* XXX - move this array to unicodectype.c ? */
9103 Py_UNICODE linebreak[] = {
9104 0x000A, /* LINE FEED */
9105 0x000D, /* CARRIAGE RETURN */
9106 0x001C, /* FILE SEPARATOR */
9107 0x001D, /* GROUP SEPARATOR */
9108 0x001E, /* RECORD SEPARATOR */
9109 0x0085, /* NEXT LINE */
9110 0x2028, /* LINE SEPARATOR */
9111 0x2029, /* PARAGRAPH SEPARATOR */
9114 /* Init the implementation */
9115 free_list = NULL;
9116 numfree = 0;
9117 unicode_empty = _PyUnicode_New(0);
9118 if (!unicode_empty)
9119 return;
9121 strcpy(unicode_default_encoding, "ascii");
9122 for (i = 0; i < 256; i++)
9123 unicode_latin1[i] = NULL;
9124 if (PyType_Ready(&PyUnicode_Type) < 0)
9125 Py_FatalError("Can't initialize 'unicode'");
9127 /* initialize the linebreak bloom filter */
9128 bloom_linebreak = make_bloom_mask(
9129 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9132 PyType_Ready(&EncodingMapType);
9135 /* Finalize the Unicode implementation */
9138 PyUnicode_ClearFreeList(void)
9140 int freelist_size = numfree;
9141 PyUnicodeObject *u;
9143 for (u = free_list; u != NULL;) {
9144 PyUnicodeObject *v = u;
9145 u = *(PyUnicodeObject **)u;
9146 if (v->str)
9147 PyObject_DEL(v->str);
9148 Py_XDECREF(v->defenc);
9149 PyObject_Del(v);
9150 numfree--;
9152 free_list = NULL;
9153 assert(numfree == 0);
9154 return freelist_size;
9157 void
9158 _PyUnicode_Fini(void)
9160 int i;
9162 Py_XDECREF(unicode_empty);
9163 unicode_empty = NULL;
9165 for (i = 0; i < 256; i++) {
9166 if (unicode_latin1[i]) {
9167 Py_DECREF(unicode_latin1[i]);
9168 unicode_latin1[i] = NULL;
9171 (void)PyUnicode_ClearFreeList();
9174 #ifdef __cplusplus
9176 #endif
9180 Local variables:
9181 c-basic-offset: 4
9182 indent-tabs-mode: nil
9183 End: