Avoid signed overflow in some xrange calculations, and extend
[python.git] / Objects / unicodeobject.c
blob65c10b1cbd9b99bc0900c8419544ef0ddc3eae29
1 /*
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
10 Copyright (c) Corporation for National Research Initiatives.
12 --------------------------------------------------------------------
13 The original string type implementation is:
15 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
18 By obtaining, using, and/or copying this software and/or its
19 associated documentation, you agree that you have read, understood,
20 and will comply with the following terms and conditions:
22 Permission to use, copy, modify, and distribute this software and its
23 associated documentation for any purpose and without fee is hereby
24 granted, provided that the above copyright notice appears in all
25 copies, and that both that copyright notice and this permission notice
26 appear in supporting documentation, and that the name of Secret Labs
27 AB or the author not be used in advertising or publicity pertaining to
28 distribution of the software without specific, written prior
29 permission.
31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 --------------------------------------------------------------------
42 #define PY_SSIZE_T_CLEAN
43 #include "Python.h"
45 #include "unicodeobject.h"
46 #include "ucnhash.h"
48 #ifdef MS_WINDOWS
49 #include <windows.h>
50 #endif
52 /* Limit for the Unicode object free list */
54 #define PyUnicode_MAXFREELIST 1024
56 /* Limit for the Unicode object free list stay alive optimization.
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
60 limit. This reduces malloc() overhead for small Unicode objects.
62 At worst this will result in PyUnicode_MAXFREELIST *
63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64 malloc()-overhead) bytes of unused garbage.
66 Setting the limit to 0 effectively turns the feature off.
68 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
73 #define KEEPALIVE_SIZE_LIMIT 9
75 /* Endianness switches; defaults to little endian */
77 #ifdef WORDS_BIGENDIAN
78 # define BYTEORDER_IS_BIG_ENDIAN
79 #else
80 # define BYTEORDER_IS_LITTLE_ENDIAN
81 #endif
83 /* --- Globals ------------------------------------------------------------
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
91 #ifdef __cplusplus
92 extern "C" {
93 #endif
95 /* Free list for Unicode objects */
96 static PyUnicodeObject *free_list;
97 static int numfree;
99 /* The empty Unicode object is shared to improve performance. */
100 static PyUnicodeObject *unicode_empty;
102 /* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104 static PyUnicodeObject *unicode_latin1[256];
106 /* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
109 Always use the PyUnicode_SetDefaultEncoding() and
110 PyUnicode_GetDefaultEncoding() APIs to access this global.
113 static char unicode_default_encoding[100];
115 /* Fast detection of the most frequent whitespace characters */
116 const unsigned char _Py_ascii_whitespace[] = {
117 0, 0, 0, 0, 0, 0, 0, 0,
118 /* case 0x0009: * HORIZONTAL TABULATION */
119 /* case 0x000A: * LINE FEED */
120 /* case 0x000B: * VERTICAL TABULATION */
121 /* case 0x000C: * FORM FEED */
122 /* case 0x000D: * CARRIAGE RETURN */
123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
125 /* case 0x001C: * FILE SEPARATOR */
126 /* case 0x001D: * GROUP SEPARATOR */
127 /* case 0x001E: * RECORD SEPARATOR */
128 /* case 0x001F: * UNIT SEPARATOR */
129 0, 0, 0, 0, 1, 1, 1, 1,
130 /* case 0x0020: * SPACE */
131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
146 /* Same for linebreaks */
147 static unsigned char ascii_linebreak[] = {
148 0, 0, 0, 0, 0, 0, 0, 0,
149 /* 0x000A, * LINE FEED */
150 /* 0x000D, * CARRIAGE RETURN */
151 0, 0, 1, 0, 0, 1, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 /* 0x001C, * FILE SEPARATOR */
154 /* 0x001D, * GROUP SEPARATOR */
155 /* 0x001E, * RECORD SEPARATOR */
156 0, 0, 0, 0, 1, 1, 1, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0
173 Py_UNICODE
174 PyUnicode_GetMax(void)
176 #ifdef Py_UNICODE_WIDE
177 return 0x10FFFF;
178 #else
179 /* This is actually an illegal character, so it should
180 not be passed to unichr. */
181 return 0xFFFF;
182 #endif
185 /* --- Bloom Filters ----------------------------------------------------- */
187 /* stuff to implement simple "bloom filters" for Unicode characters.
188 to keep things simple, we use a single bitmask, using the least 5
189 bits from each unicode characters as the bit index. */
191 /* the linebreak mask is set up by Unicode_Init below */
193 #define BLOOM_MASK unsigned long
195 static BLOOM_MASK bloom_linebreak;
197 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
199 #define BLOOM_LINEBREAK(ch) \
200 ((ch) < 128U ? ascii_linebreak[(ch)] : \
201 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
203 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
205 /* calculate simple bloom-style bitmask for a given unicode string */
207 long mask;
208 Py_ssize_t i;
210 mask = 0;
211 for (i = 0; i < len; i++)
212 mask |= (1 << (ptr[i] & 0x1F));
214 return mask;
217 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
219 Py_ssize_t i;
221 for (i = 0; i < setlen; i++)
222 if (set[i] == chr)
223 return 1;
225 return 0;
228 #define BLOOM_MEMBER(mask, chr, set, setlen) \
229 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
231 /* --- Unicode Object ----------------------------------------------------- */
233 static
234 int unicode_resize(register PyUnicodeObject *unicode,
235 Py_ssize_t length)
237 void *oldstr;
239 /* Shortcut if there's nothing much to do. */
240 if (unicode->length == length)
241 goto reset;
243 /* Resizing shared object (unicode_empty or single character
244 objects) in-place is not allowed. Use PyUnicode_Resize()
245 instead ! */
247 if (unicode == unicode_empty ||
248 (unicode->length == 1 &&
249 unicode->str[0] < 256U &&
250 unicode_latin1[unicode->str[0]] == unicode)) {
251 PyErr_SetString(PyExc_SystemError,
252 "can't resize shared unicode objects");
253 return -1;
256 /* We allocate one more byte to make sure the string is Ux0000 terminated.
257 The overallocation is also used by fastsearch, which assumes that it's
258 safe to look at str[length] (without making any assumptions about what
259 it contains). */
261 oldstr = unicode->str;
262 unicode->str = PyObject_REALLOC(unicode->str,
263 sizeof(Py_UNICODE) * (length + 1));
264 if (!unicode->str) {
265 unicode->str = (Py_UNICODE *)oldstr;
266 PyErr_NoMemory();
267 return -1;
269 unicode->str[length] = 0;
270 unicode->length = length;
272 reset:
273 /* Reset the object caches */
274 if (unicode->defenc) {
275 Py_DECREF(unicode->defenc);
276 unicode->defenc = NULL;
278 unicode->hash = -1;
280 return 0;
283 /* We allocate one more byte to make sure the string is
284 Ux0000 terminated -- XXX is this needed ?
286 XXX This allocator could further be enhanced by assuring that the
287 free list never reduces its size below 1.
291 static
292 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
294 register PyUnicodeObject *unicode;
296 /* Optimization for empty strings */
297 if (length == 0 && unicode_empty != NULL) {
298 Py_INCREF(unicode_empty);
299 return unicode_empty;
302 /* Ensure we won't overflow the size. */
303 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
304 return (PyUnicodeObject *)PyErr_NoMemory();
307 /* Unicode freelist & memory allocation */
308 if (free_list) {
309 unicode = free_list;
310 free_list = *(PyUnicodeObject **)unicode;
311 numfree--;
312 if (unicode->str) {
313 /* Keep-Alive optimization: we only upsize the buffer,
314 never downsize it. */
315 if ((unicode->length < length) &&
316 unicode_resize(unicode, length) < 0) {
317 PyObject_DEL(unicode->str);
318 unicode->str = NULL;
321 else {
322 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
323 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
325 PyObject_INIT(unicode, &PyUnicode_Type);
327 else {
328 size_t new_size;
329 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
330 if (unicode == NULL)
331 return NULL;
332 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
333 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
336 if (!unicode->str) {
337 PyErr_NoMemory();
338 goto onError;
340 /* Initialize the first element to guard against cases where
341 * the caller fails before initializing str -- unicode_resize()
342 * reads str[0], and the Keep-Alive optimization can keep memory
343 * allocated for str alive across a call to unicode_dealloc(unicode).
344 * We don't want unicode_resize to read uninitialized memory in
345 * that case.
347 unicode->str[0] = 0;
348 unicode->str[length] = 0;
349 unicode->length = length;
350 unicode->hash = -1;
351 unicode->defenc = NULL;
352 return unicode;
354 onError:
355 /* XXX UNREF/NEWREF interface should be more symmetrical */
356 _Py_DEC_REFTOTAL;
357 _Py_ForgetReference((PyObject *)unicode);
358 PyObject_Del(unicode);
359 return NULL;
362 static
363 void unicode_dealloc(register PyUnicodeObject *unicode)
365 if (PyUnicode_CheckExact(unicode) &&
366 numfree < PyUnicode_MAXFREELIST) {
367 /* Keep-Alive optimization */
368 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
369 PyObject_DEL(unicode->str);
370 unicode->str = NULL;
371 unicode->length = 0;
373 if (unicode->defenc) {
374 Py_DECREF(unicode->defenc);
375 unicode->defenc = NULL;
377 /* Add to free list */
378 *(PyUnicodeObject **)unicode = free_list;
379 free_list = unicode;
380 numfree++;
382 else {
383 PyObject_DEL(unicode->str);
384 Py_XDECREF(unicode->defenc);
385 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
389 static
390 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
392 register PyUnicodeObject *v;
394 /* Argument checks */
395 if (unicode == NULL) {
396 PyErr_BadInternalCall();
397 return -1;
399 v = *unicode;
400 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
401 PyErr_BadInternalCall();
402 return -1;
405 /* Resizing unicode_empty and single character objects is not
406 possible since these are being shared. We simply return a fresh
407 copy with the same Unicode content. */
408 if (v->length != length &&
409 (v == unicode_empty || v->length == 1)) {
410 PyUnicodeObject *w = _PyUnicode_New(length);
411 if (w == NULL)
412 return -1;
413 Py_UNICODE_COPY(w->str, v->str,
414 length < v->length ? length : v->length);
415 Py_DECREF(*unicode);
416 *unicode = w;
417 return 0;
420 /* Note that we don't have to modify *unicode for unshared Unicode
421 objects, since we can modify them in-place. */
422 return unicode_resize(v, length);
425 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
427 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
430 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
431 Py_ssize_t size)
433 PyUnicodeObject *unicode;
435 /* If the Unicode data is known at construction time, we can apply
436 some optimizations which share commonly used objects. */
437 if (u != NULL) {
439 /* Optimization for empty strings */
440 if (size == 0 && unicode_empty != NULL) {
441 Py_INCREF(unicode_empty);
442 return (PyObject *)unicode_empty;
445 /* Single character Unicode objects in the Latin-1 range are
446 shared when using this constructor */
447 if (size == 1 && *u < 256) {
448 unicode = unicode_latin1[*u];
449 if (!unicode) {
450 unicode = _PyUnicode_New(1);
451 if (!unicode)
452 return NULL;
453 unicode->str[0] = *u;
454 unicode_latin1[*u] = unicode;
456 Py_INCREF(unicode);
457 return (PyObject *)unicode;
461 unicode = _PyUnicode_New(size);
462 if (!unicode)
463 return NULL;
465 /* Copy the Unicode data into the new object */
466 if (u != NULL)
467 Py_UNICODE_COPY(unicode->str, u, size);
469 return (PyObject *)unicode;
472 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
474 PyUnicodeObject *unicode;
476 if (size < 0) {
477 PyErr_SetString(PyExc_SystemError,
478 "Negative size passed to PyUnicode_FromStringAndSize");
479 return NULL;
482 /* If the Unicode data is known at construction time, we can apply
483 some optimizations which share commonly used objects.
484 Also, this means the input must be UTF-8, so fall back to the
485 UTF-8 decoder at the end. */
486 if (u != NULL) {
488 /* Optimization for empty strings */
489 if (size == 0 && unicode_empty != NULL) {
490 Py_INCREF(unicode_empty);
491 return (PyObject *)unicode_empty;
494 /* Single characters are shared when using this constructor.
495 Restrict to ASCII, since the input must be UTF-8. */
496 if (size == 1 && Py_CHARMASK(*u) < 128) {
497 unicode = unicode_latin1[Py_CHARMASK(*u)];
498 if (!unicode) {
499 unicode = _PyUnicode_New(1);
500 if (!unicode)
501 return NULL;
502 unicode->str[0] = Py_CHARMASK(*u);
503 unicode_latin1[Py_CHARMASK(*u)] = unicode;
505 Py_INCREF(unicode);
506 return (PyObject *)unicode;
509 return PyUnicode_DecodeUTF8(u, size, NULL);
512 unicode = _PyUnicode_New(size);
513 if (!unicode)
514 return NULL;
516 return (PyObject *)unicode;
519 PyObject *PyUnicode_FromString(const char *u)
521 size_t size = strlen(u);
522 if (size > PY_SSIZE_T_MAX) {
523 PyErr_SetString(PyExc_OverflowError, "input too long");
524 return NULL;
527 return PyUnicode_FromStringAndSize(u, size);
530 #ifdef HAVE_WCHAR_H
532 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
533 # define CONVERT_WCHAR_TO_SURROGATES
534 #endif
536 #ifdef CONVERT_WCHAR_TO_SURROGATES
538 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
539 to convert from UTF32 to UTF16. */
541 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
542 Py_ssize_t size)
544 PyUnicodeObject *unicode;
545 register Py_ssize_t i;
546 Py_ssize_t alloc;
547 const wchar_t *orig_w;
549 if (w == NULL) {
550 PyErr_BadInternalCall();
551 return NULL;
554 alloc = size;
555 orig_w = w;
556 for (i = size; i > 0; i--) {
557 if (*w > 0xFFFF)
558 alloc++;
559 w++;
561 w = orig_w;
562 unicode = _PyUnicode_New(alloc);
563 if (!unicode)
564 return NULL;
566 /* Copy the wchar_t data into the new object */
568 register Py_UNICODE *u;
569 u = PyUnicode_AS_UNICODE(unicode);
570 for (i = size; i > 0; i--) {
571 if (*w > 0xFFFF) {
572 wchar_t ordinal = *w++;
573 ordinal -= 0x10000;
574 *u++ = 0xD800 | (ordinal >> 10);
575 *u++ = 0xDC00 | (ordinal & 0x3FF);
577 else
578 *u++ = *w++;
581 return (PyObject *)unicode;
584 #else
586 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
587 Py_ssize_t size)
589 PyUnicodeObject *unicode;
591 if (w == NULL) {
592 PyErr_BadInternalCall();
593 return NULL;
596 unicode = _PyUnicode_New(size);
597 if (!unicode)
598 return NULL;
600 /* Copy the wchar_t data into the new object */
601 #ifdef HAVE_USABLE_WCHAR_T
602 memcpy(unicode->str, w, size * sizeof(wchar_t));
603 #else
605 register Py_UNICODE *u;
606 register Py_ssize_t i;
607 u = PyUnicode_AS_UNICODE(unicode);
608 for (i = size; i > 0; i--)
609 *u++ = *w++;
611 #endif
613 return (PyObject *)unicode;
616 #endif /* CONVERT_WCHAR_TO_SURROGATES */
618 #undef CONVERT_WCHAR_TO_SURROGATES
620 static void
621 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
623 *fmt++ = '%';
624 if (width) {
625 if (zeropad)
626 *fmt++ = '0';
627 fmt += sprintf(fmt, "%d", width);
629 if (precision)
630 fmt += sprintf(fmt, ".%d", precision);
631 if (longflag)
632 *fmt++ = 'l';
633 else if (size_tflag) {
634 char *f = PY_FORMAT_SIZE_T;
635 while (*f)
636 *fmt++ = *f++;
638 *fmt++ = c;
639 *fmt = '\0';
642 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
644 PyObject *
645 PyUnicode_FromFormatV(const char *format, va_list vargs)
647 va_list count;
648 Py_ssize_t callcount = 0;
649 PyObject **callresults = NULL;
650 PyObject **callresult = NULL;
651 Py_ssize_t n = 0;
652 int width = 0;
653 int precision = 0;
654 int zeropad;
655 const char* f;
656 Py_UNICODE *s;
657 PyObject *string;
658 /* used by sprintf */
659 char buffer[21];
660 /* use abuffer instead of buffer, if we need more space
661 * (which can happen if there's a format specifier with width). */
662 char *abuffer = NULL;
663 char *realbuffer;
664 Py_ssize_t abuffersize = 0;
665 char fmt[60]; /* should be enough for %0width.precisionld */
666 const char *copy;
668 #ifdef VA_LIST_IS_ARRAY
669 Py_MEMCPY(count, vargs, sizeof(va_list));
670 #else
671 #ifdef __va_copy
672 __va_copy(count, vargs);
673 #else
674 count = vargs;
675 #endif
676 #endif
677 /* step 1: count the number of %S/%R/%s format specifications
678 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
679 * objects once during step 3 and put the result in an array) */
680 for (f = format; *f; f++) {
681 if (*f == '%') {
682 if (*(f+1)=='%')
683 continue;
684 if (*(f+1)=='S' || *(f+1)=='R')
685 ++callcount;
686 while (isdigit((unsigned)*f))
687 width = (width*10) + *f++ - '0';
688 while (*++f && *f != '%' && !isalpha((unsigned)*f))
690 if (*f == 's')
691 ++callcount;
694 /* step 2: allocate memory for the results of
695 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
696 if (callcount) {
697 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
698 if (!callresults) {
699 PyErr_NoMemory();
700 return NULL;
702 callresult = callresults;
704 /* step 3: figure out how large a buffer we need */
705 for (f = format; *f; f++) {
706 if (*f == '%') {
707 const char* p = f;
708 width = 0;
709 while (isdigit((unsigned)*f))
710 width = (width*10) + *f++ - '0';
711 while (*++f && *f != '%' && !isalpha((unsigned)*f))
714 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
715 * they don't affect the amount of space we reserve.
717 if ((*f == 'l' || *f == 'z') &&
718 (f[1] == 'd' || f[1] == 'u'))
719 ++f;
721 switch (*f) {
722 case 'c':
723 (void)va_arg(count, int);
724 /* fall through... */
725 case '%':
726 n++;
727 break;
728 case 'd': case 'u': case 'i': case 'x':
729 (void) va_arg(count, int);
730 /* 20 bytes is enough to hold a 64-bit
731 integer. Decimal takes the most space.
732 This isn't enough for octal.
733 If a width is specified we need more
734 (which we allocate later). */
735 if (width < 20)
736 width = 20;
737 n += width;
738 if (abuffersize < width)
739 abuffersize = width;
740 break;
741 case 's':
743 /* UTF-8 */
744 const char *s = va_arg(count, const char*);
745 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
746 if (!str)
747 goto fail;
748 n += PyUnicode_GET_SIZE(str);
749 /* Remember the str and switch to the next slot */
750 *callresult++ = str;
751 break;
753 case 'U':
755 PyObject *obj = va_arg(count, PyObject *);
756 assert(obj && PyUnicode_Check(obj));
757 n += PyUnicode_GET_SIZE(obj);
758 break;
760 case 'V':
762 PyObject *obj = va_arg(count, PyObject *);
763 const char *str = va_arg(count, const char *);
764 assert(obj || str);
765 assert(!obj || PyUnicode_Check(obj));
766 if (obj)
767 n += PyUnicode_GET_SIZE(obj);
768 else
769 n += strlen(str);
770 break;
772 case 'S':
774 PyObject *obj = va_arg(count, PyObject *);
775 PyObject *str;
776 assert(obj);
777 str = PyObject_Str(obj);
778 if (!str)
779 goto fail;
780 n += PyUnicode_GET_SIZE(str);
781 /* Remember the str and switch to the next slot */
782 *callresult++ = str;
783 break;
785 case 'R':
787 PyObject *obj = va_arg(count, PyObject *);
788 PyObject *repr;
789 assert(obj);
790 repr = PyObject_Repr(obj);
791 if (!repr)
792 goto fail;
793 n += PyUnicode_GET_SIZE(repr);
794 /* Remember the repr and switch to the next slot */
795 *callresult++ = repr;
796 break;
798 case 'p':
799 (void) va_arg(count, int);
800 /* maximum 64-bit pointer representation:
801 * 0xffffffffffffffff
802 * so 19 characters is enough.
803 * XXX I count 18 -- what's the extra for?
805 n += 19;
806 break;
807 default:
808 /* if we stumble upon an unknown
809 formatting code, copy the rest of
810 the format string to the output
811 string. (we cannot just skip the
812 code, since there's no way to know
813 what's in the argument list) */
814 n += strlen(p);
815 goto expand;
817 } else
818 n++;
820 expand:
821 if (abuffersize > 20) {
822 abuffer = PyObject_Malloc(abuffersize);
823 if (!abuffer) {
824 PyErr_NoMemory();
825 goto fail;
827 realbuffer = abuffer;
829 else
830 realbuffer = buffer;
831 /* step 4: fill the buffer */
832 /* Since we've analyzed how much space we need for the worst case,
833 we don't have to resize the string.
834 There can be no errors beyond this point. */
835 string = PyUnicode_FromUnicode(NULL, n);
836 if (!string)
837 goto fail;
839 s = PyUnicode_AS_UNICODE(string);
840 callresult = callresults;
842 for (f = format; *f; f++) {
843 if (*f == '%') {
844 const char* p = f++;
845 int longflag = 0;
846 int size_tflag = 0;
847 zeropad = (*f == '0');
848 /* parse the width.precision part */
849 width = 0;
850 while (isdigit((unsigned)*f))
851 width = (width*10) + *f++ - '0';
852 precision = 0;
853 if (*f == '.') {
854 f++;
855 while (isdigit((unsigned)*f))
856 precision = (precision*10) + *f++ - '0';
858 /* handle the long flag, but only for %ld and %lu.
859 others can be added when necessary. */
860 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
861 longflag = 1;
862 ++f;
864 /* handle the size_t flag. */
865 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
866 size_tflag = 1;
867 ++f;
870 switch (*f) {
871 case 'c':
872 *s++ = va_arg(vargs, int);
873 break;
874 case 'd':
875 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
876 if (longflag)
877 sprintf(realbuffer, fmt, va_arg(vargs, long));
878 else if (size_tflag)
879 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
880 else
881 sprintf(realbuffer, fmt, va_arg(vargs, int));
882 appendstring(realbuffer);
883 break;
884 case 'u':
885 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
886 if (longflag)
887 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
888 else if (size_tflag)
889 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
890 else
891 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
892 appendstring(realbuffer);
893 break;
894 case 'i':
895 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
896 sprintf(realbuffer, fmt, va_arg(vargs, int));
897 appendstring(realbuffer);
898 break;
899 case 'x':
900 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
901 sprintf(realbuffer, fmt, va_arg(vargs, int));
902 appendstring(realbuffer);
903 break;
904 case 's':
906 /* unused, since we already have the result */
907 (void) va_arg(vargs, char *);
908 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
909 PyUnicode_GET_SIZE(*callresult));
910 s += PyUnicode_GET_SIZE(*callresult);
911 /* We're done with the unicode()/repr() => forget it */
912 Py_DECREF(*callresult);
913 /* switch to next unicode()/repr() result */
914 ++callresult;
915 break;
917 case 'U':
919 PyObject *obj = va_arg(vargs, PyObject *);
920 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
921 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
922 s += size;
923 break;
925 case 'V':
927 PyObject *obj = va_arg(vargs, PyObject *);
928 const char *str = va_arg(vargs, const char *);
929 if (obj) {
930 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
931 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
932 s += size;
933 } else {
934 appendstring(str);
936 break;
938 case 'S':
939 case 'R':
941 Py_UNICODE *ucopy;
942 Py_ssize_t usize;
943 Py_ssize_t upos;
944 /* unused, since we already have the result */
945 (void) va_arg(vargs, PyObject *);
946 ucopy = PyUnicode_AS_UNICODE(*callresult);
947 usize = PyUnicode_GET_SIZE(*callresult);
948 for (upos = 0; upos<usize;)
949 *s++ = ucopy[upos++];
950 /* We're done with the unicode()/repr() => forget it */
951 Py_DECREF(*callresult);
952 /* switch to next unicode()/repr() result */
953 ++callresult;
954 break;
956 case 'p':
957 sprintf(buffer, "%p", va_arg(vargs, void*));
958 /* %p is ill-defined: ensure leading 0x. */
959 if (buffer[1] == 'X')
960 buffer[1] = 'x';
961 else if (buffer[1] != 'x') {
962 memmove(buffer+2, buffer, strlen(buffer)+1);
963 buffer[0] = '0';
964 buffer[1] = 'x';
966 appendstring(buffer);
967 break;
968 case '%':
969 *s++ = '%';
970 break;
971 default:
972 appendstring(p);
973 goto end;
975 } else
976 *s++ = *f;
979 end:
980 if (callresults)
981 PyObject_Free(callresults);
982 if (abuffer)
983 PyObject_Free(abuffer);
984 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
985 return string;
986 fail:
987 if (callresults) {
988 PyObject **callresult2 = callresults;
989 while (callresult2 < callresult) {
990 Py_DECREF(*callresult2);
991 ++callresult2;
993 PyObject_Free(callresults);
995 if (abuffer)
996 PyObject_Free(abuffer);
997 return NULL;
1000 #undef appendstring
1002 PyObject *
1003 PyUnicode_FromFormat(const char *format, ...)
1005 PyObject* ret;
1006 va_list vargs;
1008 #ifdef HAVE_STDARG_PROTOTYPES
1009 va_start(vargs, format);
1010 #else
1011 va_start(vargs);
1012 #endif
1013 ret = PyUnicode_FromFormatV(format, vargs);
1014 va_end(vargs);
1015 return ret;
1018 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1019 wchar_t *w,
1020 Py_ssize_t size)
1022 if (unicode == NULL) {
1023 PyErr_BadInternalCall();
1024 return -1;
1027 /* If possible, try to copy the 0-termination as well */
1028 if (size > PyUnicode_GET_SIZE(unicode))
1029 size = PyUnicode_GET_SIZE(unicode) + 1;
1031 #ifdef HAVE_USABLE_WCHAR_T
1032 memcpy(w, unicode->str, size * sizeof(wchar_t));
1033 #else
1035 register Py_UNICODE *u;
1036 register Py_ssize_t i;
1037 u = PyUnicode_AS_UNICODE(unicode);
1038 for (i = size; i > 0; i--)
1039 *w++ = *u++;
1041 #endif
1043 if (size > PyUnicode_GET_SIZE(unicode))
1044 return PyUnicode_GET_SIZE(unicode);
1045 else
1046 return size;
1049 #endif
1051 PyObject *PyUnicode_FromOrdinal(int ordinal)
1053 Py_UNICODE s[1];
1055 #ifdef Py_UNICODE_WIDE
1056 if (ordinal < 0 || ordinal > 0x10ffff) {
1057 PyErr_SetString(PyExc_ValueError,
1058 "unichr() arg not in range(0x110000) "
1059 "(wide Python build)");
1060 return NULL;
1062 #else
1063 if (ordinal < 0 || ordinal > 0xffff) {
1064 PyErr_SetString(PyExc_ValueError,
1065 "unichr() arg not in range(0x10000) "
1066 "(narrow Python build)");
1067 return NULL;
1069 #endif
1071 s[0] = (Py_UNICODE)ordinal;
1072 return PyUnicode_FromUnicode(s, 1);
1075 PyObject *PyUnicode_FromObject(register PyObject *obj)
1077 /* XXX Perhaps we should make this API an alias of
1078 PyObject_Unicode() instead ?! */
1079 if (PyUnicode_CheckExact(obj)) {
1080 Py_INCREF(obj);
1081 return obj;
1083 if (PyUnicode_Check(obj)) {
1084 /* For a Unicode subtype that's not a Unicode object,
1085 return a true Unicode object with the same data. */
1086 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1087 PyUnicode_GET_SIZE(obj));
1089 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1092 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1093 const char *encoding,
1094 const char *errors)
1096 const char *s = NULL;
1097 Py_ssize_t len;
1098 PyObject *v;
1100 if (obj == NULL) {
1101 PyErr_BadInternalCall();
1102 return NULL;
1105 #if 0
1106 /* For b/w compatibility we also accept Unicode objects provided
1107 that no encodings is given and then redirect to
1108 PyObject_Unicode() which then applies the additional logic for
1109 Unicode subclasses.
1111 NOTE: This API should really only be used for object which
1112 represent *encoded* Unicode !
1115 if (PyUnicode_Check(obj)) {
1116 if (encoding) {
1117 PyErr_SetString(PyExc_TypeError,
1118 "decoding Unicode is not supported");
1119 return NULL;
1121 return PyObject_Unicode(obj);
1123 #else
1124 if (PyUnicode_Check(obj)) {
1125 PyErr_SetString(PyExc_TypeError,
1126 "decoding Unicode is not supported");
1127 return NULL;
1129 #endif
1131 /* Coerce object */
1132 if (PyString_Check(obj)) {
1133 s = PyString_AS_STRING(obj);
1134 len = PyString_GET_SIZE(obj);
1136 else if (PyByteArray_Check(obj)) {
1137 /* Python 2.x specific */
1138 PyErr_Format(PyExc_TypeError,
1139 "decoding bytearray is not supported");
1140 return NULL;
1142 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1143 /* Overwrite the error message with something more useful in
1144 case of a TypeError. */
1145 if (PyErr_ExceptionMatches(PyExc_TypeError))
1146 PyErr_Format(PyExc_TypeError,
1147 "coercing to Unicode: need string or buffer, "
1148 "%.80s found",
1149 Py_TYPE(obj)->tp_name);
1150 goto onError;
1153 /* Convert to Unicode */
1154 if (len == 0) {
1155 Py_INCREF(unicode_empty);
1156 v = (PyObject *)unicode_empty;
1158 else
1159 v = PyUnicode_Decode(s, len, encoding, errors);
1161 return v;
1163 onError:
1164 return NULL;
1167 PyObject *PyUnicode_Decode(const char *s,
1168 Py_ssize_t size,
1169 const char *encoding,
1170 const char *errors)
1172 PyObject *buffer = NULL, *unicode;
1174 if (encoding == NULL)
1175 encoding = PyUnicode_GetDefaultEncoding();
1177 /* Shortcuts for common default encodings */
1178 if (strcmp(encoding, "utf-8") == 0)
1179 return PyUnicode_DecodeUTF8(s, size, errors);
1180 else if (strcmp(encoding, "latin-1") == 0)
1181 return PyUnicode_DecodeLatin1(s, size, errors);
1182 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1183 else if (strcmp(encoding, "mbcs") == 0)
1184 return PyUnicode_DecodeMBCS(s, size, errors);
1185 #endif
1186 else if (strcmp(encoding, "ascii") == 0)
1187 return PyUnicode_DecodeASCII(s, size, errors);
1189 /* Decode via the codec registry */
1190 buffer = PyBuffer_FromMemory((void *)s, size);
1191 if (buffer == NULL)
1192 goto onError;
1193 unicode = PyCodec_Decode(buffer, encoding, errors);
1194 if (unicode == NULL)
1195 goto onError;
1196 if (!PyUnicode_Check(unicode)) {
1197 PyErr_Format(PyExc_TypeError,
1198 "decoder did not return an unicode object (type=%.400s)",
1199 Py_TYPE(unicode)->tp_name);
1200 Py_DECREF(unicode);
1201 goto onError;
1203 Py_DECREF(buffer);
1204 return unicode;
1206 onError:
1207 Py_XDECREF(buffer);
1208 return NULL;
1211 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1212 const char *encoding,
1213 const char *errors)
1215 PyObject *v;
1217 if (!PyUnicode_Check(unicode)) {
1218 PyErr_BadArgument();
1219 goto onError;
1222 if (encoding == NULL)
1223 encoding = PyUnicode_GetDefaultEncoding();
1225 /* Decode via the codec registry */
1226 v = PyCodec_Decode(unicode, encoding, errors);
1227 if (v == NULL)
1228 goto onError;
1229 return v;
1231 onError:
1232 return NULL;
1235 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1236 Py_ssize_t size,
1237 const char *encoding,
1238 const char *errors)
1240 PyObject *v, *unicode;
1242 unicode = PyUnicode_FromUnicode(s, size);
1243 if (unicode == NULL)
1244 return NULL;
1245 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1246 Py_DECREF(unicode);
1247 return v;
1250 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1251 const char *encoding,
1252 const char *errors)
1254 PyObject *v;
1256 if (!PyUnicode_Check(unicode)) {
1257 PyErr_BadArgument();
1258 goto onError;
1261 if (encoding == NULL)
1262 encoding = PyUnicode_GetDefaultEncoding();
1264 /* Encode via the codec registry */
1265 v = PyCodec_Encode(unicode, encoding, errors);
1266 if (v == NULL)
1267 goto onError;
1268 return v;
1270 onError:
1271 return NULL;
1274 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1275 const char *encoding,
1276 const char *errors)
1278 PyObject *v;
1280 if (!PyUnicode_Check(unicode)) {
1281 PyErr_BadArgument();
1282 goto onError;
1285 if (encoding == NULL)
1286 encoding = PyUnicode_GetDefaultEncoding();
1288 /* Shortcuts for common default encodings */
1289 if (errors == NULL) {
1290 if (strcmp(encoding, "utf-8") == 0)
1291 return PyUnicode_AsUTF8String(unicode);
1292 else if (strcmp(encoding, "latin-1") == 0)
1293 return PyUnicode_AsLatin1String(unicode);
1294 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1295 else if (strcmp(encoding, "mbcs") == 0)
1296 return PyUnicode_AsMBCSString(unicode);
1297 #endif
1298 else if (strcmp(encoding, "ascii") == 0)
1299 return PyUnicode_AsASCIIString(unicode);
1302 /* Encode via the codec registry */
1303 v = PyCodec_Encode(unicode, encoding, errors);
1304 if (v == NULL)
1305 goto onError;
1306 if (!PyString_Check(v)) {
1307 PyErr_Format(PyExc_TypeError,
1308 "encoder did not return a string object (type=%.400s)",
1309 Py_TYPE(v)->tp_name);
1310 Py_DECREF(v);
1311 goto onError;
1313 return v;
1315 onError:
1316 return NULL;
1319 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1320 const char *errors)
1322 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1324 if (v)
1325 return v;
1326 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1327 if (v && errors == NULL)
1328 ((PyUnicodeObject *)unicode)->defenc = v;
1329 return v;
1332 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1334 if (!PyUnicode_Check(unicode)) {
1335 PyErr_BadArgument();
1336 goto onError;
1338 return PyUnicode_AS_UNICODE(unicode);
1340 onError:
1341 return NULL;
1344 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1346 if (!PyUnicode_Check(unicode)) {
1347 PyErr_BadArgument();
1348 goto onError;
1350 return PyUnicode_GET_SIZE(unicode);
1352 onError:
1353 return -1;
1356 const char *PyUnicode_GetDefaultEncoding(void)
1358 return unicode_default_encoding;
1361 int PyUnicode_SetDefaultEncoding(const char *encoding)
1363 PyObject *v;
1365 /* Make sure the encoding is valid. As side effect, this also
1366 loads the encoding into the codec registry cache. */
1367 v = _PyCodec_Lookup(encoding);
1368 if (v == NULL)
1369 goto onError;
1370 Py_DECREF(v);
1371 strncpy(unicode_default_encoding,
1372 encoding,
1373 sizeof(unicode_default_encoding));
1374 return 0;
1376 onError:
1377 return -1;
1380 /* error handling callback helper:
1381 build arguments, call the callback and check the arguments,
1382 if no exception occurred, copy the replacement to the output
1383 and adjust various state variables.
1384 return 0 on success, -1 on error
1387 static
1388 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1389 const char *encoding, const char *reason,
1390 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1391 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1392 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1394 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1396 PyObject *restuple = NULL;
1397 PyObject *repunicode = NULL;
1398 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1399 Py_ssize_t requiredsize;
1400 Py_ssize_t newpos;
1401 Py_UNICODE *repptr;
1402 Py_ssize_t repsize;
1403 int res = -1;
1405 if (*errorHandler == NULL) {
1406 *errorHandler = PyCodec_LookupError(errors);
1407 if (*errorHandler == NULL)
1408 goto onError;
1411 if (*exceptionObject == NULL) {
1412 *exceptionObject = PyUnicodeDecodeError_Create(
1413 encoding, input, insize, *startinpos, *endinpos, reason);
1414 if (*exceptionObject == NULL)
1415 goto onError;
1417 else {
1418 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1419 goto onError;
1420 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1421 goto onError;
1422 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1423 goto onError;
1426 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1427 if (restuple == NULL)
1428 goto onError;
1429 if (!PyTuple_Check(restuple)) {
1430 PyErr_SetString(PyExc_TypeError, &argparse[4]);
1431 goto onError;
1433 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1434 goto onError;
1435 if (newpos<0)
1436 newpos = insize+newpos;
1437 if (newpos<0 || newpos>insize) {
1438 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1439 goto onError;
1442 /* need more space? (at least enough for what we
1443 have+the replacement+the rest of the string (starting
1444 at the new input position), so we won't have to check space
1445 when there are no errors in the rest of the string) */
1446 repptr = PyUnicode_AS_UNICODE(repunicode);
1447 repsize = PyUnicode_GET_SIZE(repunicode);
1448 requiredsize = *outpos + repsize + insize-newpos;
1449 if (requiredsize > outsize) {
1450 if (requiredsize<2*outsize)
1451 requiredsize = 2*outsize;
1452 if (_PyUnicode_Resize(output, requiredsize) < 0)
1453 goto onError;
1454 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1456 *endinpos = newpos;
1457 *inptr = input + newpos;
1458 Py_UNICODE_COPY(*outptr, repptr, repsize);
1459 *outptr += repsize;
1460 *outpos += repsize;
1461 /* we made it! */
1462 res = 0;
1464 onError:
1465 Py_XDECREF(restuple);
1466 return res;
1469 /* --- UTF-7 Codec -------------------------------------------------------- */
1471 /* See RFC2152 for details. We encode conservatively and decode liberally. */
1473 /* Three simple macros defining base-64. */
1475 /* Is c a base-64 character? */
1477 #define IS_BASE64(c) \
1478 (isalnum(c) || (c) == '+' || (c) == '/')
1480 /* given that c is a base-64 character, what is its base-64 value? */
1482 #define FROM_BASE64(c) \
1483 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1484 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1485 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1486 (c) == '+' ? 62 : 63)
1488 /* What is the base-64 character of the bottom 6 bits of n? */
1490 #define TO_BASE64(n) \
1491 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1493 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1494 * decoded as itself. We are permissive on decoding; the only ASCII
1495 * byte not decoding to itself is the + which begins a base64
1496 * string. */
1498 #define DECODE_DIRECT(c) \
1499 ((c) <= 127 && (c) != '+')
1501 /* The UTF-7 encoder treats ASCII characters differently according to
1502 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1503 * the above). See RFC2152. This array identifies these different
1504 * sets:
1505 * 0 : "Set D"
1506 * alphanumeric and '(),-./:?
1507 * 1 : "Set O"
1508 * !"#$%&*;<=>@[]^_`{|}
1509 * 2 : "whitespace"
1510 * ht nl cr sp
1511 * 3 : special (must be base64 encoded)
1512 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1515 static
1516 char utf7_category[128] = {
1517 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1518 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1519 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1520 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1521 /* sp ! " # $ % & ' ( ) * + , - . / */
1522 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1523 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1524 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1525 /* @ A B C D E F G H I J K L M N O */
1526 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1527 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
1528 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1529 /* ` a b c d e f g h i j k l m n o */
1530 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1531 /* p q r s t u v w x y z { | } ~ del */
1532 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
1535 /* ENCODE_DIRECT: this character should be encoded as itself. The
1536 * answer depends on whether we are encoding set O as itself, and also
1537 * on whether we are encoding whitespace as itself. RFC2152 makes it
1538 * clear that the answers to these questions vary between
1539 * applications, so this code needs to be flexible. */
1541 #define ENCODE_DIRECT(c, directO, directWS) \
1542 ((c) < 128 && (c) > 0 && \
1543 ((utf7_category[(c)] == 0) || \
1544 (directWS && (utf7_category[(c)] == 2)) || \
1545 (directO && (utf7_category[(c)] == 1))))
1547 PyObject *PyUnicode_DecodeUTF7(const char *s,
1548 Py_ssize_t size,
1549 const char *errors)
1551 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1554 /* The decoder. The only state we preserve is our read position,
1555 * i.e. how many characters we have consumed. So if we end in the
1556 * middle of a shift sequence we have to back off the read position
1557 * and the output to the beginning of the sequence, otherwise we lose
1558 * all the shift state (seen bits, number of bits seen, high
1559 * surrogate). */
1561 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1562 Py_ssize_t size,
1563 const char *errors,
1564 Py_ssize_t *consumed)
1566 const char *starts = s;
1567 Py_ssize_t startinpos;
1568 Py_ssize_t endinpos;
1569 Py_ssize_t outpos;
1570 const char *e;
1571 PyUnicodeObject *unicode;
1572 Py_UNICODE *p;
1573 const char *errmsg = "";
1574 int inShift = 0;
1575 Py_UNICODE *shiftOutStart;
1576 unsigned int base64bits = 0;
1577 unsigned long base64buffer = 0;
1578 Py_UNICODE surrogate = 0;
1579 PyObject *errorHandler = NULL;
1580 PyObject *exc = NULL;
1582 unicode = _PyUnicode_New(size);
1583 if (!unicode)
1584 return NULL;
1585 if (size == 0) {
1586 if (consumed)
1587 *consumed = 0;
1588 return (PyObject *)unicode;
1591 p = unicode->str;
1592 shiftOutStart = p;
1593 e = s + size;
1595 while (s < e) {
1596 Py_UNICODE ch = (unsigned char) *s;
1598 if (inShift) { /* in a base-64 section */
1599 if (IS_BASE64(ch)) { /* consume a base-64 character */
1600 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1601 base64bits += 6;
1602 s++;
1603 if (base64bits >= 16) {
1604 /* we have enough bits for a UTF-16 value */
1605 Py_UNICODE outCh = (Py_UNICODE)
1606 (base64buffer >> (base64bits-16));
1607 base64bits -= 16;
1608 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1609 if (surrogate) {
1610 /* expecting a second surrogate */
1611 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1612 #ifdef Py_UNICODE_WIDE
1613 *p++ = (((surrogate & 0x3FF)<<10)
1614 | (outCh & 0x3FF)) + 0x10000;
1615 #else
1616 *p++ = surrogate;
1617 *p++ = outCh;
1618 #endif
1619 surrogate = 0;
1621 else {
1622 surrogate = 0;
1623 errmsg = "second surrogate missing";
1624 goto utf7Error;
1627 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1628 /* first surrogate */
1629 surrogate = outCh;
1631 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1632 errmsg = "unexpected second surrogate";
1633 goto utf7Error;
1635 else {
1636 *p++ = outCh;
1640 else { /* now leaving a base-64 section */
1641 inShift = 0;
1642 s++;
1643 if (surrogate) {
1644 errmsg = "second surrogate missing at end of shift sequence";
1645 goto utf7Error;
1647 if (base64bits > 0) { /* left-over bits */
1648 if (base64bits >= 6) {
1649 /* We've seen at least one base-64 character */
1650 errmsg = "partial character in shift sequence";
1651 goto utf7Error;
1653 else {
1654 /* Some bits remain; they should be zero */
1655 if (base64buffer != 0) {
1656 errmsg = "non-zero padding bits in shift sequence";
1657 goto utf7Error;
1661 if (ch != '-') {
1662 /* '-' is absorbed; other terminating
1663 characters are preserved */
1664 *p++ = ch;
1668 else if ( ch == '+' ) {
1669 startinpos = s-starts;
1670 s++; /* consume '+' */
1671 if (s < e && *s == '-') { /* '+-' encodes '+' */
1672 s++;
1673 *p++ = '+';
1675 else { /* begin base64-encoded section */
1676 inShift = 1;
1677 shiftOutStart = p;
1678 base64bits = 0;
1681 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
1682 *p++ = ch;
1683 s++;
1685 else {
1686 startinpos = s-starts;
1687 s++;
1688 errmsg = "unexpected special character";
1689 goto utf7Error;
1691 continue;
1692 utf7Error:
1693 outpos = p-PyUnicode_AS_UNICODE(unicode);
1694 endinpos = s-starts;
1695 if (unicode_decode_call_errorhandler(
1696 errors, &errorHandler,
1697 "utf7", errmsg,
1698 starts, size, &startinpos, &endinpos, &exc, &s,
1699 &unicode, &outpos, &p))
1700 goto onError;
1703 /* end of string */
1705 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1706 /* if we're in an inconsistent state, that's an error */
1707 if (surrogate ||
1708 (base64bits >= 6) ||
1709 (base64bits > 0 && base64buffer != 0)) {
1710 outpos = p-PyUnicode_AS_UNICODE(unicode);
1711 endinpos = size;
1712 if (unicode_decode_call_errorhandler(
1713 errors, &errorHandler,
1714 "utf7", "unterminated shift sequence",
1715 starts, size, &startinpos, &endinpos, &exc, &s,
1716 &unicode, &outpos, &p))
1717 goto onError;
1721 /* return state */
1722 if (consumed) {
1723 if (inShift) {
1724 p = shiftOutStart; /* back off output */
1725 *consumed = startinpos;
1727 else {
1728 *consumed = s-starts;
1732 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1733 goto onError;
1735 Py_XDECREF(errorHandler);
1736 Py_XDECREF(exc);
1737 return (PyObject *)unicode;
1739 onError:
1740 Py_XDECREF(errorHandler);
1741 Py_XDECREF(exc);
1742 Py_DECREF(unicode);
1743 return NULL;
1747 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1748 Py_ssize_t size,
1749 int base64SetO,
1750 int base64WhiteSpace,
1751 const char *errors)
1753 PyObject *v;
1754 /* It might be possible to tighten this worst case */
1755 Py_ssize_t allocated = 8 * size;
1756 int inShift = 0;
1757 Py_ssize_t i = 0;
1758 unsigned int base64bits = 0;
1759 unsigned long base64buffer = 0;
1760 char * out;
1761 char * start;
1763 if (allocated / 8 != size)
1764 return PyErr_NoMemory();
1766 if (size == 0)
1767 return PyString_FromStringAndSize(NULL, 0);
1769 v = PyString_FromStringAndSize(NULL, allocated);
1770 if (v == NULL)
1771 return NULL;
1773 start = out = PyString_AS_STRING(v);
1774 for (;i < size; ++i) {
1775 Py_UNICODE ch = s[i];
1777 if (inShift) {
1778 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1779 /* shifting out */
1780 if (base64bits) { /* output remaining bits */
1781 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1782 base64buffer = 0;
1783 base64bits = 0;
1785 inShift = 0;
1786 /* Characters not in the BASE64 set implicitly unshift the sequence
1787 so no '-' is required, except if the character is itself a '-' */
1788 if (IS_BASE64(ch) || ch == '-') {
1789 *out++ = '-';
1791 *out++ = (char) ch;
1793 else {
1794 goto encode_char;
1797 else { /* not in a shift sequence */
1798 if (ch == '+') {
1799 *out++ = '+';
1800 *out++ = '-';
1802 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1803 *out++ = (char) ch;
1805 else {
1806 *out++ = '+';
1807 inShift = 1;
1808 goto encode_char;
1811 continue;
1812 encode_char:
1813 #ifdef Py_UNICODE_WIDE
1814 if (ch >= 0x10000) {
1815 /* code first surrogate */
1816 base64bits += 16;
1817 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1818 while (base64bits >= 6) {
1819 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1820 base64bits -= 6;
1822 /* prepare second surrogate */
1823 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1825 #endif
1826 base64bits += 16;
1827 base64buffer = (base64buffer << 16) | ch;
1828 while (base64bits >= 6) {
1829 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1830 base64bits -= 6;
1833 if (base64bits)
1834 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1835 if (inShift)
1836 *out++ = '-';
1838 _PyString_Resize(&v, out - start);
1839 return v;
1842 #undef IS_BASE64
1843 #undef FROM_BASE64
1844 #undef TO_BASE64
1845 #undef DECODE_DIRECT
1846 #undef ENCODE_DIRECT
1848 /* --- UTF-8 Codec -------------------------------------------------------- */
1850 static
1851 char utf8_code_length[256] = {
1852 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1853 illegal prefix. see RFC 2279 for details */
1854 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1855 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1856 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1857 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1858 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1859 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1860 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1861 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1862 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1863 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1864 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1865 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1866 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1867 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1868 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1869 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1872 PyObject *PyUnicode_DecodeUTF8(const char *s,
1873 Py_ssize_t size,
1874 const char *errors)
1876 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1879 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1880 Py_ssize_t size,
1881 const char *errors,
1882 Py_ssize_t *consumed)
1884 const char *starts = s;
1885 int n;
1886 Py_ssize_t startinpos;
1887 Py_ssize_t endinpos;
1888 Py_ssize_t outpos;
1889 const char *e;
1890 PyUnicodeObject *unicode;
1891 Py_UNICODE *p;
1892 const char *errmsg = "";
1893 PyObject *errorHandler = NULL;
1894 PyObject *exc = NULL;
1896 /* Note: size will always be longer than the resulting Unicode
1897 character count */
1898 unicode = _PyUnicode_New(size);
1899 if (!unicode)
1900 return NULL;
1901 if (size == 0) {
1902 if (consumed)
1903 *consumed = 0;
1904 return (PyObject *)unicode;
1907 /* Unpack UTF-8 encoded data */
1908 p = unicode->str;
1909 e = s + size;
1911 while (s < e) {
1912 Py_UCS4 ch = (unsigned char)*s;
1914 if (ch < 0x80) {
1915 *p++ = (Py_UNICODE)ch;
1916 s++;
1917 continue;
1920 n = utf8_code_length[ch];
1922 if (s + n > e) {
1923 if (consumed)
1924 break;
1925 else {
1926 errmsg = "unexpected end of data";
1927 startinpos = s-starts;
1928 endinpos = size;
1929 goto utf8Error;
1933 switch (n) {
1935 case 0:
1936 errmsg = "unexpected code byte";
1937 startinpos = s-starts;
1938 endinpos = startinpos+1;
1939 goto utf8Error;
1941 case 1:
1942 errmsg = "internal error";
1943 startinpos = s-starts;
1944 endinpos = startinpos+1;
1945 goto utf8Error;
1947 case 2:
1948 if ((s[1] & 0xc0) != 0x80) {
1949 errmsg = "invalid data";
1950 startinpos = s-starts;
1951 endinpos = startinpos+2;
1952 goto utf8Error;
1954 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1955 if (ch < 0x80) {
1956 startinpos = s-starts;
1957 endinpos = startinpos+2;
1958 errmsg = "illegal encoding";
1959 goto utf8Error;
1961 else
1962 *p++ = (Py_UNICODE)ch;
1963 break;
1965 case 3:
1966 if ((s[1] & 0xc0) != 0x80 ||
1967 (s[2] & 0xc0) != 0x80) {
1968 errmsg = "invalid data";
1969 startinpos = s-starts;
1970 endinpos = startinpos+3;
1971 goto utf8Error;
1973 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1974 if (ch < 0x0800) {
1975 /* Note: UTF-8 encodings of surrogates are considered
1976 legal UTF-8 sequences;
1978 XXX For wide builds (UCS-4) we should probably try
1979 to recombine the surrogates into a single code
1980 unit.
1982 errmsg = "illegal encoding";
1983 startinpos = s-starts;
1984 endinpos = startinpos+3;
1985 goto utf8Error;
1987 else
1988 *p++ = (Py_UNICODE)ch;
1989 break;
1991 case 4:
1992 if ((s[1] & 0xc0) != 0x80 ||
1993 (s[2] & 0xc0) != 0x80 ||
1994 (s[3] & 0xc0) != 0x80) {
1995 errmsg = "invalid data";
1996 startinpos = s-starts;
1997 endinpos = startinpos+4;
1998 goto utf8Error;
2000 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2001 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2002 /* validate and convert to UTF-16 */
2003 if ((ch < 0x10000) /* minimum value allowed for 4
2004 byte encoding */
2005 || (ch > 0x10ffff)) /* maximum value allowed for
2006 UTF-16 */
2008 errmsg = "illegal encoding";
2009 startinpos = s-starts;
2010 endinpos = startinpos+4;
2011 goto utf8Error;
2013 #ifdef Py_UNICODE_WIDE
2014 *p++ = (Py_UNICODE)ch;
2015 #else
2016 /* compute and append the two surrogates: */
2018 /* translate from 10000..10FFFF to 0..FFFF */
2019 ch -= 0x10000;
2021 /* high surrogate = top 10 bits added to D800 */
2022 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2024 /* low surrogate = bottom 10 bits added to DC00 */
2025 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2026 #endif
2027 break;
2029 default:
2030 /* Other sizes are only needed for UCS-4 */
2031 errmsg = "unsupported Unicode code range";
2032 startinpos = s-starts;
2033 endinpos = startinpos+n;
2034 goto utf8Error;
2036 s += n;
2037 continue;
2039 utf8Error:
2040 outpos = p-PyUnicode_AS_UNICODE(unicode);
2041 if (unicode_decode_call_errorhandler(
2042 errors, &errorHandler,
2043 "utf8", errmsg,
2044 starts, size, &startinpos, &endinpos, &exc, &s,
2045 &unicode, &outpos, &p))
2046 goto onError;
2048 if (consumed)
2049 *consumed = s-starts;
2051 /* Adjust length */
2052 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2053 goto onError;
2055 Py_XDECREF(errorHandler);
2056 Py_XDECREF(exc);
2057 return (PyObject *)unicode;
2059 onError:
2060 Py_XDECREF(errorHandler);
2061 Py_XDECREF(exc);
2062 Py_DECREF(unicode);
2063 return NULL;
2066 /* Allocation strategy: if the string is short, convert into a stack buffer
2067 and allocate exactly as much space needed at the end. Else allocate the
2068 maximum possible needed (4 result bytes per Unicode character), and return
2069 the excess memory at the end.
2071 PyObject *
2072 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2073 Py_ssize_t size,
2074 const char *errors)
2076 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
2078 Py_ssize_t i; /* index into s of next input byte */
2079 PyObject *v; /* result string object */
2080 char *p; /* next free byte in output buffer */
2081 Py_ssize_t nallocated; /* number of result bytes allocated */
2082 Py_ssize_t nneeded; /* number of result bytes needed */
2083 char stackbuf[MAX_SHORT_UNICHARS * 4];
2085 assert(s != NULL);
2086 assert(size >= 0);
2088 if (size <= MAX_SHORT_UNICHARS) {
2089 /* Write into the stack buffer; nallocated can't overflow.
2090 * At the end, we'll allocate exactly as much heap space as it
2091 * turns out we need.
2093 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2094 v = NULL; /* will allocate after we're done */
2095 p = stackbuf;
2097 else {
2098 /* Overallocate on the heap, and give the excess back at the end. */
2099 nallocated = size * 4;
2100 if (nallocated / 4 != size) /* overflow! */
2101 return PyErr_NoMemory();
2102 v = PyString_FromStringAndSize(NULL, nallocated);
2103 if (v == NULL)
2104 return NULL;
2105 p = PyString_AS_STRING(v);
2108 for (i = 0; i < size;) {
2109 Py_UCS4 ch = s[i++];
2111 if (ch < 0x80)
2112 /* Encode ASCII */
2113 *p++ = (char) ch;
2115 else if (ch < 0x0800) {
2116 /* Encode Latin-1 */
2117 *p++ = (char)(0xc0 | (ch >> 6));
2118 *p++ = (char)(0x80 | (ch & 0x3f));
2120 else {
2121 /* Encode UCS2 Unicode ordinals */
2122 if (ch < 0x10000) {
2123 /* Special case: check for high surrogate */
2124 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2125 Py_UCS4 ch2 = s[i];
2126 /* Check for low surrogate and combine the two to
2127 form a UCS4 value */
2128 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2129 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2130 i++;
2131 goto encodeUCS4;
2133 /* Fall through: handles isolated high surrogates */
2135 *p++ = (char)(0xe0 | (ch >> 12));
2136 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2137 *p++ = (char)(0x80 | (ch & 0x3f));
2138 continue;
2140 encodeUCS4:
2141 /* Encode UCS4 Unicode ordinals */
2142 *p++ = (char)(0xf0 | (ch >> 18));
2143 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2144 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2145 *p++ = (char)(0x80 | (ch & 0x3f));
2149 if (v == NULL) {
2150 /* This was stack allocated. */
2151 nneeded = p - stackbuf;
2152 assert(nneeded <= nallocated);
2153 v = PyString_FromStringAndSize(stackbuf, nneeded);
2155 else {
2156 /* Cut back to size actually needed. */
2157 nneeded = p - PyString_AS_STRING(v);
2158 assert(nneeded <= nallocated);
2159 _PyString_Resize(&v, nneeded);
2161 return v;
2163 #undef MAX_SHORT_UNICHARS
2166 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2168 if (!PyUnicode_Check(unicode)) {
2169 PyErr_BadArgument();
2170 return NULL;
2172 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2173 PyUnicode_GET_SIZE(unicode),
2174 NULL);
2177 /* --- UTF-32 Codec ------------------------------------------------------- */
2179 PyObject *
2180 PyUnicode_DecodeUTF32(const char *s,
2181 Py_ssize_t size,
2182 const char *errors,
2183 int *byteorder)
2185 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2188 PyObject *
2189 PyUnicode_DecodeUTF32Stateful(const char *s,
2190 Py_ssize_t size,
2191 const char *errors,
2192 int *byteorder,
2193 Py_ssize_t *consumed)
2195 const char *starts = s;
2196 Py_ssize_t startinpos;
2197 Py_ssize_t endinpos;
2198 Py_ssize_t outpos;
2199 PyUnicodeObject *unicode;
2200 Py_UNICODE *p;
2201 #ifndef Py_UNICODE_WIDE
2202 int i, pairs;
2203 #else
2204 const int pairs = 0;
2205 #endif
2206 const unsigned char *q, *e;
2207 int bo = 0; /* assume native ordering by default */
2208 const char *errmsg = "";
2209 /* Offsets from q for retrieving bytes in the right order. */
2210 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2211 int iorder[] = {0, 1, 2, 3};
2212 #else
2213 int iorder[] = {3, 2, 1, 0};
2214 #endif
2215 PyObject *errorHandler = NULL;
2216 PyObject *exc = NULL;
2217 /* On narrow builds we split characters outside the BMP into two
2218 codepoints => count how much extra space we need. */
2219 #ifndef Py_UNICODE_WIDE
2220 for (i = pairs = 0; i < size/4; i++)
2221 if (((Py_UCS4 *)s)[i] >= 0x10000)
2222 pairs++;
2223 #endif
2225 /* This might be one to much, because of a BOM */
2226 unicode = _PyUnicode_New((size+3)/4+pairs);
2227 if (!unicode)
2228 return NULL;
2229 if (size == 0)
2230 return (PyObject *)unicode;
2232 /* Unpack UTF-32 encoded data */
2233 p = unicode->str;
2234 q = (unsigned char *)s;
2235 e = q + size;
2237 if (byteorder)
2238 bo = *byteorder;
2240 /* Check for BOM marks (U+FEFF) in the input and adjust current
2241 byte order setting accordingly. In native mode, the leading BOM
2242 mark is skipped, in all other modes, it is copied to the output
2243 stream as-is (giving a ZWNBSP character). */
2244 if (bo == 0) {
2245 if (size >= 4) {
2246 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2247 (q[iorder[1]] << 8) | q[iorder[0]];
2248 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2249 if (bom == 0x0000FEFF) {
2250 q += 4;
2251 bo = -1;
2253 else if (bom == 0xFFFE0000) {
2254 q += 4;
2255 bo = 1;
2257 #else
2258 if (bom == 0x0000FEFF) {
2259 q += 4;
2260 bo = 1;
2262 else if (bom == 0xFFFE0000) {
2263 q += 4;
2264 bo = -1;
2266 #endif
2270 if (bo == -1) {
2271 /* force LE */
2272 iorder[0] = 0;
2273 iorder[1] = 1;
2274 iorder[2] = 2;
2275 iorder[3] = 3;
2277 else if (bo == 1) {
2278 /* force BE */
2279 iorder[0] = 3;
2280 iorder[1] = 2;
2281 iorder[2] = 1;
2282 iorder[3] = 0;
2285 while (q < e) {
2286 Py_UCS4 ch;
2287 /* remaining bytes at the end? (size should be divisible by 4) */
2288 if (e-q<4) {
2289 if (consumed)
2290 break;
2291 errmsg = "truncated data";
2292 startinpos = ((const char *)q)-starts;
2293 endinpos = ((const char *)e)-starts;
2294 goto utf32Error;
2295 /* The remaining input chars are ignored if the callback
2296 chooses to skip the input */
2298 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2299 (q[iorder[1]] << 8) | q[iorder[0]];
2301 if (ch >= 0x110000)
2303 errmsg = "codepoint not in range(0x110000)";
2304 startinpos = ((const char *)q)-starts;
2305 endinpos = startinpos+4;
2306 goto utf32Error;
2308 #ifndef Py_UNICODE_WIDE
2309 if (ch >= 0x10000)
2311 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2312 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2314 else
2315 #endif
2316 *p++ = ch;
2317 q += 4;
2318 continue;
2319 utf32Error:
2320 outpos = p-PyUnicode_AS_UNICODE(unicode);
2321 if (unicode_decode_call_errorhandler(
2322 errors, &errorHandler,
2323 "utf32", errmsg,
2324 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2325 &unicode, &outpos, &p))
2326 goto onError;
2329 if (byteorder)
2330 *byteorder = bo;
2332 if (consumed)
2333 *consumed = (const char *)q-starts;
2335 /* Adjust length */
2336 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2337 goto onError;
2339 Py_XDECREF(errorHandler);
2340 Py_XDECREF(exc);
2341 return (PyObject *)unicode;
2343 onError:
2344 Py_DECREF(unicode);
2345 Py_XDECREF(errorHandler);
2346 Py_XDECREF(exc);
2347 return NULL;
2350 PyObject *
2351 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2352 Py_ssize_t size,
2353 const char *errors,
2354 int byteorder)
2356 PyObject *v;
2357 unsigned char *p;
2358 Py_ssize_t nsize, bytesize;
2359 #ifndef Py_UNICODE_WIDE
2360 Py_ssize_t i, pairs;
2361 #else
2362 const int pairs = 0;
2363 #endif
2364 /* Offsets from p for storing byte pairs in the right order. */
2365 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2366 int iorder[] = {0, 1, 2, 3};
2367 #else
2368 int iorder[] = {3, 2, 1, 0};
2369 #endif
2371 #define STORECHAR(CH) \
2372 do { \
2373 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2374 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2375 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2376 p[iorder[0]] = (CH) & 0xff; \
2377 p += 4; \
2378 } while(0)
2380 /* In narrow builds we can output surrogate pairs as one codepoint,
2381 so we need less space. */
2382 #ifndef Py_UNICODE_WIDE
2383 for (i = pairs = 0; i < size-1; i++)
2384 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2385 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2386 pairs++;
2387 #endif
2388 nsize = (size - pairs + (byteorder == 0));
2389 bytesize = nsize * 4;
2390 if (bytesize / 4 != nsize)
2391 return PyErr_NoMemory();
2392 v = PyString_FromStringAndSize(NULL, bytesize);
2393 if (v == NULL)
2394 return NULL;
2396 p = (unsigned char *)PyString_AS_STRING(v);
2397 if (byteorder == 0)
2398 STORECHAR(0xFEFF);
2399 if (size == 0)
2400 return v;
2402 if (byteorder == -1) {
2403 /* force LE */
2404 iorder[0] = 0;
2405 iorder[1] = 1;
2406 iorder[2] = 2;
2407 iorder[3] = 3;
2409 else if (byteorder == 1) {
2410 /* force BE */
2411 iorder[0] = 3;
2412 iorder[1] = 2;
2413 iorder[2] = 1;
2414 iorder[3] = 0;
2417 while (size-- > 0) {
2418 Py_UCS4 ch = *s++;
2419 #ifndef Py_UNICODE_WIDE
2420 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2421 Py_UCS4 ch2 = *s;
2422 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2423 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2424 s++;
2425 size--;
2428 #endif
2429 STORECHAR(ch);
2431 return v;
2432 #undef STORECHAR
2435 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2437 if (!PyUnicode_Check(unicode)) {
2438 PyErr_BadArgument();
2439 return NULL;
2441 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2442 PyUnicode_GET_SIZE(unicode),
2443 NULL,
2447 /* --- UTF-16 Codec ------------------------------------------------------- */
2449 PyObject *
2450 PyUnicode_DecodeUTF16(const char *s,
2451 Py_ssize_t size,
2452 const char *errors,
2453 int *byteorder)
2455 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2458 PyObject *
2459 PyUnicode_DecodeUTF16Stateful(const char *s,
2460 Py_ssize_t size,
2461 const char *errors,
2462 int *byteorder,
2463 Py_ssize_t *consumed)
2465 const char *starts = s;
2466 Py_ssize_t startinpos;
2467 Py_ssize_t endinpos;
2468 Py_ssize_t outpos;
2469 PyUnicodeObject *unicode;
2470 Py_UNICODE *p;
2471 const unsigned char *q, *e;
2472 int bo = 0; /* assume native ordering by default */
2473 const char *errmsg = "";
2474 /* Offsets from q for retrieving byte pairs in the right order. */
2475 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2476 int ihi = 1, ilo = 0;
2477 #else
2478 int ihi = 0, ilo = 1;
2479 #endif
2480 PyObject *errorHandler = NULL;
2481 PyObject *exc = NULL;
2483 /* Note: size will always be longer than the resulting Unicode
2484 character count */
2485 unicode = _PyUnicode_New(size);
2486 if (!unicode)
2487 return NULL;
2488 if (size == 0)
2489 return (PyObject *)unicode;
2491 /* Unpack UTF-16 encoded data */
2492 p = unicode->str;
2493 q = (unsigned char *)s;
2494 e = q + size;
2496 if (byteorder)
2497 bo = *byteorder;
2499 /* Check for BOM marks (U+FEFF) in the input and adjust current
2500 byte order setting accordingly. In native mode, the leading BOM
2501 mark is skipped, in all other modes, it is copied to the output
2502 stream as-is (giving a ZWNBSP character). */
2503 if (bo == 0) {
2504 if (size >= 2) {
2505 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2506 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2507 if (bom == 0xFEFF) {
2508 q += 2;
2509 bo = -1;
2511 else if (bom == 0xFFFE) {
2512 q += 2;
2513 bo = 1;
2515 #else
2516 if (bom == 0xFEFF) {
2517 q += 2;
2518 bo = 1;
2520 else if (bom == 0xFFFE) {
2521 q += 2;
2522 bo = -1;
2524 #endif
2528 if (bo == -1) {
2529 /* force LE */
2530 ihi = 1;
2531 ilo = 0;
2533 else if (bo == 1) {
2534 /* force BE */
2535 ihi = 0;
2536 ilo = 1;
2539 while (q < e) {
2540 Py_UNICODE ch;
2541 /* remaining bytes at the end? (size should be even) */
2542 if (e-q<2) {
2543 if (consumed)
2544 break;
2545 errmsg = "truncated data";
2546 startinpos = ((const char *)q)-starts;
2547 endinpos = ((const char *)e)-starts;
2548 goto utf16Error;
2549 /* The remaining input chars are ignored if the callback
2550 chooses to skip the input */
2552 ch = (q[ihi] << 8) | q[ilo];
2554 q += 2;
2556 if (ch < 0xD800 || ch > 0xDFFF) {
2557 *p++ = ch;
2558 continue;
2561 /* UTF-16 code pair: */
2562 if (q >= e) {
2563 errmsg = "unexpected end of data";
2564 startinpos = (((const char *)q)-2)-starts;
2565 endinpos = ((const char *)e)-starts;
2566 goto utf16Error;
2568 if (0xD800 <= ch && ch <= 0xDBFF) {
2569 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2570 q += 2;
2571 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2572 #ifndef Py_UNICODE_WIDE
2573 *p++ = ch;
2574 *p++ = ch2;
2575 #else
2576 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2577 #endif
2578 continue;
2580 else {
2581 errmsg = "illegal UTF-16 surrogate";
2582 startinpos = (((const char *)q)-4)-starts;
2583 endinpos = startinpos+2;
2584 goto utf16Error;
2588 errmsg = "illegal encoding";
2589 startinpos = (((const char *)q)-2)-starts;
2590 endinpos = startinpos+2;
2591 /* Fall through to report the error */
2593 utf16Error:
2594 outpos = p-PyUnicode_AS_UNICODE(unicode);
2595 if (unicode_decode_call_errorhandler(
2596 errors, &errorHandler,
2597 "utf16", errmsg,
2598 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2599 &unicode, &outpos, &p))
2600 goto onError;
2603 if (byteorder)
2604 *byteorder = bo;
2606 if (consumed)
2607 *consumed = (const char *)q-starts;
2609 /* Adjust length */
2610 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2611 goto onError;
2613 Py_XDECREF(errorHandler);
2614 Py_XDECREF(exc);
2615 return (PyObject *)unicode;
2617 onError:
2618 Py_DECREF(unicode);
2619 Py_XDECREF(errorHandler);
2620 Py_XDECREF(exc);
2621 return NULL;
2624 PyObject *
2625 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2626 Py_ssize_t size,
2627 const char *errors,
2628 int byteorder)
2630 PyObject *v;
2631 unsigned char *p;
2632 Py_ssize_t nsize, bytesize;
2633 #ifdef Py_UNICODE_WIDE
2634 Py_ssize_t i, pairs;
2635 #else
2636 const int pairs = 0;
2637 #endif
2638 /* Offsets from p for storing byte pairs in the right order. */
2639 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2640 int ihi = 1, ilo = 0;
2641 #else
2642 int ihi = 0, ilo = 1;
2643 #endif
2645 #define STORECHAR(CH) \
2646 do { \
2647 p[ihi] = ((CH) >> 8) & 0xff; \
2648 p[ilo] = (CH) & 0xff; \
2649 p += 2; \
2650 } while(0)
2652 #ifdef Py_UNICODE_WIDE
2653 for (i = pairs = 0; i < size; i++)
2654 if (s[i] >= 0x10000)
2655 pairs++;
2656 #endif
2657 /* 2 * (size + pairs + (byteorder == 0)) */
2658 if (size > PY_SSIZE_T_MAX ||
2659 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2660 return PyErr_NoMemory();
2661 nsize = size + pairs + (byteorder == 0);
2662 bytesize = nsize * 2;
2663 if (bytesize / 2 != nsize)
2664 return PyErr_NoMemory();
2665 v = PyString_FromStringAndSize(NULL, bytesize);
2666 if (v == NULL)
2667 return NULL;
2669 p = (unsigned char *)PyString_AS_STRING(v);
2670 if (byteorder == 0)
2671 STORECHAR(0xFEFF);
2672 if (size == 0)
2673 return v;
2675 if (byteorder == -1) {
2676 /* force LE */
2677 ihi = 1;
2678 ilo = 0;
2680 else if (byteorder == 1) {
2681 /* force BE */
2682 ihi = 0;
2683 ilo = 1;
2686 while (size-- > 0) {
2687 Py_UNICODE ch = *s++;
2688 Py_UNICODE ch2 = 0;
2689 #ifdef Py_UNICODE_WIDE
2690 if (ch >= 0x10000) {
2691 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2692 ch = 0xD800 | ((ch-0x10000) >> 10);
2694 #endif
2695 STORECHAR(ch);
2696 if (ch2)
2697 STORECHAR(ch2);
2699 return v;
2700 #undef STORECHAR
2703 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2705 if (!PyUnicode_Check(unicode)) {
2706 PyErr_BadArgument();
2707 return NULL;
2709 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2710 PyUnicode_GET_SIZE(unicode),
2711 NULL,
2715 /* --- Unicode Escape Codec ----------------------------------------------- */
2717 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2719 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2720 Py_ssize_t size,
2721 const char *errors)
2723 const char *starts = s;
2724 Py_ssize_t startinpos;
2725 Py_ssize_t endinpos;
2726 Py_ssize_t outpos;
2727 int i;
2728 PyUnicodeObject *v;
2729 Py_UNICODE *p;
2730 const char *end;
2731 char* message;
2732 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2733 PyObject *errorHandler = NULL;
2734 PyObject *exc = NULL;
2736 /* Escaped strings will always be longer than the resulting
2737 Unicode string, so we start with size here and then reduce the
2738 length after conversion to the true value.
2739 (but if the error callback returns a long replacement string
2740 we'll have to allocate more space) */
2741 v = _PyUnicode_New(size);
2742 if (v == NULL)
2743 goto onError;
2744 if (size == 0)
2745 return (PyObject *)v;
2747 p = PyUnicode_AS_UNICODE(v);
2748 end = s + size;
2750 while (s < end) {
2751 unsigned char c;
2752 Py_UNICODE x;
2753 int digits;
2755 /* Non-escape characters are interpreted as Unicode ordinals */
2756 if (*s != '\\') {
2757 *p++ = (unsigned char) *s++;
2758 continue;
2761 startinpos = s-starts;
2762 /* \ - Escapes */
2763 s++;
2764 c = *s++;
2765 if (s > end)
2766 c = '\0'; /* Invalid after \ */
2767 switch (c) {
2769 /* \x escapes */
2770 case '\n': break;
2771 case '\\': *p++ = '\\'; break;
2772 case '\'': *p++ = '\''; break;
2773 case '\"': *p++ = '\"'; break;
2774 case 'b': *p++ = '\b'; break;
2775 case 'f': *p++ = '\014'; break; /* FF */
2776 case 't': *p++ = '\t'; break;
2777 case 'n': *p++ = '\n'; break;
2778 case 'r': *p++ = '\r'; break;
2779 case 'v': *p++ = '\013'; break; /* VT */
2780 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2782 /* \OOO (octal) escapes */
2783 case '0': case '1': case '2': case '3':
2784 case '4': case '5': case '6': case '7':
2785 x = s[-1] - '0';
2786 if (s < end && '0' <= *s && *s <= '7') {
2787 x = (x<<3) + *s++ - '0';
2788 if (s < end && '0' <= *s && *s <= '7')
2789 x = (x<<3) + *s++ - '0';
2791 *p++ = x;
2792 break;
2794 /* hex escapes */
2795 /* \xXX */
2796 case 'x':
2797 digits = 2;
2798 message = "truncated \\xXX escape";
2799 goto hexescape;
2801 /* \uXXXX */
2802 case 'u':
2803 digits = 4;
2804 message = "truncated \\uXXXX escape";
2805 goto hexescape;
2807 /* \UXXXXXXXX */
2808 case 'U':
2809 digits = 8;
2810 message = "truncated \\UXXXXXXXX escape";
2811 hexescape:
2812 chr = 0;
2813 outpos = p-PyUnicode_AS_UNICODE(v);
2814 if (s+digits>end) {
2815 endinpos = size;
2816 if (unicode_decode_call_errorhandler(
2817 errors, &errorHandler,
2818 "unicodeescape", "end of string in escape sequence",
2819 starts, size, &startinpos, &endinpos, &exc, &s,
2820 &v, &outpos, &p))
2821 goto onError;
2822 goto nextByte;
2824 for (i = 0; i < digits; ++i) {
2825 c = (unsigned char) s[i];
2826 if (!isxdigit(c)) {
2827 endinpos = (s+i+1)-starts;
2828 if (unicode_decode_call_errorhandler(
2829 errors, &errorHandler,
2830 "unicodeescape", message,
2831 starts, size, &startinpos, &endinpos, &exc, &s,
2832 &v, &outpos, &p))
2833 goto onError;
2834 goto nextByte;
2836 chr = (chr<<4) & ~0xF;
2837 if (c >= '0' && c <= '9')
2838 chr += c - '0';
2839 else if (c >= 'a' && c <= 'f')
2840 chr += 10 + c - 'a';
2841 else
2842 chr += 10 + c - 'A';
2844 s += i;
2845 if (chr == 0xffffffff && PyErr_Occurred())
2846 /* _decoding_error will have already written into the
2847 target buffer. */
2848 break;
2849 store:
2850 /* when we get here, chr is a 32-bit unicode character */
2851 if (chr <= 0xffff)
2852 /* UCS-2 character */
2853 *p++ = (Py_UNICODE) chr;
2854 else if (chr <= 0x10ffff) {
2855 /* UCS-4 character. Either store directly, or as
2856 surrogate pair. */
2857 #ifdef Py_UNICODE_WIDE
2858 *p++ = chr;
2859 #else
2860 chr -= 0x10000L;
2861 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2862 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2863 #endif
2864 } else {
2865 endinpos = s-starts;
2866 outpos = p-PyUnicode_AS_UNICODE(v);
2867 if (unicode_decode_call_errorhandler(
2868 errors, &errorHandler,
2869 "unicodeescape", "illegal Unicode character",
2870 starts, size, &startinpos, &endinpos, &exc, &s,
2871 &v, &outpos, &p))
2872 goto onError;
2874 break;
2876 /* \N{name} */
2877 case 'N':
2878 message = "malformed \\N character escape";
2879 if (ucnhash_CAPI == NULL) {
2880 /* load the unicode data module */
2881 PyObject *m, *api;
2882 m = PyImport_ImportModuleNoBlock("unicodedata");
2883 if (m == NULL)
2884 goto ucnhashError;
2885 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
2886 Py_DECREF(m);
2887 if (api == NULL)
2888 goto ucnhashError;
2889 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
2890 Py_DECREF(api);
2891 if (ucnhash_CAPI == NULL)
2892 goto ucnhashError;
2894 if (*s == '{') {
2895 const char *start = s+1;
2896 /* look for the closing brace */
2897 while (*s != '}' && s < end)
2898 s++;
2899 if (s > start && s < end && *s == '}') {
2900 /* found a name. look it up in the unicode database */
2901 message = "unknown Unicode character name";
2902 s++;
2903 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2904 goto store;
2907 endinpos = s-starts;
2908 outpos = p-PyUnicode_AS_UNICODE(v);
2909 if (unicode_decode_call_errorhandler(
2910 errors, &errorHandler,
2911 "unicodeescape", message,
2912 starts, size, &startinpos, &endinpos, &exc, &s,
2913 &v, &outpos, &p))
2914 goto onError;
2915 break;
2917 default:
2918 if (s > end) {
2919 message = "\\ at end of string";
2920 s--;
2921 endinpos = s-starts;
2922 outpos = p-PyUnicode_AS_UNICODE(v);
2923 if (unicode_decode_call_errorhandler(
2924 errors, &errorHandler,
2925 "unicodeescape", message,
2926 starts, size, &startinpos, &endinpos, &exc, &s,
2927 &v, &outpos, &p))
2928 goto onError;
2930 else {
2931 *p++ = '\\';
2932 *p++ = (unsigned char)s[-1];
2934 break;
2936 nextByte:
2939 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2940 goto onError;
2941 Py_XDECREF(errorHandler);
2942 Py_XDECREF(exc);
2943 return (PyObject *)v;
2945 ucnhashError:
2946 PyErr_SetString(
2947 PyExc_UnicodeError,
2948 "\\N escapes not supported (can't load unicodedata module)"
2950 Py_XDECREF(v);
2951 Py_XDECREF(errorHandler);
2952 Py_XDECREF(exc);
2953 return NULL;
2955 onError:
2956 Py_XDECREF(v);
2957 Py_XDECREF(errorHandler);
2958 Py_XDECREF(exc);
2959 return NULL;
2962 /* Return a Unicode-Escape string version of the Unicode object.
2964 If quotes is true, the string is enclosed in u"" or u'' quotes as
2965 appropriate.
2969 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2970 Py_ssize_t size,
2971 Py_UNICODE ch)
2973 /* like wcschr, but doesn't stop at NULL characters */
2975 while (size-- > 0) {
2976 if (*s == ch)
2977 return s;
2978 s++;
2981 return NULL;
2984 static
2985 PyObject *unicodeescape_string(const Py_UNICODE *s,
2986 Py_ssize_t size,
2987 int quotes)
2989 PyObject *repr;
2990 char *p;
2992 static const char *hexdigit = "0123456789abcdef";
2993 #ifdef Py_UNICODE_WIDE
2994 const Py_ssize_t expandsize = 10;
2995 #else
2996 const Py_ssize_t expandsize = 6;
2997 #endif
2999 /* XXX(nnorwitz): rather than over-allocating, it would be
3000 better to choose a different scheme. Perhaps scan the
3001 first N-chars of the string and allocate based on that size.
3003 /* Initial allocation is based on the longest-possible unichr
3004 escape.
3006 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3007 unichr, so in this case it's the longest unichr escape. In
3008 narrow (UTF-16) builds this is five chars per source unichr
3009 since there are two unichrs in the surrogate pair, so in narrow
3010 (UTF-16) builds it's not the longest unichr escape.
3012 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3013 so in the narrow (UTF-16) build case it's the longest unichr
3014 escape.
3017 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3018 return PyErr_NoMemory();
3020 repr = PyString_FromStringAndSize(NULL,
3022 + expandsize*size
3023 + 1);
3024 if (repr == NULL)
3025 return NULL;
3027 p = PyString_AS_STRING(repr);
3029 if (quotes) {
3030 *p++ = 'u';
3031 *p++ = (findchar(s, size, '\'') &&
3032 !findchar(s, size, '"')) ? '"' : '\'';
3034 while (size-- > 0) {
3035 Py_UNICODE ch = *s++;
3037 /* Escape quotes and backslashes */
3038 if ((quotes &&
3039 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
3040 *p++ = '\\';
3041 *p++ = (char) ch;
3042 continue;
3045 #ifdef Py_UNICODE_WIDE
3046 /* Map 21-bit characters to '\U00xxxxxx' */
3047 else if (ch >= 0x10000) {
3048 *p++ = '\\';
3049 *p++ = 'U';
3050 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3051 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3052 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3053 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3054 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3055 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3056 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
3057 *p++ = hexdigit[ch & 0x0000000F];
3058 continue;
3060 #else
3061 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3062 else if (ch >= 0xD800 && ch < 0xDC00) {
3063 Py_UNICODE ch2;
3064 Py_UCS4 ucs;
3066 ch2 = *s++;
3067 size--;
3068 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3069 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3070 *p++ = '\\';
3071 *p++ = 'U';
3072 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3073 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3074 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3075 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3076 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3077 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3078 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3079 *p++ = hexdigit[ucs & 0x0000000F];
3080 continue;
3082 /* Fall through: isolated surrogates are copied as-is */
3083 s--;
3084 size++;
3086 #endif
3088 /* Map 16-bit characters to '\uxxxx' */
3089 if (ch >= 256) {
3090 *p++ = '\\';
3091 *p++ = 'u';
3092 *p++ = hexdigit[(ch >> 12) & 0x000F];
3093 *p++ = hexdigit[(ch >> 8) & 0x000F];
3094 *p++ = hexdigit[(ch >> 4) & 0x000F];
3095 *p++ = hexdigit[ch & 0x000F];
3098 /* Map special whitespace to '\t', \n', '\r' */
3099 else if (ch == '\t') {
3100 *p++ = '\\';
3101 *p++ = 't';
3103 else if (ch == '\n') {
3104 *p++ = '\\';
3105 *p++ = 'n';
3107 else if (ch == '\r') {
3108 *p++ = '\\';
3109 *p++ = 'r';
3112 /* Map non-printable US ASCII to '\xhh' */
3113 else if (ch < ' ' || ch >= 0x7F) {
3114 *p++ = '\\';
3115 *p++ = 'x';
3116 *p++ = hexdigit[(ch >> 4) & 0x000F];
3117 *p++ = hexdigit[ch & 0x000F];
3120 /* Copy everything else as-is */
3121 else
3122 *p++ = (char) ch;
3124 if (quotes)
3125 *p++ = PyString_AS_STRING(repr)[1];
3127 *p = '\0';
3128 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
3129 return repr;
3132 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3133 Py_ssize_t size)
3135 return unicodeescape_string(s, size, 0);
3138 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3140 if (!PyUnicode_Check(unicode)) {
3141 PyErr_BadArgument();
3142 return NULL;
3144 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3145 PyUnicode_GET_SIZE(unicode));
3148 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3150 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3151 Py_ssize_t size,
3152 const char *errors)
3154 const char *starts = s;
3155 Py_ssize_t startinpos;
3156 Py_ssize_t endinpos;
3157 Py_ssize_t outpos;
3158 PyUnicodeObject *v;
3159 Py_UNICODE *p;
3160 const char *end;
3161 const char *bs;
3162 PyObject *errorHandler = NULL;
3163 PyObject *exc = NULL;
3165 /* Escaped strings will always be longer than the resulting
3166 Unicode string, so we start with size here and then reduce the
3167 length after conversion to the true value. (But decoding error
3168 handler might have to resize the string) */
3169 v = _PyUnicode_New(size);
3170 if (v == NULL)
3171 goto onError;
3172 if (size == 0)
3173 return (PyObject *)v;
3174 p = PyUnicode_AS_UNICODE(v);
3175 end = s + size;
3176 while (s < end) {
3177 unsigned char c;
3178 Py_UCS4 x;
3179 int i;
3180 int count;
3182 /* Non-escape characters are interpreted as Unicode ordinals */
3183 if (*s != '\\') {
3184 *p++ = (unsigned char)*s++;
3185 continue;
3187 startinpos = s-starts;
3189 /* \u-escapes are only interpreted iff the number of leading
3190 backslashes if odd */
3191 bs = s;
3192 for (;s < end;) {
3193 if (*s != '\\')
3194 break;
3195 *p++ = (unsigned char)*s++;
3197 if (((s - bs) & 1) == 0 ||
3198 s >= end ||
3199 (*s != 'u' && *s != 'U')) {
3200 continue;
3202 p--;
3203 count = *s=='u' ? 4 : 8;
3204 s++;
3206 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3207 outpos = p-PyUnicode_AS_UNICODE(v);
3208 for (x = 0, i = 0; i < count; ++i, ++s) {
3209 c = (unsigned char)*s;
3210 if (!isxdigit(c)) {
3211 endinpos = s-starts;
3212 if (unicode_decode_call_errorhandler(
3213 errors, &errorHandler,
3214 "rawunicodeescape", "truncated \\uXXXX",
3215 starts, size, &startinpos, &endinpos, &exc, &s,
3216 &v, &outpos, &p))
3217 goto onError;
3218 goto nextByte;
3220 x = (x<<4) & ~0xF;
3221 if (c >= '0' && c <= '9')
3222 x += c - '0';
3223 else if (c >= 'a' && c <= 'f')
3224 x += 10 + c - 'a';
3225 else
3226 x += 10 + c - 'A';
3228 if (x <= 0xffff)
3229 /* UCS-2 character */
3230 *p++ = (Py_UNICODE) x;
3231 else if (x <= 0x10ffff) {
3232 /* UCS-4 character. Either store directly, or as
3233 surrogate pair. */
3234 #ifdef Py_UNICODE_WIDE
3235 *p++ = (Py_UNICODE) x;
3236 #else
3237 x -= 0x10000L;
3238 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3239 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3240 #endif
3241 } else {
3242 endinpos = s-starts;
3243 outpos = p-PyUnicode_AS_UNICODE(v);
3244 if (unicode_decode_call_errorhandler(
3245 errors, &errorHandler,
3246 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3247 starts, size, &startinpos, &endinpos, &exc, &s,
3248 &v, &outpos, &p))
3249 goto onError;
3251 nextByte:
3254 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3255 goto onError;
3256 Py_XDECREF(errorHandler);
3257 Py_XDECREF(exc);
3258 return (PyObject *)v;
3260 onError:
3261 Py_XDECREF(v);
3262 Py_XDECREF(errorHandler);
3263 Py_XDECREF(exc);
3264 return NULL;
3267 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3268 Py_ssize_t size)
3270 PyObject *repr;
3271 char *p;
3272 char *q;
3274 static const char *hexdigit = "0123456789abcdef";
3275 #ifdef Py_UNICODE_WIDE
3276 const Py_ssize_t expandsize = 10;
3277 #else
3278 const Py_ssize_t expandsize = 6;
3279 #endif
3281 if (size > PY_SSIZE_T_MAX / expandsize)
3282 return PyErr_NoMemory();
3284 repr = PyString_FromStringAndSize(NULL, expandsize * size);
3285 if (repr == NULL)
3286 return NULL;
3287 if (size == 0)
3288 return repr;
3290 p = q = PyString_AS_STRING(repr);
3291 while (size-- > 0) {
3292 Py_UNICODE ch = *s++;
3293 #ifdef Py_UNICODE_WIDE
3294 /* Map 32-bit characters to '\Uxxxxxxxx' */
3295 if (ch >= 0x10000) {
3296 *p++ = '\\';
3297 *p++ = 'U';
3298 *p++ = hexdigit[(ch >> 28) & 0xf];
3299 *p++ = hexdigit[(ch >> 24) & 0xf];
3300 *p++ = hexdigit[(ch >> 20) & 0xf];
3301 *p++ = hexdigit[(ch >> 16) & 0xf];
3302 *p++ = hexdigit[(ch >> 12) & 0xf];
3303 *p++ = hexdigit[(ch >> 8) & 0xf];
3304 *p++ = hexdigit[(ch >> 4) & 0xf];
3305 *p++ = hexdigit[ch & 15];
3307 else
3308 #else
3309 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3310 if (ch >= 0xD800 && ch < 0xDC00) {
3311 Py_UNICODE ch2;
3312 Py_UCS4 ucs;
3314 ch2 = *s++;
3315 size--;
3316 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3317 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3318 *p++ = '\\';
3319 *p++ = 'U';
3320 *p++ = hexdigit[(ucs >> 28) & 0xf];
3321 *p++ = hexdigit[(ucs >> 24) & 0xf];
3322 *p++ = hexdigit[(ucs >> 20) & 0xf];
3323 *p++ = hexdigit[(ucs >> 16) & 0xf];
3324 *p++ = hexdigit[(ucs >> 12) & 0xf];
3325 *p++ = hexdigit[(ucs >> 8) & 0xf];
3326 *p++ = hexdigit[(ucs >> 4) & 0xf];
3327 *p++ = hexdigit[ucs & 0xf];
3328 continue;
3330 /* Fall through: isolated surrogates are copied as-is */
3331 s--;
3332 size++;
3334 #endif
3335 /* Map 16-bit characters to '\uxxxx' */
3336 if (ch >= 256) {
3337 *p++ = '\\';
3338 *p++ = 'u';
3339 *p++ = hexdigit[(ch >> 12) & 0xf];
3340 *p++ = hexdigit[(ch >> 8) & 0xf];
3341 *p++ = hexdigit[(ch >> 4) & 0xf];
3342 *p++ = hexdigit[ch & 15];
3344 /* Copy everything else as-is */
3345 else
3346 *p++ = (char) ch;
3348 *p = '\0';
3349 _PyString_Resize(&repr, p - q);
3350 return repr;
3353 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3355 if (!PyUnicode_Check(unicode)) {
3356 PyErr_BadArgument();
3357 return NULL;
3359 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3360 PyUnicode_GET_SIZE(unicode));
3363 /* --- Unicode Internal Codec ------------------------------------------- */
3365 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3366 Py_ssize_t size,
3367 const char *errors)
3369 const char *starts = s;
3370 Py_ssize_t startinpos;
3371 Py_ssize_t endinpos;
3372 Py_ssize_t outpos;
3373 PyUnicodeObject *v;
3374 Py_UNICODE *p;
3375 const char *end;
3376 const char *reason;
3377 PyObject *errorHandler = NULL;
3378 PyObject *exc = NULL;
3380 #ifdef Py_UNICODE_WIDE
3381 Py_UNICODE unimax = PyUnicode_GetMax();
3382 #endif
3384 /* XXX overflow detection missing */
3385 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3386 if (v == NULL)
3387 goto onError;
3388 if (PyUnicode_GetSize((PyObject *)v) == 0)
3389 return (PyObject *)v;
3390 p = PyUnicode_AS_UNICODE(v);
3391 end = s + size;
3393 while (s < end) {
3394 memcpy(p, s, sizeof(Py_UNICODE));
3395 /* We have to sanity check the raw data, otherwise doom looms for
3396 some malformed UCS-4 data. */
3397 if (
3398 #ifdef Py_UNICODE_WIDE
3399 *p > unimax || *p < 0 ||
3400 #endif
3401 end-s < Py_UNICODE_SIZE
3404 startinpos = s - starts;
3405 if (end-s < Py_UNICODE_SIZE) {
3406 endinpos = end-starts;
3407 reason = "truncated input";
3409 else {
3410 endinpos = s - starts + Py_UNICODE_SIZE;
3411 reason = "illegal code point (> 0x10FFFF)";
3413 outpos = p - PyUnicode_AS_UNICODE(v);
3414 if (unicode_decode_call_errorhandler(
3415 errors, &errorHandler,
3416 "unicode_internal", reason,
3417 starts, size, &startinpos, &endinpos, &exc, &s,
3418 &v, &outpos, &p)) {
3419 goto onError;
3422 else {
3423 p++;
3424 s += Py_UNICODE_SIZE;
3428 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3429 goto onError;
3430 Py_XDECREF(errorHandler);
3431 Py_XDECREF(exc);
3432 return (PyObject *)v;
3434 onError:
3435 Py_XDECREF(v);
3436 Py_XDECREF(errorHandler);
3437 Py_XDECREF(exc);
3438 return NULL;
3441 /* --- Latin-1 Codec ------------------------------------------------------ */
3443 PyObject *PyUnicode_DecodeLatin1(const char *s,
3444 Py_ssize_t size,
3445 const char *errors)
3447 PyUnicodeObject *v;
3448 Py_UNICODE *p;
3450 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3451 if (size == 1) {
3452 Py_UNICODE r = *(unsigned char*)s;
3453 return PyUnicode_FromUnicode(&r, 1);
3456 v = _PyUnicode_New(size);
3457 if (v == NULL)
3458 goto onError;
3459 if (size == 0)
3460 return (PyObject *)v;
3461 p = PyUnicode_AS_UNICODE(v);
3462 while (size-- > 0)
3463 *p++ = (unsigned char)*s++;
3464 return (PyObject *)v;
3466 onError:
3467 Py_XDECREF(v);
3468 return NULL;
3471 /* create or adjust a UnicodeEncodeError */
3472 static void make_encode_exception(PyObject **exceptionObject,
3473 const char *encoding,
3474 const Py_UNICODE *unicode, Py_ssize_t size,
3475 Py_ssize_t startpos, Py_ssize_t endpos,
3476 const char *reason)
3478 if (*exceptionObject == NULL) {
3479 *exceptionObject = PyUnicodeEncodeError_Create(
3480 encoding, unicode, size, startpos, endpos, reason);
3482 else {
3483 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3484 goto onError;
3485 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3486 goto onError;
3487 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3488 goto onError;
3489 return;
3490 onError:
3491 Py_DECREF(*exceptionObject);
3492 *exceptionObject = NULL;
3496 /* raises a UnicodeEncodeError */
3497 static void raise_encode_exception(PyObject **exceptionObject,
3498 const char *encoding,
3499 const Py_UNICODE *unicode, Py_ssize_t size,
3500 Py_ssize_t startpos, Py_ssize_t endpos,
3501 const char *reason)
3503 make_encode_exception(exceptionObject,
3504 encoding, unicode, size, startpos, endpos, reason);
3505 if (*exceptionObject != NULL)
3506 PyCodec_StrictErrors(*exceptionObject);
3509 /* error handling callback helper:
3510 build arguments, call the callback and check the arguments,
3511 put the result into newpos and return the replacement string, which
3512 has to be freed by the caller */
3513 static PyObject *unicode_encode_call_errorhandler(const char *errors,
3514 PyObject **errorHandler,
3515 const char *encoding, const char *reason,
3516 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3517 Py_ssize_t startpos, Py_ssize_t endpos,
3518 Py_ssize_t *newpos)
3520 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3522 PyObject *restuple;
3523 PyObject *resunicode;
3525 if (*errorHandler == NULL) {
3526 *errorHandler = PyCodec_LookupError(errors);
3527 if (*errorHandler == NULL)
3528 return NULL;
3531 make_encode_exception(exceptionObject,
3532 encoding, unicode, size, startpos, endpos, reason);
3533 if (*exceptionObject == NULL)
3534 return NULL;
3536 restuple = PyObject_CallFunctionObjArgs(
3537 *errorHandler, *exceptionObject, NULL);
3538 if (restuple == NULL)
3539 return NULL;
3540 if (!PyTuple_Check(restuple)) {
3541 PyErr_SetString(PyExc_TypeError, &argparse[4]);
3542 Py_DECREF(restuple);
3543 return NULL;
3545 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3546 &resunicode, newpos)) {
3547 Py_DECREF(restuple);
3548 return NULL;
3550 if (*newpos<0)
3551 *newpos = size+*newpos;
3552 if (*newpos<0 || *newpos>size) {
3553 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3554 Py_DECREF(restuple);
3555 return NULL;
3557 Py_INCREF(resunicode);
3558 Py_DECREF(restuple);
3559 return resunicode;
3562 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3563 Py_ssize_t size,
3564 const char *errors,
3565 int limit)
3567 /* output object */
3568 PyObject *res;
3569 /* pointers to the beginning and end+1 of input */
3570 const Py_UNICODE *startp = p;
3571 const Py_UNICODE *endp = p + size;
3572 /* pointer to the beginning of the unencodable characters */
3573 /* const Py_UNICODE *badp = NULL; */
3574 /* pointer into the output */
3575 char *str;
3576 /* current output position */
3577 Py_ssize_t respos = 0;
3578 Py_ssize_t ressize;
3579 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3580 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3581 PyObject *errorHandler = NULL;
3582 PyObject *exc = NULL;
3583 /* the following variable is used for caching string comparisons
3584 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3585 int known_errorHandler = -1;
3587 /* allocate enough for a simple encoding without
3588 replacements, if we need more, we'll resize */
3589 res = PyString_FromStringAndSize(NULL, size);
3590 if (res == NULL)
3591 goto onError;
3592 if (size == 0)
3593 return res;
3594 str = PyString_AS_STRING(res);
3595 ressize = size;
3597 while (p<endp) {
3598 Py_UNICODE c = *p;
3600 /* can we encode this? */
3601 if (c<limit) {
3602 /* no overflow check, because we know that the space is enough */
3603 *str++ = (char)c;
3604 ++p;
3606 else {
3607 Py_ssize_t unicodepos = p-startp;
3608 Py_ssize_t requiredsize;
3609 PyObject *repunicode;
3610 Py_ssize_t repsize;
3611 Py_ssize_t newpos;
3612 Py_ssize_t respos;
3613 Py_UNICODE *uni2;
3614 /* startpos for collecting unencodable chars */
3615 const Py_UNICODE *collstart = p;
3616 const Py_UNICODE *collend = p;
3617 /* find all unecodable characters */
3618 while ((collend < endp) && ((*collend)>=limit))
3619 ++collend;
3620 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3621 if (known_errorHandler==-1) {
3622 if ((errors==NULL) || (!strcmp(errors, "strict")))
3623 known_errorHandler = 1;
3624 else if (!strcmp(errors, "replace"))
3625 known_errorHandler = 2;
3626 else if (!strcmp(errors, "ignore"))
3627 known_errorHandler = 3;
3628 else if (!strcmp(errors, "xmlcharrefreplace"))
3629 known_errorHandler = 4;
3630 else
3631 known_errorHandler = 0;
3633 switch (known_errorHandler) {
3634 case 1: /* strict */
3635 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3636 goto onError;
3637 case 2: /* replace */
3638 while (collstart++<collend)
3639 *str++ = '?'; /* fall through */
3640 case 3: /* ignore */
3641 p = collend;
3642 break;
3643 case 4: /* xmlcharrefreplace */
3644 respos = str-PyString_AS_STRING(res);
3645 /* determine replacement size (temporarily (mis)uses p) */
3646 for (p = collstart, repsize = 0; p < collend; ++p) {
3647 if (*p<10)
3648 repsize += 2+1+1;
3649 else if (*p<100)
3650 repsize += 2+2+1;
3651 else if (*p<1000)
3652 repsize += 2+3+1;
3653 else if (*p<10000)
3654 repsize += 2+4+1;
3655 #ifndef Py_UNICODE_WIDE
3656 else
3657 repsize += 2+5+1;
3658 #else
3659 else if (*p<100000)
3660 repsize += 2+5+1;
3661 else if (*p<1000000)
3662 repsize += 2+6+1;
3663 else
3664 repsize += 2+7+1;
3665 #endif
3667 requiredsize = respos+repsize+(endp-collend);
3668 if (requiredsize > ressize) {
3669 if (requiredsize<2*ressize)
3670 requiredsize = 2*ressize;
3671 if (_PyString_Resize(&res, requiredsize))
3672 goto onError;
3673 str = PyString_AS_STRING(res) + respos;
3674 ressize = requiredsize;
3676 /* generate replacement (temporarily (mis)uses p) */
3677 for (p = collstart; p < collend; ++p) {
3678 str += sprintf(str, "&#%d;", (int)*p);
3680 p = collend;
3681 break;
3682 default:
3683 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3684 encoding, reason, startp, size, &exc,
3685 collstart-startp, collend-startp, &newpos);
3686 if (repunicode == NULL)
3687 goto onError;
3688 /* need more space? (at least enough for what we have+the
3689 replacement+the rest of the string, so we won't have to
3690 check space for encodable characters) */
3691 respos = str-PyString_AS_STRING(res);
3692 repsize = PyUnicode_GET_SIZE(repunicode);
3693 requiredsize = respos+repsize+(endp-collend);
3694 if (requiredsize > ressize) {
3695 if (requiredsize<2*ressize)
3696 requiredsize = 2*ressize;
3697 if (_PyString_Resize(&res, requiredsize)) {
3698 Py_DECREF(repunicode);
3699 goto onError;
3701 str = PyString_AS_STRING(res) + respos;
3702 ressize = requiredsize;
3704 /* check if there is anything unencodable in the replacement
3705 and copy it to the output */
3706 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3707 c = *uni2;
3708 if (c >= limit) {
3709 raise_encode_exception(&exc, encoding, startp, size,
3710 unicodepos, unicodepos+1, reason);
3711 Py_DECREF(repunicode);
3712 goto onError;
3714 *str = (char)c;
3716 p = startp + newpos;
3717 Py_DECREF(repunicode);
3721 /* Resize if we allocated to much */
3722 respos = str-PyString_AS_STRING(res);
3723 if (respos<ressize)
3724 /* If this falls res will be NULL */
3725 _PyString_Resize(&res, respos);
3726 Py_XDECREF(errorHandler);
3727 Py_XDECREF(exc);
3728 return res;
3730 onError:
3731 Py_XDECREF(res);
3732 Py_XDECREF(errorHandler);
3733 Py_XDECREF(exc);
3734 return NULL;
3737 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3738 Py_ssize_t size,
3739 const char *errors)
3741 return unicode_encode_ucs1(p, size, errors, 256);
3744 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3746 if (!PyUnicode_Check(unicode)) {
3747 PyErr_BadArgument();
3748 return NULL;
3750 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3751 PyUnicode_GET_SIZE(unicode),
3752 NULL);
3755 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3757 PyObject *PyUnicode_DecodeASCII(const char *s,
3758 Py_ssize_t size,
3759 const char *errors)
3761 const char *starts = s;
3762 PyUnicodeObject *v;
3763 Py_UNICODE *p;
3764 Py_ssize_t startinpos;
3765 Py_ssize_t endinpos;
3766 Py_ssize_t outpos;
3767 const char *e;
3768 PyObject *errorHandler = NULL;
3769 PyObject *exc = NULL;
3771 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3772 if (size == 1 && *(unsigned char*)s < 128) {
3773 Py_UNICODE r = *(unsigned char*)s;
3774 return PyUnicode_FromUnicode(&r, 1);
3777 v = _PyUnicode_New(size);
3778 if (v == NULL)
3779 goto onError;
3780 if (size == 0)
3781 return (PyObject *)v;
3782 p = PyUnicode_AS_UNICODE(v);
3783 e = s + size;
3784 while (s < e) {
3785 register unsigned char c = (unsigned char)*s;
3786 if (c < 128) {
3787 *p++ = c;
3788 ++s;
3790 else {
3791 startinpos = s-starts;
3792 endinpos = startinpos + 1;
3793 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3794 if (unicode_decode_call_errorhandler(
3795 errors, &errorHandler,
3796 "ascii", "ordinal not in range(128)",
3797 starts, size, &startinpos, &endinpos, &exc, &s,
3798 &v, &outpos, &p))
3799 goto onError;
3802 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
3803 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3804 goto onError;
3805 Py_XDECREF(errorHandler);
3806 Py_XDECREF(exc);
3807 return (PyObject *)v;
3809 onError:
3810 Py_XDECREF(v);
3811 Py_XDECREF(errorHandler);
3812 Py_XDECREF(exc);
3813 return NULL;
3816 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3817 Py_ssize_t size,
3818 const char *errors)
3820 return unicode_encode_ucs1(p, size, errors, 128);
3823 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3825 if (!PyUnicode_Check(unicode)) {
3826 PyErr_BadArgument();
3827 return NULL;
3829 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3830 PyUnicode_GET_SIZE(unicode),
3831 NULL);
3834 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3836 /* --- MBCS codecs for Windows -------------------------------------------- */
3838 #if SIZEOF_INT < SIZEOF_SIZE_T
3839 #define NEED_RETRY
3840 #endif
3842 /* XXX This code is limited to "true" double-byte encodings, as
3843 a) it assumes an incomplete character consists of a single byte, and
3844 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3845 encodings, see IsDBCSLeadByteEx documentation. */
3847 static int is_dbcs_lead_byte(const char *s, int offset)
3849 const char *curr = s + offset;
3851 if (IsDBCSLeadByte(*curr)) {
3852 const char *prev = CharPrev(s, curr);
3853 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3855 return 0;
3859 * Decode MBCS string into unicode object. If 'final' is set, converts
3860 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3862 static int decode_mbcs(PyUnicodeObject **v,
3863 const char *s, /* MBCS string */
3864 int size, /* sizeof MBCS string */
3865 int final)
3867 Py_UNICODE *p;
3868 Py_ssize_t n = 0;
3869 int usize = 0;
3871 assert(size >= 0);
3873 /* Skip trailing lead-byte unless 'final' is set */
3874 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3875 --size;
3877 /* First get the size of the result */
3878 if (size > 0) {
3879 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3880 if (usize == 0) {
3881 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3882 return -1;
3886 if (*v == NULL) {
3887 /* Create unicode object */
3888 *v = _PyUnicode_New(usize);
3889 if (*v == NULL)
3890 return -1;
3892 else {
3893 /* Extend unicode object */
3894 n = PyUnicode_GET_SIZE(*v);
3895 if (_PyUnicode_Resize(v, n + usize) < 0)
3896 return -1;
3899 /* Do the conversion */
3900 if (size > 0) {
3901 p = PyUnicode_AS_UNICODE(*v) + n;
3902 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3903 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3904 return -1;
3908 return size;
3911 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3912 Py_ssize_t size,
3913 const char *errors,
3914 Py_ssize_t *consumed)
3916 PyUnicodeObject *v = NULL;
3917 int done;
3919 if (consumed)
3920 *consumed = 0;
3922 #ifdef NEED_RETRY
3923 retry:
3924 if (size > INT_MAX)
3925 done = decode_mbcs(&v, s, INT_MAX, 0);
3926 else
3927 #endif
3928 done = decode_mbcs(&v, s, (int)size, !consumed);
3930 if (done < 0) {
3931 Py_XDECREF(v);
3932 return NULL;
3935 if (consumed)
3936 *consumed += done;
3938 #ifdef NEED_RETRY
3939 if (size > INT_MAX) {
3940 s += done;
3941 size -= done;
3942 goto retry;
3944 #endif
3946 return (PyObject *)v;
3949 PyObject *PyUnicode_DecodeMBCS(const char *s,
3950 Py_ssize_t size,
3951 const char *errors)
3953 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3957 * Convert unicode into string object (MBCS).
3958 * Returns 0 if succeed, -1 otherwise.
3960 static int encode_mbcs(PyObject **repr,
3961 const Py_UNICODE *p, /* unicode */
3962 int size) /* size of unicode */
3964 int mbcssize = 0;
3965 Py_ssize_t n = 0;
3967 assert(size >= 0);
3969 /* First get the size of the result */
3970 if (size > 0) {
3971 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3972 if (mbcssize == 0) {
3973 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3974 return -1;
3978 if (*repr == NULL) {
3979 /* Create string object */
3980 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3981 if (*repr == NULL)
3982 return -1;
3984 else {
3985 /* Extend string object */
3986 n = PyString_Size(*repr);
3987 if (_PyString_Resize(repr, n + mbcssize) < 0)
3988 return -1;
3991 /* Do the conversion */
3992 if (size > 0) {
3993 char *s = PyString_AS_STRING(*repr) + n;
3994 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3995 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3996 return -1;
4000 return 0;
4003 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
4004 Py_ssize_t size,
4005 const char *errors)
4007 PyObject *repr = NULL;
4008 int ret;
4010 #ifdef NEED_RETRY
4011 retry:
4012 if (size > INT_MAX)
4013 ret = encode_mbcs(&repr, p, INT_MAX);
4014 else
4015 #endif
4016 ret = encode_mbcs(&repr, p, (int)size);
4018 if (ret < 0) {
4019 Py_XDECREF(repr);
4020 return NULL;
4023 #ifdef NEED_RETRY
4024 if (size > INT_MAX) {
4025 p += INT_MAX;
4026 size -= INT_MAX;
4027 goto retry;
4029 #endif
4031 return repr;
4034 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4036 if (!PyUnicode_Check(unicode)) {
4037 PyErr_BadArgument();
4038 return NULL;
4040 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4041 PyUnicode_GET_SIZE(unicode),
4042 NULL);
4045 #undef NEED_RETRY
4047 #endif /* MS_WINDOWS */
4049 /* --- Character Mapping Codec -------------------------------------------- */
4051 PyObject *PyUnicode_DecodeCharmap(const char *s,
4052 Py_ssize_t size,
4053 PyObject *mapping,
4054 const char *errors)
4056 const char *starts = s;
4057 Py_ssize_t startinpos;
4058 Py_ssize_t endinpos;
4059 Py_ssize_t outpos;
4060 const char *e;
4061 PyUnicodeObject *v;
4062 Py_UNICODE *p;
4063 Py_ssize_t extrachars = 0;
4064 PyObject *errorHandler = NULL;
4065 PyObject *exc = NULL;
4066 Py_UNICODE *mapstring = NULL;
4067 Py_ssize_t maplen = 0;
4069 /* Default to Latin-1 */
4070 if (mapping == NULL)
4071 return PyUnicode_DecodeLatin1(s, size, errors);
4073 v = _PyUnicode_New(size);
4074 if (v == NULL)
4075 goto onError;
4076 if (size == 0)
4077 return (PyObject *)v;
4078 p = PyUnicode_AS_UNICODE(v);
4079 e = s + size;
4080 if (PyUnicode_CheckExact(mapping)) {
4081 mapstring = PyUnicode_AS_UNICODE(mapping);
4082 maplen = PyUnicode_GET_SIZE(mapping);
4083 while (s < e) {
4084 unsigned char ch = *s;
4085 Py_UNICODE x = 0xfffe; /* illegal value */
4087 if (ch < maplen)
4088 x = mapstring[ch];
4090 if (x == 0xfffe) {
4091 /* undefined mapping */
4092 outpos = p-PyUnicode_AS_UNICODE(v);
4093 startinpos = s-starts;
4094 endinpos = startinpos+1;
4095 if (unicode_decode_call_errorhandler(
4096 errors, &errorHandler,
4097 "charmap", "character maps to <undefined>",
4098 starts, size, &startinpos, &endinpos, &exc, &s,
4099 &v, &outpos, &p)) {
4100 goto onError;
4102 continue;
4104 *p++ = x;
4105 ++s;
4108 else {
4109 while (s < e) {
4110 unsigned char ch = *s;
4111 PyObject *w, *x;
4113 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4114 w = PyInt_FromLong((long)ch);
4115 if (w == NULL)
4116 goto onError;
4117 x = PyObject_GetItem(mapping, w);
4118 Py_DECREF(w);
4119 if (x == NULL) {
4120 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4121 /* No mapping found means: mapping is undefined. */
4122 PyErr_Clear();
4123 x = Py_None;
4124 Py_INCREF(x);
4125 } else
4126 goto onError;
4129 /* Apply mapping */
4130 if (PyInt_Check(x)) {
4131 long value = PyInt_AS_LONG(x);
4132 if (value < 0 || value > 65535) {
4133 PyErr_SetString(PyExc_TypeError,
4134 "character mapping must be in range(65536)");
4135 Py_DECREF(x);
4136 goto onError;
4138 *p++ = (Py_UNICODE)value;
4140 else if (x == Py_None) {
4141 /* undefined mapping */
4142 outpos = p-PyUnicode_AS_UNICODE(v);
4143 startinpos = s-starts;
4144 endinpos = startinpos+1;
4145 if (unicode_decode_call_errorhandler(
4146 errors, &errorHandler,
4147 "charmap", "character maps to <undefined>",
4148 starts, size, &startinpos, &endinpos, &exc, &s,
4149 &v, &outpos, &p)) {
4150 Py_DECREF(x);
4151 goto onError;
4153 Py_DECREF(x);
4154 continue;
4156 else if (PyUnicode_Check(x)) {
4157 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4159 if (targetsize == 1)
4160 /* 1-1 mapping */
4161 *p++ = *PyUnicode_AS_UNICODE(x);
4163 else if (targetsize > 1) {
4164 /* 1-n mapping */
4165 if (targetsize > extrachars) {
4166 /* resize first */
4167 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4168 Py_ssize_t needed = (targetsize - extrachars) + \
4169 (targetsize << 2);
4170 extrachars += needed;
4171 /* XXX overflow detection missing */
4172 if (_PyUnicode_Resize(&v,
4173 PyUnicode_GET_SIZE(v) + needed) < 0) {
4174 Py_DECREF(x);
4175 goto onError;
4177 p = PyUnicode_AS_UNICODE(v) + oldpos;
4179 Py_UNICODE_COPY(p,
4180 PyUnicode_AS_UNICODE(x),
4181 targetsize);
4182 p += targetsize;
4183 extrachars -= targetsize;
4185 /* 1-0 mapping: skip the character */
4187 else {
4188 /* wrong return value */
4189 PyErr_SetString(PyExc_TypeError,
4190 "character mapping must return integer, None or unicode");
4191 Py_DECREF(x);
4192 goto onError;
4194 Py_DECREF(x);
4195 ++s;
4198 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4199 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4200 goto onError;
4201 Py_XDECREF(errorHandler);
4202 Py_XDECREF(exc);
4203 return (PyObject *)v;
4205 onError:
4206 Py_XDECREF(errorHandler);
4207 Py_XDECREF(exc);
4208 Py_XDECREF(v);
4209 return NULL;
4212 /* Charmap encoding: the lookup table */
4214 struct encoding_map{
4215 PyObject_HEAD
4216 unsigned char level1[32];
4217 int count2, count3;
4218 unsigned char level23[1];
4221 static PyObject*
4222 encoding_map_size(PyObject *obj, PyObject* args)
4224 struct encoding_map *map = (struct encoding_map*)obj;
4225 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4226 128*map->count3);
4229 static PyMethodDef encoding_map_methods[] = {
4230 {"size", encoding_map_size, METH_NOARGS,
4231 PyDoc_STR("Return the size (in bytes) of this object") },
4232 { 0 }
4235 static void
4236 encoding_map_dealloc(PyObject* o)
4238 PyObject_FREE(o);
4241 static PyTypeObject EncodingMapType = {
4242 PyVarObject_HEAD_INIT(NULL, 0)
4243 "EncodingMap", /*tp_name*/
4244 sizeof(struct encoding_map), /*tp_basicsize*/
4245 0, /*tp_itemsize*/
4246 /* methods */
4247 encoding_map_dealloc, /*tp_dealloc*/
4248 0, /*tp_print*/
4249 0, /*tp_getattr*/
4250 0, /*tp_setattr*/
4251 0, /*tp_compare*/
4252 0, /*tp_repr*/
4253 0, /*tp_as_number*/
4254 0, /*tp_as_sequence*/
4255 0, /*tp_as_mapping*/
4256 0, /*tp_hash*/
4257 0, /*tp_call*/
4258 0, /*tp_str*/
4259 0, /*tp_getattro*/
4260 0, /*tp_setattro*/
4261 0, /*tp_as_buffer*/
4262 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4263 0, /*tp_doc*/
4264 0, /*tp_traverse*/
4265 0, /*tp_clear*/
4266 0, /*tp_richcompare*/
4267 0, /*tp_weaklistoffset*/
4268 0, /*tp_iter*/
4269 0, /*tp_iternext*/
4270 encoding_map_methods, /*tp_methods*/
4271 0, /*tp_members*/
4272 0, /*tp_getset*/
4273 0, /*tp_base*/
4274 0, /*tp_dict*/
4275 0, /*tp_descr_get*/
4276 0, /*tp_descr_set*/
4277 0, /*tp_dictoffset*/
4278 0, /*tp_init*/
4279 0, /*tp_alloc*/
4280 0, /*tp_new*/
4281 0, /*tp_free*/
4282 0, /*tp_is_gc*/
4285 PyObject*
4286 PyUnicode_BuildEncodingMap(PyObject* string)
4288 Py_UNICODE *decode;
4289 PyObject *result;
4290 struct encoding_map *mresult;
4291 int i;
4292 int need_dict = 0;
4293 unsigned char level1[32];
4294 unsigned char level2[512];
4295 unsigned char *mlevel1, *mlevel2, *mlevel3;
4296 int count2 = 0, count3 = 0;
4298 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4299 PyErr_BadArgument();
4300 return NULL;
4302 decode = PyUnicode_AS_UNICODE(string);
4303 memset(level1, 0xFF, sizeof level1);
4304 memset(level2, 0xFF, sizeof level2);
4306 /* If there isn't a one-to-one mapping of NULL to \0,
4307 or if there are non-BMP characters, we need to use
4308 a mapping dictionary. */
4309 if (decode[0] != 0)
4310 need_dict = 1;
4311 for (i = 1; i < 256; i++) {
4312 int l1, l2;
4313 if (decode[i] == 0
4314 #ifdef Py_UNICODE_WIDE
4315 || decode[i] > 0xFFFF
4316 #endif
4318 need_dict = 1;
4319 break;
4321 if (decode[i] == 0xFFFE)
4322 /* unmapped character */
4323 continue;
4324 l1 = decode[i] >> 11;
4325 l2 = decode[i] >> 7;
4326 if (level1[l1] == 0xFF)
4327 level1[l1] = count2++;
4328 if (level2[l2] == 0xFF)
4329 level2[l2] = count3++;
4332 if (count2 >= 0xFF || count3 >= 0xFF)
4333 need_dict = 1;
4335 if (need_dict) {
4336 PyObject *result = PyDict_New();
4337 PyObject *key, *value;
4338 if (!result)
4339 return NULL;
4340 for (i = 0; i < 256; i++) {
4341 key = value = NULL;
4342 key = PyInt_FromLong(decode[i]);
4343 value = PyInt_FromLong(i);
4344 if (!key || !value)
4345 goto failed1;
4346 if (PyDict_SetItem(result, key, value) == -1)
4347 goto failed1;
4348 Py_DECREF(key);
4349 Py_DECREF(value);
4351 return result;
4352 failed1:
4353 Py_XDECREF(key);
4354 Py_XDECREF(value);
4355 Py_DECREF(result);
4356 return NULL;
4359 /* Create a three-level trie */
4360 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4361 16*count2 + 128*count3 - 1);
4362 if (!result)
4363 return PyErr_NoMemory();
4364 PyObject_Init(result, &EncodingMapType);
4365 mresult = (struct encoding_map*)result;
4366 mresult->count2 = count2;
4367 mresult->count3 = count3;
4368 mlevel1 = mresult->level1;
4369 mlevel2 = mresult->level23;
4370 mlevel3 = mresult->level23 + 16*count2;
4371 memcpy(mlevel1, level1, 32);
4372 memset(mlevel2, 0xFF, 16*count2);
4373 memset(mlevel3, 0, 128*count3);
4374 count3 = 0;
4375 for (i = 1; i < 256; i++) {
4376 int o1, o2, o3, i2, i3;
4377 if (decode[i] == 0xFFFE)
4378 /* unmapped character */
4379 continue;
4380 o1 = decode[i]>>11;
4381 o2 = (decode[i]>>7) & 0xF;
4382 i2 = 16*mlevel1[o1] + o2;
4383 if (mlevel2[i2] == 0xFF)
4384 mlevel2[i2] = count3++;
4385 o3 = decode[i] & 0x7F;
4386 i3 = 128*mlevel2[i2] + o3;
4387 mlevel3[i3] = i;
4389 return result;
4392 static int
4393 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4395 struct encoding_map *map = (struct encoding_map*)mapping;
4396 int l1 = c>>11;
4397 int l2 = (c>>7) & 0xF;
4398 int l3 = c & 0x7F;
4399 int i;
4401 #ifdef Py_UNICODE_WIDE
4402 if (c > 0xFFFF) {
4403 return -1;
4405 #endif
4406 if (c == 0)
4407 return 0;
4408 /* level 1*/
4409 i = map->level1[l1];
4410 if (i == 0xFF) {
4411 return -1;
4413 /* level 2*/
4414 i = map->level23[16*i+l2];
4415 if (i == 0xFF) {
4416 return -1;
4418 /* level 3 */
4419 i = map->level23[16*map->count2 + 128*i + l3];
4420 if (i == 0) {
4421 return -1;
4423 return i;
4426 /* Lookup the character ch in the mapping. If the character
4427 can't be found, Py_None is returned (or NULL, if another
4428 error occurred). */
4429 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4431 PyObject *w = PyInt_FromLong((long)c);
4432 PyObject *x;
4434 if (w == NULL)
4435 return NULL;
4436 x = PyObject_GetItem(mapping, w);
4437 Py_DECREF(w);
4438 if (x == NULL) {
4439 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4440 /* No mapping found means: mapping is undefined. */
4441 PyErr_Clear();
4442 x = Py_None;
4443 Py_INCREF(x);
4444 return x;
4445 } else
4446 return NULL;
4448 else if (x == Py_None)
4449 return x;
4450 else if (PyInt_Check(x)) {
4451 long value = PyInt_AS_LONG(x);
4452 if (value < 0 || value > 255) {
4453 PyErr_SetString(PyExc_TypeError,
4454 "character mapping must be in range(256)");
4455 Py_DECREF(x);
4456 return NULL;
4458 return x;
4460 else if (PyString_Check(x))
4461 return x;
4462 else {
4463 /* wrong return value */
4464 PyErr_SetString(PyExc_TypeError,
4465 "character mapping must return integer, None or str");
4466 Py_DECREF(x);
4467 return NULL;
4471 static int
4472 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4474 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4475 /* exponentially overallocate to minimize reallocations */
4476 if (requiredsize < 2*outsize)
4477 requiredsize = 2*outsize;
4478 if (_PyString_Resize(outobj, requiredsize)) {
4479 return 0;
4481 return 1;
4484 typedef enum charmapencode_result {
4485 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4486 }charmapencode_result;
4487 /* lookup the character, put the result in the output string and adjust
4488 various state variables. Reallocate the output string if not enough
4489 space is available. Return a new reference to the object that
4490 was put in the output buffer, or Py_None, if the mapping was undefined
4491 (in which case no character was written) or NULL, if a
4492 reallocation error occurred. The caller must decref the result */
4493 static
4494 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4495 PyObject **outobj, Py_ssize_t *outpos)
4497 PyObject *rep;
4498 char *outstart;
4499 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4501 if (Py_TYPE(mapping) == &EncodingMapType) {
4502 int res = encoding_map_lookup(c, mapping);
4503 Py_ssize_t requiredsize = *outpos+1;
4504 if (res == -1)
4505 return enc_FAILED;
4506 if (outsize<requiredsize)
4507 if (!charmapencode_resize(outobj, outpos, requiredsize))
4508 return enc_EXCEPTION;
4509 outstart = PyString_AS_STRING(*outobj);
4510 outstart[(*outpos)++] = (char)res;
4511 return enc_SUCCESS;
4514 rep = charmapencode_lookup(c, mapping);
4515 if (rep==NULL)
4516 return enc_EXCEPTION;
4517 else if (rep==Py_None) {
4518 Py_DECREF(rep);
4519 return enc_FAILED;
4520 } else {
4521 if (PyInt_Check(rep)) {
4522 Py_ssize_t requiredsize = *outpos+1;
4523 if (outsize<requiredsize)
4524 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4525 Py_DECREF(rep);
4526 return enc_EXCEPTION;
4528 outstart = PyString_AS_STRING(*outobj);
4529 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4531 else {
4532 const char *repchars = PyString_AS_STRING(rep);
4533 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4534 Py_ssize_t requiredsize = *outpos+repsize;
4535 if (outsize<requiredsize)
4536 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4537 Py_DECREF(rep);
4538 return enc_EXCEPTION;
4540 outstart = PyString_AS_STRING(*outobj);
4541 memcpy(outstart + *outpos, repchars, repsize);
4542 *outpos += repsize;
4545 Py_DECREF(rep);
4546 return enc_SUCCESS;
4549 /* handle an error in PyUnicode_EncodeCharmap
4550 Return 0 on success, -1 on error */
4551 static
4552 int charmap_encoding_error(
4553 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4554 PyObject **exceptionObject,
4555 int *known_errorHandler, PyObject **errorHandler, const char *errors,
4556 PyObject **res, Py_ssize_t *respos)
4558 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4559 Py_ssize_t repsize;
4560 Py_ssize_t newpos;
4561 Py_UNICODE *uni2;
4562 /* startpos for collecting unencodable chars */
4563 Py_ssize_t collstartpos = *inpos;
4564 Py_ssize_t collendpos = *inpos+1;
4565 Py_ssize_t collpos;
4566 char *encoding = "charmap";
4567 char *reason = "character maps to <undefined>";
4568 charmapencode_result x;
4570 /* find all unencodable characters */
4571 while (collendpos < size) {
4572 PyObject *rep;
4573 if (Py_TYPE(mapping) == &EncodingMapType) {
4574 int res = encoding_map_lookup(p[collendpos], mapping);
4575 if (res != -1)
4576 break;
4577 ++collendpos;
4578 continue;
4581 rep = charmapencode_lookup(p[collendpos], mapping);
4582 if (rep==NULL)
4583 return -1;
4584 else if (rep!=Py_None) {
4585 Py_DECREF(rep);
4586 break;
4588 Py_DECREF(rep);
4589 ++collendpos;
4591 /* cache callback name lookup
4592 * (if not done yet, i.e. it's the first error) */
4593 if (*known_errorHandler==-1) {
4594 if ((errors==NULL) || (!strcmp(errors, "strict")))
4595 *known_errorHandler = 1;
4596 else if (!strcmp(errors, "replace"))
4597 *known_errorHandler = 2;
4598 else if (!strcmp(errors, "ignore"))
4599 *known_errorHandler = 3;
4600 else if (!strcmp(errors, "xmlcharrefreplace"))
4601 *known_errorHandler = 4;
4602 else
4603 *known_errorHandler = 0;
4605 switch (*known_errorHandler) {
4606 case 1: /* strict */
4607 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4608 return -1;
4609 case 2: /* replace */
4610 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4611 x = charmapencode_output('?', mapping, res, respos);
4612 if (x==enc_EXCEPTION) {
4613 return -1;
4615 else if (x==enc_FAILED) {
4616 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4617 return -1;
4620 /* fall through */
4621 case 3: /* ignore */
4622 *inpos = collendpos;
4623 break;
4624 case 4: /* xmlcharrefreplace */
4625 /* generate replacement (temporarily (mis)uses p) */
4626 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4627 char buffer[2+29+1+1];
4628 char *cp;
4629 sprintf(buffer, "&#%d;", (int)p[collpos]);
4630 for (cp = buffer; *cp; ++cp) {
4631 x = charmapencode_output(*cp, mapping, res, respos);
4632 if (x==enc_EXCEPTION)
4633 return -1;
4634 else if (x==enc_FAILED) {
4635 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4636 return -1;
4640 *inpos = collendpos;
4641 break;
4642 default:
4643 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4644 encoding, reason, p, size, exceptionObject,
4645 collstartpos, collendpos, &newpos);
4646 if (repunicode == NULL)
4647 return -1;
4648 /* generate replacement */
4649 repsize = PyUnicode_GET_SIZE(repunicode);
4650 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4651 x = charmapencode_output(*uni2, mapping, res, respos);
4652 if (x==enc_EXCEPTION) {
4653 return -1;
4655 else if (x==enc_FAILED) {
4656 Py_DECREF(repunicode);
4657 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4658 return -1;
4661 *inpos = newpos;
4662 Py_DECREF(repunicode);
4664 return 0;
4667 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4668 Py_ssize_t size,
4669 PyObject *mapping,
4670 const char *errors)
4672 /* output object */
4673 PyObject *res = NULL;
4674 /* current input position */
4675 Py_ssize_t inpos = 0;
4676 /* current output position */
4677 Py_ssize_t respos = 0;
4678 PyObject *errorHandler = NULL;
4679 PyObject *exc = NULL;
4680 /* the following variable is used for caching string comparisons
4681 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4682 * 3=ignore, 4=xmlcharrefreplace */
4683 int known_errorHandler = -1;
4685 /* Default to Latin-1 */
4686 if (mapping == NULL)
4687 return PyUnicode_EncodeLatin1(p, size, errors);
4689 /* allocate enough for a simple encoding without
4690 replacements, if we need more, we'll resize */
4691 res = PyString_FromStringAndSize(NULL, size);
4692 if (res == NULL)
4693 goto onError;
4694 if (size == 0)
4695 return res;
4697 while (inpos<size) {
4698 /* try to encode it */
4699 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4700 if (x==enc_EXCEPTION) /* error */
4701 goto onError;
4702 if (x==enc_FAILED) { /* unencodable character */
4703 if (charmap_encoding_error(p, size, &inpos, mapping,
4704 &exc,
4705 &known_errorHandler, &errorHandler, errors,
4706 &res, &respos)) {
4707 goto onError;
4710 else
4711 /* done with this character => adjust input position */
4712 ++inpos;
4715 /* Resize if we allocated to much */
4716 if (respos<PyString_GET_SIZE(res)) {
4717 if (_PyString_Resize(&res, respos))
4718 goto onError;
4720 Py_XDECREF(exc);
4721 Py_XDECREF(errorHandler);
4722 return res;
4724 onError:
4725 Py_XDECREF(res);
4726 Py_XDECREF(exc);
4727 Py_XDECREF(errorHandler);
4728 return NULL;
4731 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4732 PyObject *mapping)
4734 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4735 PyErr_BadArgument();
4736 return NULL;
4738 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4739 PyUnicode_GET_SIZE(unicode),
4740 mapping,
4741 NULL);
4744 /* create or adjust a UnicodeTranslateError */
4745 static void make_translate_exception(PyObject **exceptionObject,
4746 const Py_UNICODE *unicode, Py_ssize_t size,
4747 Py_ssize_t startpos, Py_ssize_t endpos,
4748 const char *reason)
4750 if (*exceptionObject == NULL) {
4751 *exceptionObject = PyUnicodeTranslateError_Create(
4752 unicode, size, startpos, endpos, reason);
4754 else {
4755 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4756 goto onError;
4757 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4758 goto onError;
4759 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4760 goto onError;
4761 return;
4762 onError:
4763 Py_DECREF(*exceptionObject);
4764 *exceptionObject = NULL;
4768 /* raises a UnicodeTranslateError */
4769 static void raise_translate_exception(PyObject **exceptionObject,
4770 const Py_UNICODE *unicode, Py_ssize_t size,
4771 Py_ssize_t startpos, Py_ssize_t endpos,
4772 const char *reason)
4774 make_translate_exception(exceptionObject,
4775 unicode, size, startpos, endpos, reason);
4776 if (*exceptionObject != NULL)
4777 PyCodec_StrictErrors(*exceptionObject);
4780 /* error handling callback helper:
4781 build arguments, call the callback and check the arguments,
4782 put the result into newpos and return the replacement string, which
4783 has to be freed by the caller */
4784 static PyObject *unicode_translate_call_errorhandler(const char *errors,
4785 PyObject **errorHandler,
4786 const char *reason,
4787 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4788 Py_ssize_t startpos, Py_ssize_t endpos,
4789 Py_ssize_t *newpos)
4791 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4793 Py_ssize_t i_newpos;
4794 PyObject *restuple;
4795 PyObject *resunicode;
4797 if (*errorHandler == NULL) {
4798 *errorHandler = PyCodec_LookupError(errors);
4799 if (*errorHandler == NULL)
4800 return NULL;
4803 make_translate_exception(exceptionObject,
4804 unicode, size, startpos, endpos, reason);
4805 if (*exceptionObject == NULL)
4806 return NULL;
4808 restuple = PyObject_CallFunctionObjArgs(
4809 *errorHandler, *exceptionObject, NULL);
4810 if (restuple == NULL)
4811 return NULL;
4812 if (!PyTuple_Check(restuple)) {
4813 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4814 Py_DECREF(restuple);
4815 return NULL;
4817 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4818 &resunicode, &i_newpos)) {
4819 Py_DECREF(restuple);
4820 return NULL;
4822 if (i_newpos<0)
4823 *newpos = size+i_newpos;
4824 else
4825 *newpos = i_newpos;
4826 if (*newpos<0 || *newpos>size) {
4827 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4828 Py_DECREF(restuple);
4829 return NULL;
4831 Py_INCREF(resunicode);
4832 Py_DECREF(restuple);
4833 return resunicode;
4836 /* Lookup the character ch in the mapping and put the result in result,
4837 which must be decrefed by the caller.
4838 Return 0 on success, -1 on error */
4839 static
4840 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4842 PyObject *w = PyInt_FromLong((long)c);
4843 PyObject *x;
4845 if (w == NULL)
4846 return -1;
4847 x = PyObject_GetItem(mapping, w);
4848 Py_DECREF(w);
4849 if (x == NULL) {
4850 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4851 /* No mapping found means: use 1:1 mapping. */
4852 PyErr_Clear();
4853 *result = NULL;
4854 return 0;
4855 } else
4856 return -1;
4858 else if (x == Py_None) {
4859 *result = x;
4860 return 0;
4862 else if (PyInt_Check(x)) {
4863 long value = PyInt_AS_LONG(x);
4864 long max = PyUnicode_GetMax();
4865 if (value < 0 || value > max) {
4866 PyErr_Format(PyExc_TypeError,
4867 "character mapping must be in range(0x%lx)", max+1);
4868 Py_DECREF(x);
4869 return -1;
4871 *result = x;
4872 return 0;
4874 else if (PyUnicode_Check(x)) {
4875 *result = x;
4876 return 0;
4878 else {
4879 /* wrong return value */
4880 PyErr_SetString(PyExc_TypeError,
4881 "character mapping must return integer, None or unicode");
4882 Py_DECREF(x);
4883 return -1;
4886 /* ensure that *outobj is at least requiredsize characters long,
4887 if not reallocate and adjust various state variables.
4888 Return 0 on success, -1 on error */
4889 static
4890 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4891 Py_ssize_t requiredsize)
4893 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4894 if (requiredsize > oldsize) {
4895 /* remember old output position */
4896 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4897 /* exponentially overallocate to minimize reallocations */
4898 if (requiredsize < 2 * oldsize)
4899 requiredsize = 2 * oldsize;
4900 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4901 return -1;
4902 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4904 return 0;
4906 /* lookup the character, put the result in the output string and adjust
4907 various state variables. Return a new reference to the object that
4908 was put in the output buffer in *result, or Py_None, if the mapping was
4909 undefined (in which case no character was written).
4910 The called must decref result.
4911 Return 0 on success, -1 on error. */
4912 static
4913 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
4914 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4915 PyObject **res)
4917 if (charmaptranslate_lookup(*curinp, mapping, res))
4918 return -1;
4919 if (*res==NULL) {
4920 /* not found => default to 1:1 mapping */
4921 *(*outp)++ = *curinp;
4923 else if (*res==Py_None)
4925 else if (PyInt_Check(*res)) {
4926 /* no overflow check, because we know that the space is enough */
4927 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4929 else if (PyUnicode_Check(*res)) {
4930 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4931 if (repsize==1) {
4932 /* no overflow check, because we know that the space is enough */
4933 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4935 else if (repsize!=0) {
4936 /* more than one character */
4937 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4938 (insize - (curinp-startinp)) +
4939 repsize - 1;
4940 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4941 return -1;
4942 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4943 *outp += repsize;
4946 else
4947 return -1;
4948 return 0;
4951 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
4952 Py_ssize_t size,
4953 PyObject *mapping,
4954 const char *errors)
4956 /* output object */
4957 PyObject *res = NULL;
4958 /* pointers to the beginning and end+1 of input */
4959 const Py_UNICODE *startp = p;
4960 const Py_UNICODE *endp = p + size;
4961 /* pointer into the output */
4962 Py_UNICODE *str;
4963 /* current output position */
4964 Py_ssize_t respos = 0;
4965 char *reason = "character maps to <undefined>";
4966 PyObject *errorHandler = NULL;
4967 PyObject *exc = NULL;
4968 /* the following variable is used for caching string comparisons
4969 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4970 * 3=ignore, 4=xmlcharrefreplace */
4971 int known_errorHandler = -1;
4973 if (mapping == NULL) {
4974 PyErr_BadArgument();
4975 return NULL;
4978 /* allocate enough for a simple 1:1 translation without
4979 replacements, if we need more, we'll resize */
4980 res = PyUnicode_FromUnicode(NULL, size);
4981 if (res == NULL)
4982 goto onError;
4983 if (size == 0)
4984 return res;
4985 str = PyUnicode_AS_UNICODE(res);
4987 while (p<endp) {
4988 /* try to encode it */
4989 PyObject *x = NULL;
4990 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4991 Py_XDECREF(x);
4992 goto onError;
4994 Py_XDECREF(x);
4995 if (x!=Py_None) /* it worked => adjust input pointer */
4996 ++p;
4997 else { /* untranslatable character */
4998 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4999 Py_ssize_t repsize;
5000 Py_ssize_t newpos;
5001 Py_UNICODE *uni2;
5002 /* startpos for collecting untranslatable chars */
5003 const Py_UNICODE *collstart = p;
5004 const Py_UNICODE *collend = p+1;
5005 const Py_UNICODE *coll;
5007 /* find all untranslatable characters */
5008 while (collend < endp) {
5009 if (charmaptranslate_lookup(*collend, mapping, &x))
5010 goto onError;
5011 Py_XDECREF(x);
5012 if (x!=Py_None)
5013 break;
5014 ++collend;
5016 /* cache callback name lookup
5017 * (if not done yet, i.e. it's the first error) */
5018 if (known_errorHandler==-1) {
5019 if ((errors==NULL) || (!strcmp(errors, "strict")))
5020 known_errorHandler = 1;
5021 else if (!strcmp(errors, "replace"))
5022 known_errorHandler = 2;
5023 else if (!strcmp(errors, "ignore"))
5024 known_errorHandler = 3;
5025 else if (!strcmp(errors, "xmlcharrefreplace"))
5026 known_errorHandler = 4;
5027 else
5028 known_errorHandler = 0;
5030 switch (known_errorHandler) {
5031 case 1: /* strict */
5032 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5033 goto onError;
5034 case 2: /* replace */
5035 /* No need to check for space, this is a 1:1 replacement */
5036 for (coll = collstart; coll<collend; ++coll)
5037 *str++ = '?';
5038 /* fall through */
5039 case 3: /* ignore */
5040 p = collend;
5041 break;
5042 case 4: /* xmlcharrefreplace */
5043 /* generate replacement (temporarily (mis)uses p) */
5044 for (p = collstart; p < collend; ++p) {
5045 char buffer[2+29+1+1];
5046 char *cp;
5047 sprintf(buffer, "&#%d;", (int)*p);
5048 if (charmaptranslate_makespace(&res, &str,
5049 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5050 goto onError;
5051 for (cp = buffer; *cp; ++cp)
5052 *str++ = *cp;
5054 p = collend;
5055 break;
5056 default:
5057 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5058 reason, startp, size, &exc,
5059 collstart-startp, collend-startp, &newpos);
5060 if (repunicode == NULL)
5061 goto onError;
5062 /* generate replacement */
5063 repsize = PyUnicode_GET_SIZE(repunicode);
5064 if (charmaptranslate_makespace(&res, &str,
5065 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5066 Py_DECREF(repunicode);
5067 goto onError;
5069 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5070 *str++ = *uni2;
5071 p = startp + newpos;
5072 Py_DECREF(repunicode);
5076 /* Resize if we allocated to much */
5077 respos = str-PyUnicode_AS_UNICODE(res);
5078 if (respos<PyUnicode_GET_SIZE(res)) {
5079 if (PyUnicode_Resize(&res, respos) < 0)
5080 goto onError;
5082 Py_XDECREF(exc);
5083 Py_XDECREF(errorHandler);
5084 return res;
5086 onError:
5087 Py_XDECREF(res);
5088 Py_XDECREF(exc);
5089 Py_XDECREF(errorHandler);
5090 return NULL;
5093 PyObject *PyUnicode_Translate(PyObject *str,
5094 PyObject *mapping,
5095 const char *errors)
5097 PyObject *result;
5099 str = PyUnicode_FromObject(str);
5100 if (str == NULL)
5101 goto onError;
5102 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5103 PyUnicode_GET_SIZE(str),
5104 mapping,
5105 errors);
5106 Py_DECREF(str);
5107 return result;
5109 onError:
5110 Py_XDECREF(str);
5111 return NULL;
5114 /* --- Decimal Encoder ---------------------------------------------------- */
5116 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5117 Py_ssize_t length,
5118 char *output,
5119 const char *errors)
5121 Py_UNICODE *p, *end;
5122 PyObject *errorHandler = NULL;
5123 PyObject *exc = NULL;
5124 const char *encoding = "decimal";
5125 const char *reason = "invalid decimal Unicode string";
5126 /* the following variable is used for caching string comparisons
5127 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5128 int known_errorHandler = -1;
5130 if (output == NULL) {
5131 PyErr_BadArgument();
5132 return -1;
5135 p = s;
5136 end = s + length;
5137 while (p < end) {
5138 register Py_UNICODE ch = *p;
5139 int decimal;
5140 PyObject *repunicode;
5141 Py_ssize_t repsize;
5142 Py_ssize_t newpos;
5143 Py_UNICODE *uni2;
5144 Py_UNICODE *collstart;
5145 Py_UNICODE *collend;
5147 if (Py_UNICODE_ISSPACE(ch)) {
5148 *output++ = ' ';
5149 ++p;
5150 continue;
5152 decimal = Py_UNICODE_TODECIMAL(ch);
5153 if (decimal >= 0) {
5154 *output++ = '0' + decimal;
5155 ++p;
5156 continue;
5158 if (0 < ch && ch < 256) {
5159 *output++ = (char)ch;
5160 ++p;
5161 continue;
5163 /* All other characters are considered unencodable */
5164 collstart = p;
5165 collend = p+1;
5166 while (collend < end) {
5167 if ((0 < *collend && *collend < 256) ||
5168 !Py_UNICODE_ISSPACE(*collend) ||
5169 Py_UNICODE_TODECIMAL(*collend))
5170 break;
5172 /* cache callback name lookup
5173 * (if not done yet, i.e. it's the first error) */
5174 if (known_errorHandler==-1) {
5175 if ((errors==NULL) || (!strcmp(errors, "strict")))
5176 known_errorHandler = 1;
5177 else if (!strcmp(errors, "replace"))
5178 known_errorHandler = 2;
5179 else if (!strcmp(errors, "ignore"))
5180 known_errorHandler = 3;
5181 else if (!strcmp(errors, "xmlcharrefreplace"))
5182 known_errorHandler = 4;
5183 else
5184 known_errorHandler = 0;
5186 switch (known_errorHandler) {
5187 case 1: /* strict */
5188 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5189 goto onError;
5190 case 2: /* replace */
5191 for (p = collstart; p < collend; ++p)
5192 *output++ = '?';
5193 /* fall through */
5194 case 3: /* ignore */
5195 p = collend;
5196 break;
5197 case 4: /* xmlcharrefreplace */
5198 /* generate replacement (temporarily (mis)uses p) */
5199 for (p = collstart; p < collend; ++p)
5200 output += sprintf(output, "&#%d;", (int)*p);
5201 p = collend;
5202 break;
5203 default:
5204 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5205 encoding, reason, s, length, &exc,
5206 collstart-s, collend-s, &newpos);
5207 if (repunicode == NULL)
5208 goto onError;
5209 /* generate replacement */
5210 repsize = PyUnicode_GET_SIZE(repunicode);
5211 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5212 Py_UNICODE ch = *uni2;
5213 if (Py_UNICODE_ISSPACE(ch))
5214 *output++ = ' ';
5215 else {
5216 decimal = Py_UNICODE_TODECIMAL(ch);
5217 if (decimal >= 0)
5218 *output++ = '0' + decimal;
5219 else if (0 < ch && ch < 256)
5220 *output++ = (char)ch;
5221 else {
5222 Py_DECREF(repunicode);
5223 raise_encode_exception(&exc, encoding,
5224 s, length, collstart-s, collend-s, reason);
5225 goto onError;
5229 p = s + newpos;
5230 Py_DECREF(repunicode);
5233 /* 0-terminate the output string */
5234 *output++ = '\0';
5235 Py_XDECREF(exc);
5236 Py_XDECREF(errorHandler);
5237 return 0;
5239 onError:
5240 Py_XDECREF(exc);
5241 Py_XDECREF(errorHandler);
5242 return -1;
5245 /* --- Helpers ------------------------------------------------------------ */
5247 #include "stringlib/unicodedefs.h"
5249 #define FROM_UNICODE
5251 #include "stringlib/fastsearch.h"
5253 #include "stringlib/count.h"
5254 #include "stringlib/find.h"
5255 #include "stringlib/partition.h"
5257 /* helper macro to fixup start/end slice values */
5258 #define FIX_START_END(obj) \
5259 if (start < 0) \
5260 start += (obj)->length; \
5261 if (start < 0) \
5262 start = 0; \
5263 if (end > (obj)->length) \
5264 end = (obj)->length; \
5265 if (end < 0) \
5266 end += (obj)->length; \
5267 if (end < 0) \
5268 end = 0;
5270 Py_ssize_t PyUnicode_Count(PyObject *str,
5271 PyObject *substr,
5272 Py_ssize_t start,
5273 Py_ssize_t end)
5275 Py_ssize_t result;
5276 PyUnicodeObject* str_obj;
5277 PyUnicodeObject* sub_obj;
5279 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5280 if (!str_obj)
5281 return -1;
5282 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5283 if (!sub_obj) {
5284 Py_DECREF(str_obj);
5285 return -1;
5288 FIX_START_END(str_obj);
5290 result = stringlib_count(
5291 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5294 Py_DECREF(sub_obj);
5295 Py_DECREF(str_obj);
5297 return result;
5300 Py_ssize_t PyUnicode_Find(PyObject *str,
5301 PyObject *sub,
5302 Py_ssize_t start,
5303 Py_ssize_t end,
5304 int direction)
5306 Py_ssize_t result;
5308 str = PyUnicode_FromObject(str);
5309 if (!str)
5310 return -2;
5311 sub = PyUnicode_FromObject(sub);
5312 if (!sub) {
5313 Py_DECREF(str);
5314 return -2;
5317 if (direction > 0)
5318 result = stringlib_find_slice(
5319 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5320 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5321 start, end
5323 else
5324 result = stringlib_rfind_slice(
5325 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5326 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5327 start, end
5330 Py_DECREF(str);
5331 Py_DECREF(sub);
5333 return result;
5336 static
5337 int tailmatch(PyUnicodeObject *self,
5338 PyUnicodeObject *substring,
5339 Py_ssize_t start,
5340 Py_ssize_t end,
5341 int direction)
5343 if (substring->length == 0)
5344 return 1;
5346 FIX_START_END(self);
5348 end -= substring->length;
5349 if (end < start)
5350 return 0;
5352 if (direction > 0) {
5353 if (Py_UNICODE_MATCH(self, end, substring))
5354 return 1;
5355 } else {
5356 if (Py_UNICODE_MATCH(self, start, substring))
5357 return 1;
5360 return 0;
5363 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5364 PyObject *substr,
5365 Py_ssize_t start,
5366 Py_ssize_t end,
5367 int direction)
5369 Py_ssize_t result;
5371 str = PyUnicode_FromObject(str);
5372 if (str == NULL)
5373 return -1;
5374 substr = PyUnicode_FromObject(substr);
5375 if (substr == NULL) {
5376 Py_DECREF(str);
5377 return -1;
5380 result = tailmatch((PyUnicodeObject *)str,
5381 (PyUnicodeObject *)substr,
5382 start, end, direction);
5383 Py_DECREF(str);
5384 Py_DECREF(substr);
5385 return result;
5388 /* Apply fixfct filter to the Unicode object self and return a
5389 reference to the modified object */
5391 static
5392 PyObject *fixup(PyUnicodeObject *self,
5393 int (*fixfct)(PyUnicodeObject *s))
5396 PyUnicodeObject *u;
5398 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5399 if (u == NULL)
5400 return NULL;
5402 Py_UNICODE_COPY(u->str, self->str, self->length);
5404 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5405 /* fixfct should return TRUE if it modified the buffer. If
5406 FALSE, return a reference to the original buffer instead
5407 (to save space, not time) */
5408 Py_INCREF(self);
5409 Py_DECREF(u);
5410 return (PyObject*) self;
5412 return (PyObject*) u;
5415 static
5416 int fixupper(PyUnicodeObject *self)
5418 Py_ssize_t len = self->length;
5419 Py_UNICODE *s = self->str;
5420 int status = 0;
5422 while (len-- > 0) {
5423 register Py_UNICODE ch;
5425 ch = Py_UNICODE_TOUPPER(*s);
5426 if (ch != *s) {
5427 status = 1;
5428 *s = ch;
5430 s++;
5433 return status;
5436 static
5437 int fixlower(PyUnicodeObject *self)
5439 Py_ssize_t len = self->length;
5440 Py_UNICODE *s = self->str;
5441 int status = 0;
5443 while (len-- > 0) {
5444 register Py_UNICODE ch;
5446 ch = Py_UNICODE_TOLOWER(*s);
5447 if (ch != *s) {
5448 status = 1;
5449 *s = ch;
5451 s++;
5454 return status;
5457 static
5458 int fixswapcase(PyUnicodeObject *self)
5460 Py_ssize_t len = self->length;
5461 Py_UNICODE *s = self->str;
5462 int status = 0;
5464 while (len-- > 0) {
5465 if (Py_UNICODE_ISUPPER(*s)) {
5466 *s = Py_UNICODE_TOLOWER(*s);
5467 status = 1;
5468 } else if (Py_UNICODE_ISLOWER(*s)) {
5469 *s = Py_UNICODE_TOUPPER(*s);
5470 status = 1;
5472 s++;
5475 return status;
5478 static
5479 int fixcapitalize(PyUnicodeObject *self)
5481 Py_ssize_t len = self->length;
5482 Py_UNICODE *s = self->str;
5483 int status = 0;
5485 if (len == 0)
5486 return 0;
5487 if (Py_UNICODE_ISLOWER(*s)) {
5488 *s = Py_UNICODE_TOUPPER(*s);
5489 status = 1;
5491 s++;
5492 while (--len > 0) {
5493 if (Py_UNICODE_ISUPPER(*s)) {
5494 *s = Py_UNICODE_TOLOWER(*s);
5495 status = 1;
5497 s++;
5499 return status;
5502 static
5503 int fixtitle(PyUnicodeObject *self)
5505 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5506 register Py_UNICODE *e;
5507 int previous_is_cased;
5509 /* Shortcut for single character strings */
5510 if (PyUnicode_GET_SIZE(self) == 1) {
5511 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5512 if (*p != ch) {
5513 *p = ch;
5514 return 1;
5516 else
5517 return 0;
5520 e = p + PyUnicode_GET_SIZE(self);
5521 previous_is_cased = 0;
5522 for (; p < e; p++) {
5523 register const Py_UNICODE ch = *p;
5525 if (previous_is_cased)
5526 *p = Py_UNICODE_TOLOWER(ch);
5527 else
5528 *p = Py_UNICODE_TOTITLE(ch);
5530 if (Py_UNICODE_ISLOWER(ch) ||
5531 Py_UNICODE_ISUPPER(ch) ||
5532 Py_UNICODE_ISTITLE(ch))
5533 previous_is_cased = 1;
5534 else
5535 previous_is_cased = 0;
5537 return 1;
5540 PyObject *
5541 PyUnicode_Join(PyObject *separator, PyObject *seq)
5543 PyObject *internal_separator = NULL;
5544 const Py_UNICODE blank = ' ';
5545 const Py_UNICODE *sep = &blank;
5546 Py_ssize_t seplen = 1;
5547 PyUnicodeObject *res = NULL; /* the result */
5548 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5549 Py_ssize_t res_used; /* # used bytes */
5550 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5551 PyObject *fseq; /* PySequence_Fast(seq) */
5552 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
5553 PyObject *item;
5554 Py_ssize_t i;
5556 fseq = PySequence_Fast(seq, "");
5557 if (fseq == NULL) {
5558 return NULL;
5561 /* Grrrr. A codec may be invoked to convert str objects to
5562 * Unicode, and so it's possible to call back into Python code
5563 * during PyUnicode_FromObject(), and so it's possible for a sick
5564 * codec to change the size of fseq (if seq is a list). Therefore
5565 * we have to keep refetching the size -- can't assume seqlen
5566 * is invariant.
5568 seqlen = PySequence_Fast_GET_SIZE(fseq);
5569 /* If empty sequence, return u"". */
5570 if (seqlen == 0) {
5571 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5572 goto Done;
5574 /* If singleton sequence with an exact Unicode, return that. */
5575 if (seqlen == 1) {
5576 item = PySequence_Fast_GET_ITEM(fseq, 0);
5577 if (PyUnicode_CheckExact(item)) {
5578 Py_INCREF(item);
5579 res = (PyUnicodeObject *)item;
5580 goto Done;
5584 /* At least two items to join, or one that isn't exact Unicode. */
5585 if (seqlen > 1) {
5586 /* Set up sep and seplen -- they're needed. */
5587 if (separator == NULL) {
5588 sep = &blank;
5589 seplen = 1;
5591 else {
5592 internal_separator = PyUnicode_FromObject(separator);
5593 if (internal_separator == NULL)
5594 goto onError;
5595 sep = PyUnicode_AS_UNICODE(internal_separator);
5596 seplen = PyUnicode_GET_SIZE(internal_separator);
5597 /* In case PyUnicode_FromObject() mutated seq. */
5598 seqlen = PySequence_Fast_GET_SIZE(fseq);
5602 /* Get space. */
5603 res = _PyUnicode_New(res_alloc);
5604 if (res == NULL)
5605 goto onError;
5606 res_p = PyUnicode_AS_UNICODE(res);
5607 res_used = 0;
5609 for (i = 0; i < seqlen; ++i) {
5610 Py_ssize_t itemlen;
5611 Py_ssize_t new_res_used;
5613 item = PySequence_Fast_GET_ITEM(fseq, i);
5614 /* Convert item to Unicode. */
5615 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5616 PyErr_Format(PyExc_TypeError,
5617 "sequence item %zd: expected string or Unicode,"
5618 " %.80s found",
5619 i, Py_TYPE(item)->tp_name);
5620 goto onError;
5622 item = PyUnicode_FromObject(item);
5623 if (item == NULL)
5624 goto onError;
5625 /* We own a reference to item from here on. */
5627 /* In case PyUnicode_FromObject() mutated seq. */
5628 seqlen = PySequence_Fast_GET_SIZE(fseq);
5630 /* Make sure we have enough space for the separator and the item. */
5631 itemlen = PyUnicode_GET_SIZE(item);
5632 new_res_used = res_used + itemlen;
5633 if (new_res_used < 0)
5634 goto Overflow;
5635 if (i < seqlen - 1) {
5636 new_res_used += seplen;
5637 if (new_res_used < 0)
5638 goto Overflow;
5640 if (new_res_used > res_alloc) {
5641 /* double allocated size until it's big enough */
5642 do {
5643 res_alloc += res_alloc;
5644 if (res_alloc <= 0)
5645 goto Overflow;
5646 } while (new_res_used > res_alloc);
5647 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5648 Py_DECREF(item);
5649 goto onError;
5651 res_p = PyUnicode_AS_UNICODE(res) + res_used;
5654 /* Copy item, and maybe the separator. */
5655 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5656 res_p += itemlen;
5657 if (i < seqlen - 1) {
5658 Py_UNICODE_COPY(res_p, sep, seplen);
5659 res_p += seplen;
5661 Py_DECREF(item);
5662 res_used = new_res_used;
5665 /* Shrink res to match the used area; this probably can't fail,
5666 * but it's cheap to check.
5668 if (_PyUnicode_Resize(&res, res_used) < 0)
5669 goto onError;
5671 Done:
5672 Py_XDECREF(internal_separator);
5673 Py_DECREF(fseq);
5674 return (PyObject *)res;
5676 Overflow:
5677 PyErr_SetString(PyExc_OverflowError,
5678 "join() result is too long for a Python string");
5679 Py_DECREF(item);
5680 /* fall through */
5682 onError:
5683 Py_XDECREF(internal_separator);
5684 Py_DECREF(fseq);
5685 Py_XDECREF(res);
5686 return NULL;
5689 static
5690 PyUnicodeObject *pad(PyUnicodeObject *self,
5691 Py_ssize_t left,
5692 Py_ssize_t right,
5693 Py_UNICODE fill)
5695 PyUnicodeObject *u;
5697 if (left < 0)
5698 left = 0;
5699 if (right < 0)
5700 right = 0;
5702 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5703 Py_INCREF(self);
5704 return self;
5707 if (left > PY_SSIZE_T_MAX - self->length ||
5708 right > PY_SSIZE_T_MAX - (left + self->length)) {
5709 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5710 return NULL;
5712 u = _PyUnicode_New(left + self->length + right);
5713 if (u) {
5714 if (left)
5715 Py_UNICODE_FILL(u->str, fill, left);
5716 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5717 if (right)
5718 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5721 return u;
5724 #define SPLIT_APPEND(data, left, right) \
5725 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
5726 if (!str) \
5727 goto onError; \
5728 if (PyList_Append(list, str)) { \
5729 Py_DECREF(str); \
5730 goto onError; \
5732 else \
5733 Py_DECREF(str);
5735 static
5736 PyObject *split_whitespace(PyUnicodeObject *self,
5737 PyObject *list,
5738 Py_ssize_t maxcount)
5740 register Py_ssize_t i;
5741 register Py_ssize_t j;
5742 Py_ssize_t len = self->length;
5743 PyObject *str;
5744 register const Py_UNICODE *buf = self->str;
5746 for (i = j = 0; i < len; ) {
5747 /* find a token */
5748 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5749 i++;
5750 j = i;
5751 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
5752 i++;
5753 if (j < i) {
5754 if (maxcount-- <= 0)
5755 break;
5756 SPLIT_APPEND(buf, j, i);
5757 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5758 i++;
5759 j = i;
5762 if (j < len) {
5763 SPLIT_APPEND(buf, j, len);
5765 return list;
5767 onError:
5768 Py_DECREF(list);
5769 return NULL;
5772 PyObject *PyUnicode_Splitlines(PyObject *string,
5773 int keepends)
5775 register Py_ssize_t i;
5776 register Py_ssize_t j;
5777 Py_ssize_t len;
5778 PyObject *list;
5779 PyObject *str;
5780 Py_UNICODE *data;
5782 string = PyUnicode_FromObject(string);
5783 if (string == NULL)
5784 return NULL;
5785 data = PyUnicode_AS_UNICODE(string);
5786 len = PyUnicode_GET_SIZE(string);
5788 list = PyList_New(0);
5789 if (!list)
5790 goto onError;
5792 for (i = j = 0; i < len; ) {
5793 Py_ssize_t eol;
5795 /* Find a line and append it */
5796 while (i < len && !BLOOM_LINEBREAK(data[i]))
5797 i++;
5799 /* Skip the line break reading CRLF as one line break */
5800 eol = i;
5801 if (i < len) {
5802 if (data[i] == '\r' && i + 1 < len &&
5803 data[i+1] == '\n')
5804 i += 2;
5805 else
5806 i++;
5807 if (keepends)
5808 eol = i;
5810 SPLIT_APPEND(data, j, eol);
5811 j = i;
5813 if (j < len) {
5814 SPLIT_APPEND(data, j, len);
5817 Py_DECREF(string);
5818 return list;
5820 onError:
5821 Py_XDECREF(list);
5822 Py_DECREF(string);
5823 return NULL;
5826 static
5827 PyObject *split_char(PyUnicodeObject *self,
5828 PyObject *list,
5829 Py_UNICODE ch,
5830 Py_ssize_t maxcount)
5832 register Py_ssize_t i;
5833 register Py_ssize_t j;
5834 Py_ssize_t len = self->length;
5835 PyObject *str;
5836 register const Py_UNICODE *buf = self->str;
5838 for (i = j = 0; i < len; ) {
5839 if (buf[i] == ch) {
5840 if (maxcount-- <= 0)
5841 break;
5842 SPLIT_APPEND(buf, j, i);
5843 i = j = i + 1;
5844 } else
5845 i++;
5847 if (j <= len) {
5848 SPLIT_APPEND(buf, j, len);
5850 return list;
5852 onError:
5853 Py_DECREF(list);
5854 return NULL;
5857 static
5858 PyObject *split_substring(PyUnicodeObject *self,
5859 PyObject *list,
5860 PyUnicodeObject *substring,
5861 Py_ssize_t maxcount)
5863 register Py_ssize_t i;
5864 register Py_ssize_t j;
5865 Py_ssize_t len = self->length;
5866 Py_ssize_t sublen = substring->length;
5867 PyObject *str;
5869 for (i = j = 0; i <= len - sublen; ) {
5870 if (Py_UNICODE_MATCH(self, i, substring)) {
5871 if (maxcount-- <= 0)
5872 break;
5873 SPLIT_APPEND(self->str, j, i);
5874 i = j = i + sublen;
5875 } else
5876 i++;
5878 if (j <= len) {
5879 SPLIT_APPEND(self->str, j, len);
5881 return list;
5883 onError:
5884 Py_DECREF(list);
5885 return NULL;
5888 static
5889 PyObject *rsplit_whitespace(PyUnicodeObject *self,
5890 PyObject *list,
5891 Py_ssize_t maxcount)
5893 register Py_ssize_t i;
5894 register Py_ssize_t j;
5895 Py_ssize_t len = self->length;
5896 PyObject *str;
5897 register const Py_UNICODE *buf = self->str;
5899 for (i = j = len - 1; i >= 0; ) {
5900 /* find a token */
5901 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5902 i--;
5903 j = i;
5904 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
5905 i--;
5906 if (j > i) {
5907 if (maxcount-- <= 0)
5908 break;
5909 SPLIT_APPEND(buf, i + 1, j + 1);
5910 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5911 i--;
5912 j = i;
5915 if (j >= 0) {
5916 SPLIT_APPEND(buf, 0, j + 1);
5918 if (PyList_Reverse(list) < 0)
5919 goto onError;
5920 return list;
5922 onError:
5923 Py_DECREF(list);
5924 return NULL;
5927 static
5928 PyObject *rsplit_char(PyUnicodeObject *self,
5929 PyObject *list,
5930 Py_UNICODE ch,
5931 Py_ssize_t maxcount)
5933 register Py_ssize_t i;
5934 register Py_ssize_t j;
5935 Py_ssize_t len = self->length;
5936 PyObject *str;
5937 register const Py_UNICODE *buf = self->str;
5939 for (i = j = len - 1; i >= 0; ) {
5940 if (buf[i] == ch) {
5941 if (maxcount-- <= 0)
5942 break;
5943 SPLIT_APPEND(buf, i + 1, j + 1);
5944 j = i = i - 1;
5945 } else
5946 i--;
5948 if (j >= -1) {
5949 SPLIT_APPEND(buf, 0, j + 1);
5951 if (PyList_Reverse(list) < 0)
5952 goto onError;
5953 return list;
5955 onError:
5956 Py_DECREF(list);
5957 return NULL;
5960 static
5961 PyObject *rsplit_substring(PyUnicodeObject *self,
5962 PyObject *list,
5963 PyUnicodeObject *substring,
5964 Py_ssize_t maxcount)
5966 register Py_ssize_t i;
5967 register Py_ssize_t j;
5968 Py_ssize_t len = self->length;
5969 Py_ssize_t sublen = substring->length;
5970 PyObject *str;
5972 for (i = len - sublen, j = len; i >= 0; ) {
5973 if (Py_UNICODE_MATCH(self, i, substring)) {
5974 if (maxcount-- <= 0)
5975 break;
5976 SPLIT_APPEND(self->str, i + sublen, j);
5977 j = i;
5978 i -= sublen;
5979 } else
5980 i--;
5982 if (j >= 0) {
5983 SPLIT_APPEND(self->str, 0, j);
5985 if (PyList_Reverse(list) < 0)
5986 goto onError;
5987 return list;
5989 onError:
5990 Py_DECREF(list);
5991 return NULL;
5994 #undef SPLIT_APPEND
5996 static
5997 PyObject *split(PyUnicodeObject *self,
5998 PyUnicodeObject *substring,
5999 Py_ssize_t maxcount)
6001 PyObject *list;
6003 if (maxcount < 0)
6004 maxcount = PY_SSIZE_T_MAX;
6006 list = PyList_New(0);
6007 if (!list)
6008 return NULL;
6010 if (substring == NULL)
6011 return split_whitespace(self,list,maxcount);
6013 else if (substring->length == 1)
6014 return split_char(self,list,substring->str[0],maxcount);
6016 else if (substring->length == 0) {
6017 Py_DECREF(list);
6018 PyErr_SetString(PyExc_ValueError, "empty separator");
6019 return NULL;
6021 else
6022 return split_substring(self,list,substring,maxcount);
6025 static
6026 PyObject *rsplit(PyUnicodeObject *self,
6027 PyUnicodeObject *substring,
6028 Py_ssize_t maxcount)
6030 PyObject *list;
6032 if (maxcount < 0)
6033 maxcount = PY_SSIZE_T_MAX;
6035 list = PyList_New(0);
6036 if (!list)
6037 return NULL;
6039 if (substring == NULL)
6040 return rsplit_whitespace(self,list,maxcount);
6042 else if (substring->length == 1)
6043 return rsplit_char(self,list,substring->str[0],maxcount);
6045 else if (substring->length == 0) {
6046 Py_DECREF(list);
6047 PyErr_SetString(PyExc_ValueError, "empty separator");
6048 return NULL;
6050 else
6051 return rsplit_substring(self,list,substring,maxcount);
6054 static
6055 PyObject *replace(PyUnicodeObject *self,
6056 PyUnicodeObject *str1,
6057 PyUnicodeObject *str2,
6058 Py_ssize_t maxcount)
6060 PyUnicodeObject *u;
6062 if (maxcount < 0)
6063 maxcount = PY_SSIZE_T_MAX;
6065 if (str1->length == str2->length) {
6066 /* same length */
6067 Py_ssize_t i;
6068 if (str1->length == 1) {
6069 /* replace characters */
6070 Py_UNICODE u1, u2;
6071 if (!findchar(self->str, self->length, str1->str[0]))
6072 goto nothing;
6073 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6074 if (!u)
6075 return NULL;
6076 Py_UNICODE_COPY(u->str, self->str, self->length);
6077 u1 = str1->str[0];
6078 u2 = str2->str[0];
6079 for (i = 0; i < u->length; i++)
6080 if (u->str[i] == u1) {
6081 if (--maxcount < 0)
6082 break;
6083 u->str[i] = u2;
6085 } else {
6086 i = fastsearch(
6087 self->str, self->length, str1->str, str1->length, FAST_SEARCH
6089 if (i < 0)
6090 goto nothing;
6091 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6092 if (!u)
6093 return NULL;
6094 Py_UNICODE_COPY(u->str, self->str, self->length);
6095 while (i <= self->length - str1->length)
6096 if (Py_UNICODE_MATCH(self, i, str1)) {
6097 if (--maxcount < 0)
6098 break;
6099 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6100 i += str1->length;
6101 } else
6102 i++;
6104 } else {
6106 Py_ssize_t n, i, j, e;
6107 Py_ssize_t product, new_size, delta;
6108 Py_UNICODE *p;
6110 /* replace strings */
6111 n = stringlib_count(self->str, self->length, str1->str, str1->length);
6112 if (n > maxcount)
6113 n = maxcount;
6114 if (n == 0)
6115 goto nothing;
6116 /* new_size = self->length + n * (str2->length - str1->length)); */
6117 delta = (str2->length - str1->length);
6118 if (delta == 0) {
6119 new_size = self->length;
6120 } else {
6121 product = n * (str2->length - str1->length);
6122 if ((product / (str2->length - str1->length)) != n) {
6123 PyErr_SetString(PyExc_OverflowError,
6124 "replace string is too long");
6125 return NULL;
6127 new_size = self->length + product;
6128 if (new_size < 0) {
6129 PyErr_SetString(PyExc_OverflowError,
6130 "replace string is too long");
6131 return NULL;
6134 u = _PyUnicode_New(new_size);
6135 if (!u)
6136 return NULL;
6137 i = 0;
6138 p = u->str;
6139 e = self->length - str1->length;
6140 if (str1->length > 0) {
6141 while (n-- > 0) {
6142 /* look for next match */
6143 j = i;
6144 while (j <= e) {
6145 if (Py_UNICODE_MATCH(self, j, str1))
6146 break;
6147 j++;
6149 if (j > i) {
6150 if (j > e)
6151 break;
6152 /* copy unchanged part [i:j] */
6153 Py_UNICODE_COPY(p, self->str+i, j-i);
6154 p += j - i;
6156 /* copy substitution string */
6157 if (str2->length > 0) {
6158 Py_UNICODE_COPY(p, str2->str, str2->length);
6159 p += str2->length;
6161 i = j + str1->length;
6163 if (i < self->length)
6164 /* copy tail [i:] */
6165 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6166 } else {
6167 /* interleave */
6168 while (n > 0) {
6169 Py_UNICODE_COPY(p, str2->str, str2->length);
6170 p += str2->length;
6171 if (--n <= 0)
6172 break;
6173 *p++ = self->str[i++];
6175 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6178 return (PyObject *) u;
6180 nothing:
6181 /* nothing to replace; return original string (when possible) */
6182 if (PyUnicode_CheckExact(self)) {
6183 Py_INCREF(self);
6184 return (PyObject *) self;
6186 return PyUnicode_FromUnicode(self->str, self->length);
6189 /* --- Unicode Object Methods --------------------------------------------- */
6191 PyDoc_STRVAR(title__doc__,
6192 "S.title() -> unicode\n\
6194 Return a titlecased version of S, i.e. words start with title case\n\
6195 characters, all remaining cased characters have lower case.");
6197 static PyObject*
6198 unicode_title(PyUnicodeObject *self)
6200 return fixup(self, fixtitle);
6203 PyDoc_STRVAR(capitalize__doc__,
6204 "S.capitalize() -> unicode\n\
6206 Return a capitalized version of S, i.e. make the first character\n\
6207 have upper case.");
6209 static PyObject*
6210 unicode_capitalize(PyUnicodeObject *self)
6212 return fixup(self, fixcapitalize);
6215 #if 0
6216 PyDoc_STRVAR(capwords__doc__,
6217 "S.capwords() -> unicode\n\
6219 Apply .capitalize() to all words in S and return the result with\n\
6220 normalized whitespace (all whitespace strings are replaced by ' ').");
6222 static PyObject*
6223 unicode_capwords(PyUnicodeObject *self)
6225 PyObject *list;
6226 PyObject *item;
6227 Py_ssize_t i;
6229 /* Split into words */
6230 list = split(self, NULL, -1);
6231 if (!list)
6232 return NULL;
6234 /* Capitalize each word */
6235 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6236 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6237 fixcapitalize);
6238 if (item == NULL)
6239 goto onError;
6240 Py_DECREF(PyList_GET_ITEM(list, i));
6241 PyList_SET_ITEM(list, i, item);
6244 /* Join the words to form a new string */
6245 item = PyUnicode_Join(NULL, list);
6247 onError:
6248 Py_DECREF(list);
6249 return (PyObject *)item;
6251 #endif
6253 /* Argument converter. Coerces to a single unicode character */
6255 static int
6256 convert_uc(PyObject *obj, void *addr)
6258 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6259 PyObject *uniobj;
6260 Py_UNICODE *unistr;
6262 uniobj = PyUnicode_FromObject(obj);
6263 if (uniobj == NULL) {
6264 PyErr_SetString(PyExc_TypeError,
6265 "The fill character cannot be converted to Unicode");
6266 return 0;
6268 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6269 PyErr_SetString(PyExc_TypeError,
6270 "The fill character must be exactly one character long");
6271 Py_DECREF(uniobj);
6272 return 0;
6274 unistr = PyUnicode_AS_UNICODE(uniobj);
6275 *fillcharloc = unistr[0];
6276 Py_DECREF(uniobj);
6277 return 1;
6280 PyDoc_STRVAR(center__doc__,
6281 "S.center(width[, fillchar]) -> unicode\n\
6283 Return S centered in a Unicode string of length width. Padding is\n\
6284 done using the specified fill character (default is a space)");
6286 static PyObject *
6287 unicode_center(PyUnicodeObject *self, PyObject *args)
6289 Py_ssize_t marg, left;
6290 Py_ssize_t width;
6291 Py_UNICODE fillchar = ' ';
6293 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6294 return NULL;
6296 if (self->length >= width && PyUnicode_CheckExact(self)) {
6297 Py_INCREF(self);
6298 return (PyObject*) self;
6301 marg = width - self->length;
6302 left = marg / 2 + (marg & width & 1);
6304 return (PyObject*) pad(self, left, marg - left, fillchar);
6307 #if 0
6309 /* This code should go into some future Unicode collation support
6310 module. The basic comparison should compare ordinals on a naive
6311 basis (this is what Java does and thus Jython too). */
6313 /* speedy UTF-16 code point order comparison */
6314 /* gleaned from: */
6315 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6317 static short utf16Fixup[32] =
6319 0, 0, 0, 0, 0, 0, 0, 0,
6320 0, 0, 0, 0, 0, 0, 0, 0,
6321 0, 0, 0, 0, 0, 0, 0, 0,
6322 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6325 static int
6326 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6328 Py_ssize_t len1, len2;
6330 Py_UNICODE *s1 = str1->str;
6331 Py_UNICODE *s2 = str2->str;
6333 len1 = str1->length;
6334 len2 = str2->length;
6336 while (len1 > 0 && len2 > 0) {
6337 Py_UNICODE c1, c2;
6339 c1 = *s1++;
6340 c2 = *s2++;
6342 if (c1 > (1<<11) * 26)
6343 c1 += utf16Fixup[c1>>11];
6344 if (c2 > (1<<11) * 26)
6345 c2 += utf16Fixup[c2>>11];
6346 /* now c1 and c2 are in UTF-32-compatible order */
6348 if (c1 != c2)
6349 return (c1 < c2) ? -1 : 1;
6351 len1--; len2--;
6354 return (len1 < len2) ? -1 : (len1 != len2);
6357 #else
6359 static int
6360 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6362 register Py_ssize_t len1, len2;
6364 Py_UNICODE *s1 = str1->str;
6365 Py_UNICODE *s2 = str2->str;
6367 len1 = str1->length;
6368 len2 = str2->length;
6370 while (len1 > 0 && len2 > 0) {
6371 Py_UNICODE c1, c2;
6373 c1 = *s1++;
6374 c2 = *s2++;
6376 if (c1 != c2)
6377 return (c1 < c2) ? -1 : 1;
6379 len1--; len2--;
6382 return (len1 < len2) ? -1 : (len1 != len2);
6385 #endif
6387 int PyUnicode_Compare(PyObject *left,
6388 PyObject *right)
6390 PyUnicodeObject *u = NULL, *v = NULL;
6391 int result;
6393 /* Coerce the two arguments */
6394 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6395 if (u == NULL)
6396 goto onError;
6397 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6398 if (v == NULL)
6399 goto onError;
6401 /* Shortcut for empty or interned objects */
6402 if (v == u) {
6403 Py_DECREF(u);
6404 Py_DECREF(v);
6405 return 0;
6408 result = unicode_compare(u, v);
6410 Py_DECREF(u);
6411 Py_DECREF(v);
6412 return result;
6414 onError:
6415 Py_XDECREF(u);
6416 Py_XDECREF(v);
6417 return -1;
6420 PyObject *PyUnicode_RichCompare(PyObject *left,
6421 PyObject *right,
6422 int op)
6424 int result;
6426 result = PyUnicode_Compare(left, right);
6427 if (result == -1 && PyErr_Occurred())
6428 goto onError;
6430 /* Convert the return value to a Boolean */
6431 switch (op) {
6432 case Py_EQ:
6433 result = (result == 0);
6434 break;
6435 case Py_NE:
6436 result = (result != 0);
6437 break;
6438 case Py_LE:
6439 result = (result <= 0);
6440 break;
6441 case Py_GE:
6442 result = (result >= 0);
6443 break;
6444 case Py_LT:
6445 result = (result == -1);
6446 break;
6447 case Py_GT:
6448 result = (result == 1);
6449 break;
6451 return PyBool_FromLong(result);
6453 onError:
6455 /* Standard case
6457 Type errors mean that PyUnicode_FromObject() could not convert
6458 one of the arguments (usually the right hand side) to Unicode,
6459 ie. we can't handle the comparison request. However, it is
6460 possible that the other object knows a comparison method, which
6461 is why we return Py_NotImplemented to give the other object a
6462 chance.
6465 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6466 PyErr_Clear();
6467 Py_INCREF(Py_NotImplemented);
6468 return Py_NotImplemented;
6470 if (op != Py_EQ && op != Py_NE)
6471 return NULL;
6473 /* Equality comparison.
6475 This is a special case: we silence any PyExc_UnicodeDecodeError
6476 and instead turn it into a PyErr_UnicodeWarning.
6479 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6480 return NULL;
6481 PyErr_Clear();
6482 if (PyErr_Warn(PyExc_UnicodeWarning,
6483 (op == Py_EQ) ?
6484 "Unicode equal comparison "
6485 "failed to convert both arguments to Unicode - "
6486 "interpreting them as being unequal" :
6487 "Unicode unequal comparison "
6488 "failed to convert both arguments to Unicode - "
6489 "interpreting them as being unequal"
6490 ) < 0)
6491 return NULL;
6492 result = (op == Py_NE);
6493 return PyBool_FromLong(result);
6496 int PyUnicode_Contains(PyObject *container,
6497 PyObject *element)
6499 PyObject *str, *sub;
6500 int result;
6502 /* Coerce the two arguments */
6503 sub = PyUnicode_FromObject(element);
6504 if (!sub) {
6505 PyErr_SetString(PyExc_TypeError,
6506 "'in <string>' requires string as left operand");
6507 return -1;
6510 str = PyUnicode_FromObject(container);
6511 if (!str) {
6512 Py_DECREF(sub);
6513 return -1;
6516 result = stringlib_contains_obj(str, sub);
6518 Py_DECREF(str);
6519 Py_DECREF(sub);
6521 return result;
6524 /* Concat to string or Unicode object giving a new Unicode object. */
6526 PyObject *PyUnicode_Concat(PyObject *left,
6527 PyObject *right)
6529 PyUnicodeObject *u = NULL, *v = NULL, *w;
6531 /* Coerce the two arguments */
6532 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6533 if (u == NULL)
6534 goto onError;
6535 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6536 if (v == NULL)
6537 goto onError;
6539 /* Shortcuts */
6540 if (v == unicode_empty) {
6541 Py_DECREF(v);
6542 return (PyObject *)u;
6544 if (u == unicode_empty) {
6545 Py_DECREF(u);
6546 return (PyObject *)v;
6549 /* Concat the two Unicode strings */
6550 w = _PyUnicode_New(u->length + v->length);
6551 if (w == NULL)
6552 goto onError;
6553 Py_UNICODE_COPY(w->str, u->str, u->length);
6554 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6556 Py_DECREF(u);
6557 Py_DECREF(v);
6558 return (PyObject *)w;
6560 onError:
6561 Py_XDECREF(u);
6562 Py_XDECREF(v);
6563 return NULL;
6566 PyDoc_STRVAR(count__doc__,
6567 "S.count(sub[, start[, end]]) -> int\n\
6569 Return the number of non-overlapping occurrences of substring sub in\n\
6570 Unicode string S[start:end]. Optional arguments start and end are\n\
6571 interpreted as in slice notation.");
6573 static PyObject *
6574 unicode_count(PyUnicodeObject *self, PyObject *args)
6576 PyUnicodeObject *substring;
6577 Py_ssize_t start = 0;
6578 Py_ssize_t end = PY_SSIZE_T_MAX;
6579 PyObject *result;
6581 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6582 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6583 return NULL;
6585 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6586 (PyObject *)substring);
6587 if (substring == NULL)
6588 return NULL;
6590 FIX_START_END(self);
6592 result = PyInt_FromSsize_t(
6593 stringlib_count(self->str + start, end - start,
6594 substring->str, substring->length)
6597 Py_DECREF(substring);
6599 return result;
6602 PyDoc_STRVAR(encode__doc__,
6603 "S.encode([encoding[,errors]]) -> string or unicode\n\
6605 Encodes S using the codec registered for encoding. encoding defaults\n\
6606 to the default encoding. errors may be given to set a different error\n\
6607 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6608 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6609 'xmlcharrefreplace' as well as any other name registered with\n\
6610 codecs.register_error that can handle UnicodeEncodeErrors.");
6612 static PyObject *
6613 unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6615 static char *kwlist[] = {"encoding", "errors", 0};
6616 char *encoding = NULL;
6617 char *errors = NULL;
6618 PyObject *v;
6620 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6621 kwlist, &encoding, &errors))
6622 return NULL;
6623 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6624 if (v == NULL)
6625 goto onError;
6626 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6627 PyErr_Format(PyExc_TypeError,
6628 "encoder did not return a string/unicode object "
6629 "(type=%.400s)",
6630 Py_TYPE(v)->tp_name);
6631 Py_DECREF(v);
6632 return NULL;
6634 return v;
6636 onError:
6637 return NULL;
6640 PyDoc_STRVAR(decode__doc__,
6641 "S.decode([encoding[,errors]]) -> string or unicode\n\
6643 Decodes S using the codec registered for encoding. encoding defaults\n\
6644 to the default encoding. errors may be given to set a different error\n\
6645 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6646 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6647 as well as any other name registerd with codecs.register_error that is\n\
6648 able to handle UnicodeDecodeErrors.");
6650 static PyObject *
6651 unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6653 static char *kwlist[] = {"encoding", "errors", 0};
6654 char *encoding = NULL;
6655 char *errors = NULL;
6656 PyObject *v;
6658 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6659 kwlist, &encoding, &errors))
6660 return NULL;
6661 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
6662 if (v == NULL)
6663 goto onError;
6664 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6665 PyErr_Format(PyExc_TypeError,
6666 "decoder did not return a string/unicode object "
6667 "(type=%.400s)",
6668 Py_TYPE(v)->tp_name);
6669 Py_DECREF(v);
6670 return NULL;
6672 return v;
6674 onError:
6675 return NULL;
6678 PyDoc_STRVAR(expandtabs__doc__,
6679 "S.expandtabs([tabsize]) -> unicode\n\
6681 Return a copy of S where all tab characters are expanded using spaces.\n\
6682 If tabsize is not given, a tab size of 8 characters is assumed.");
6684 static PyObject*
6685 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6687 Py_UNICODE *e;
6688 Py_UNICODE *p;
6689 Py_UNICODE *q;
6690 Py_UNICODE *qe;
6691 Py_ssize_t i, j, incr;
6692 PyUnicodeObject *u;
6693 int tabsize = 8;
6695 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6696 return NULL;
6698 /* First pass: determine size of output string */
6699 i = 0; /* chars up to and including most recent \n or \r */
6700 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6701 e = self->str + self->length; /* end of input */
6702 for (p = self->str; p < e; p++)
6703 if (*p == '\t') {
6704 if (tabsize > 0) {
6705 incr = tabsize - (j % tabsize); /* cannot overflow */
6706 if (j > PY_SSIZE_T_MAX - incr)
6707 goto overflow1;
6708 j += incr;
6711 else {
6712 if (j > PY_SSIZE_T_MAX - 1)
6713 goto overflow1;
6714 j++;
6715 if (*p == '\n' || *p == '\r') {
6716 if (i > PY_SSIZE_T_MAX - j)
6717 goto overflow1;
6718 i += j;
6719 j = 0;
6723 if (i > PY_SSIZE_T_MAX - j)
6724 goto overflow1;
6726 /* Second pass: create output string and fill it */
6727 u = _PyUnicode_New(i + j);
6728 if (!u)
6729 return NULL;
6731 j = 0; /* same as in first pass */
6732 q = u->str; /* next output char */
6733 qe = u->str + u->length; /* end of output */
6735 for (p = self->str; p < e; p++)
6736 if (*p == '\t') {
6737 if (tabsize > 0) {
6738 i = tabsize - (j % tabsize);
6739 j += i;
6740 while (i--) {
6741 if (q >= qe)
6742 goto overflow2;
6743 *q++ = ' ';
6747 else {
6748 if (q >= qe)
6749 goto overflow2;
6750 *q++ = *p;
6751 j++;
6752 if (*p == '\n' || *p == '\r')
6753 j = 0;
6756 return (PyObject*) u;
6758 overflow2:
6759 Py_DECREF(u);
6760 overflow1:
6761 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6762 return NULL;
6765 PyDoc_STRVAR(find__doc__,
6766 "S.find(sub [,start [,end]]) -> int\n\
6768 Return the lowest index in S where substring sub is found,\n\
6769 such that sub is contained within s[start:end]. Optional\n\
6770 arguments start and end are interpreted as in slice notation.\n\
6772 Return -1 on failure.");
6774 static PyObject *
6775 unicode_find(PyUnicodeObject *self, PyObject *args)
6777 PyObject *substring;
6778 Py_ssize_t start;
6779 Py_ssize_t end;
6780 Py_ssize_t result;
6782 if (!_ParseTupleFinds(args, &substring, &start, &end))
6783 return NULL;
6785 result = stringlib_find_slice(
6786 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6787 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6788 start, end
6791 Py_DECREF(substring);
6793 return PyInt_FromSsize_t(result);
6796 static PyObject *
6797 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6799 if (index < 0 || index >= self->length) {
6800 PyErr_SetString(PyExc_IndexError, "string index out of range");
6801 return NULL;
6804 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6807 static long
6808 unicode_hash(PyUnicodeObject *self)
6810 /* Since Unicode objects compare equal to their ASCII string
6811 counterparts, they should use the individual character values
6812 as basis for their hash value. This is needed to assure that
6813 strings and Unicode objects behave in the same way as
6814 dictionary keys. */
6816 register Py_ssize_t len;
6817 register Py_UNICODE *p;
6818 register long x;
6820 if (self->hash != -1)
6821 return self->hash;
6822 len = PyUnicode_GET_SIZE(self);
6823 p = PyUnicode_AS_UNICODE(self);
6824 x = *p << 7;
6825 while (--len >= 0)
6826 x = (1000003*x) ^ *p++;
6827 x ^= PyUnicode_GET_SIZE(self);
6828 if (x == -1)
6829 x = -2;
6830 self->hash = x;
6831 return x;
6834 PyDoc_STRVAR(index__doc__,
6835 "S.index(sub [,start [,end]]) -> int\n\
6837 Like S.find() but raise ValueError when the substring is not found.");
6839 static PyObject *
6840 unicode_index(PyUnicodeObject *self, PyObject *args)
6842 Py_ssize_t result;
6843 PyObject *substring;
6844 Py_ssize_t start;
6845 Py_ssize_t end;
6847 if (!_ParseTupleFinds(args, &substring, &start, &end))
6848 return NULL;
6850 result = stringlib_find_slice(
6851 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6852 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6853 start, end
6856 Py_DECREF(substring);
6858 if (result < 0) {
6859 PyErr_SetString(PyExc_ValueError, "substring not found");
6860 return NULL;
6863 return PyInt_FromSsize_t(result);
6866 PyDoc_STRVAR(islower__doc__,
6867 "S.islower() -> bool\n\
6869 Return True if all cased characters in S are lowercase and there is\n\
6870 at least one cased character in S, False otherwise.");
6872 static PyObject*
6873 unicode_islower(PyUnicodeObject *self)
6875 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6876 register const Py_UNICODE *e;
6877 int cased;
6879 /* Shortcut for single character strings */
6880 if (PyUnicode_GET_SIZE(self) == 1)
6881 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6883 /* Special case for empty strings */
6884 if (PyUnicode_GET_SIZE(self) == 0)
6885 return PyBool_FromLong(0);
6887 e = p + PyUnicode_GET_SIZE(self);
6888 cased = 0;
6889 for (; p < e; p++) {
6890 register const Py_UNICODE ch = *p;
6892 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6893 return PyBool_FromLong(0);
6894 else if (!cased && Py_UNICODE_ISLOWER(ch))
6895 cased = 1;
6897 return PyBool_FromLong(cased);
6900 PyDoc_STRVAR(isupper__doc__,
6901 "S.isupper() -> bool\n\
6903 Return True if all cased characters in S are uppercase and there is\n\
6904 at least one cased character in S, False otherwise.");
6906 static PyObject*
6907 unicode_isupper(PyUnicodeObject *self)
6909 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6910 register const Py_UNICODE *e;
6911 int cased;
6913 /* Shortcut for single character strings */
6914 if (PyUnicode_GET_SIZE(self) == 1)
6915 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6917 /* Special case for empty strings */
6918 if (PyUnicode_GET_SIZE(self) == 0)
6919 return PyBool_FromLong(0);
6921 e = p + PyUnicode_GET_SIZE(self);
6922 cased = 0;
6923 for (; p < e; p++) {
6924 register const Py_UNICODE ch = *p;
6926 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6927 return PyBool_FromLong(0);
6928 else if (!cased && Py_UNICODE_ISUPPER(ch))
6929 cased = 1;
6931 return PyBool_FromLong(cased);
6934 PyDoc_STRVAR(istitle__doc__,
6935 "S.istitle() -> bool\n\
6937 Return True if S is a titlecased string and there is at least one\n\
6938 character in S, i.e. upper- and titlecase characters may only\n\
6939 follow uncased characters and lowercase characters only cased ones.\n\
6940 Return False otherwise.");
6942 static PyObject*
6943 unicode_istitle(PyUnicodeObject *self)
6945 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6946 register const Py_UNICODE *e;
6947 int cased, previous_is_cased;
6949 /* Shortcut for single character strings */
6950 if (PyUnicode_GET_SIZE(self) == 1)
6951 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6952 (Py_UNICODE_ISUPPER(*p) != 0));
6954 /* Special case for empty strings */
6955 if (PyUnicode_GET_SIZE(self) == 0)
6956 return PyBool_FromLong(0);
6958 e = p + PyUnicode_GET_SIZE(self);
6959 cased = 0;
6960 previous_is_cased = 0;
6961 for (; p < e; p++) {
6962 register const Py_UNICODE ch = *p;
6964 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6965 if (previous_is_cased)
6966 return PyBool_FromLong(0);
6967 previous_is_cased = 1;
6968 cased = 1;
6970 else if (Py_UNICODE_ISLOWER(ch)) {
6971 if (!previous_is_cased)
6972 return PyBool_FromLong(0);
6973 previous_is_cased = 1;
6974 cased = 1;
6976 else
6977 previous_is_cased = 0;
6979 return PyBool_FromLong(cased);
6982 PyDoc_STRVAR(isspace__doc__,
6983 "S.isspace() -> bool\n\
6985 Return True if all characters in S are whitespace\n\
6986 and there is at least one character in S, False otherwise.");
6988 static PyObject*
6989 unicode_isspace(PyUnicodeObject *self)
6991 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6992 register const Py_UNICODE *e;
6994 /* Shortcut for single character strings */
6995 if (PyUnicode_GET_SIZE(self) == 1 &&
6996 Py_UNICODE_ISSPACE(*p))
6997 return PyBool_FromLong(1);
6999 /* Special case for empty strings */
7000 if (PyUnicode_GET_SIZE(self) == 0)
7001 return PyBool_FromLong(0);
7003 e = p + PyUnicode_GET_SIZE(self);
7004 for (; p < e; p++) {
7005 if (!Py_UNICODE_ISSPACE(*p))
7006 return PyBool_FromLong(0);
7008 return PyBool_FromLong(1);
7011 PyDoc_STRVAR(isalpha__doc__,
7012 "S.isalpha() -> bool\n\
7014 Return True if all characters in S are alphabetic\n\
7015 and there is at least one character in S, False otherwise.");
7017 static PyObject*
7018 unicode_isalpha(PyUnicodeObject *self)
7020 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7021 register const Py_UNICODE *e;
7023 /* Shortcut for single character strings */
7024 if (PyUnicode_GET_SIZE(self) == 1 &&
7025 Py_UNICODE_ISALPHA(*p))
7026 return PyBool_FromLong(1);
7028 /* Special case for empty strings */
7029 if (PyUnicode_GET_SIZE(self) == 0)
7030 return PyBool_FromLong(0);
7032 e = p + PyUnicode_GET_SIZE(self);
7033 for (; p < e; p++) {
7034 if (!Py_UNICODE_ISALPHA(*p))
7035 return PyBool_FromLong(0);
7037 return PyBool_FromLong(1);
7040 PyDoc_STRVAR(isalnum__doc__,
7041 "S.isalnum() -> bool\n\
7043 Return True if all characters in S are alphanumeric\n\
7044 and there is at least one character in S, False otherwise.");
7046 static PyObject*
7047 unicode_isalnum(PyUnicodeObject *self)
7049 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7050 register const Py_UNICODE *e;
7052 /* Shortcut for single character strings */
7053 if (PyUnicode_GET_SIZE(self) == 1 &&
7054 Py_UNICODE_ISALNUM(*p))
7055 return PyBool_FromLong(1);
7057 /* Special case for empty strings */
7058 if (PyUnicode_GET_SIZE(self) == 0)
7059 return PyBool_FromLong(0);
7061 e = p + PyUnicode_GET_SIZE(self);
7062 for (; p < e; p++) {
7063 if (!Py_UNICODE_ISALNUM(*p))
7064 return PyBool_FromLong(0);
7066 return PyBool_FromLong(1);
7069 PyDoc_STRVAR(isdecimal__doc__,
7070 "S.isdecimal() -> bool\n\
7072 Return True if there are only decimal characters in S,\n\
7073 False otherwise.");
7075 static PyObject*
7076 unicode_isdecimal(PyUnicodeObject *self)
7078 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7079 register const Py_UNICODE *e;
7081 /* Shortcut for single character strings */
7082 if (PyUnicode_GET_SIZE(self) == 1 &&
7083 Py_UNICODE_ISDECIMAL(*p))
7084 return PyBool_FromLong(1);
7086 /* Special case for empty strings */
7087 if (PyUnicode_GET_SIZE(self) == 0)
7088 return PyBool_FromLong(0);
7090 e = p + PyUnicode_GET_SIZE(self);
7091 for (; p < e; p++) {
7092 if (!Py_UNICODE_ISDECIMAL(*p))
7093 return PyBool_FromLong(0);
7095 return PyBool_FromLong(1);
7098 PyDoc_STRVAR(isdigit__doc__,
7099 "S.isdigit() -> bool\n\
7101 Return True if all characters in S are digits\n\
7102 and there is at least one character in S, False otherwise.");
7104 static PyObject*
7105 unicode_isdigit(PyUnicodeObject *self)
7107 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7108 register const Py_UNICODE *e;
7110 /* Shortcut for single character strings */
7111 if (PyUnicode_GET_SIZE(self) == 1 &&
7112 Py_UNICODE_ISDIGIT(*p))
7113 return PyBool_FromLong(1);
7115 /* Special case for empty strings */
7116 if (PyUnicode_GET_SIZE(self) == 0)
7117 return PyBool_FromLong(0);
7119 e = p + PyUnicode_GET_SIZE(self);
7120 for (; p < e; p++) {
7121 if (!Py_UNICODE_ISDIGIT(*p))
7122 return PyBool_FromLong(0);
7124 return PyBool_FromLong(1);
7127 PyDoc_STRVAR(isnumeric__doc__,
7128 "S.isnumeric() -> bool\n\
7130 Return True if there are only numeric characters in S,\n\
7131 False otherwise.");
7133 static PyObject*
7134 unicode_isnumeric(PyUnicodeObject *self)
7136 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7137 register const Py_UNICODE *e;
7139 /* Shortcut for single character strings */
7140 if (PyUnicode_GET_SIZE(self) == 1 &&
7141 Py_UNICODE_ISNUMERIC(*p))
7142 return PyBool_FromLong(1);
7144 /* Special case for empty strings */
7145 if (PyUnicode_GET_SIZE(self) == 0)
7146 return PyBool_FromLong(0);
7148 e = p + PyUnicode_GET_SIZE(self);
7149 for (; p < e; p++) {
7150 if (!Py_UNICODE_ISNUMERIC(*p))
7151 return PyBool_FromLong(0);
7153 return PyBool_FromLong(1);
7156 PyDoc_STRVAR(join__doc__,
7157 "S.join(iterable) -> unicode\n\
7159 Return a string which is the concatenation of the strings in the\n\
7160 iterable. The separator between elements is S.");
7162 static PyObject*
7163 unicode_join(PyObject *self, PyObject *data)
7165 return PyUnicode_Join(self, data);
7168 static Py_ssize_t
7169 unicode_length(PyUnicodeObject *self)
7171 return self->length;
7174 PyDoc_STRVAR(ljust__doc__,
7175 "S.ljust(width[, fillchar]) -> int\n\
7177 Return S left-justified in a Unicode string of length width. Padding is\n\
7178 done using the specified fill character (default is a space).");
7180 static PyObject *
7181 unicode_ljust(PyUnicodeObject *self, PyObject *args)
7183 Py_ssize_t width;
7184 Py_UNICODE fillchar = ' ';
7186 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7187 return NULL;
7189 if (self->length >= width && PyUnicode_CheckExact(self)) {
7190 Py_INCREF(self);
7191 return (PyObject*) self;
7194 return (PyObject*) pad(self, 0, width - self->length, fillchar);
7197 PyDoc_STRVAR(lower__doc__,
7198 "S.lower() -> unicode\n\
7200 Return a copy of the string S converted to lowercase.");
7202 static PyObject*
7203 unicode_lower(PyUnicodeObject *self)
7205 return fixup(self, fixlower);
7208 #define LEFTSTRIP 0
7209 #define RIGHTSTRIP 1
7210 #define BOTHSTRIP 2
7212 /* Arrays indexed by above */
7213 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7215 #define STRIPNAME(i) (stripformat[i]+3)
7217 /* externally visible for str.strip(unicode) */
7218 PyObject *
7219 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7221 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7222 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7223 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7224 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7225 Py_ssize_t i, j;
7227 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7229 i = 0;
7230 if (striptype != RIGHTSTRIP) {
7231 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7232 i++;
7236 j = len;
7237 if (striptype != LEFTSTRIP) {
7238 do {
7239 j--;
7240 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7241 j++;
7244 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7245 Py_INCREF(self);
7246 return (PyObject*)self;
7248 else
7249 return PyUnicode_FromUnicode(s+i, j-i);
7253 static PyObject *
7254 do_strip(PyUnicodeObject *self, int striptype)
7256 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7257 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7259 i = 0;
7260 if (striptype != RIGHTSTRIP) {
7261 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7262 i++;
7266 j = len;
7267 if (striptype != LEFTSTRIP) {
7268 do {
7269 j--;
7270 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7271 j++;
7274 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7275 Py_INCREF(self);
7276 return (PyObject*)self;
7278 else
7279 return PyUnicode_FromUnicode(s+i, j-i);
7283 static PyObject *
7284 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7286 PyObject *sep = NULL;
7288 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7289 return NULL;
7291 if (sep != NULL && sep != Py_None) {
7292 if (PyUnicode_Check(sep))
7293 return _PyUnicode_XStrip(self, striptype, sep);
7294 else if (PyString_Check(sep)) {
7295 PyObject *res;
7296 sep = PyUnicode_FromObject(sep);
7297 if (sep==NULL)
7298 return NULL;
7299 res = _PyUnicode_XStrip(self, striptype, sep);
7300 Py_DECREF(sep);
7301 return res;
7303 else {
7304 PyErr_Format(PyExc_TypeError,
7305 "%s arg must be None, unicode or str",
7306 STRIPNAME(striptype));
7307 return NULL;
7311 return do_strip(self, striptype);
7315 PyDoc_STRVAR(strip__doc__,
7316 "S.strip([chars]) -> unicode\n\
7318 Return a copy of the string S with leading and trailing\n\
7319 whitespace removed.\n\
7320 If chars is given and not None, remove characters in chars instead.\n\
7321 If chars is a str, it will be converted to unicode before stripping");
7323 static PyObject *
7324 unicode_strip(PyUnicodeObject *self, PyObject *args)
7326 if (PyTuple_GET_SIZE(args) == 0)
7327 return do_strip(self, BOTHSTRIP); /* Common case */
7328 else
7329 return do_argstrip(self, BOTHSTRIP, args);
7333 PyDoc_STRVAR(lstrip__doc__,
7334 "S.lstrip([chars]) -> unicode\n\
7336 Return a copy of the string S with leading whitespace removed.\n\
7337 If chars is given and not None, remove characters in chars instead.\n\
7338 If chars is a str, it will be converted to unicode before stripping");
7340 static PyObject *
7341 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7343 if (PyTuple_GET_SIZE(args) == 0)
7344 return do_strip(self, LEFTSTRIP); /* Common case */
7345 else
7346 return do_argstrip(self, LEFTSTRIP, args);
7350 PyDoc_STRVAR(rstrip__doc__,
7351 "S.rstrip([chars]) -> unicode\n\
7353 Return a copy of the string S with trailing whitespace removed.\n\
7354 If chars is given and not None, remove characters in chars instead.\n\
7355 If chars is a str, it will be converted to unicode before stripping");
7357 static PyObject *
7358 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7360 if (PyTuple_GET_SIZE(args) == 0)
7361 return do_strip(self, RIGHTSTRIP); /* Common case */
7362 else
7363 return do_argstrip(self, RIGHTSTRIP, args);
7367 static PyObject*
7368 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7370 PyUnicodeObject *u;
7371 Py_UNICODE *p;
7372 Py_ssize_t nchars;
7373 size_t nbytes;
7375 if (len < 0)
7376 len = 0;
7378 if (len == 1 && PyUnicode_CheckExact(str)) {
7379 /* no repeat, return original string */
7380 Py_INCREF(str);
7381 return (PyObject*) str;
7384 /* ensure # of chars needed doesn't overflow int and # of bytes
7385 * needed doesn't overflow size_t
7387 nchars = len * str->length;
7388 if (len && nchars / len != str->length) {
7389 PyErr_SetString(PyExc_OverflowError,
7390 "repeated string is too long");
7391 return NULL;
7393 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7394 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7395 PyErr_SetString(PyExc_OverflowError,
7396 "repeated string is too long");
7397 return NULL;
7399 u = _PyUnicode_New(nchars);
7400 if (!u)
7401 return NULL;
7403 p = u->str;
7405 if (str->length == 1 && len > 0) {
7406 Py_UNICODE_FILL(p, str->str[0], len);
7407 } else {
7408 Py_ssize_t done = 0; /* number of characters copied this far */
7409 if (done < nchars) {
7410 Py_UNICODE_COPY(p, str->str, str->length);
7411 done = str->length;
7413 while (done < nchars) {
7414 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7415 Py_UNICODE_COPY(p+done, p, n);
7416 done += n;
7420 return (PyObject*) u;
7423 PyObject *PyUnicode_Replace(PyObject *obj,
7424 PyObject *subobj,
7425 PyObject *replobj,
7426 Py_ssize_t maxcount)
7428 PyObject *self;
7429 PyObject *str1;
7430 PyObject *str2;
7431 PyObject *result;
7433 self = PyUnicode_FromObject(obj);
7434 if (self == NULL)
7435 return NULL;
7436 str1 = PyUnicode_FromObject(subobj);
7437 if (str1 == NULL) {
7438 Py_DECREF(self);
7439 return NULL;
7441 str2 = PyUnicode_FromObject(replobj);
7442 if (str2 == NULL) {
7443 Py_DECREF(self);
7444 Py_DECREF(str1);
7445 return NULL;
7447 result = replace((PyUnicodeObject *)self,
7448 (PyUnicodeObject *)str1,
7449 (PyUnicodeObject *)str2,
7450 maxcount);
7451 Py_DECREF(self);
7452 Py_DECREF(str1);
7453 Py_DECREF(str2);
7454 return result;
7457 PyDoc_STRVAR(replace__doc__,
7458 "S.replace (old, new[, count]) -> unicode\n\
7460 Return a copy of S with all occurrences of substring\n\
7461 old replaced by new. If the optional argument count is\n\
7462 given, only the first count occurrences are replaced.");
7464 static PyObject*
7465 unicode_replace(PyUnicodeObject *self, PyObject *args)
7467 PyUnicodeObject *str1;
7468 PyUnicodeObject *str2;
7469 Py_ssize_t maxcount = -1;
7470 PyObject *result;
7472 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7473 return NULL;
7474 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7475 if (str1 == NULL)
7476 return NULL;
7477 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7478 if (str2 == NULL) {
7479 Py_DECREF(str1);
7480 return NULL;
7483 result = replace(self, str1, str2, maxcount);
7485 Py_DECREF(str1);
7486 Py_DECREF(str2);
7487 return result;
7490 static
7491 PyObject *unicode_repr(PyObject *unicode)
7493 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7494 PyUnicode_GET_SIZE(unicode),
7498 PyDoc_STRVAR(rfind__doc__,
7499 "S.rfind(sub [,start [,end]]) -> int\n\
7501 Return the highest index in S where substring sub is found,\n\
7502 such that sub is contained within s[start:end]. Optional\n\
7503 arguments start and end are interpreted as in slice notation.\n\
7505 Return -1 on failure.");
7507 static PyObject *
7508 unicode_rfind(PyUnicodeObject *self, PyObject *args)
7510 PyObject *substring;
7511 Py_ssize_t start;
7512 Py_ssize_t end;
7513 Py_ssize_t result;
7515 if (!_ParseTupleFinds(args, &substring, &start, &end))
7516 return NULL;
7518 result = stringlib_rfind_slice(
7519 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7520 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7521 start, end
7524 Py_DECREF(substring);
7526 return PyInt_FromSsize_t(result);
7529 PyDoc_STRVAR(rindex__doc__,
7530 "S.rindex(sub [,start [,end]]) -> int\n\
7532 Like S.rfind() but raise ValueError when the substring is not found.");
7534 static PyObject *
7535 unicode_rindex(PyUnicodeObject *self, PyObject *args)
7537 PyObject *substring;
7538 Py_ssize_t start;
7539 Py_ssize_t end;
7540 Py_ssize_t result;
7542 if (!_ParseTupleFinds(args, &substring, &start, &end))
7543 return NULL;
7545 result = stringlib_rfind_slice(
7546 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7547 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7548 start, end
7551 Py_DECREF(substring);
7553 if (result < 0) {
7554 PyErr_SetString(PyExc_ValueError, "substring not found");
7555 return NULL;
7557 return PyInt_FromSsize_t(result);
7560 PyDoc_STRVAR(rjust__doc__,
7561 "S.rjust(width[, fillchar]) -> unicode\n\
7563 Return S right-justified in a Unicode string of length width. Padding is\n\
7564 done using the specified fill character (default is a space).");
7566 static PyObject *
7567 unicode_rjust(PyUnicodeObject *self, PyObject *args)
7569 Py_ssize_t width;
7570 Py_UNICODE fillchar = ' ';
7572 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7573 return NULL;
7575 if (self->length >= width && PyUnicode_CheckExact(self)) {
7576 Py_INCREF(self);
7577 return (PyObject*) self;
7580 return (PyObject*) pad(self, width - self->length, 0, fillchar);
7583 static PyObject*
7584 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
7586 /* standard clamping */
7587 if (start < 0)
7588 start = 0;
7589 if (end < 0)
7590 end = 0;
7591 if (end > self->length)
7592 end = self->length;
7593 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
7594 /* full slice, return original string */
7595 Py_INCREF(self);
7596 return (PyObject*) self;
7598 if (start > end)
7599 start = end;
7600 /* copy slice */
7601 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7602 end - start);
7605 PyObject *PyUnicode_Split(PyObject *s,
7606 PyObject *sep,
7607 Py_ssize_t maxsplit)
7609 PyObject *result;
7611 s = PyUnicode_FromObject(s);
7612 if (s == NULL)
7613 return NULL;
7614 if (sep != NULL) {
7615 sep = PyUnicode_FromObject(sep);
7616 if (sep == NULL) {
7617 Py_DECREF(s);
7618 return NULL;
7622 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7624 Py_DECREF(s);
7625 Py_XDECREF(sep);
7626 return result;
7629 PyDoc_STRVAR(split__doc__,
7630 "S.split([sep [,maxsplit]]) -> list of strings\n\
7632 Return a list of the words in S, using sep as the\n\
7633 delimiter string. If maxsplit is given, at most maxsplit\n\
7634 splits are done. If sep is not specified or is None, any\n\
7635 whitespace string is a separator and empty strings are\n\
7636 removed from the result.");
7638 static PyObject*
7639 unicode_split(PyUnicodeObject *self, PyObject *args)
7641 PyObject *substring = Py_None;
7642 Py_ssize_t maxcount = -1;
7644 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7645 return NULL;
7647 if (substring == Py_None)
7648 return split(self, NULL, maxcount);
7649 else if (PyUnicode_Check(substring))
7650 return split(self, (PyUnicodeObject *)substring, maxcount);
7651 else
7652 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7655 PyObject *
7656 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7658 PyObject* str_obj;
7659 PyObject* sep_obj;
7660 PyObject* out;
7662 str_obj = PyUnicode_FromObject(str_in);
7663 if (!str_obj)
7664 return NULL;
7665 sep_obj = PyUnicode_FromObject(sep_in);
7666 if (!sep_obj) {
7667 Py_DECREF(str_obj);
7668 return NULL;
7671 out = stringlib_partition(
7672 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7673 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7676 Py_DECREF(sep_obj);
7677 Py_DECREF(str_obj);
7679 return out;
7683 PyObject *
7684 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7686 PyObject* str_obj;
7687 PyObject* sep_obj;
7688 PyObject* out;
7690 str_obj = PyUnicode_FromObject(str_in);
7691 if (!str_obj)
7692 return NULL;
7693 sep_obj = PyUnicode_FromObject(sep_in);
7694 if (!sep_obj) {
7695 Py_DECREF(str_obj);
7696 return NULL;
7699 out = stringlib_rpartition(
7700 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7701 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7704 Py_DECREF(sep_obj);
7705 Py_DECREF(str_obj);
7707 return out;
7710 PyDoc_STRVAR(partition__doc__,
7711 "S.partition(sep) -> (head, sep, tail)\n\
7713 Search for the separator sep in S, and return the part before it,\n\
7714 the separator itself, and the part after it. If the separator is not\n\
7715 found, return S and two empty strings.");
7717 static PyObject*
7718 unicode_partition(PyUnicodeObject *self, PyObject *separator)
7720 return PyUnicode_Partition((PyObject *)self, separator);
7723 PyDoc_STRVAR(rpartition__doc__,
7724 "S.rpartition(sep) -> (tail, sep, head)\n\
7726 Search for the separator sep in S, starting at the end of S, and return\n\
7727 the part before it, the separator itself, and the part after it. If the\n\
7728 separator is not found, return two empty strings and S.");
7730 static PyObject*
7731 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7733 return PyUnicode_RPartition((PyObject *)self, separator);
7736 PyObject *PyUnicode_RSplit(PyObject *s,
7737 PyObject *sep,
7738 Py_ssize_t maxsplit)
7740 PyObject *result;
7742 s = PyUnicode_FromObject(s);
7743 if (s == NULL)
7744 return NULL;
7745 if (sep != NULL) {
7746 sep = PyUnicode_FromObject(sep);
7747 if (sep == NULL) {
7748 Py_DECREF(s);
7749 return NULL;
7753 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7755 Py_DECREF(s);
7756 Py_XDECREF(sep);
7757 return result;
7760 PyDoc_STRVAR(rsplit__doc__,
7761 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7763 Return a list of the words in S, using sep as the\n\
7764 delimiter string, starting at the end of the string and\n\
7765 working to the front. If maxsplit is given, at most maxsplit\n\
7766 splits are done. If sep is not specified, any whitespace string\n\
7767 is a separator.");
7769 static PyObject*
7770 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7772 PyObject *substring = Py_None;
7773 Py_ssize_t maxcount = -1;
7775 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7776 return NULL;
7778 if (substring == Py_None)
7779 return rsplit(self, NULL, maxcount);
7780 else if (PyUnicode_Check(substring))
7781 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7782 else
7783 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7786 PyDoc_STRVAR(splitlines__doc__,
7787 "S.splitlines([keepends]) -> list of strings\n\
7789 Return a list of the lines in S, breaking at line boundaries.\n\
7790 Line breaks are not included in the resulting list unless keepends\n\
7791 is given and true.");
7793 static PyObject*
7794 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7796 int keepends = 0;
7798 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7799 return NULL;
7801 return PyUnicode_Splitlines((PyObject *)self, keepends);
7804 static
7805 PyObject *unicode_str(PyUnicodeObject *self)
7807 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
7810 PyDoc_STRVAR(swapcase__doc__,
7811 "S.swapcase() -> unicode\n\
7813 Return a copy of S with uppercase characters converted to lowercase\n\
7814 and vice versa.");
7816 static PyObject*
7817 unicode_swapcase(PyUnicodeObject *self)
7819 return fixup(self, fixswapcase);
7822 PyDoc_STRVAR(translate__doc__,
7823 "S.translate(table) -> unicode\n\
7825 Return a copy of the string S, where all characters have been mapped\n\
7826 through the given translation table, which must be a mapping of\n\
7827 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7828 Unmapped characters are left untouched. Characters mapped to None\n\
7829 are deleted.");
7831 static PyObject*
7832 unicode_translate(PyUnicodeObject *self, PyObject *table)
7834 return PyUnicode_TranslateCharmap(self->str,
7835 self->length,
7836 table,
7837 "ignore");
7840 PyDoc_STRVAR(upper__doc__,
7841 "S.upper() -> unicode\n\
7843 Return a copy of S converted to uppercase.");
7845 static PyObject*
7846 unicode_upper(PyUnicodeObject *self)
7848 return fixup(self, fixupper);
7851 PyDoc_STRVAR(zfill__doc__,
7852 "S.zfill(width) -> unicode\n\
7854 Pad a numeric string S with zeros on the left, to fill a field\n\
7855 of the specified width. The string S is never truncated.");
7857 static PyObject *
7858 unicode_zfill(PyUnicodeObject *self, PyObject *args)
7860 Py_ssize_t fill;
7861 PyUnicodeObject *u;
7863 Py_ssize_t width;
7864 if (!PyArg_ParseTuple(args, "n:zfill", &width))
7865 return NULL;
7867 if (self->length >= width) {
7868 if (PyUnicode_CheckExact(self)) {
7869 Py_INCREF(self);
7870 return (PyObject*) self;
7872 else
7873 return PyUnicode_FromUnicode(
7874 PyUnicode_AS_UNICODE(self),
7875 PyUnicode_GET_SIZE(self)
7879 fill = width - self->length;
7881 u = pad(self, fill, 0, '0');
7883 if (u == NULL)
7884 return NULL;
7886 if (u->str[fill] == '+' || u->str[fill] == '-') {
7887 /* move sign to beginning of string */
7888 u->str[0] = u->str[fill];
7889 u->str[fill] = '0';
7892 return (PyObject*) u;
7895 #if 0
7896 static PyObject*
7897 free_listsize(PyUnicodeObject *self)
7899 return PyInt_FromLong(numfree);
7901 #endif
7903 PyDoc_STRVAR(startswith__doc__,
7904 "S.startswith(prefix[, start[, end]]) -> bool\n\
7906 Return True if S starts with the specified prefix, False otherwise.\n\
7907 With optional start, test S beginning at that position.\n\
7908 With optional end, stop comparing S at that position.\n\
7909 prefix can also be a tuple of strings to try.");
7911 static PyObject *
7912 unicode_startswith(PyUnicodeObject *self,
7913 PyObject *args)
7915 PyObject *subobj;
7916 PyUnicodeObject *substring;
7917 Py_ssize_t start = 0;
7918 Py_ssize_t end = PY_SSIZE_T_MAX;
7919 int result;
7921 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
7922 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7923 return NULL;
7924 if (PyTuple_Check(subobj)) {
7925 Py_ssize_t i;
7926 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7927 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7928 PyTuple_GET_ITEM(subobj, i));
7929 if (substring == NULL)
7930 return NULL;
7931 result = tailmatch(self, substring, start, end, -1);
7932 Py_DECREF(substring);
7933 if (result) {
7934 Py_RETURN_TRUE;
7937 /* nothing matched */
7938 Py_RETURN_FALSE;
7940 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7941 if (substring == NULL)
7942 return NULL;
7943 result = tailmatch(self, substring, start, end, -1);
7944 Py_DECREF(substring);
7945 return PyBool_FromLong(result);
7949 PyDoc_STRVAR(endswith__doc__,
7950 "S.endswith(suffix[, start[, end]]) -> bool\n\
7952 Return True if S ends with the specified suffix, False otherwise.\n\
7953 With optional start, test S beginning at that position.\n\
7954 With optional end, stop comparing S at that position.\n\
7955 suffix can also be a tuple of strings to try.");
7957 static PyObject *
7958 unicode_endswith(PyUnicodeObject *self,
7959 PyObject *args)
7961 PyObject *subobj;
7962 PyUnicodeObject *substring;
7963 Py_ssize_t start = 0;
7964 Py_ssize_t end = PY_SSIZE_T_MAX;
7965 int result;
7967 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7968 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7969 return NULL;
7970 if (PyTuple_Check(subobj)) {
7971 Py_ssize_t i;
7972 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7973 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7974 PyTuple_GET_ITEM(subobj, i));
7975 if (substring == NULL)
7976 return NULL;
7977 result = tailmatch(self, substring, start, end, +1);
7978 Py_DECREF(substring);
7979 if (result) {
7980 Py_RETURN_TRUE;
7983 Py_RETURN_FALSE;
7985 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7986 if (substring == NULL)
7987 return NULL;
7989 result = tailmatch(self, substring, start, end, +1);
7990 Py_DECREF(substring);
7991 return PyBool_FromLong(result);
7995 /* Implements do_string_format, which is unicode because of stringlib */
7996 #include "stringlib/string_format.h"
7998 PyDoc_STRVAR(format__doc__,
7999 "S.format(*args, **kwargs) -> unicode\n\
8003 static PyObject *
8004 unicode__format__(PyObject *self, PyObject *args)
8006 PyObject *format_spec;
8007 PyObject *result = NULL;
8008 PyObject *tmp = NULL;
8010 /* If 2.x, convert format_spec to the same type as value */
8011 /* This is to allow things like u''.format('') */
8012 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
8013 goto done;
8014 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
8015 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
8016 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
8017 goto done;
8019 tmp = PyObject_Unicode(format_spec);
8020 if (tmp == NULL)
8021 goto done;
8022 format_spec = tmp;
8024 result = _PyUnicode_FormatAdvanced(self,
8025 PyUnicode_AS_UNICODE(format_spec),
8026 PyUnicode_GET_SIZE(format_spec));
8027 done:
8028 Py_XDECREF(tmp);
8029 return result;
8032 PyDoc_STRVAR(p_format__doc__,
8033 "S.__format__(format_spec) -> unicode\n\
8037 static PyObject *
8038 unicode__sizeof__(PyUnicodeObject *v)
8040 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
8041 sizeof(Py_UNICODE) * (v->length + 1));
8044 PyDoc_STRVAR(sizeof__doc__,
8045 "S.__sizeof__() -> size of S in memory, in bytes\n\
8049 static PyObject *
8050 unicode_getnewargs(PyUnicodeObject *v)
8052 return Py_BuildValue("(u#)", v->str, v->length);
8056 static PyMethodDef unicode_methods[] = {
8058 /* Order is according to common usage: often used methods should
8059 appear first, since lookup is done sequentially. */
8061 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
8062 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8063 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
8064 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
8065 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8066 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8067 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8068 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8069 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8070 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8071 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
8072 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
8073 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8074 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8075 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
8076 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
8077 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
8078 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
8079 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8080 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8081 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
8082 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
8083 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
8084 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
8085 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
8086 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8087 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8088 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8089 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8090 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8091 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8092 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8093 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8094 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8095 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8096 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8097 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8098 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8099 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
8100 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
8101 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8102 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
8103 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8104 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
8105 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
8106 #if 0
8107 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
8108 #endif
8110 #if 0
8111 /* This one is just used for debugging the implementation. */
8112 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
8113 #endif
8115 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
8116 {NULL, NULL}
8119 static PyObject *
8120 unicode_mod(PyObject *v, PyObject *w)
8122 if (!PyUnicode_Check(v)) {
8123 Py_INCREF(Py_NotImplemented);
8124 return Py_NotImplemented;
8126 return PyUnicode_Format(v, w);
8129 static PyNumberMethods unicode_as_number = {
8130 0, /*nb_add*/
8131 0, /*nb_subtract*/
8132 0, /*nb_multiply*/
8133 0, /*nb_divide*/
8134 unicode_mod, /*nb_remainder*/
8137 static PySequenceMethods unicode_as_sequence = {
8138 (lenfunc) unicode_length, /* sq_length */
8139 PyUnicode_Concat, /* sq_concat */
8140 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8141 (ssizeargfunc) unicode_getitem, /* sq_item */
8142 (ssizessizeargfunc) unicode_slice, /* sq_slice */
8143 0, /* sq_ass_item */
8144 0, /* sq_ass_slice */
8145 PyUnicode_Contains, /* sq_contains */
8148 static PyObject*
8149 unicode_subscript(PyUnicodeObject* self, PyObject* item)
8151 if (PyIndex_Check(item)) {
8152 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
8153 if (i == -1 && PyErr_Occurred())
8154 return NULL;
8155 if (i < 0)
8156 i += PyUnicode_GET_SIZE(self);
8157 return unicode_getitem(self, i);
8158 } else if (PySlice_Check(item)) {
8159 Py_ssize_t start, stop, step, slicelength, cur, i;
8160 Py_UNICODE* source_buf;
8161 Py_UNICODE* result_buf;
8162 PyObject* result;
8164 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
8165 &start, &stop, &step, &slicelength) < 0) {
8166 return NULL;
8169 if (slicelength <= 0) {
8170 return PyUnicode_FromUnicode(NULL, 0);
8171 } else if (start == 0 && step == 1 && slicelength == self->length &&
8172 PyUnicode_CheckExact(self)) {
8173 Py_INCREF(self);
8174 return (PyObject *)self;
8175 } else if (step == 1) {
8176 return PyUnicode_FromUnicode(self->str + start, slicelength);
8177 } else {
8178 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
8179 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8180 sizeof(Py_UNICODE));
8182 if (result_buf == NULL)
8183 return PyErr_NoMemory();
8185 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8186 result_buf[i] = source_buf[cur];
8189 result = PyUnicode_FromUnicode(result_buf, slicelength);
8190 PyObject_FREE(result_buf);
8191 return result;
8193 } else {
8194 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8195 return NULL;
8199 static PyMappingMethods unicode_as_mapping = {
8200 (lenfunc)unicode_length, /* mp_length */
8201 (binaryfunc)unicode_subscript, /* mp_subscript */
8202 (objobjargproc)0, /* mp_ass_subscript */
8205 static Py_ssize_t
8206 unicode_buffer_getreadbuf(PyUnicodeObject *self,
8207 Py_ssize_t index,
8208 const void **ptr)
8210 if (index != 0) {
8211 PyErr_SetString(PyExc_SystemError,
8212 "accessing non-existent unicode segment");
8213 return -1;
8215 *ptr = (void *) self->str;
8216 return PyUnicode_GET_DATA_SIZE(self);
8219 static Py_ssize_t
8220 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
8221 const void **ptr)
8223 PyErr_SetString(PyExc_TypeError,
8224 "cannot use unicode as modifiable buffer");
8225 return -1;
8228 static int
8229 unicode_buffer_getsegcount(PyUnicodeObject *self,
8230 Py_ssize_t *lenp)
8232 if (lenp)
8233 *lenp = PyUnicode_GET_DATA_SIZE(self);
8234 return 1;
8237 static Py_ssize_t
8238 unicode_buffer_getcharbuf(PyUnicodeObject *self,
8239 Py_ssize_t index,
8240 const void **ptr)
8242 PyObject *str;
8244 if (index != 0) {
8245 PyErr_SetString(PyExc_SystemError,
8246 "accessing non-existent unicode segment");
8247 return -1;
8249 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
8250 if (str == NULL)
8251 return -1;
8252 *ptr = (void *) PyString_AS_STRING(str);
8253 return PyString_GET_SIZE(str);
8256 /* Helpers for PyUnicode_Format() */
8258 static PyObject *
8259 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8261 Py_ssize_t argidx = *p_argidx;
8262 if (argidx < arglen) {
8263 (*p_argidx)++;
8264 if (arglen < 0)
8265 return args;
8266 else
8267 return PyTuple_GetItem(args, argidx);
8269 PyErr_SetString(PyExc_TypeError,
8270 "not enough arguments for format string");
8271 return NULL;
8274 #define F_LJUST (1<<0)
8275 #define F_SIGN (1<<1)
8276 #define F_BLANK (1<<2)
8277 #define F_ALT (1<<3)
8278 #define F_ZERO (1<<4)
8280 static Py_ssize_t
8281 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8283 register Py_ssize_t i;
8284 Py_ssize_t len = strlen(charbuffer);
8285 for (i = len - 1; i >= 0; i--)
8286 buffer[i] = (Py_UNICODE) charbuffer[i];
8288 return len;
8291 static int
8292 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8294 Py_ssize_t result;
8296 PyOS_snprintf((char *)buffer, len, format, x);
8297 result = strtounicode(buffer, (char *)buffer);
8298 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8301 /* XXX To save some code duplication, formatfloat/long/int could have been
8302 shared with stringobject.c, converting from 8-bit to Unicode after the
8303 formatting is done. */
8305 static int
8306 formatfloat(Py_UNICODE *buf,
8307 size_t buflen,
8308 int flags,
8309 int prec,
8310 int type,
8311 PyObject *v)
8313 double x;
8314 Py_ssize_t result;
8315 char *tmp;
8317 x = PyFloat_AsDouble(v);
8318 if (x == -1.0 && PyErr_Occurred())
8319 return -1;
8320 if (prec < 0)
8321 prec = 6;
8322 #if SIZEOF_INT > 4
8323 /* make sure that the decimal representation of precision really does
8324 need at most 10 digits: platforms with sizeof(int) == 8 exist! */
8325 if (prec > 0x7fffffff) {
8326 PyErr_SetString(PyExc_OverflowError,
8327 "outrageously large precision "
8328 "for formatted float");
8329 return -1;
8331 #endif
8333 if (type == 'f' && fabs(x) >= 1e50)
8334 type = 'g';
8335 /* Worst case length calc to ensure no buffer overrun:
8337 'g' formats:
8338 fmt = %#.<prec>g
8339 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8340 for any double rep.)
8341 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8343 'f' formats:
8344 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8345 len = 1 + 50 + 1 + prec = 52 + prec
8347 If prec=0 the effective precision is 1 (the leading digit is
8348 always given), therefore increase the length by one.
8351 if (((type == 'g' || type == 'G') &&
8352 buflen <= (size_t)10 + (size_t)prec) ||
8353 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
8354 PyErr_SetString(PyExc_OverflowError,
8355 "formatted float is too long (precision too large?)");
8356 return -1;
8359 tmp = PyOS_double_to_string(x, type, prec,
8360 (flags&F_ALT)?Py_DTSF_ALT:0, NULL);
8361 if (!tmp)
8362 return -1;
8364 result = strtounicode(buf, tmp);
8365 PyMem_Free(tmp);
8366 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8369 static PyObject*
8370 formatlong(PyObject *val, int flags, int prec, int type)
8372 char *buf;
8373 int i, len;
8374 PyObject *str; /* temporary string object. */
8375 PyUnicodeObject *result;
8377 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8378 if (!str)
8379 return NULL;
8380 result = _PyUnicode_New(len);
8381 if (!result) {
8382 Py_DECREF(str);
8383 return NULL;
8385 for (i = 0; i < len; i++)
8386 result->str[i] = buf[i];
8387 result->str[len] = 0;
8388 Py_DECREF(str);
8389 return (PyObject*)result;
8392 static int
8393 formatint(Py_UNICODE *buf,
8394 size_t buflen,
8395 int flags,
8396 int prec,
8397 int type,
8398 PyObject *v)
8400 /* fmt = '%#.' + `prec` + 'l' + `type`
8401 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8402 * + 1 + 1
8403 * = 24
8405 char fmt[64]; /* plenty big enough! */
8406 char *sign;
8407 long x;
8409 x = PyInt_AsLong(v);
8410 if (x == -1 && PyErr_Occurred())
8411 return -1;
8412 if (x < 0 && type == 'u') {
8413 type = 'd';
8415 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8416 sign = "-";
8417 else
8418 sign = "";
8419 if (prec < 0)
8420 prec = 1;
8422 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8423 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8425 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8426 PyErr_SetString(PyExc_OverflowError,
8427 "formatted integer is too long (precision too large?)");
8428 return -1;
8431 if ((flags & F_ALT) &&
8432 (type == 'x' || type == 'X')) {
8433 /* When converting under %#x or %#X, there are a number
8434 * of issues that cause pain:
8435 * - when 0 is being converted, the C standard leaves off
8436 * the '0x' or '0X', which is inconsistent with other
8437 * %#x/%#X conversions and inconsistent with Python's
8438 * hex() function
8439 * - there are platforms that violate the standard and
8440 * convert 0 with the '0x' or '0X'
8441 * (Metrowerks, Compaq Tru64)
8442 * - there are platforms that give '0x' when converting
8443 * under %#X, but convert 0 in accordance with the
8444 * standard (OS/2 EMX)
8446 * We can achieve the desired consistency by inserting our
8447 * own '0x' or '0X' prefix, and substituting %x/%X in place
8448 * of %#x/%#X.
8450 * Note that this is the same approach as used in
8451 * formatint() in stringobject.c
8453 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8454 sign, type, prec, type);
8456 else {
8457 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8458 sign, (flags&F_ALT) ? "#" : "",
8459 prec, type);
8461 if (sign[0])
8462 return longtounicode(buf, buflen, fmt, -x);
8463 else
8464 return longtounicode(buf, buflen, fmt, x);
8467 static int
8468 formatchar(Py_UNICODE *buf,
8469 size_t buflen,
8470 PyObject *v)
8472 /* presume that the buffer is at least 2 characters long */
8473 if (PyUnicode_Check(v)) {
8474 if (PyUnicode_GET_SIZE(v) != 1)
8475 goto onError;
8476 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8479 else if (PyString_Check(v)) {
8480 if (PyString_GET_SIZE(v) != 1)
8481 goto onError;
8482 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8485 else {
8486 /* Integer input truncated to a character */
8487 long x;
8488 x = PyInt_AsLong(v);
8489 if (x == -1 && PyErr_Occurred())
8490 goto onError;
8491 #ifdef Py_UNICODE_WIDE
8492 if (x < 0 || x > 0x10ffff) {
8493 PyErr_SetString(PyExc_OverflowError,
8494 "%c arg not in range(0x110000) "
8495 "(wide Python build)");
8496 return -1;
8498 #else
8499 if (x < 0 || x > 0xffff) {
8500 PyErr_SetString(PyExc_OverflowError,
8501 "%c arg not in range(0x10000) "
8502 "(narrow Python build)");
8503 return -1;
8505 #endif
8506 buf[0] = (Py_UNICODE) x;
8508 buf[1] = '\0';
8509 return 1;
8511 onError:
8512 PyErr_SetString(PyExc_TypeError,
8513 "%c requires int or char");
8514 return -1;
8517 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8519 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8520 chars are formatted. XXX This is a magic number. Each formatting
8521 routine does bounds checking to ensure no overflow, but a better
8522 solution may be to malloc a buffer of appropriate size for each
8523 format. For now, the current solution is sufficient.
8525 #define FORMATBUFLEN (size_t)120
8527 PyObject *PyUnicode_Format(PyObject *format,
8528 PyObject *args)
8530 Py_UNICODE *fmt, *res;
8531 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8532 int args_owned = 0;
8533 PyUnicodeObject *result = NULL;
8534 PyObject *dict = NULL;
8535 PyObject *uformat;
8537 if (format == NULL || args == NULL) {
8538 PyErr_BadInternalCall();
8539 return NULL;
8541 uformat = PyUnicode_FromObject(format);
8542 if (uformat == NULL)
8543 return NULL;
8544 fmt = PyUnicode_AS_UNICODE(uformat);
8545 fmtcnt = PyUnicode_GET_SIZE(uformat);
8547 reslen = rescnt = fmtcnt + 100;
8548 result = _PyUnicode_New(reslen);
8549 if (result == NULL)
8550 goto onError;
8551 res = PyUnicode_AS_UNICODE(result);
8553 if (PyTuple_Check(args)) {
8554 arglen = PyTuple_Size(args);
8555 argidx = 0;
8557 else {
8558 arglen = -1;
8559 argidx = -2;
8561 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
8562 !PyObject_TypeCheck(args, &PyBaseString_Type))
8563 dict = args;
8565 while (--fmtcnt >= 0) {
8566 if (*fmt != '%') {
8567 if (--rescnt < 0) {
8568 rescnt = fmtcnt + 100;
8569 reslen += rescnt;
8570 if (_PyUnicode_Resize(&result, reslen) < 0)
8571 goto onError;
8572 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8573 --rescnt;
8575 *res++ = *fmt++;
8577 else {
8578 /* Got a format specifier */
8579 int flags = 0;
8580 Py_ssize_t width = -1;
8581 int prec = -1;
8582 Py_UNICODE c = '\0';
8583 Py_UNICODE fill;
8584 int isnumok;
8585 PyObject *v = NULL;
8586 PyObject *temp = NULL;
8587 Py_UNICODE *pbuf;
8588 Py_UNICODE sign;
8589 Py_ssize_t len;
8590 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
8592 fmt++;
8593 if (*fmt == '(') {
8594 Py_UNICODE *keystart;
8595 Py_ssize_t keylen;
8596 PyObject *key;
8597 int pcount = 1;
8599 if (dict == NULL) {
8600 PyErr_SetString(PyExc_TypeError,
8601 "format requires a mapping");
8602 goto onError;
8604 ++fmt;
8605 --fmtcnt;
8606 keystart = fmt;
8607 /* Skip over balanced parentheses */
8608 while (pcount > 0 && --fmtcnt >= 0) {
8609 if (*fmt == ')')
8610 --pcount;
8611 else if (*fmt == '(')
8612 ++pcount;
8613 fmt++;
8615 keylen = fmt - keystart - 1;
8616 if (fmtcnt < 0 || pcount > 0) {
8617 PyErr_SetString(PyExc_ValueError,
8618 "incomplete format key");
8619 goto onError;
8621 #if 0
8622 /* keys are converted to strings using UTF-8 and
8623 then looked up since Python uses strings to hold
8624 variables names etc. in its namespaces and we
8625 wouldn't want to break common idioms. */
8626 key = PyUnicode_EncodeUTF8(keystart,
8627 keylen,
8628 NULL);
8629 #else
8630 key = PyUnicode_FromUnicode(keystart, keylen);
8631 #endif
8632 if (key == NULL)
8633 goto onError;
8634 if (args_owned) {
8635 Py_DECREF(args);
8636 args_owned = 0;
8638 args = PyObject_GetItem(dict, key);
8639 Py_DECREF(key);
8640 if (args == NULL) {
8641 goto onError;
8643 args_owned = 1;
8644 arglen = -1;
8645 argidx = -2;
8647 while (--fmtcnt >= 0) {
8648 switch (c = *fmt++) {
8649 case '-': flags |= F_LJUST; continue;
8650 case '+': flags |= F_SIGN; continue;
8651 case ' ': flags |= F_BLANK; continue;
8652 case '#': flags |= F_ALT; continue;
8653 case '0': flags |= F_ZERO; continue;
8655 break;
8657 if (c == '*') {
8658 v = getnextarg(args, arglen, &argidx);
8659 if (v == NULL)
8660 goto onError;
8661 if (!PyInt_Check(v)) {
8662 PyErr_SetString(PyExc_TypeError,
8663 "* wants int");
8664 goto onError;
8666 width = PyInt_AsLong(v);
8667 if (width < 0) {
8668 flags |= F_LJUST;
8669 width = -width;
8671 if (--fmtcnt >= 0)
8672 c = *fmt++;
8674 else if (c >= '0' && c <= '9') {
8675 width = c - '0';
8676 while (--fmtcnt >= 0) {
8677 c = *fmt++;
8678 if (c < '0' || c > '9')
8679 break;
8680 if ((width*10) / 10 != width) {
8681 PyErr_SetString(PyExc_ValueError,
8682 "width too big");
8683 goto onError;
8685 width = width*10 + (c - '0');
8688 if (c == '.') {
8689 prec = 0;
8690 if (--fmtcnt >= 0)
8691 c = *fmt++;
8692 if (c == '*') {
8693 v = getnextarg(args, arglen, &argidx);
8694 if (v == NULL)
8695 goto onError;
8696 if (!PyInt_Check(v)) {
8697 PyErr_SetString(PyExc_TypeError,
8698 "* wants int");
8699 goto onError;
8701 prec = PyInt_AsLong(v);
8702 if (prec < 0)
8703 prec = 0;
8704 if (--fmtcnt >= 0)
8705 c = *fmt++;
8707 else if (c >= '0' && c <= '9') {
8708 prec = c - '0';
8709 while (--fmtcnt >= 0) {
8710 c = Py_CHARMASK(*fmt++);
8711 if (c < '0' || c > '9')
8712 break;
8713 if ((prec*10) / 10 != prec) {
8714 PyErr_SetString(PyExc_ValueError,
8715 "prec too big");
8716 goto onError;
8718 prec = prec*10 + (c - '0');
8721 } /* prec */
8722 if (fmtcnt >= 0) {
8723 if (c == 'h' || c == 'l' || c == 'L') {
8724 if (--fmtcnt >= 0)
8725 c = *fmt++;
8728 if (fmtcnt < 0) {
8729 PyErr_SetString(PyExc_ValueError,
8730 "incomplete format");
8731 goto onError;
8733 if (c != '%') {
8734 v = getnextarg(args, arglen, &argidx);
8735 if (v == NULL)
8736 goto onError;
8738 sign = 0;
8739 fill = ' ';
8740 switch (c) {
8742 case '%':
8743 pbuf = formatbuf;
8744 /* presume that buffer length is at least 1 */
8745 pbuf[0] = '%';
8746 len = 1;
8747 break;
8749 case 's':
8750 case 'r':
8751 if (PyUnicode_Check(v) && c == 's') {
8752 temp = v;
8753 Py_INCREF(temp);
8755 else {
8756 PyObject *unicode;
8757 if (c == 's')
8758 temp = PyObject_Unicode(v);
8759 else
8760 temp = PyObject_Repr(v);
8761 if (temp == NULL)
8762 goto onError;
8763 if (PyUnicode_Check(temp))
8764 /* nothing to do */;
8765 else if (PyString_Check(temp)) {
8766 /* convert to string to Unicode */
8767 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8768 PyString_GET_SIZE(temp),
8769 NULL,
8770 "strict");
8771 Py_DECREF(temp);
8772 temp = unicode;
8773 if (temp == NULL)
8774 goto onError;
8776 else {
8777 Py_DECREF(temp);
8778 PyErr_SetString(PyExc_TypeError,
8779 "%s argument has non-string str()");
8780 goto onError;
8783 pbuf = PyUnicode_AS_UNICODE(temp);
8784 len = PyUnicode_GET_SIZE(temp);
8785 if (prec >= 0 && len > prec)
8786 len = prec;
8787 break;
8789 case 'i':
8790 case 'd':
8791 case 'u':
8792 case 'o':
8793 case 'x':
8794 case 'X':
8795 if (c == 'i')
8796 c = 'd';
8797 isnumok = 0;
8798 if (PyNumber_Check(v)) {
8799 PyObject *iobj=NULL;
8801 if (PyInt_Check(v) || (PyLong_Check(v))) {
8802 iobj = v;
8803 Py_INCREF(iobj);
8805 else {
8806 iobj = PyNumber_Int(v);
8807 if (iobj==NULL) iobj = PyNumber_Long(v);
8809 if (iobj!=NULL) {
8810 if (PyInt_Check(iobj)) {
8811 isnumok = 1;
8812 pbuf = formatbuf;
8813 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8814 flags, prec, c, iobj);
8815 Py_DECREF(iobj);
8816 if (len < 0)
8817 goto onError;
8818 sign = 1;
8820 else if (PyLong_Check(iobj)) {
8821 isnumok = 1;
8822 temp = formatlong(iobj, flags, prec, c);
8823 Py_DECREF(iobj);
8824 if (!temp)
8825 goto onError;
8826 pbuf = PyUnicode_AS_UNICODE(temp);
8827 len = PyUnicode_GET_SIZE(temp);
8828 sign = 1;
8830 else {
8831 Py_DECREF(iobj);
8835 if (!isnumok) {
8836 PyErr_Format(PyExc_TypeError,
8837 "%%%c format: a number is required, "
8838 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8839 goto onError;
8841 if (flags & F_ZERO)
8842 fill = '0';
8843 break;
8845 case 'e':
8846 case 'E':
8847 case 'f':
8848 case 'F':
8849 case 'g':
8850 case 'G':
8851 if (c == 'F')
8852 c = 'f';
8853 pbuf = formatbuf;
8854 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8855 flags, prec, c, v);
8856 if (len < 0)
8857 goto onError;
8858 sign = 1;
8859 if (flags & F_ZERO)
8860 fill = '0';
8861 break;
8863 case 'c':
8864 pbuf = formatbuf;
8865 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8866 if (len < 0)
8867 goto onError;
8868 break;
8870 default:
8871 PyErr_Format(PyExc_ValueError,
8872 "unsupported format character '%c' (0x%x) "
8873 "at index %zd",
8874 (31<=c && c<=126) ? (char)c : '?',
8875 (int)c,
8876 (Py_ssize_t)(fmt - 1 -
8877 PyUnicode_AS_UNICODE(uformat)));
8878 goto onError;
8880 if (sign) {
8881 if (*pbuf == '-' || *pbuf == '+') {
8882 sign = *pbuf++;
8883 len--;
8885 else if (flags & F_SIGN)
8886 sign = '+';
8887 else if (flags & F_BLANK)
8888 sign = ' ';
8889 else
8890 sign = 0;
8892 if (width < len)
8893 width = len;
8894 if (rescnt - (sign != 0) < width) {
8895 reslen -= rescnt;
8896 rescnt = width + fmtcnt + 100;
8897 reslen += rescnt;
8898 if (reslen < 0) {
8899 Py_XDECREF(temp);
8900 PyErr_NoMemory();
8901 goto onError;
8903 if (_PyUnicode_Resize(&result, reslen) < 0) {
8904 Py_XDECREF(temp);
8905 goto onError;
8907 res = PyUnicode_AS_UNICODE(result)
8908 + reslen - rescnt;
8910 if (sign) {
8911 if (fill != ' ')
8912 *res++ = sign;
8913 rescnt--;
8914 if (width > len)
8915 width--;
8917 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8918 assert(pbuf[0] == '0');
8919 assert(pbuf[1] == c);
8920 if (fill != ' ') {
8921 *res++ = *pbuf++;
8922 *res++ = *pbuf++;
8924 rescnt -= 2;
8925 width -= 2;
8926 if (width < 0)
8927 width = 0;
8928 len -= 2;
8930 if (width > len && !(flags & F_LJUST)) {
8931 do {
8932 --rescnt;
8933 *res++ = fill;
8934 } while (--width > len);
8936 if (fill == ' ') {
8937 if (sign)
8938 *res++ = sign;
8939 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8940 assert(pbuf[0] == '0');
8941 assert(pbuf[1] == c);
8942 *res++ = *pbuf++;
8943 *res++ = *pbuf++;
8946 Py_UNICODE_COPY(res, pbuf, len);
8947 res += len;
8948 rescnt -= len;
8949 while (--width >= len) {
8950 --rescnt;
8951 *res++ = ' ';
8953 if (dict && (argidx < arglen) && c != '%') {
8954 PyErr_SetString(PyExc_TypeError,
8955 "not all arguments converted during string formatting");
8956 Py_XDECREF(temp);
8957 goto onError;
8959 Py_XDECREF(temp);
8960 } /* '%' */
8961 } /* until end */
8962 if (argidx < arglen && !dict) {
8963 PyErr_SetString(PyExc_TypeError,
8964 "not all arguments converted during string formatting");
8965 goto onError;
8968 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8969 goto onError;
8970 if (args_owned) {
8971 Py_DECREF(args);
8973 Py_DECREF(uformat);
8974 return (PyObject *)result;
8976 onError:
8977 Py_XDECREF(result);
8978 Py_DECREF(uformat);
8979 if (args_owned) {
8980 Py_DECREF(args);
8982 return NULL;
8985 static PyBufferProcs unicode_as_buffer = {
8986 (readbufferproc) unicode_buffer_getreadbuf,
8987 (writebufferproc) unicode_buffer_getwritebuf,
8988 (segcountproc) unicode_buffer_getsegcount,
8989 (charbufferproc) unicode_buffer_getcharbuf,
8992 static PyObject *
8993 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8995 static PyObject *
8996 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8998 PyObject *x = NULL;
8999 static char *kwlist[] = {"string", "encoding", "errors", 0};
9000 char *encoding = NULL;
9001 char *errors = NULL;
9003 if (type != &PyUnicode_Type)
9004 return unicode_subtype_new(type, args, kwds);
9005 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
9006 kwlist, &x, &encoding, &errors))
9007 return NULL;
9008 if (x == NULL)
9009 return (PyObject *)_PyUnicode_New(0);
9010 if (encoding == NULL && errors == NULL)
9011 return PyObject_Unicode(x);
9012 else
9013 return PyUnicode_FromEncodedObject(x, encoding, errors);
9016 static PyObject *
9017 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9019 PyUnicodeObject *tmp, *pnew;
9020 Py_ssize_t n;
9022 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9023 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9024 if (tmp == NULL)
9025 return NULL;
9026 assert(PyUnicode_Check(tmp));
9027 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9028 if (pnew == NULL) {
9029 Py_DECREF(tmp);
9030 return NULL;
9032 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9033 if (pnew->str == NULL) {
9034 _Py_ForgetReference((PyObject *)pnew);
9035 PyObject_Del(pnew);
9036 Py_DECREF(tmp);
9037 return PyErr_NoMemory();
9039 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9040 pnew->length = n;
9041 pnew->hash = tmp->hash;
9042 Py_DECREF(tmp);
9043 return (PyObject *)pnew;
9046 PyDoc_STRVAR(unicode_doc,
9047 "unicode(string [, encoding[, errors]]) -> object\n\
9049 Create a new Unicode object from the given encoded string.\n\
9050 encoding defaults to the current default string encoding.\n\
9051 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
9053 PyTypeObject PyUnicode_Type = {
9054 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9055 "unicode", /* tp_name */
9056 sizeof(PyUnicodeObject), /* tp_size */
9057 0, /* tp_itemsize */
9058 /* Slots */
9059 (destructor)unicode_dealloc, /* tp_dealloc */
9060 0, /* tp_print */
9061 0, /* tp_getattr */
9062 0, /* tp_setattr */
9063 0, /* tp_compare */
9064 unicode_repr, /* tp_repr */
9065 &unicode_as_number, /* tp_as_number */
9066 &unicode_as_sequence, /* tp_as_sequence */
9067 &unicode_as_mapping, /* tp_as_mapping */
9068 (hashfunc) unicode_hash, /* tp_hash*/
9069 0, /* tp_call*/
9070 (reprfunc) unicode_str, /* tp_str */
9071 PyObject_GenericGetAttr, /* tp_getattro */
9072 0, /* tp_setattro */
9073 &unicode_as_buffer, /* tp_as_buffer */
9074 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
9075 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
9076 unicode_doc, /* tp_doc */
9077 0, /* tp_traverse */
9078 0, /* tp_clear */
9079 PyUnicode_RichCompare, /* tp_richcompare */
9080 0, /* tp_weaklistoffset */
9081 0, /* tp_iter */
9082 0, /* tp_iternext */
9083 unicode_methods, /* tp_methods */
9084 0, /* tp_members */
9085 0, /* tp_getset */
9086 &PyBaseString_Type, /* tp_base */
9087 0, /* tp_dict */
9088 0, /* tp_descr_get */
9089 0, /* tp_descr_set */
9090 0, /* tp_dictoffset */
9091 0, /* tp_init */
9092 0, /* tp_alloc */
9093 unicode_new, /* tp_new */
9094 PyObject_Del, /* tp_free */
9097 /* Initialize the Unicode implementation */
9099 void _PyUnicode_Init(void)
9101 int i;
9103 /* XXX - move this array to unicodectype.c ? */
9104 Py_UNICODE linebreak[] = {
9105 0x000A, /* LINE FEED */
9106 0x000D, /* CARRIAGE RETURN */
9107 0x001C, /* FILE SEPARATOR */
9108 0x001D, /* GROUP SEPARATOR */
9109 0x001E, /* RECORD SEPARATOR */
9110 0x0085, /* NEXT LINE */
9111 0x2028, /* LINE SEPARATOR */
9112 0x2029, /* PARAGRAPH SEPARATOR */
9115 /* Init the implementation */
9116 free_list = NULL;
9117 numfree = 0;
9118 unicode_empty = _PyUnicode_New(0);
9119 if (!unicode_empty)
9120 return;
9122 strcpy(unicode_default_encoding, "ascii");
9123 for (i = 0; i < 256; i++)
9124 unicode_latin1[i] = NULL;
9125 if (PyType_Ready(&PyUnicode_Type) < 0)
9126 Py_FatalError("Can't initialize 'unicode'");
9128 /* initialize the linebreak bloom filter */
9129 bloom_linebreak = make_bloom_mask(
9130 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9133 PyType_Ready(&EncodingMapType);
9136 /* Finalize the Unicode implementation */
9139 PyUnicode_ClearFreeList(void)
9141 int freelist_size = numfree;
9142 PyUnicodeObject *u;
9144 for (u = free_list; u != NULL;) {
9145 PyUnicodeObject *v = u;
9146 u = *(PyUnicodeObject **)u;
9147 if (v->str)
9148 PyObject_DEL(v->str);
9149 Py_XDECREF(v->defenc);
9150 PyObject_Del(v);
9151 numfree--;
9153 free_list = NULL;
9154 assert(numfree == 0);
9155 return freelist_size;
9158 void
9159 _PyUnicode_Fini(void)
9161 int i;
9163 Py_XDECREF(unicode_empty);
9164 unicode_empty = NULL;
9166 for (i = 0; i < 256; i++) {
9167 if (unicode_latin1[i]) {
9168 Py_DECREF(unicode_latin1[i]);
9169 unicode_latin1[i] = NULL;
9172 (void)PyUnicode_ClearFreeList();
9175 #ifdef __cplusplus
9177 #endif
9181 Local variables:
9182 c-basic-offset: 4
9183 indent-tabs-mode: nil
9184 End: