Kill a couple of "<>"
[python.git] / Objects / unicodeobject.c
blob79e824ee7641dcd0106f81952e6a6110a0fe70d6
1 /*
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
10 Copyright (c) Corporation for National Research Initiatives.
12 --------------------------------------------------------------------
13 The original string type implementation is:
15 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
18 By obtaining, using, and/or copying this software and/or its
19 associated documentation, you agree that you have read, understood,
20 and will comply with the following terms and conditions:
22 Permission to use, copy, modify, and distribute this software and its
23 associated documentation for any purpose and without fee is hereby
24 granted, provided that the above copyright notice appears in all
25 copies, and that both that copyright notice and this permission notice
26 appear in supporting documentation, and that the name of Secret Labs
27 AB or the author not be used in advertising or publicity pertaining to
28 distribution of the software without specific, written prior
29 permission.
31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 --------------------------------------------------------------------
42 #define PY_SSIZE_T_CLEAN
43 #include "Python.h"
45 #include "unicodeobject.h"
46 #include "ucnhash.h"
48 #ifdef MS_WINDOWS
49 #include <windows.h>
50 #endif
52 /* Limit for the Unicode object free list */
54 #define PyUnicode_MAXFREELIST 1024
56 /* Limit for the Unicode object free list stay alive optimization.
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
60 limit. This reduces malloc() overhead for small Unicode objects.
62 At worst this will result in PyUnicode_MAXFREELIST *
63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64 malloc()-overhead) bytes of unused garbage.
66 Setting the limit to 0 effectively turns the feature off.
68 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
73 #define KEEPALIVE_SIZE_LIMIT 9
75 /* Endianness switches; defaults to little endian */
77 #ifdef WORDS_BIGENDIAN
78 # define BYTEORDER_IS_BIG_ENDIAN
79 #else
80 # define BYTEORDER_IS_LITTLE_ENDIAN
81 #endif
83 /* --- Globals ------------------------------------------------------------
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
91 #ifdef __cplusplus
92 extern "C" {
93 #endif
95 /* Free list for Unicode objects */
96 static PyUnicodeObject *free_list;
97 static int numfree;
99 /* The empty Unicode object is shared to improve performance. */
100 static PyUnicodeObject *unicode_empty;
102 /* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104 static PyUnicodeObject *unicode_latin1[256];
106 /* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
109 Always use the PyUnicode_SetDefaultEncoding() and
110 PyUnicode_GetDefaultEncoding() APIs to access this global.
113 static char unicode_default_encoding[100];
115 /* Fast detection of the most frequent whitespace characters */
116 const unsigned char _Py_ascii_whitespace[] = {
117 0, 0, 0, 0, 0, 0, 0, 0,
118 /* case 0x0009: * HORIZONTAL TABULATION */
119 /* case 0x000A: * LINE FEED */
120 /* case 0x000B: * VERTICAL TABULATION */
121 /* case 0x000C: * FORM FEED */
122 /* case 0x000D: * CARRIAGE RETURN */
123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
125 /* case 0x001C: * FILE SEPARATOR */
126 /* case 0x001D: * GROUP SEPARATOR */
127 /* case 0x001E: * RECORD SEPARATOR */
128 /* case 0x001F: * UNIT SEPARATOR */
129 0, 0, 0, 0, 1, 1, 1, 1,
130 /* case 0x0020: * SPACE */
131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
146 /* Same for linebreaks */
147 static unsigned char ascii_linebreak[] = {
148 0, 0, 0, 0, 0, 0, 0, 0,
149 /* 0x000A, * LINE FEED */
150 /* 0x000D, * CARRIAGE RETURN */
151 0, 0, 1, 0, 0, 1, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 /* 0x001C, * FILE SEPARATOR */
154 /* 0x001D, * GROUP SEPARATOR */
155 /* 0x001E, * RECORD SEPARATOR */
156 0, 0, 0, 0, 1, 1, 1, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0
173 Py_UNICODE
174 PyUnicode_GetMax(void)
176 #ifdef Py_UNICODE_WIDE
177 return 0x10FFFF;
178 #else
179 /* This is actually an illegal character, so it should
180 not be passed to unichr. */
181 return 0xFFFF;
182 #endif
185 /* --- Bloom Filters ----------------------------------------------------- */
187 /* stuff to implement simple "bloom filters" for Unicode characters.
188 to keep things simple, we use a single bitmask, using the least 5
189 bits from each unicode characters as the bit index. */
191 /* the linebreak mask is set up by Unicode_Init below */
193 #define BLOOM_MASK unsigned long
195 static BLOOM_MASK bloom_linebreak;
197 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
199 #define BLOOM_LINEBREAK(ch) \
200 ((ch) < 128U ? ascii_linebreak[(ch)] : \
201 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
203 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
205 /* calculate simple bloom-style bitmask for a given unicode string */
207 long mask;
208 Py_ssize_t i;
210 mask = 0;
211 for (i = 0; i < len; i++)
212 mask |= (1 << (ptr[i] & 0x1F));
214 return mask;
217 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
219 Py_ssize_t i;
221 for (i = 0; i < setlen; i++)
222 if (set[i] == chr)
223 return 1;
225 return 0;
228 #define BLOOM_MEMBER(mask, chr, set, setlen) \
229 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
231 /* --- Unicode Object ----------------------------------------------------- */
233 static
234 int unicode_resize(register PyUnicodeObject *unicode,
235 Py_ssize_t length)
237 void *oldstr;
239 /* Shortcut if there's nothing much to do. */
240 if (unicode->length == length)
241 goto reset;
243 /* Resizing shared object (unicode_empty or single character
244 objects) in-place is not allowed. Use PyUnicode_Resize()
245 instead ! */
247 if (unicode == unicode_empty ||
248 (unicode->length == 1 &&
249 unicode->str[0] < 256U &&
250 unicode_latin1[unicode->str[0]] == unicode)) {
251 PyErr_SetString(PyExc_SystemError,
252 "can't resize shared unicode objects");
253 return -1;
256 /* We allocate one more byte to make sure the string is Ux0000 terminated.
257 The overallocation is also used by fastsearch, which assumes that it's
258 safe to look at str[length] (without making any assumptions about what
259 it contains). */
261 oldstr = unicode->str;
262 unicode->str = PyObject_REALLOC(unicode->str,
263 sizeof(Py_UNICODE) * (length + 1));
264 if (!unicode->str) {
265 unicode->str = (Py_UNICODE *)oldstr;
266 PyErr_NoMemory();
267 return -1;
269 unicode->str[length] = 0;
270 unicode->length = length;
272 reset:
273 /* Reset the object caches */
274 if (unicode->defenc) {
275 Py_DECREF(unicode->defenc);
276 unicode->defenc = NULL;
278 unicode->hash = -1;
280 return 0;
283 /* We allocate one more byte to make sure the string is
284 Ux0000 terminated -- XXX is this needed ?
286 XXX This allocator could further be enhanced by assuring that the
287 free list never reduces its size below 1.
291 static
292 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
294 register PyUnicodeObject *unicode;
296 /* Optimization for empty strings */
297 if (length == 0 && unicode_empty != NULL) {
298 Py_INCREF(unicode_empty);
299 return unicode_empty;
302 /* Ensure we won't overflow the size. */
303 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
304 return (PyUnicodeObject *)PyErr_NoMemory();
307 /* Unicode freelist & memory allocation */
308 if (free_list) {
309 unicode = free_list;
310 free_list = *(PyUnicodeObject **)unicode;
311 numfree--;
312 if (unicode->str) {
313 /* Keep-Alive optimization: we only upsize the buffer,
314 never downsize it. */
315 if ((unicode->length < length) &&
316 unicode_resize(unicode, length) < 0) {
317 PyObject_DEL(unicode->str);
318 unicode->str = NULL;
321 else {
322 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
323 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
325 PyObject_INIT(unicode, &PyUnicode_Type);
327 else {
328 size_t new_size;
329 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
330 if (unicode == NULL)
331 return NULL;
332 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
333 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
336 if (!unicode->str) {
337 PyErr_NoMemory();
338 goto onError;
340 /* Initialize the first element to guard against cases where
341 * the caller fails before initializing str -- unicode_resize()
342 * reads str[0], and the Keep-Alive optimization can keep memory
343 * allocated for str alive across a call to unicode_dealloc(unicode).
344 * We don't want unicode_resize to read uninitialized memory in
345 * that case.
347 unicode->str[0] = 0;
348 unicode->str[length] = 0;
349 unicode->length = length;
350 unicode->hash = -1;
351 unicode->defenc = NULL;
352 return unicode;
354 onError:
355 /* XXX UNREF/NEWREF interface should be more symmetrical */
356 _Py_DEC_REFTOTAL;
357 _Py_ForgetReference((PyObject *)unicode);
358 PyObject_Del(unicode);
359 return NULL;
362 static
363 void unicode_dealloc(register PyUnicodeObject *unicode)
365 if (PyUnicode_CheckExact(unicode) &&
366 numfree < PyUnicode_MAXFREELIST) {
367 /* Keep-Alive optimization */
368 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
369 PyObject_DEL(unicode->str);
370 unicode->str = NULL;
371 unicode->length = 0;
373 if (unicode->defenc) {
374 Py_DECREF(unicode->defenc);
375 unicode->defenc = NULL;
377 /* Add to free list */
378 *(PyUnicodeObject **)unicode = free_list;
379 free_list = unicode;
380 numfree++;
382 else {
383 PyObject_DEL(unicode->str);
384 Py_XDECREF(unicode->defenc);
385 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
389 static
390 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
392 register PyUnicodeObject *v;
394 /* Argument checks */
395 if (unicode == NULL) {
396 PyErr_BadInternalCall();
397 return -1;
399 v = *unicode;
400 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
401 PyErr_BadInternalCall();
402 return -1;
405 /* Resizing unicode_empty and single character objects is not
406 possible since these are being shared. We simply return a fresh
407 copy with the same Unicode content. */
408 if (v->length != length &&
409 (v == unicode_empty || v->length == 1)) {
410 PyUnicodeObject *w = _PyUnicode_New(length);
411 if (w == NULL)
412 return -1;
413 Py_UNICODE_COPY(w->str, v->str,
414 length < v->length ? length : v->length);
415 Py_DECREF(*unicode);
416 *unicode = w;
417 return 0;
420 /* Note that we don't have to modify *unicode for unshared Unicode
421 objects, since we can modify them in-place. */
422 return unicode_resize(v, length);
425 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
427 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
430 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
431 Py_ssize_t size)
433 PyUnicodeObject *unicode;
435 /* If the Unicode data is known at construction time, we can apply
436 some optimizations which share commonly used objects. */
437 if (u != NULL) {
439 /* Optimization for empty strings */
440 if (size == 0 && unicode_empty != NULL) {
441 Py_INCREF(unicode_empty);
442 return (PyObject *)unicode_empty;
445 /* Single character Unicode objects in the Latin-1 range are
446 shared when using this constructor */
447 if (size == 1 && *u < 256) {
448 unicode = unicode_latin1[*u];
449 if (!unicode) {
450 unicode = _PyUnicode_New(1);
451 if (!unicode)
452 return NULL;
453 unicode->str[0] = *u;
454 unicode_latin1[*u] = unicode;
456 Py_INCREF(unicode);
457 return (PyObject *)unicode;
461 unicode = _PyUnicode_New(size);
462 if (!unicode)
463 return NULL;
465 /* Copy the Unicode data into the new object */
466 if (u != NULL)
467 Py_UNICODE_COPY(unicode->str, u, size);
469 return (PyObject *)unicode;
472 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
474 PyUnicodeObject *unicode;
476 if (size < 0) {
477 PyErr_SetString(PyExc_SystemError,
478 "Negative size passed to PyUnicode_FromStringAndSize");
479 return NULL;
482 /* If the Unicode data is known at construction time, we can apply
483 some optimizations which share commonly used objects.
484 Also, this means the input must be UTF-8, so fall back to the
485 UTF-8 decoder at the end. */
486 if (u != NULL) {
488 /* Optimization for empty strings */
489 if (size == 0 && unicode_empty != NULL) {
490 Py_INCREF(unicode_empty);
491 return (PyObject *)unicode_empty;
494 /* Single characters are shared when using this constructor.
495 Restrict to ASCII, since the input must be UTF-8. */
496 if (size == 1 && Py_CHARMASK(*u) < 128) {
497 unicode = unicode_latin1[Py_CHARMASK(*u)];
498 if (!unicode) {
499 unicode = _PyUnicode_New(1);
500 if (!unicode)
501 return NULL;
502 unicode->str[0] = Py_CHARMASK(*u);
503 unicode_latin1[Py_CHARMASK(*u)] = unicode;
505 Py_INCREF(unicode);
506 return (PyObject *)unicode;
509 return PyUnicode_DecodeUTF8(u, size, NULL);
512 unicode = _PyUnicode_New(size);
513 if (!unicode)
514 return NULL;
516 return (PyObject *)unicode;
519 PyObject *PyUnicode_FromString(const char *u)
521 size_t size = strlen(u);
522 if (size > PY_SSIZE_T_MAX) {
523 PyErr_SetString(PyExc_OverflowError, "input too long");
524 return NULL;
527 return PyUnicode_FromStringAndSize(u, size);
530 #ifdef HAVE_WCHAR_H
532 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
533 # define CONVERT_WCHAR_TO_SURROGATES
534 #endif
536 #ifdef CONVERT_WCHAR_TO_SURROGATES
538 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
539 to convert from UTF32 to UTF16. */
541 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
542 Py_ssize_t size)
544 PyUnicodeObject *unicode;
545 register Py_ssize_t i;
546 Py_ssize_t alloc;
547 const wchar_t *orig_w;
549 if (w == NULL) {
550 PyErr_BadInternalCall();
551 return NULL;
554 alloc = size;
555 orig_w = w;
556 for (i = size; i > 0; i--) {
557 if (*w > 0xFFFF)
558 alloc++;
559 w++;
561 w = orig_w;
562 unicode = _PyUnicode_New(alloc);
563 if (!unicode)
564 return NULL;
566 /* Copy the wchar_t data into the new object */
568 register Py_UNICODE *u;
569 u = PyUnicode_AS_UNICODE(unicode);
570 for (i = size; i > 0; i--) {
571 if (*w > 0xFFFF) {
572 wchar_t ordinal = *w++;
573 ordinal -= 0x10000;
574 *u++ = 0xD800 | (ordinal >> 10);
575 *u++ = 0xDC00 | (ordinal & 0x3FF);
577 else
578 *u++ = *w++;
581 return (PyObject *)unicode;
584 #else
586 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
587 Py_ssize_t size)
589 PyUnicodeObject *unicode;
591 if (w == NULL) {
592 PyErr_BadInternalCall();
593 return NULL;
596 unicode = _PyUnicode_New(size);
597 if (!unicode)
598 return NULL;
600 /* Copy the wchar_t data into the new object */
601 #ifdef HAVE_USABLE_WCHAR_T
602 memcpy(unicode->str, w, size * sizeof(wchar_t));
603 #else
605 register Py_UNICODE *u;
606 register Py_ssize_t i;
607 u = PyUnicode_AS_UNICODE(unicode);
608 for (i = size; i > 0; i--)
609 *u++ = *w++;
611 #endif
613 return (PyObject *)unicode;
616 #endif /* CONVERT_WCHAR_TO_SURROGATES */
618 #undef CONVERT_WCHAR_TO_SURROGATES
620 static void
621 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
623 *fmt++ = '%';
624 if (width) {
625 if (zeropad)
626 *fmt++ = '0';
627 fmt += sprintf(fmt, "%d", width);
629 if (precision)
630 fmt += sprintf(fmt, ".%d", precision);
631 if (longflag)
632 *fmt++ = 'l';
633 else if (size_tflag) {
634 char *f = PY_FORMAT_SIZE_T;
635 while (*f)
636 *fmt++ = *f++;
638 *fmt++ = c;
639 *fmt = '\0';
642 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
644 PyObject *
645 PyUnicode_FromFormatV(const char *format, va_list vargs)
647 va_list count;
648 Py_ssize_t callcount = 0;
649 PyObject **callresults = NULL;
650 PyObject **callresult = NULL;
651 Py_ssize_t n = 0;
652 int width = 0;
653 int precision = 0;
654 int zeropad;
655 const char* f;
656 Py_UNICODE *s;
657 PyObject *string;
658 /* used by sprintf */
659 char buffer[21];
660 /* use abuffer instead of buffer, if we need more space
661 * (which can happen if there's a format specifier with width). */
662 char *abuffer = NULL;
663 char *realbuffer;
664 Py_ssize_t abuffersize = 0;
665 char fmt[60]; /* should be enough for %0width.precisionld */
666 const char *copy;
668 #ifdef VA_LIST_IS_ARRAY
669 Py_MEMCPY(count, vargs, sizeof(va_list));
670 #else
671 #ifdef __va_copy
672 __va_copy(count, vargs);
673 #else
674 count = vargs;
675 #endif
676 #endif
677 /* step 1: count the number of %S/%R/%s format specifications
678 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
679 * objects once during step 3 and put the result in an array) */
680 for (f = format; *f; f++) {
681 if (*f == '%') {
682 if (*(f+1)=='%')
683 continue;
684 if (*(f+1)=='S' || *(f+1)=='R')
685 ++callcount;
686 while (isdigit((unsigned)*f))
687 width = (width*10) + *f++ - '0';
688 while (*++f && *f != '%' && !isalpha((unsigned)*f))
690 if (*f == 's')
691 ++callcount;
694 /* step 2: allocate memory for the results of
695 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
696 if (callcount) {
697 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
698 if (!callresults) {
699 PyErr_NoMemory();
700 return NULL;
702 callresult = callresults;
704 /* step 3: figure out how large a buffer we need */
705 for (f = format; *f; f++) {
706 if (*f == '%') {
707 const char* p = f;
708 width = 0;
709 while (isdigit((unsigned)*f))
710 width = (width*10) + *f++ - '0';
711 while (*++f && *f != '%' && !isalpha((unsigned)*f))
714 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
715 * they don't affect the amount of space we reserve.
717 if ((*f == 'l' || *f == 'z') &&
718 (f[1] == 'd' || f[1] == 'u'))
719 ++f;
721 switch (*f) {
722 case 'c':
723 (void)va_arg(count, int);
724 /* fall through... */
725 case '%':
726 n++;
727 break;
728 case 'd': case 'u': case 'i': case 'x':
729 (void) va_arg(count, int);
730 /* 20 bytes is enough to hold a 64-bit
731 integer. Decimal takes the most space.
732 This isn't enough for octal.
733 If a width is specified we need more
734 (which we allocate later). */
735 if (width < 20)
736 width = 20;
737 n += width;
738 if (abuffersize < width)
739 abuffersize = width;
740 break;
741 case 's':
743 /* UTF-8 */
744 const char *s = va_arg(count, const char*);
745 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
746 if (!str)
747 goto fail;
748 n += PyUnicode_GET_SIZE(str);
749 /* Remember the str and switch to the next slot */
750 *callresult++ = str;
751 break;
753 case 'U':
755 PyObject *obj = va_arg(count, PyObject *);
756 assert(obj && PyUnicode_Check(obj));
757 n += PyUnicode_GET_SIZE(obj);
758 break;
760 case 'V':
762 PyObject *obj = va_arg(count, PyObject *);
763 const char *str = va_arg(count, const char *);
764 assert(obj || str);
765 assert(!obj || PyUnicode_Check(obj));
766 if (obj)
767 n += PyUnicode_GET_SIZE(obj);
768 else
769 n += strlen(str);
770 break;
772 case 'S':
774 PyObject *obj = va_arg(count, PyObject *);
775 PyObject *str;
776 assert(obj);
777 str = PyObject_Str(obj);
778 if (!str)
779 goto fail;
780 n += PyUnicode_GET_SIZE(str);
781 /* Remember the str and switch to the next slot */
782 *callresult++ = str;
783 break;
785 case 'R':
787 PyObject *obj = va_arg(count, PyObject *);
788 PyObject *repr;
789 assert(obj);
790 repr = PyObject_Repr(obj);
791 if (!repr)
792 goto fail;
793 n += PyUnicode_GET_SIZE(repr);
794 /* Remember the repr and switch to the next slot */
795 *callresult++ = repr;
796 break;
798 case 'p':
799 (void) va_arg(count, int);
800 /* maximum 64-bit pointer representation:
801 * 0xffffffffffffffff
802 * so 19 characters is enough.
803 * XXX I count 18 -- what's the extra for?
805 n += 19;
806 break;
807 default:
808 /* if we stumble upon an unknown
809 formatting code, copy the rest of
810 the format string to the output
811 string. (we cannot just skip the
812 code, since there's no way to know
813 what's in the argument list) */
814 n += strlen(p);
815 goto expand;
817 } else
818 n++;
820 expand:
821 if (abuffersize > 20) {
822 abuffer = PyObject_Malloc(abuffersize);
823 if (!abuffer) {
824 PyErr_NoMemory();
825 goto fail;
827 realbuffer = abuffer;
829 else
830 realbuffer = buffer;
831 /* step 4: fill the buffer */
832 /* Since we've analyzed how much space we need for the worst case,
833 we don't have to resize the string.
834 There can be no errors beyond this point. */
835 string = PyUnicode_FromUnicode(NULL, n);
836 if (!string)
837 goto fail;
839 s = PyUnicode_AS_UNICODE(string);
840 callresult = callresults;
842 for (f = format; *f; f++) {
843 if (*f == '%') {
844 const char* p = f++;
845 int longflag = 0;
846 int size_tflag = 0;
847 zeropad = (*f == '0');
848 /* parse the width.precision part */
849 width = 0;
850 while (isdigit((unsigned)*f))
851 width = (width*10) + *f++ - '0';
852 precision = 0;
853 if (*f == '.') {
854 f++;
855 while (isdigit((unsigned)*f))
856 precision = (precision*10) + *f++ - '0';
858 /* handle the long flag, but only for %ld and %lu.
859 others can be added when necessary. */
860 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
861 longflag = 1;
862 ++f;
864 /* handle the size_t flag. */
865 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
866 size_tflag = 1;
867 ++f;
870 switch (*f) {
871 case 'c':
872 *s++ = va_arg(vargs, int);
873 break;
874 case 'd':
875 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
876 if (longflag)
877 sprintf(realbuffer, fmt, va_arg(vargs, long));
878 else if (size_tflag)
879 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
880 else
881 sprintf(realbuffer, fmt, va_arg(vargs, int));
882 appendstring(realbuffer);
883 break;
884 case 'u':
885 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
886 if (longflag)
887 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
888 else if (size_tflag)
889 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
890 else
891 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
892 appendstring(realbuffer);
893 break;
894 case 'i':
895 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
896 sprintf(realbuffer, fmt, va_arg(vargs, int));
897 appendstring(realbuffer);
898 break;
899 case 'x':
900 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
901 sprintf(realbuffer, fmt, va_arg(vargs, int));
902 appendstring(realbuffer);
903 break;
904 case 's':
906 /* unused, since we already have the result */
907 (void) va_arg(vargs, char *);
908 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
909 PyUnicode_GET_SIZE(*callresult));
910 s += PyUnicode_GET_SIZE(*callresult);
911 /* We're done with the unicode()/repr() => forget it */
912 Py_DECREF(*callresult);
913 /* switch to next unicode()/repr() result */
914 ++callresult;
915 break;
917 case 'U':
919 PyObject *obj = va_arg(vargs, PyObject *);
920 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
921 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
922 s += size;
923 break;
925 case 'V':
927 PyObject *obj = va_arg(vargs, PyObject *);
928 const char *str = va_arg(vargs, const char *);
929 if (obj) {
930 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
931 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
932 s += size;
933 } else {
934 appendstring(str);
936 break;
938 case 'S':
939 case 'R':
941 Py_UNICODE *ucopy;
942 Py_ssize_t usize;
943 Py_ssize_t upos;
944 /* unused, since we already have the result */
945 (void) va_arg(vargs, PyObject *);
946 ucopy = PyUnicode_AS_UNICODE(*callresult);
947 usize = PyUnicode_GET_SIZE(*callresult);
948 for (upos = 0; upos<usize;)
949 *s++ = ucopy[upos++];
950 /* We're done with the unicode()/repr() => forget it */
951 Py_DECREF(*callresult);
952 /* switch to next unicode()/repr() result */
953 ++callresult;
954 break;
956 case 'p':
957 sprintf(buffer, "%p", va_arg(vargs, void*));
958 /* %p is ill-defined: ensure leading 0x. */
959 if (buffer[1] == 'X')
960 buffer[1] = 'x';
961 else if (buffer[1] != 'x') {
962 memmove(buffer+2, buffer, strlen(buffer)+1);
963 buffer[0] = '0';
964 buffer[1] = 'x';
966 appendstring(buffer);
967 break;
968 case '%':
969 *s++ = '%';
970 break;
971 default:
972 appendstring(p);
973 goto end;
975 } else
976 *s++ = *f;
979 end:
980 if (callresults)
981 PyObject_Free(callresults);
982 if (abuffer)
983 PyObject_Free(abuffer);
984 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
985 return string;
986 fail:
987 if (callresults) {
988 PyObject **callresult2 = callresults;
989 while (callresult2 < callresult) {
990 Py_DECREF(*callresult2);
991 ++callresult2;
993 PyObject_Free(callresults);
995 if (abuffer)
996 PyObject_Free(abuffer);
997 return NULL;
1000 #undef appendstring
1002 PyObject *
1003 PyUnicode_FromFormat(const char *format, ...)
1005 PyObject* ret;
1006 va_list vargs;
1008 #ifdef HAVE_STDARG_PROTOTYPES
1009 va_start(vargs, format);
1010 #else
1011 va_start(vargs);
1012 #endif
1013 ret = PyUnicode_FromFormatV(format, vargs);
1014 va_end(vargs);
1015 return ret;
1018 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1019 wchar_t *w,
1020 Py_ssize_t size)
1022 if (unicode == NULL) {
1023 PyErr_BadInternalCall();
1024 return -1;
1027 /* If possible, try to copy the 0-termination as well */
1028 if (size > PyUnicode_GET_SIZE(unicode))
1029 size = PyUnicode_GET_SIZE(unicode) + 1;
1031 #ifdef HAVE_USABLE_WCHAR_T
1032 memcpy(w, unicode->str, size * sizeof(wchar_t));
1033 #else
1035 register Py_UNICODE *u;
1036 register Py_ssize_t i;
1037 u = PyUnicode_AS_UNICODE(unicode);
1038 for (i = size; i > 0; i--)
1039 *w++ = *u++;
1041 #endif
1043 if (size > PyUnicode_GET_SIZE(unicode))
1044 return PyUnicode_GET_SIZE(unicode);
1045 else
1046 return size;
1049 #endif
1051 PyObject *PyUnicode_FromOrdinal(int ordinal)
1053 Py_UNICODE s[1];
1055 #ifdef Py_UNICODE_WIDE
1056 if (ordinal < 0 || ordinal > 0x10ffff) {
1057 PyErr_SetString(PyExc_ValueError,
1058 "unichr() arg not in range(0x110000) "
1059 "(wide Python build)");
1060 return NULL;
1062 #else
1063 if (ordinal < 0 || ordinal > 0xffff) {
1064 PyErr_SetString(PyExc_ValueError,
1065 "unichr() arg not in range(0x10000) "
1066 "(narrow Python build)");
1067 return NULL;
1069 #endif
1071 s[0] = (Py_UNICODE)ordinal;
1072 return PyUnicode_FromUnicode(s, 1);
1075 PyObject *PyUnicode_FromObject(register PyObject *obj)
1077 /* XXX Perhaps we should make this API an alias of
1078 PyObject_Unicode() instead ?! */
1079 if (PyUnicode_CheckExact(obj)) {
1080 Py_INCREF(obj);
1081 return obj;
1083 if (PyUnicode_Check(obj)) {
1084 /* For a Unicode subtype that's not a Unicode object,
1085 return a true Unicode object with the same data. */
1086 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1087 PyUnicode_GET_SIZE(obj));
1089 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1092 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1093 const char *encoding,
1094 const char *errors)
1096 const char *s = NULL;
1097 Py_ssize_t len;
1098 PyObject *v;
1100 if (obj == NULL) {
1101 PyErr_BadInternalCall();
1102 return NULL;
1105 #if 0
1106 /* For b/w compatibility we also accept Unicode objects provided
1107 that no encodings is given and then redirect to
1108 PyObject_Unicode() which then applies the additional logic for
1109 Unicode subclasses.
1111 NOTE: This API should really only be used for object which
1112 represent *encoded* Unicode !
1115 if (PyUnicode_Check(obj)) {
1116 if (encoding) {
1117 PyErr_SetString(PyExc_TypeError,
1118 "decoding Unicode is not supported");
1119 return NULL;
1121 return PyObject_Unicode(obj);
1123 #else
1124 if (PyUnicode_Check(obj)) {
1125 PyErr_SetString(PyExc_TypeError,
1126 "decoding Unicode is not supported");
1127 return NULL;
1129 #endif
1131 /* Coerce object */
1132 if (PyString_Check(obj)) {
1133 s = PyString_AS_STRING(obj);
1134 len = PyString_GET_SIZE(obj);
1136 else if (PyByteArray_Check(obj)) {
1137 /* Python 2.x specific */
1138 PyErr_Format(PyExc_TypeError,
1139 "decoding bytearray is not supported");
1140 return NULL;
1142 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1143 /* Overwrite the error message with something more useful in
1144 case of a TypeError. */
1145 if (PyErr_ExceptionMatches(PyExc_TypeError))
1146 PyErr_Format(PyExc_TypeError,
1147 "coercing to Unicode: need string or buffer, "
1148 "%.80s found",
1149 Py_TYPE(obj)->tp_name);
1150 goto onError;
1153 /* Convert to Unicode */
1154 if (len == 0) {
1155 Py_INCREF(unicode_empty);
1156 v = (PyObject *)unicode_empty;
1158 else
1159 v = PyUnicode_Decode(s, len, encoding, errors);
1161 return v;
1163 onError:
1164 return NULL;
1167 PyObject *PyUnicode_Decode(const char *s,
1168 Py_ssize_t size,
1169 const char *encoding,
1170 const char *errors)
1172 PyObject *buffer = NULL, *unicode;
1174 if (encoding == NULL)
1175 encoding = PyUnicode_GetDefaultEncoding();
1177 /* Shortcuts for common default encodings */
1178 if (strcmp(encoding, "utf-8") == 0)
1179 return PyUnicode_DecodeUTF8(s, size, errors);
1180 else if (strcmp(encoding, "latin-1") == 0)
1181 return PyUnicode_DecodeLatin1(s, size, errors);
1182 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1183 else if (strcmp(encoding, "mbcs") == 0)
1184 return PyUnicode_DecodeMBCS(s, size, errors);
1185 #endif
1186 else if (strcmp(encoding, "ascii") == 0)
1187 return PyUnicode_DecodeASCII(s, size, errors);
1189 /* Decode via the codec registry */
1190 buffer = PyBuffer_FromMemory((void *)s, size);
1191 if (buffer == NULL)
1192 goto onError;
1193 unicode = PyCodec_Decode(buffer, encoding, errors);
1194 if (unicode == NULL)
1195 goto onError;
1196 if (!PyUnicode_Check(unicode)) {
1197 PyErr_Format(PyExc_TypeError,
1198 "decoder did not return an unicode object (type=%.400s)",
1199 Py_TYPE(unicode)->tp_name);
1200 Py_DECREF(unicode);
1201 goto onError;
1203 Py_DECREF(buffer);
1204 return unicode;
1206 onError:
1207 Py_XDECREF(buffer);
1208 return NULL;
1211 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1212 const char *encoding,
1213 const char *errors)
1215 PyObject *v;
1217 if (!PyUnicode_Check(unicode)) {
1218 PyErr_BadArgument();
1219 goto onError;
1222 if (encoding == NULL)
1223 encoding = PyUnicode_GetDefaultEncoding();
1225 /* Decode via the codec registry */
1226 v = PyCodec_Decode(unicode, encoding, errors);
1227 if (v == NULL)
1228 goto onError;
1229 return v;
1231 onError:
1232 return NULL;
1235 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1236 Py_ssize_t size,
1237 const char *encoding,
1238 const char *errors)
1240 PyObject *v, *unicode;
1242 unicode = PyUnicode_FromUnicode(s, size);
1243 if (unicode == NULL)
1244 return NULL;
1245 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1246 Py_DECREF(unicode);
1247 return v;
1250 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1251 const char *encoding,
1252 const char *errors)
1254 PyObject *v;
1256 if (!PyUnicode_Check(unicode)) {
1257 PyErr_BadArgument();
1258 goto onError;
1261 if (encoding == NULL)
1262 encoding = PyUnicode_GetDefaultEncoding();
1264 /* Encode via the codec registry */
1265 v = PyCodec_Encode(unicode, encoding, errors);
1266 if (v == NULL)
1267 goto onError;
1268 return v;
1270 onError:
1271 return NULL;
1274 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1275 const char *encoding,
1276 const char *errors)
1278 PyObject *v;
1280 if (!PyUnicode_Check(unicode)) {
1281 PyErr_BadArgument();
1282 goto onError;
1285 if (encoding == NULL)
1286 encoding = PyUnicode_GetDefaultEncoding();
1288 /* Shortcuts for common default encodings */
1289 if (errors == NULL) {
1290 if (strcmp(encoding, "utf-8") == 0)
1291 return PyUnicode_AsUTF8String(unicode);
1292 else if (strcmp(encoding, "latin-1") == 0)
1293 return PyUnicode_AsLatin1String(unicode);
1294 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1295 else if (strcmp(encoding, "mbcs") == 0)
1296 return PyUnicode_AsMBCSString(unicode);
1297 #endif
1298 else if (strcmp(encoding, "ascii") == 0)
1299 return PyUnicode_AsASCIIString(unicode);
1302 /* Encode via the codec registry */
1303 v = PyCodec_Encode(unicode, encoding, errors);
1304 if (v == NULL)
1305 goto onError;
1306 if (!PyString_Check(v)) {
1307 PyErr_Format(PyExc_TypeError,
1308 "encoder did not return a string object (type=%.400s)",
1309 Py_TYPE(v)->tp_name);
1310 Py_DECREF(v);
1311 goto onError;
1313 return v;
1315 onError:
1316 return NULL;
1319 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1320 const char *errors)
1322 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1324 if (v)
1325 return v;
1326 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1327 if (v && errors == NULL)
1328 ((PyUnicodeObject *)unicode)->defenc = v;
1329 return v;
1332 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1334 if (!PyUnicode_Check(unicode)) {
1335 PyErr_BadArgument();
1336 goto onError;
1338 return PyUnicode_AS_UNICODE(unicode);
1340 onError:
1341 return NULL;
1344 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1346 if (!PyUnicode_Check(unicode)) {
1347 PyErr_BadArgument();
1348 goto onError;
1350 return PyUnicode_GET_SIZE(unicode);
1352 onError:
1353 return -1;
1356 const char *PyUnicode_GetDefaultEncoding(void)
1358 return unicode_default_encoding;
1361 int PyUnicode_SetDefaultEncoding(const char *encoding)
1363 PyObject *v;
1365 /* Make sure the encoding is valid. As side effect, this also
1366 loads the encoding into the codec registry cache. */
1367 v = _PyCodec_Lookup(encoding);
1368 if (v == NULL)
1369 goto onError;
1370 Py_DECREF(v);
1371 strncpy(unicode_default_encoding,
1372 encoding,
1373 sizeof(unicode_default_encoding));
1374 return 0;
1376 onError:
1377 return -1;
1380 /* error handling callback helper:
1381 build arguments, call the callback and check the arguments,
1382 if no exception occurred, copy the replacement to the output
1383 and adjust various state variables.
1384 return 0 on success, -1 on error
1387 static
1388 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1389 const char *encoding, const char *reason,
1390 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1391 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1392 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1394 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1396 PyObject *restuple = NULL;
1397 PyObject *repunicode = NULL;
1398 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1399 Py_ssize_t requiredsize;
1400 Py_ssize_t newpos;
1401 Py_UNICODE *repptr;
1402 Py_ssize_t repsize;
1403 int res = -1;
1405 if (*errorHandler == NULL) {
1406 *errorHandler = PyCodec_LookupError(errors);
1407 if (*errorHandler == NULL)
1408 goto onError;
1411 if (*exceptionObject == NULL) {
1412 *exceptionObject = PyUnicodeDecodeError_Create(
1413 encoding, input, insize, *startinpos, *endinpos, reason);
1414 if (*exceptionObject == NULL)
1415 goto onError;
1417 else {
1418 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1419 goto onError;
1420 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1421 goto onError;
1422 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1423 goto onError;
1426 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1427 if (restuple == NULL)
1428 goto onError;
1429 if (!PyTuple_Check(restuple)) {
1430 PyErr_SetString(PyExc_TypeError, &argparse[4]);
1431 goto onError;
1433 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1434 goto onError;
1435 if (newpos<0)
1436 newpos = insize+newpos;
1437 if (newpos<0 || newpos>insize) {
1438 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1439 goto onError;
1442 /* need more space? (at least enough for what we
1443 have+the replacement+the rest of the string (starting
1444 at the new input position), so we won't have to check space
1445 when there are no errors in the rest of the string) */
1446 repptr = PyUnicode_AS_UNICODE(repunicode);
1447 repsize = PyUnicode_GET_SIZE(repunicode);
1448 requiredsize = *outpos + repsize + insize-newpos;
1449 if (requiredsize > outsize) {
1450 if (requiredsize<2*outsize)
1451 requiredsize = 2*outsize;
1452 if (_PyUnicode_Resize(output, requiredsize) < 0)
1453 goto onError;
1454 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1456 *endinpos = newpos;
1457 *inptr = input + newpos;
1458 Py_UNICODE_COPY(*outptr, repptr, repsize);
1459 *outptr += repsize;
1460 *outpos += repsize;
1461 /* we made it! */
1462 res = 0;
1464 onError:
1465 Py_XDECREF(restuple);
1466 return res;
1469 /* --- UTF-7 Codec -------------------------------------------------------- */
1471 /* See RFC2152 for details. We encode conservatively and decode liberally. */
1473 /* Three simple macros defining base-64. */
1475 /* Is c a base-64 character? */
1477 #define IS_BASE64(c) \
1478 (isalnum(c) || (c) == '+' || (c) == '/')
1480 /* given that c is a base-64 character, what is its base-64 value? */
1482 #define FROM_BASE64(c) \
1483 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1484 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1485 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1486 (c) == '+' ? 62 : 63)
1488 /* What is the base-64 character of the bottom 6 bits of n? */
1490 #define TO_BASE64(n) \
1491 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1493 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1494 * decoded as itself. We are permissive on decoding; the only ASCII
1495 * byte not decoding to itself is the + which begins a base64
1496 * string. */
1498 #define DECODE_DIRECT(c) \
1499 ((c) <= 127 && (c) != '+')
1501 /* The UTF-7 encoder treats ASCII characters differently according to
1502 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1503 * the above). See RFC2152. This array identifies these different
1504 * sets:
1505 * 0 : "Set D"
1506 * alphanumeric and '(),-./:?
1507 * 1 : "Set O"
1508 * !"#$%&*;<=>@[]^_`{|}
1509 * 2 : "whitespace"
1510 * ht nl cr sp
1511 * 3 : special (must be base64 encoded)
1512 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1515 static
1516 char utf7_category[128] = {
1517 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1518 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1519 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1520 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1521 /* sp ! " # $ % & ' ( ) * + , - . / */
1522 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1523 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1524 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1525 /* @ A B C D E F G H I J K L M N O */
1526 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1527 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
1528 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1529 /* ` a b c d e f g h i j k l m n o */
1530 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1531 /* p q r s t u v w x y z { | } ~ del */
1532 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
1535 /* ENCODE_DIRECT: this character should be encoded as itself. The
1536 * answer depends on whether we are encoding set O as itself, and also
1537 * on whether we are encoding whitespace as itself. RFC2152 makes it
1538 * clear that the answers to these questions vary between
1539 * applications, so this code needs to be flexible. */
1541 #define ENCODE_DIRECT(c, directO, directWS) \
1542 ((c) < 128 && (c) > 0 && \
1543 ((utf7_category[(c)] == 0) || \
1544 (directWS && (utf7_category[(c)] == 2)) || \
1545 (directO && (utf7_category[(c)] == 1))))
1547 PyObject *PyUnicode_DecodeUTF7(const char *s,
1548 Py_ssize_t size,
1549 const char *errors)
1551 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1554 /* The decoder. The only state we preserve is our read position,
1555 * i.e. how many characters we have consumed. So if we end in the
1556 * middle of a shift sequence we have to back off the read position
1557 * and the output to the beginning of the sequence, otherwise we lose
1558 * all the shift state (seen bits, number of bits seen, high
1559 * surrogate). */
1561 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1562 Py_ssize_t size,
1563 const char *errors,
1564 Py_ssize_t *consumed)
1566 const char *starts = s;
1567 Py_ssize_t startinpos;
1568 Py_ssize_t endinpos;
1569 Py_ssize_t outpos;
1570 const char *e;
1571 PyUnicodeObject *unicode;
1572 Py_UNICODE *p;
1573 const char *errmsg = "";
1574 int inShift = 0;
1575 Py_UNICODE *shiftOutStart;
1576 unsigned int base64bits = 0;
1577 unsigned long base64buffer = 0;
1578 Py_UNICODE surrogate = 0;
1579 PyObject *errorHandler = NULL;
1580 PyObject *exc = NULL;
1582 unicode = _PyUnicode_New(size);
1583 if (!unicode)
1584 return NULL;
1585 if (size == 0) {
1586 if (consumed)
1587 *consumed = 0;
1588 return (PyObject *)unicode;
1591 p = unicode->str;
1592 shiftOutStart = p;
1593 e = s + size;
1595 while (s < e) {
1596 Py_UNICODE ch = (unsigned char) *s;
1598 if (inShift) { /* in a base-64 section */
1599 if (IS_BASE64(ch)) { /* consume a base-64 character */
1600 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1601 base64bits += 6;
1602 s++;
1603 if (base64bits >= 16) {
1604 /* we have enough bits for a UTF-16 value */
1605 Py_UNICODE outCh = (Py_UNICODE)
1606 (base64buffer >> (base64bits-16));
1607 base64bits -= 16;
1608 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1609 if (surrogate) {
1610 /* expecting a second surrogate */
1611 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1612 #ifdef Py_UNICODE_WIDE
1613 *p++ = (((surrogate & 0x3FF)<<10)
1614 | (outCh & 0x3FF)) + 0x10000;
1615 #else
1616 *p++ = surrogate;
1617 *p++ = outCh;
1618 #endif
1619 surrogate = 0;
1621 else {
1622 surrogate = 0;
1623 errmsg = "second surrogate missing";
1624 goto utf7Error;
1627 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1628 /* first surrogate */
1629 surrogate = outCh;
1631 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1632 errmsg = "unexpected second surrogate";
1633 goto utf7Error;
1635 else {
1636 *p++ = outCh;
1640 else { /* now leaving a base-64 section */
1641 inShift = 0;
1642 s++;
1643 if (surrogate) {
1644 errmsg = "second surrogate missing at end of shift sequence";
1645 goto utf7Error;
1647 if (base64bits > 0) { /* left-over bits */
1648 if (base64bits >= 6) {
1649 /* We've seen at least one base-64 character */
1650 errmsg = "partial character in shift sequence";
1651 goto utf7Error;
1653 else {
1654 /* Some bits remain; they should be zero */
1655 if (base64buffer != 0) {
1656 errmsg = "non-zero padding bits in shift sequence";
1657 goto utf7Error;
1661 if (ch != '-') {
1662 /* '-' is absorbed; other terminating
1663 characters are preserved */
1664 *p++ = ch;
1668 else if ( ch == '+' ) {
1669 startinpos = s-starts;
1670 s++; /* consume '+' */
1671 if (s < e && *s == '-') { /* '+-' encodes '+' */
1672 s++;
1673 *p++ = '+';
1675 else { /* begin base64-encoded section */
1676 inShift = 1;
1677 shiftOutStart = p;
1678 base64bits = 0;
1681 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
1682 *p++ = ch;
1683 s++;
1685 else {
1686 startinpos = s-starts;
1687 s++;
1688 errmsg = "unexpected special character";
1689 goto utf7Error;
1691 continue;
1692 utf7Error:
1693 outpos = p-PyUnicode_AS_UNICODE(unicode);
1694 endinpos = s-starts;
1695 if (unicode_decode_call_errorhandler(
1696 errors, &errorHandler,
1697 "utf7", errmsg,
1698 starts, size, &startinpos, &endinpos, &exc, &s,
1699 &unicode, &outpos, &p))
1700 goto onError;
1703 /* end of string */
1705 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1706 /* if we're in an inconsistent state, that's an error */
1707 if (surrogate ||
1708 (base64bits >= 6) ||
1709 (base64bits > 0 && base64buffer != 0)) {
1710 outpos = p-PyUnicode_AS_UNICODE(unicode);
1711 endinpos = size;
1712 if (unicode_decode_call_errorhandler(
1713 errors, &errorHandler,
1714 "utf7", "unterminated shift sequence",
1715 starts, size, &startinpos, &endinpos, &exc, &s,
1716 &unicode, &outpos, &p))
1717 goto onError;
1721 /* return state */
1722 if (consumed) {
1723 if (inShift) {
1724 p = shiftOutStart; /* back off output */
1725 *consumed = startinpos;
1727 else {
1728 *consumed = s-starts;
1732 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1733 goto onError;
1735 Py_XDECREF(errorHandler);
1736 Py_XDECREF(exc);
1737 return (PyObject *)unicode;
1739 onError:
1740 Py_XDECREF(errorHandler);
1741 Py_XDECREF(exc);
1742 Py_DECREF(unicode);
1743 return NULL;
1747 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1748 Py_ssize_t size,
1749 int base64SetO,
1750 int base64WhiteSpace,
1751 const char *errors)
1753 PyObject *v;
1754 /* It might be possible to tighten this worst case */
1755 Py_ssize_t allocated = 8 * size;
1756 int inShift = 0;
1757 Py_ssize_t i = 0;
1758 unsigned int base64bits = 0;
1759 unsigned long base64buffer = 0;
1760 char * out;
1761 char * start;
1763 if (allocated / 8 != size)
1764 return PyErr_NoMemory();
1766 if (size == 0)
1767 return PyString_FromStringAndSize(NULL, 0);
1769 v = PyString_FromStringAndSize(NULL, allocated);
1770 if (v == NULL)
1771 return NULL;
1773 start = out = PyString_AS_STRING(v);
1774 for (;i < size; ++i) {
1775 Py_UNICODE ch = s[i];
1777 if (inShift) {
1778 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1779 /* shifting out */
1780 if (base64bits) { /* output remaining bits */
1781 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1782 base64buffer = 0;
1783 base64bits = 0;
1785 inShift = 0;
1786 /* Characters not in the BASE64 set implicitly unshift the sequence
1787 so no '-' is required, except if the character is itself a '-' */
1788 if (IS_BASE64(ch) || ch == '-') {
1789 *out++ = '-';
1791 *out++ = (char) ch;
1793 else {
1794 goto encode_char;
1797 else { /* not in a shift sequence */
1798 if (ch == '+') {
1799 *out++ = '+';
1800 *out++ = '-';
1802 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1803 *out++ = (char) ch;
1805 else {
1806 *out++ = '+';
1807 inShift = 1;
1808 goto encode_char;
1811 continue;
1812 encode_char:
1813 #ifdef Py_UNICODE_WIDE
1814 if (ch >= 0x10000) {
1815 /* code first surrogate */
1816 base64bits += 16;
1817 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1818 while (base64bits >= 6) {
1819 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1820 base64bits -= 6;
1822 /* prepare second surrogate */
1823 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1825 #endif
1826 base64bits += 16;
1827 base64buffer = (base64buffer << 16) | ch;
1828 while (base64bits >= 6) {
1829 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1830 base64bits -= 6;
1833 if (base64bits)
1834 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1835 if (inShift)
1836 *out++ = '-';
1838 _PyString_Resize(&v, out - start);
1839 return v;
1842 #undef IS_BASE64
1843 #undef FROM_BASE64
1844 #undef TO_BASE64
1845 #undef DECODE_DIRECT
1846 #undef ENCODE_DIRECT
1848 /* --- UTF-8 Codec -------------------------------------------------------- */
1850 static
1851 char utf8_code_length[256] = {
1852 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1853 illegal prefix. see RFC 2279 for details */
1854 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1855 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1856 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1857 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1858 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1859 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1860 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1861 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1862 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1863 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1864 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1865 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1866 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1867 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1868 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1869 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1872 PyObject *PyUnicode_DecodeUTF8(const char *s,
1873 Py_ssize_t size,
1874 const char *errors)
1876 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1879 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1880 Py_ssize_t size,
1881 const char *errors,
1882 Py_ssize_t *consumed)
1884 const char *starts = s;
1885 int n;
1886 Py_ssize_t startinpos;
1887 Py_ssize_t endinpos;
1888 Py_ssize_t outpos;
1889 const char *e;
1890 PyUnicodeObject *unicode;
1891 Py_UNICODE *p;
1892 const char *errmsg = "";
1893 PyObject *errorHandler = NULL;
1894 PyObject *exc = NULL;
1896 /* Note: size will always be longer than the resulting Unicode
1897 character count */
1898 unicode = _PyUnicode_New(size);
1899 if (!unicode)
1900 return NULL;
1901 if (size == 0) {
1902 if (consumed)
1903 *consumed = 0;
1904 return (PyObject *)unicode;
1907 /* Unpack UTF-8 encoded data */
1908 p = unicode->str;
1909 e = s + size;
1911 while (s < e) {
1912 Py_UCS4 ch = (unsigned char)*s;
1914 if (ch < 0x80) {
1915 *p++ = (Py_UNICODE)ch;
1916 s++;
1917 continue;
1920 n = utf8_code_length[ch];
1922 if (s + n > e) {
1923 if (consumed)
1924 break;
1925 else {
1926 errmsg = "unexpected end of data";
1927 startinpos = s-starts;
1928 endinpos = size;
1929 goto utf8Error;
1933 switch (n) {
1935 case 0:
1936 errmsg = "unexpected code byte";
1937 startinpos = s-starts;
1938 endinpos = startinpos+1;
1939 goto utf8Error;
1941 case 1:
1942 errmsg = "internal error";
1943 startinpos = s-starts;
1944 endinpos = startinpos+1;
1945 goto utf8Error;
1947 case 2:
1948 if ((s[1] & 0xc0) != 0x80) {
1949 errmsg = "invalid data";
1950 startinpos = s-starts;
1951 endinpos = startinpos+2;
1952 goto utf8Error;
1954 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1955 if (ch < 0x80) {
1956 startinpos = s-starts;
1957 endinpos = startinpos+2;
1958 errmsg = "illegal encoding";
1959 goto utf8Error;
1961 else
1962 *p++ = (Py_UNICODE)ch;
1963 break;
1965 case 3:
1966 if ((s[1] & 0xc0) != 0x80 ||
1967 (s[2] & 0xc0) != 0x80) {
1968 errmsg = "invalid data";
1969 startinpos = s-starts;
1970 endinpos = startinpos+3;
1971 goto utf8Error;
1973 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1974 if (ch < 0x0800) {
1975 /* Note: UTF-8 encodings of surrogates are considered
1976 legal UTF-8 sequences;
1978 XXX For wide builds (UCS-4) we should probably try
1979 to recombine the surrogates into a single code
1980 unit.
1982 errmsg = "illegal encoding";
1983 startinpos = s-starts;
1984 endinpos = startinpos+3;
1985 goto utf8Error;
1987 else
1988 *p++ = (Py_UNICODE)ch;
1989 break;
1991 case 4:
1992 if ((s[1] & 0xc0) != 0x80 ||
1993 (s[2] & 0xc0) != 0x80 ||
1994 (s[3] & 0xc0) != 0x80) {
1995 errmsg = "invalid data";
1996 startinpos = s-starts;
1997 endinpos = startinpos+4;
1998 goto utf8Error;
2000 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2001 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2002 /* validate and convert to UTF-16 */
2003 if ((ch < 0x10000) /* minimum value allowed for 4
2004 byte encoding */
2005 || (ch > 0x10ffff)) /* maximum value allowed for
2006 UTF-16 */
2008 errmsg = "illegal encoding";
2009 startinpos = s-starts;
2010 endinpos = startinpos+4;
2011 goto utf8Error;
2013 #ifdef Py_UNICODE_WIDE
2014 *p++ = (Py_UNICODE)ch;
2015 #else
2016 /* compute and append the two surrogates: */
2018 /* translate from 10000..10FFFF to 0..FFFF */
2019 ch -= 0x10000;
2021 /* high surrogate = top 10 bits added to D800 */
2022 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2024 /* low surrogate = bottom 10 bits added to DC00 */
2025 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2026 #endif
2027 break;
2029 default:
2030 /* Other sizes are only needed for UCS-4 */
2031 errmsg = "unsupported Unicode code range";
2032 startinpos = s-starts;
2033 endinpos = startinpos+n;
2034 goto utf8Error;
2036 s += n;
2037 continue;
2039 utf8Error:
2040 outpos = p-PyUnicode_AS_UNICODE(unicode);
2041 if (unicode_decode_call_errorhandler(
2042 errors, &errorHandler,
2043 "utf8", errmsg,
2044 starts, size, &startinpos, &endinpos, &exc, &s,
2045 &unicode, &outpos, &p))
2046 goto onError;
2048 if (consumed)
2049 *consumed = s-starts;
2051 /* Adjust length */
2052 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2053 goto onError;
2055 Py_XDECREF(errorHandler);
2056 Py_XDECREF(exc);
2057 return (PyObject *)unicode;
2059 onError:
2060 Py_XDECREF(errorHandler);
2061 Py_XDECREF(exc);
2062 Py_DECREF(unicode);
2063 return NULL;
2066 /* Allocation strategy: if the string is short, convert into a stack buffer
2067 and allocate exactly as much space needed at the end. Else allocate the
2068 maximum possible needed (4 result bytes per Unicode character), and return
2069 the excess memory at the end.
2071 PyObject *
2072 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2073 Py_ssize_t size,
2074 const char *errors)
2076 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
2078 Py_ssize_t i; /* index into s of next input byte */
2079 PyObject *v; /* result string object */
2080 char *p; /* next free byte in output buffer */
2081 Py_ssize_t nallocated; /* number of result bytes allocated */
2082 Py_ssize_t nneeded; /* number of result bytes needed */
2083 char stackbuf[MAX_SHORT_UNICHARS * 4];
2085 assert(s != NULL);
2086 assert(size >= 0);
2088 if (size <= MAX_SHORT_UNICHARS) {
2089 /* Write into the stack buffer; nallocated can't overflow.
2090 * At the end, we'll allocate exactly as much heap space as it
2091 * turns out we need.
2093 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2094 v = NULL; /* will allocate after we're done */
2095 p = stackbuf;
2097 else {
2098 /* Overallocate on the heap, and give the excess back at the end. */
2099 nallocated = size * 4;
2100 if (nallocated / 4 != size) /* overflow! */
2101 return PyErr_NoMemory();
2102 v = PyString_FromStringAndSize(NULL, nallocated);
2103 if (v == NULL)
2104 return NULL;
2105 p = PyString_AS_STRING(v);
2108 for (i = 0; i < size;) {
2109 Py_UCS4 ch = s[i++];
2111 if (ch < 0x80)
2112 /* Encode ASCII */
2113 *p++ = (char) ch;
2115 else if (ch < 0x0800) {
2116 /* Encode Latin-1 */
2117 *p++ = (char)(0xc0 | (ch >> 6));
2118 *p++ = (char)(0x80 | (ch & 0x3f));
2120 else {
2121 /* Encode UCS2 Unicode ordinals */
2122 if (ch < 0x10000) {
2123 /* Special case: check for high surrogate */
2124 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2125 Py_UCS4 ch2 = s[i];
2126 /* Check for low surrogate and combine the two to
2127 form a UCS4 value */
2128 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2129 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2130 i++;
2131 goto encodeUCS4;
2133 /* Fall through: handles isolated high surrogates */
2135 *p++ = (char)(0xe0 | (ch >> 12));
2136 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2137 *p++ = (char)(0x80 | (ch & 0x3f));
2138 continue;
2140 encodeUCS4:
2141 /* Encode UCS4 Unicode ordinals */
2142 *p++ = (char)(0xf0 | (ch >> 18));
2143 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2144 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2145 *p++ = (char)(0x80 | (ch & 0x3f));
2149 if (v == NULL) {
2150 /* This was stack allocated. */
2151 nneeded = p - stackbuf;
2152 assert(nneeded <= nallocated);
2153 v = PyString_FromStringAndSize(stackbuf, nneeded);
2155 else {
2156 /* Cut back to size actually needed. */
2157 nneeded = p - PyString_AS_STRING(v);
2158 assert(nneeded <= nallocated);
2159 _PyString_Resize(&v, nneeded);
2161 return v;
2163 #undef MAX_SHORT_UNICHARS
2166 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2168 if (!PyUnicode_Check(unicode)) {
2169 PyErr_BadArgument();
2170 return NULL;
2172 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2173 PyUnicode_GET_SIZE(unicode),
2174 NULL);
2177 /* --- UTF-32 Codec ------------------------------------------------------- */
2179 PyObject *
2180 PyUnicode_DecodeUTF32(const char *s,
2181 Py_ssize_t size,
2182 const char *errors,
2183 int *byteorder)
2185 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2188 PyObject *
2189 PyUnicode_DecodeUTF32Stateful(const char *s,
2190 Py_ssize_t size,
2191 const char *errors,
2192 int *byteorder,
2193 Py_ssize_t *consumed)
2195 const char *starts = s;
2196 Py_ssize_t startinpos;
2197 Py_ssize_t endinpos;
2198 Py_ssize_t outpos;
2199 PyUnicodeObject *unicode;
2200 Py_UNICODE *p;
2201 #ifndef Py_UNICODE_WIDE
2202 int i, pairs;
2203 #else
2204 const int pairs = 0;
2205 #endif
2206 const unsigned char *q, *e;
2207 int bo = 0; /* assume native ordering by default */
2208 const char *errmsg = "";
2209 /* Offsets from q for retrieving bytes in the right order. */
2210 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2211 int iorder[] = {0, 1, 2, 3};
2212 #else
2213 int iorder[] = {3, 2, 1, 0};
2214 #endif
2215 PyObject *errorHandler = NULL;
2216 PyObject *exc = NULL;
2217 /* On narrow builds we split characters outside the BMP into two
2218 codepoints => count how much extra space we need. */
2219 #ifndef Py_UNICODE_WIDE
2220 for (i = pairs = 0; i < size/4; i++)
2221 if (((Py_UCS4 *)s)[i] >= 0x10000)
2222 pairs++;
2223 #endif
2225 /* This might be one to much, because of a BOM */
2226 unicode = _PyUnicode_New((size+3)/4+pairs);
2227 if (!unicode)
2228 return NULL;
2229 if (size == 0)
2230 return (PyObject *)unicode;
2232 /* Unpack UTF-32 encoded data */
2233 p = unicode->str;
2234 q = (unsigned char *)s;
2235 e = q + size;
2237 if (byteorder)
2238 bo = *byteorder;
2240 /* Check for BOM marks (U+FEFF) in the input and adjust current
2241 byte order setting accordingly. In native mode, the leading BOM
2242 mark is skipped, in all other modes, it is copied to the output
2243 stream as-is (giving a ZWNBSP character). */
2244 if (bo == 0) {
2245 if (size >= 4) {
2246 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2247 (q[iorder[1]] << 8) | q[iorder[0]];
2248 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2249 if (bom == 0x0000FEFF) {
2250 q += 4;
2251 bo = -1;
2253 else if (bom == 0xFFFE0000) {
2254 q += 4;
2255 bo = 1;
2257 #else
2258 if (bom == 0x0000FEFF) {
2259 q += 4;
2260 bo = 1;
2262 else if (bom == 0xFFFE0000) {
2263 q += 4;
2264 bo = -1;
2266 #endif
2270 if (bo == -1) {
2271 /* force LE */
2272 iorder[0] = 0;
2273 iorder[1] = 1;
2274 iorder[2] = 2;
2275 iorder[3] = 3;
2277 else if (bo == 1) {
2278 /* force BE */
2279 iorder[0] = 3;
2280 iorder[1] = 2;
2281 iorder[2] = 1;
2282 iorder[3] = 0;
2285 while (q < e) {
2286 Py_UCS4 ch;
2287 /* remaining bytes at the end? (size should be divisible by 4) */
2288 if (e-q<4) {
2289 if (consumed)
2290 break;
2291 errmsg = "truncated data";
2292 startinpos = ((const char *)q)-starts;
2293 endinpos = ((const char *)e)-starts;
2294 goto utf32Error;
2295 /* The remaining input chars are ignored if the callback
2296 chooses to skip the input */
2298 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2299 (q[iorder[1]] << 8) | q[iorder[0]];
2301 if (ch >= 0x110000)
2303 errmsg = "codepoint not in range(0x110000)";
2304 startinpos = ((const char *)q)-starts;
2305 endinpos = startinpos+4;
2306 goto utf32Error;
2308 #ifndef Py_UNICODE_WIDE
2309 if (ch >= 0x10000)
2311 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2312 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2314 else
2315 #endif
2316 *p++ = ch;
2317 q += 4;
2318 continue;
2319 utf32Error:
2320 outpos = p-PyUnicode_AS_UNICODE(unicode);
2321 if (unicode_decode_call_errorhandler(
2322 errors, &errorHandler,
2323 "utf32", errmsg,
2324 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2325 &unicode, &outpos, &p))
2326 goto onError;
2329 if (byteorder)
2330 *byteorder = bo;
2332 if (consumed)
2333 *consumed = (const char *)q-starts;
2335 /* Adjust length */
2336 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2337 goto onError;
2339 Py_XDECREF(errorHandler);
2340 Py_XDECREF(exc);
2341 return (PyObject *)unicode;
2343 onError:
2344 Py_DECREF(unicode);
2345 Py_XDECREF(errorHandler);
2346 Py_XDECREF(exc);
2347 return NULL;
2350 PyObject *
2351 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2352 Py_ssize_t size,
2353 const char *errors,
2354 int byteorder)
2356 PyObject *v;
2357 unsigned char *p;
2358 Py_ssize_t nsize, bytesize;
2359 #ifndef Py_UNICODE_WIDE
2360 Py_ssize_t i, pairs;
2361 #else
2362 const int pairs = 0;
2363 #endif
2364 /* Offsets from p for storing byte pairs in the right order. */
2365 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2366 int iorder[] = {0, 1, 2, 3};
2367 #else
2368 int iorder[] = {3, 2, 1, 0};
2369 #endif
2371 #define STORECHAR(CH) \
2372 do { \
2373 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2374 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2375 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2376 p[iorder[0]] = (CH) & 0xff; \
2377 p += 4; \
2378 } while(0)
2380 /* In narrow builds we can output surrogate pairs as one codepoint,
2381 so we need less space. */
2382 #ifndef Py_UNICODE_WIDE
2383 for (i = pairs = 0; i < size-1; i++)
2384 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2385 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2386 pairs++;
2387 #endif
2388 nsize = (size - pairs + (byteorder == 0));
2389 bytesize = nsize * 4;
2390 if (bytesize / 4 != nsize)
2391 return PyErr_NoMemory();
2392 v = PyString_FromStringAndSize(NULL, bytesize);
2393 if (v == NULL)
2394 return NULL;
2396 p = (unsigned char *)PyString_AS_STRING(v);
2397 if (byteorder == 0)
2398 STORECHAR(0xFEFF);
2399 if (size == 0)
2400 return v;
2402 if (byteorder == -1) {
2403 /* force LE */
2404 iorder[0] = 0;
2405 iorder[1] = 1;
2406 iorder[2] = 2;
2407 iorder[3] = 3;
2409 else if (byteorder == 1) {
2410 /* force BE */
2411 iorder[0] = 3;
2412 iorder[1] = 2;
2413 iorder[2] = 1;
2414 iorder[3] = 0;
2417 while (size-- > 0) {
2418 Py_UCS4 ch = *s++;
2419 #ifndef Py_UNICODE_WIDE
2420 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2421 Py_UCS4 ch2 = *s;
2422 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2423 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2424 s++;
2425 size--;
2428 #endif
2429 STORECHAR(ch);
2431 return v;
2432 #undef STORECHAR
2435 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2437 if (!PyUnicode_Check(unicode)) {
2438 PyErr_BadArgument();
2439 return NULL;
2441 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2442 PyUnicode_GET_SIZE(unicode),
2443 NULL,
2447 /* --- UTF-16 Codec ------------------------------------------------------- */
2449 PyObject *
2450 PyUnicode_DecodeUTF16(const char *s,
2451 Py_ssize_t size,
2452 const char *errors,
2453 int *byteorder)
2455 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2458 PyObject *
2459 PyUnicode_DecodeUTF16Stateful(const char *s,
2460 Py_ssize_t size,
2461 const char *errors,
2462 int *byteorder,
2463 Py_ssize_t *consumed)
2465 const char *starts = s;
2466 Py_ssize_t startinpos;
2467 Py_ssize_t endinpos;
2468 Py_ssize_t outpos;
2469 PyUnicodeObject *unicode;
2470 Py_UNICODE *p;
2471 const unsigned char *q, *e;
2472 int bo = 0; /* assume native ordering by default */
2473 const char *errmsg = "";
2474 /* Offsets from q for retrieving byte pairs in the right order. */
2475 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2476 int ihi = 1, ilo = 0;
2477 #else
2478 int ihi = 0, ilo = 1;
2479 #endif
2480 PyObject *errorHandler = NULL;
2481 PyObject *exc = NULL;
2483 /* Note: size will always be longer than the resulting Unicode
2484 character count */
2485 unicode = _PyUnicode_New(size);
2486 if (!unicode)
2487 return NULL;
2488 if (size == 0)
2489 return (PyObject *)unicode;
2491 /* Unpack UTF-16 encoded data */
2492 p = unicode->str;
2493 q = (unsigned char *)s;
2494 e = q + size;
2496 if (byteorder)
2497 bo = *byteorder;
2499 /* Check for BOM marks (U+FEFF) in the input and adjust current
2500 byte order setting accordingly. In native mode, the leading BOM
2501 mark is skipped, in all other modes, it is copied to the output
2502 stream as-is (giving a ZWNBSP character). */
2503 if (bo == 0) {
2504 if (size >= 2) {
2505 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2506 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2507 if (bom == 0xFEFF) {
2508 q += 2;
2509 bo = -1;
2511 else if (bom == 0xFFFE) {
2512 q += 2;
2513 bo = 1;
2515 #else
2516 if (bom == 0xFEFF) {
2517 q += 2;
2518 bo = 1;
2520 else if (bom == 0xFFFE) {
2521 q += 2;
2522 bo = -1;
2524 #endif
2528 if (bo == -1) {
2529 /* force LE */
2530 ihi = 1;
2531 ilo = 0;
2533 else if (bo == 1) {
2534 /* force BE */
2535 ihi = 0;
2536 ilo = 1;
2539 while (q < e) {
2540 Py_UNICODE ch;
2541 /* remaining bytes at the end? (size should be even) */
2542 if (e-q<2) {
2543 if (consumed)
2544 break;
2545 errmsg = "truncated data";
2546 startinpos = ((const char *)q)-starts;
2547 endinpos = ((const char *)e)-starts;
2548 goto utf16Error;
2549 /* The remaining input chars are ignored if the callback
2550 chooses to skip the input */
2552 ch = (q[ihi] << 8) | q[ilo];
2554 q += 2;
2556 if (ch < 0xD800 || ch > 0xDFFF) {
2557 *p++ = ch;
2558 continue;
2561 /* UTF-16 code pair: */
2562 if (q >= e) {
2563 errmsg = "unexpected end of data";
2564 startinpos = (((const char *)q)-2)-starts;
2565 endinpos = ((const char *)e)-starts;
2566 goto utf16Error;
2568 if (0xD800 <= ch && ch <= 0xDBFF) {
2569 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2570 q += 2;
2571 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2572 #ifndef Py_UNICODE_WIDE
2573 *p++ = ch;
2574 *p++ = ch2;
2575 #else
2576 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2577 #endif
2578 continue;
2580 else {
2581 errmsg = "illegal UTF-16 surrogate";
2582 startinpos = (((const char *)q)-4)-starts;
2583 endinpos = startinpos+2;
2584 goto utf16Error;
2588 errmsg = "illegal encoding";
2589 startinpos = (((const char *)q)-2)-starts;
2590 endinpos = startinpos+2;
2591 /* Fall through to report the error */
2593 utf16Error:
2594 outpos = p-PyUnicode_AS_UNICODE(unicode);
2595 if (unicode_decode_call_errorhandler(
2596 errors, &errorHandler,
2597 "utf16", errmsg,
2598 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2599 &unicode, &outpos, &p))
2600 goto onError;
2603 if (byteorder)
2604 *byteorder = bo;
2606 if (consumed)
2607 *consumed = (const char *)q-starts;
2609 /* Adjust length */
2610 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2611 goto onError;
2613 Py_XDECREF(errorHandler);
2614 Py_XDECREF(exc);
2615 return (PyObject *)unicode;
2617 onError:
2618 Py_DECREF(unicode);
2619 Py_XDECREF(errorHandler);
2620 Py_XDECREF(exc);
2621 return NULL;
2624 PyObject *
2625 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2626 Py_ssize_t size,
2627 const char *errors,
2628 int byteorder)
2630 PyObject *v;
2631 unsigned char *p;
2632 Py_ssize_t nsize, bytesize;
2633 #ifdef Py_UNICODE_WIDE
2634 Py_ssize_t i, pairs;
2635 #else
2636 const int pairs = 0;
2637 #endif
2638 /* Offsets from p for storing byte pairs in the right order. */
2639 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2640 int ihi = 1, ilo = 0;
2641 #else
2642 int ihi = 0, ilo = 1;
2643 #endif
2645 #define STORECHAR(CH) \
2646 do { \
2647 p[ihi] = ((CH) >> 8) & 0xff; \
2648 p[ilo] = (CH) & 0xff; \
2649 p += 2; \
2650 } while(0)
2652 #ifdef Py_UNICODE_WIDE
2653 for (i = pairs = 0; i < size; i++)
2654 if (s[i] >= 0x10000)
2655 pairs++;
2656 #endif
2657 /* 2 * (size + pairs + (byteorder == 0)) */
2658 if (size > PY_SSIZE_T_MAX ||
2659 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2660 return PyErr_NoMemory();
2661 nsize = size + pairs + (byteorder == 0);
2662 bytesize = nsize * 2;
2663 if (bytesize / 2 != nsize)
2664 return PyErr_NoMemory();
2665 v = PyString_FromStringAndSize(NULL, bytesize);
2666 if (v == NULL)
2667 return NULL;
2669 p = (unsigned char *)PyString_AS_STRING(v);
2670 if (byteorder == 0)
2671 STORECHAR(0xFEFF);
2672 if (size == 0)
2673 return v;
2675 if (byteorder == -1) {
2676 /* force LE */
2677 ihi = 1;
2678 ilo = 0;
2680 else if (byteorder == 1) {
2681 /* force BE */
2682 ihi = 0;
2683 ilo = 1;
2686 while (size-- > 0) {
2687 Py_UNICODE ch = *s++;
2688 Py_UNICODE ch2 = 0;
2689 #ifdef Py_UNICODE_WIDE
2690 if (ch >= 0x10000) {
2691 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2692 ch = 0xD800 | ((ch-0x10000) >> 10);
2694 #endif
2695 STORECHAR(ch);
2696 if (ch2)
2697 STORECHAR(ch2);
2699 return v;
2700 #undef STORECHAR
2703 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2705 if (!PyUnicode_Check(unicode)) {
2706 PyErr_BadArgument();
2707 return NULL;
2709 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2710 PyUnicode_GET_SIZE(unicode),
2711 NULL,
2715 /* --- Unicode Escape Codec ----------------------------------------------- */
2717 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2719 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2720 Py_ssize_t size,
2721 const char *errors)
2723 const char *starts = s;
2724 Py_ssize_t startinpos;
2725 Py_ssize_t endinpos;
2726 Py_ssize_t outpos;
2727 int i;
2728 PyUnicodeObject *v;
2729 Py_UNICODE *p;
2730 const char *end;
2731 char* message;
2732 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2733 PyObject *errorHandler = NULL;
2734 PyObject *exc = NULL;
2736 /* Escaped strings will always be longer than the resulting
2737 Unicode string, so we start with size here and then reduce the
2738 length after conversion to the true value.
2739 (but if the error callback returns a long replacement string
2740 we'll have to allocate more space) */
2741 v = _PyUnicode_New(size);
2742 if (v == NULL)
2743 goto onError;
2744 if (size == 0)
2745 return (PyObject *)v;
2747 p = PyUnicode_AS_UNICODE(v);
2748 end = s + size;
2750 while (s < end) {
2751 unsigned char c;
2752 Py_UNICODE x;
2753 int digits;
2755 /* Non-escape characters are interpreted as Unicode ordinals */
2756 if (*s != '\\') {
2757 *p++ = (unsigned char) *s++;
2758 continue;
2761 startinpos = s-starts;
2762 /* \ - Escapes */
2763 s++;
2764 c = *s++;
2765 if (s > end)
2766 c = '\0'; /* Invalid after \ */
2767 switch (c) {
2769 /* \x escapes */
2770 case '\n': break;
2771 case '\\': *p++ = '\\'; break;
2772 case '\'': *p++ = '\''; break;
2773 case '\"': *p++ = '\"'; break;
2774 case 'b': *p++ = '\b'; break;
2775 case 'f': *p++ = '\014'; break; /* FF */
2776 case 't': *p++ = '\t'; break;
2777 case 'n': *p++ = '\n'; break;
2778 case 'r': *p++ = '\r'; break;
2779 case 'v': *p++ = '\013'; break; /* VT */
2780 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2782 /* \OOO (octal) escapes */
2783 case '0': case '1': case '2': case '3':
2784 case '4': case '5': case '6': case '7':
2785 x = s[-1] - '0';
2786 if (s < end && '0' <= *s && *s <= '7') {
2787 x = (x<<3) + *s++ - '0';
2788 if (s < end && '0' <= *s && *s <= '7')
2789 x = (x<<3) + *s++ - '0';
2791 *p++ = x;
2792 break;
2794 /* hex escapes */
2795 /* \xXX */
2796 case 'x':
2797 digits = 2;
2798 message = "truncated \\xXX escape";
2799 goto hexescape;
2801 /* \uXXXX */
2802 case 'u':
2803 digits = 4;
2804 message = "truncated \\uXXXX escape";
2805 goto hexescape;
2807 /* \UXXXXXXXX */
2808 case 'U':
2809 digits = 8;
2810 message = "truncated \\UXXXXXXXX escape";
2811 hexescape:
2812 chr = 0;
2813 outpos = p-PyUnicode_AS_UNICODE(v);
2814 if (s+digits>end) {
2815 endinpos = size;
2816 if (unicode_decode_call_errorhandler(
2817 errors, &errorHandler,
2818 "unicodeescape", "end of string in escape sequence",
2819 starts, size, &startinpos, &endinpos, &exc, &s,
2820 &v, &outpos, &p))
2821 goto onError;
2822 goto nextByte;
2824 for (i = 0; i < digits; ++i) {
2825 c = (unsigned char) s[i];
2826 if (!isxdigit(c)) {
2827 endinpos = (s+i+1)-starts;
2828 if (unicode_decode_call_errorhandler(
2829 errors, &errorHandler,
2830 "unicodeescape", message,
2831 starts, size, &startinpos, &endinpos, &exc, &s,
2832 &v, &outpos, &p))
2833 goto onError;
2834 goto nextByte;
2836 chr = (chr<<4) & ~0xF;
2837 if (c >= '0' && c <= '9')
2838 chr += c - '0';
2839 else if (c >= 'a' && c <= 'f')
2840 chr += 10 + c - 'a';
2841 else
2842 chr += 10 + c - 'A';
2844 s += i;
2845 if (chr == 0xffffffff && PyErr_Occurred())
2846 /* _decoding_error will have already written into the
2847 target buffer. */
2848 break;
2849 store:
2850 /* when we get here, chr is a 32-bit unicode character */
2851 if (chr <= 0xffff)
2852 /* UCS-2 character */
2853 *p++ = (Py_UNICODE) chr;
2854 else if (chr <= 0x10ffff) {
2855 /* UCS-4 character. Either store directly, or as
2856 surrogate pair. */
2857 #ifdef Py_UNICODE_WIDE
2858 *p++ = chr;
2859 #else
2860 chr -= 0x10000L;
2861 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2862 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2863 #endif
2864 } else {
2865 endinpos = s-starts;
2866 outpos = p-PyUnicode_AS_UNICODE(v);
2867 if (unicode_decode_call_errorhandler(
2868 errors, &errorHandler,
2869 "unicodeescape", "illegal Unicode character",
2870 starts, size, &startinpos, &endinpos, &exc, &s,
2871 &v, &outpos, &p))
2872 goto onError;
2874 break;
2876 /* \N{name} */
2877 case 'N':
2878 message = "malformed \\N character escape";
2879 if (ucnhash_CAPI == NULL) {
2880 /* load the unicode data module */
2881 PyObject *m, *api;
2882 m = PyImport_ImportModuleNoBlock("unicodedata");
2883 if (m == NULL)
2884 goto ucnhashError;
2885 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
2886 Py_DECREF(m);
2887 if (api == NULL)
2888 goto ucnhashError;
2889 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
2890 Py_DECREF(api);
2891 if (ucnhash_CAPI == NULL)
2892 goto ucnhashError;
2894 if (*s == '{') {
2895 const char *start = s+1;
2896 /* look for the closing brace */
2897 while (*s != '}' && s < end)
2898 s++;
2899 if (s > start && s < end && *s == '}') {
2900 /* found a name. look it up in the unicode database */
2901 message = "unknown Unicode character name";
2902 s++;
2903 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2904 goto store;
2907 endinpos = s-starts;
2908 outpos = p-PyUnicode_AS_UNICODE(v);
2909 if (unicode_decode_call_errorhandler(
2910 errors, &errorHandler,
2911 "unicodeescape", message,
2912 starts, size, &startinpos, &endinpos, &exc, &s,
2913 &v, &outpos, &p))
2914 goto onError;
2915 break;
2917 default:
2918 if (s > end) {
2919 message = "\\ at end of string";
2920 s--;
2921 endinpos = s-starts;
2922 outpos = p-PyUnicode_AS_UNICODE(v);
2923 if (unicode_decode_call_errorhandler(
2924 errors, &errorHandler,
2925 "unicodeescape", message,
2926 starts, size, &startinpos, &endinpos, &exc, &s,
2927 &v, &outpos, &p))
2928 goto onError;
2930 else {
2931 *p++ = '\\';
2932 *p++ = (unsigned char)s[-1];
2934 break;
2936 nextByte:
2939 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2940 goto onError;
2941 Py_XDECREF(errorHandler);
2942 Py_XDECREF(exc);
2943 return (PyObject *)v;
2945 ucnhashError:
2946 PyErr_SetString(
2947 PyExc_UnicodeError,
2948 "\\N escapes not supported (can't load unicodedata module)"
2950 Py_XDECREF(v);
2951 Py_XDECREF(errorHandler);
2952 Py_XDECREF(exc);
2953 return NULL;
2955 onError:
2956 Py_XDECREF(v);
2957 Py_XDECREF(errorHandler);
2958 Py_XDECREF(exc);
2959 return NULL;
2962 /* Return a Unicode-Escape string version of the Unicode object.
2964 If quotes is true, the string is enclosed in u"" or u'' quotes as
2965 appropriate.
2969 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2970 Py_ssize_t size,
2971 Py_UNICODE ch)
2973 /* like wcschr, but doesn't stop at NULL characters */
2975 while (size-- > 0) {
2976 if (*s == ch)
2977 return s;
2978 s++;
2981 return NULL;
2984 static
2985 PyObject *unicodeescape_string(const Py_UNICODE *s,
2986 Py_ssize_t size,
2987 int quotes)
2989 PyObject *repr;
2990 char *p;
2992 static const char *hexdigit = "0123456789abcdef";
2993 #ifdef Py_UNICODE_WIDE
2994 const Py_ssize_t expandsize = 10;
2995 #else
2996 const Py_ssize_t expandsize = 6;
2997 #endif
2999 /* XXX(nnorwitz): rather than over-allocating, it would be
3000 better to choose a different scheme. Perhaps scan the
3001 first N-chars of the string and allocate based on that size.
3003 /* Initial allocation is based on the longest-possible unichr
3004 escape.
3006 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3007 unichr, so in this case it's the longest unichr escape. In
3008 narrow (UTF-16) builds this is five chars per source unichr
3009 since there are two unichrs in the surrogate pair, so in narrow
3010 (UTF-16) builds it's not the longest unichr escape.
3012 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3013 so in the narrow (UTF-16) build case it's the longest unichr
3014 escape.
3017 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3018 return PyErr_NoMemory();
3020 repr = PyString_FromStringAndSize(NULL,
3022 + expandsize*size
3023 + 1);
3024 if (repr == NULL)
3025 return NULL;
3027 p = PyString_AS_STRING(repr);
3029 if (quotes) {
3030 *p++ = 'u';
3031 *p++ = (findchar(s, size, '\'') &&
3032 !findchar(s, size, '"')) ? '"' : '\'';
3034 while (size-- > 0) {
3035 Py_UNICODE ch = *s++;
3037 /* Escape quotes and backslashes */
3038 if ((quotes &&
3039 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
3040 *p++ = '\\';
3041 *p++ = (char) ch;
3042 continue;
3045 #ifdef Py_UNICODE_WIDE
3046 /* Map 21-bit characters to '\U00xxxxxx' */
3047 else if (ch >= 0x10000) {
3048 *p++ = '\\';
3049 *p++ = 'U';
3050 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3051 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3052 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3053 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3054 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3055 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3056 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
3057 *p++ = hexdigit[ch & 0x0000000F];
3058 continue;
3060 #else
3061 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3062 else if (ch >= 0xD800 && ch < 0xDC00) {
3063 Py_UNICODE ch2;
3064 Py_UCS4 ucs;
3066 ch2 = *s++;
3067 size--;
3068 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3069 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3070 *p++ = '\\';
3071 *p++ = 'U';
3072 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3073 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3074 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3075 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3076 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3077 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3078 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3079 *p++ = hexdigit[ucs & 0x0000000F];
3080 continue;
3082 /* Fall through: isolated surrogates are copied as-is */
3083 s--;
3084 size++;
3086 #endif
3088 /* Map 16-bit characters to '\uxxxx' */
3089 if (ch >= 256) {
3090 *p++ = '\\';
3091 *p++ = 'u';
3092 *p++ = hexdigit[(ch >> 12) & 0x000F];
3093 *p++ = hexdigit[(ch >> 8) & 0x000F];
3094 *p++ = hexdigit[(ch >> 4) & 0x000F];
3095 *p++ = hexdigit[ch & 0x000F];
3098 /* Map special whitespace to '\t', \n', '\r' */
3099 else if (ch == '\t') {
3100 *p++ = '\\';
3101 *p++ = 't';
3103 else if (ch == '\n') {
3104 *p++ = '\\';
3105 *p++ = 'n';
3107 else if (ch == '\r') {
3108 *p++ = '\\';
3109 *p++ = 'r';
3112 /* Map non-printable US ASCII to '\xhh' */
3113 else if (ch < ' ' || ch >= 0x7F) {
3114 *p++ = '\\';
3115 *p++ = 'x';
3116 *p++ = hexdigit[(ch >> 4) & 0x000F];
3117 *p++ = hexdigit[ch & 0x000F];
3120 /* Copy everything else as-is */
3121 else
3122 *p++ = (char) ch;
3124 if (quotes)
3125 *p++ = PyString_AS_STRING(repr)[1];
3127 *p = '\0';
3128 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
3129 return repr;
3132 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3133 Py_ssize_t size)
3135 return unicodeescape_string(s, size, 0);
3138 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3140 if (!PyUnicode_Check(unicode)) {
3141 PyErr_BadArgument();
3142 return NULL;
3144 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3145 PyUnicode_GET_SIZE(unicode));
3148 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3150 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3151 Py_ssize_t size,
3152 const char *errors)
3154 const char *starts = s;
3155 Py_ssize_t startinpos;
3156 Py_ssize_t endinpos;
3157 Py_ssize_t outpos;
3158 PyUnicodeObject *v;
3159 Py_UNICODE *p;
3160 const char *end;
3161 const char *bs;
3162 PyObject *errorHandler = NULL;
3163 PyObject *exc = NULL;
3165 /* Escaped strings will always be longer than the resulting
3166 Unicode string, so we start with size here and then reduce the
3167 length after conversion to the true value. (But decoding error
3168 handler might have to resize the string) */
3169 v = _PyUnicode_New(size);
3170 if (v == NULL)
3171 goto onError;
3172 if (size == 0)
3173 return (PyObject *)v;
3174 p = PyUnicode_AS_UNICODE(v);
3175 end = s + size;
3176 while (s < end) {
3177 unsigned char c;
3178 Py_UCS4 x;
3179 int i;
3180 int count;
3182 /* Non-escape characters are interpreted as Unicode ordinals */
3183 if (*s != '\\') {
3184 *p++ = (unsigned char)*s++;
3185 continue;
3187 startinpos = s-starts;
3189 /* \u-escapes are only interpreted iff the number of leading
3190 backslashes if odd */
3191 bs = s;
3192 for (;s < end;) {
3193 if (*s != '\\')
3194 break;
3195 *p++ = (unsigned char)*s++;
3197 if (((s - bs) & 1) == 0 ||
3198 s >= end ||
3199 (*s != 'u' && *s != 'U')) {
3200 continue;
3202 p--;
3203 count = *s=='u' ? 4 : 8;
3204 s++;
3206 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3207 outpos = p-PyUnicode_AS_UNICODE(v);
3208 for (x = 0, i = 0; i < count; ++i, ++s) {
3209 c = (unsigned char)*s;
3210 if (!isxdigit(c)) {
3211 endinpos = s-starts;
3212 if (unicode_decode_call_errorhandler(
3213 errors, &errorHandler,
3214 "rawunicodeescape", "truncated \\uXXXX",
3215 starts, size, &startinpos, &endinpos, &exc, &s,
3216 &v, &outpos, &p))
3217 goto onError;
3218 goto nextByte;
3220 x = (x<<4) & ~0xF;
3221 if (c >= '0' && c <= '9')
3222 x += c - '0';
3223 else if (c >= 'a' && c <= 'f')
3224 x += 10 + c - 'a';
3225 else
3226 x += 10 + c - 'A';
3228 if (x <= 0xffff)
3229 /* UCS-2 character */
3230 *p++ = (Py_UNICODE) x;
3231 else if (x <= 0x10ffff) {
3232 /* UCS-4 character. Either store directly, or as
3233 surrogate pair. */
3234 #ifdef Py_UNICODE_WIDE
3235 *p++ = (Py_UNICODE) x;
3236 #else
3237 x -= 0x10000L;
3238 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3239 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3240 #endif
3241 } else {
3242 endinpos = s-starts;
3243 outpos = p-PyUnicode_AS_UNICODE(v);
3244 if (unicode_decode_call_errorhandler(
3245 errors, &errorHandler,
3246 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3247 starts, size, &startinpos, &endinpos, &exc, &s,
3248 &v, &outpos, &p))
3249 goto onError;
3251 nextByte:
3254 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3255 goto onError;
3256 Py_XDECREF(errorHandler);
3257 Py_XDECREF(exc);
3258 return (PyObject *)v;
3260 onError:
3261 Py_XDECREF(v);
3262 Py_XDECREF(errorHandler);
3263 Py_XDECREF(exc);
3264 return NULL;
3267 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3268 Py_ssize_t size)
3270 PyObject *repr;
3271 char *p;
3272 char *q;
3274 static const char *hexdigit = "0123456789abcdef";
3275 #ifdef Py_UNICODE_WIDE
3276 const Py_ssize_t expandsize = 10;
3277 #else
3278 const Py_ssize_t expandsize = 6;
3279 #endif
3281 if (size > PY_SSIZE_T_MAX / expandsize)
3282 return PyErr_NoMemory();
3284 repr = PyString_FromStringAndSize(NULL, expandsize * size);
3285 if (repr == NULL)
3286 return NULL;
3287 if (size == 0)
3288 return repr;
3290 p = q = PyString_AS_STRING(repr);
3291 while (size-- > 0) {
3292 Py_UNICODE ch = *s++;
3293 #ifdef Py_UNICODE_WIDE
3294 /* Map 32-bit characters to '\Uxxxxxxxx' */
3295 if (ch >= 0x10000) {
3296 *p++ = '\\';
3297 *p++ = 'U';
3298 *p++ = hexdigit[(ch >> 28) & 0xf];
3299 *p++ = hexdigit[(ch >> 24) & 0xf];
3300 *p++ = hexdigit[(ch >> 20) & 0xf];
3301 *p++ = hexdigit[(ch >> 16) & 0xf];
3302 *p++ = hexdigit[(ch >> 12) & 0xf];
3303 *p++ = hexdigit[(ch >> 8) & 0xf];
3304 *p++ = hexdigit[(ch >> 4) & 0xf];
3305 *p++ = hexdigit[ch & 15];
3307 else
3308 #else
3309 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3310 if (ch >= 0xD800 && ch < 0xDC00) {
3311 Py_UNICODE ch2;
3312 Py_UCS4 ucs;
3314 ch2 = *s++;
3315 size--;
3316 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3317 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3318 *p++ = '\\';
3319 *p++ = 'U';
3320 *p++ = hexdigit[(ucs >> 28) & 0xf];
3321 *p++ = hexdigit[(ucs >> 24) & 0xf];
3322 *p++ = hexdigit[(ucs >> 20) & 0xf];
3323 *p++ = hexdigit[(ucs >> 16) & 0xf];
3324 *p++ = hexdigit[(ucs >> 12) & 0xf];
3325 *p++ = hexdigit[(ucs >> 8) & 0xf];
3326 *p++ = hexdigit[(ucs >> 4) & 0xf];
3327 *p++ = hexdigit[ucs & 0xf];
3328 continue;
3330 /* Fall through: isolated surrogates are copied as-is */
3331 s--;
3332 size++;
3334 #endif
3335 /* Map 16-bit characters to '\uxxxx' */
3336 if (ch >= 256) {
3337 *p++ = '\\';
3338 *p++ = 'u';
3339 *p++ = hexdigit[(ch >> 12) & 0xf];
3340 *p++ = hexdigit[(ch >> 8) & 0xf];
3341 *p++ = hexdigit[(ch >> 4) & 0xf];
3342 *p++ = hexdigit[ch & 15];
3344 /* Copy everything else as-is */
3345 else
3346 *p++ = (char) ch;
3348 *p = '\0';
3349 _PyString_Resize(&repr, p - q);
3350 return repr;
3353 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3355 if (!PyUnicode_Check(unicode)) {
3356 PyErr_BadArgument();
3357 return NULL;
3359 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3360 PyUnicode_GET_SIZE(unicode));
3363 /* --- Unicode Internal Codec ------------------------------------------- */
3365 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3366 Py_ssize_t size,
3367 const char *errors)
3369 const char *starts = s;
3370 Py_ssize_t startinpos;
3371 Py_ssize_t endinpos;
3372 Py_ssize_t outpos;
3373 PyUnicodeObject *v;
3374 Py_UNICODE *p;
3375 const char *end;
3376 const char *reason;
3377 PyObject *errorHandler = NULL;
3378 PyObject *exc = NULL;
3380 #ifdef Py_UNICODE_WIDE
3381 Py_UNICODE unimax = PyUnicode_GetMax();
3382 #endif
3384 /* XXX overflow detection missing */
3385 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3386 if (v == NULL)
3387 goto onError;
3388 if (PyUnicode_GetSize((PyObject *)v) == 0)
3389 return (PyObject *)v;
3390 p = PyUnicode_AS_UNICODE(v);
3391 end = s + size;
3393 while (s < end) {
3394 memcpy(p, s, sizeof(Py_UNICODE));
3395 /* We have to sanity check the raw data, otherwise doom looms for
3396 some malformed UCS-4 data. */
3397 if (
3398 #ifdef Py_UNICODE_WIDE
3399 *p > unimax || *p < 0 ||
3400 #endif
3401 end-s < Py_UNICODE_SIZE
3404 startinpos = s - starts;
3405 if (end-s < Py_UNICODE_SIZE) {
3406 endinpos = end-starts;
3407 reason = "truncated input";
3409 else {
3410 endinpos = s - starts + Py_UNICODE_SIZE;
3411 reason = "illegal code point (> 0x10FFFF)";
3413 outpos = p - PyUnicode_AS_UNICODE(v);
3414 if (unicode_decode_call_errorhandler(
3415 errors, &errorHandler,
3416 "unicode_internal", reason,
3417 starts, size, &startinpos, &endinpos, &exc, &s,
3418 &v, &outpos, &p)) {
3419 goto onError;
3422 else {
3423 p++;
3424 s += Py_UNICODE_SIZE;
3428 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3429 goto onError;
3430 Py_XDECREF(errorHandler);
3431 Py_XDECREF(exc);
3432 return (PyObject *)v;
3434 onError:
3435 Py_XDECREF(v);
3436 Py_XDECREF(errorHandler);
3437 Py_XDECREF(exc);
3438 return NULL;
3441 /* --- Latin-1 Codec ------------------------------------------------------ */
3443 PyObject *PyUnicode_DecodeLatin1(const char *s,
3444 Py_ssize_t size,
3445 const char *errors)
3447 PyUnicodeObject *v;
3448 Py_UNICODE *p;
3450 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3451 if (size == 1) {
3452 Py_UNICODE r = *(unsigned char*)s;
3453 return PyUnicode_FromUnicode(&r, 1);
3456 v = _PyUnicode_New(size);
3457 if (v == NULL)
3458 goto onError;
3459 if (size == 0)
3460 return (PyObject *)v;
3461 p = PyUnicode_AS_UNICODE(v);
3462 while (size-- > 0)
3463 *p++ = (unsigned char)*s++;
3464 return (PyObject *)v;
3466 onError:
3467 Py_XDECREF(v);
3468 return NULL;
3471 /* create or adjust a UnicodeEncodeError */
3472 static void make_encode_exception(PyObject **exceptionObject,
3473 const char *encoding,
3474 const Py_UNICODE *unicode, Py_ssize_t size,
3475 Py_ssize_t startpos, Py_ssize_t endpos,
3476 const char *reason)
3478 if (*exceptionObject == NULL) {
3479 *exceptionObject = PyUnicodeEncodeError_Create(
3480 encoding, unicode, size, startpos, endpos, reason);
3482 else {
3483 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3484 goto onError;
3485 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3486 goto onError;
3487 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3488 goto onError;
3489 return;
3490 onError:
3491 Py_DECREF(*exceptionObject);
3492 *exceptionObject = NULL;
3496 /* raises a UnicodeEncodeError */
3497 static void raise_encode_exception(PyObject **exceptionObject,
3498 const char *encoding,
3499 const Py_UNICODE *unicode, Py_ssize_t size,
3500 Py_ssize_t startpos, Py_ssize_t endpos,
3501 const char *reason)
3503 make_encode_exception(exceptionObject,
3504 encoding, unicode, size, startpos, endpos, reason);
3505 if (*exceptionObject != NULL)
3506 PyCodec_StrictErrors(*exceptionObject);
3509 /* error handling callback helper:
3510 build arguments, call the callback and check the arguments,
3511 put the result into newpos and return the replacement string, which
3512 has to be freed by the caller */
3513 static PyObject *unicode_encode_call_errorhandler(const char *errors,
3514 PyObject **errorHandler,
3515 const char *encoding, const char *reason,
3516 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3517 Py_ssize_t startpos, Py_ssize_t endpos,
3518 Py_ssize_t *newpos)
3520 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3522 PyObject *restuple;
3523 PyObject *resunicode;
3525 if (*errorHandler == NULL) {
3526 *errorHandler = PyCodec_LookupError(errors);
3527 if (*errorHandler == NULL)
3528 return NULL;
3531 make_encode_exception(exceptionObject,
3532 encoding, unicode, size, startpos, endpos, reason);
3533 if (*exceptionObject == NULL)
3534 return NULL;
3536 restuple = PyObject_CallFunctionObjArgs(
3537 *errorHandler, *exceptionObject, NULL);
3538 if (restuple == NULL)
3539 return NULL;
3540 if (!PyTuple_Check(restuple)) {
3541 PyErr_SetString(PyExc_TypeError, &argparse[4]);
3542 Py_DECREF(restuple);
3543 return NULL;
3545 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3546 &resunicode, newpos)) {
3547 Py_DECREF(restuple);
3548 return NULL;
3550 if (*newpos<0)
3551 *newpos = size+*newpos;
3552 if (*newpos<0 || *newpos>size) {
3553 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3554 Py_DECREF(restuple);
3555 return NULL;
3557 Py_INCREF(resunicode);
3558 Py_DECREF(restuple);
3559 return resunicode;
3562 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3563 Py_ssize_t size,
3564 const char *errors,
3565 int limit)
3567 /* output object */
3568 PyObject *res;
3569 /* pointers to the beginning and end+1 of input */
3570 const Py_UNICODE *startp = p;
3571 const Py_UNICODE *endp = p + size;
3572 /* pointer to the beginning of the unencodable characters */
3573 /* const Py_UNICODE *badp = NULL; */
3574 /* pointer into the output */
3575 char *str;
3576 /* current output position */
3577 Py_ssize_t respos = 0;
3578 Py_ssize_t ressize;
3579 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3580 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3581 PyObject *errorHandler = NULL;
3582 PyObject *exc = NULL;
3583 /* the following variable is used for caching string comparisons
3584 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3585 int known_errorHandler = -1;
3587 /* allocate enough for a simple encoding without
3588 replacements, if we need more, we'll resize */
3589 res = PyString_FromStringAndSize(NULL, size);
3590 if (res == NULL)
3591 goto onError;
3592 if (size == 0)
3593 return res;
3594 str = PyString_AS_STRING(res);
3595 ressize = size;
3597 while (p<endp) {
3598 Py_UNICODE c = *p;
3600 /* can we encode this? */
3601 if (c<limit) {
3602 /* no overflow check, because we know that the space is enough */
3603 *str++ = (char)c;
3604 ++p;
3606 else {
3607 Py_ssize_t unicodepos = p-startp;
3608 Py_ssize_t requiredsize;
3609 PyObject *repunicode;
3610 Py_ssize_t repsize;
3611 Py_ssize_t newpos;
3612 Py_ssize_t respos;
3613 Py_UNICODE *uni2;
3614 /* startpos for collecting unencodable chars */
3615 const Py_UNICODE *collstart = p;
3616 const Py_UNICODE *collend = p;
3617 /* find all unecodable characters */
3618 while ((collend < endp) && ((*collend)>=limit))
3619 ++collend;
3620 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3621 if (known_errorHandler==-1) {
3622 if ((errors==NULL) || (!strcmp(errors, "strict")))
3623 known_errorHandler = 1;
3624 else if (!strcmp(errors, "replace"))
3625 known_errorHandler = 2;
3626 else if (!strcmp(errors, "ignore"))
3627 known_errorHandler = 3;
3628 else if (!strcmp(errors, "xmlcharrefreplace"))
3629 known_errorHandler = 4;
3630 else
3631 known_errorHandler = 0;
3633 switch (known_errorHandler) {
3634 case 1: /* strict */
3635 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3636 goto onError;
3637 case 2: /* replace */
3638 while (collstart++<collend)
3639 *str++ = '?'; /* fall through */
3640 case 3: /* ignore */
3641 p = collend;
3642 break;
3643 case 4: /* xmlcharrefreplace */
3644 respos = str-PyString_AS_STRING(res);
3645 /* determine replacement size (temporarily (mis)uses p) */
3646 for (p = collstart, repsize = 0; p < collend; ++p) {
3647 if (*p<10)
3648 repsize += 2+1+1;
3649 else if (*p<100)
3650 repsize += 2+2+1;
3651 else if (*p<1000)
3652 repsize += 2+3+1;
3653 else if (*p<10000)
3654 repsize += 2+4+1;
3655 #ifndef Py_UNICODE_WIDE
3656 else
3657 repsize += 2+5+1;
3658 #else
3659 else if (*p<100000)
3660 repsize += 2+5+1;
3661 else if (*p<1000000)
3662 repsize += 2+6+1;
3663 else
3664 repsize += 2+7+1;
3665 #endif
3667 requiredsize = respos+repsize+(endp-collend);
3668 if (requiredsize > ressize) {
3669 if (requiredsize<2*ressize)
3670 requiredsize = 2*ressize;
3671 if (_PyString_Resize(&res, requiredsize))
3672 goto onError;
3673 str = PyString_AS_STRING(res) + respos;
3674 ressize = requiredsize;
3676 /* generate replacement (temporarily (mis)uses p) */
3677 for (p = collstart; p < collend; ++p) {
3678 str += sprintf(str, "&#%d;", (int)*p);
3680 p = collend;
3681 break;
3682 default:
3683 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3684 encoding, reason, startp, size, &exc,
3685 collstart-startp, collend-startp, &newpos);
3686 if (repunicode == NULL)
3687 goto onError;
3688 /* need more space? (at least enough for what we have+the
3689 replacement+the rest of the string, so we won't have to
3690 check space for encodable characters) */
3691 respos = str-PyString_AS_STRING(res);
3692 repsize = PyUnicode_GET_SIZE(repunicode);
3693 requiredsize = respos+repsize+(endp-collend);
3694 if (requiredsize > ressize) {
3695 if (requiredsize<2*ressize)
3696 requiredsize = 2*ressize;
3697 if (_PyString_Resize(&res, requiredsize)) {
3698 Py_DECREF(repunicode);
3699 goto onError;
3701 str = PyString_AS_STRING(res) + respos;
3702 ressize = requiredsize;
3704 /* check if there is anything unencodable in the replacement
3705 and copy it to the output */
3706 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3707 c = *uni2;
3708 if (c >= limit) {
3709 raise_encode_exception(&exc, encoding, startp, size,
3710 unicodepos, unicodepos+1, reason);
3711 Py_DECREF(repunicode);
3712 goto onError;
3714 *str = (char)c;
3716 p = startp + newpos;
3717 Py_DECREF(repunicode);
3721 /* Resize if we allocated to much */
3722 respos = str-PyString_AS_STRING(res);
3723 if (respos<ressize)
3724 /* If this falls res will be NULL */
3725 _PyString_Resize(&res, respos);
3726 Py_XDECREF(errorHandler);
3727 Py_XDECREF(exc);
3728 return res;
3730 onError:
3731 Py_XDECREF(res);
3732 Py_XDECREF(errorHandler);
3733 Py_XDECREF(exc);
3734 return NULL;
3737 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3738 Py_ssize_t size,
3739 const char *errors)
3741 return unicode_encode_ucs1(p, size, errors, 256);
3744 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3746 if (!PyUnicode_Check(unicode)) {
3747 PyErr_BadArgument();
3748 return NULL;
3750 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3751 PyUnicode_GET_SIZE(unicode),
3752 NULL);
3755 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3757 PyObject *PyUnicode_DecodeASCII(const char *s,
3758 Py_ssize_t size,
3759 const char *errors)
3761 const char *starts = s;
3762 PyUnicodeObject *v;
3763 Py_UNICODE *p;
3764 Py_ssize_t startinpos;
3765 Py_ssize_t endinpos;
3766 Py_ssize_t outpos;
3767 const char *e;
3768 PyObject *errorHandler = NULL;
3769 PyObject *exc = NULL;
3771 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3772 if (size == 1 && *(unsigned char*)s < 128) {
3773 Py_UNICODE r = *(unsigned char*)s;
3774 return PyUnicode_FromUnicode(&r, 1);
3777 v = _PyUnicode_New(size);
3778 if (v == NULL)
3779 goto onError;
3780 if (size == 0)
3781 return (PyObject *)v;
3782 p = PyUnicode_AS_UNICODE(v);
3783 e = s + size;
3784 while (s < e) {
3785 register unsigned char c = (unsigned char)*s;
3786 if (c < 128) {
3787 *p++ = c;
3788 ++s;
3790 else {
3791 startinpos = s-starts;
3792 endinpos = startinpos + 1;
3793 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3794 if (unicode_decode_call_errorhandler(
3795 errors, &errorHandler,
3796 "ascii", "ordinal not in range(128)",
3797 starts, size, &startinpos, &endinpos, &exc, &s,
3798 &v, &outpos, &p))
3799 goto onError;
3802 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
3803 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3804 goto onError;
3805 Py_XDECREF(errorHandler);
3806 Py_XDECREF(exc);
3807 return (PyObject *)v;
3809 onError:
3810 Py_XDECREF(v);
3811 Py_XDECREF(errorHandler);
3812 Py_XDECREF(exc);
3813 return NULL;
3816 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3817 Py_ssize_t size,
3818 const char *errors)
3820 return unicode_encode_ucs1(p, size, errors, 128);
3823 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3825 if (!PyUnicode_Check(unicode)) {
3826 PyErr_BadArgument();
3827 return NULL;
3829 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3830 PyUnicode_GET_SIZE(unicode),
3831 NULL);
3834 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3836 /* --- MBCS codecs for Windows -------------------------------------------- */
3838 #if SIZEOF_INT < SIZEOF_SIZE_T
3839 #define NEED_RETRY
3840 #endif
3842 /* XXX This code is limited to "true" double-byte encodings, as
3843 a) it assumes an incomplete character consists of a single byte, and
3844 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3845 encodings, see IsDBCSLeadByteEx documentation. */
3847 static int is_dbcs_lead_byte(const char *s, int offset)
3849 const char *curr = s + offset;
3851 if (IsDBCSLeadByte(*curr)) {
3852 const char *prev = CharPrev(s, curr);
3853 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3855 return 0;
3859 * Decode MBCS string into unicode object. If 'final' is set, converts
3860 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3862 static int decode_mbcs(PyUnicodeObject **v,
3863 const char *s, /* MBCS string */
3864 int size, /* sizeof MBCS string */
3865 int final)
3867 Py_UNICODE *p;
3868 Py_ssize_t n = 0;
3869 int usize = 0;
3871 assert(size >= 0);
3873 /* Skip trailing lead-byte unless 'final' is set */
3874 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3875 --size;
3877 /* First get the size of the result */
3878 if (size > 0) {
3879 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3880 if (usize == 0) {
3881 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3882 return -1;
3886 if (*v == NULL) {
3887 /* Create unicode object */
3888 *v = _PyUnicode_New(usize);
3889 if (*v == NULL)
3890 return -1;
3892 else {
3893 /* Extend unicode object */
3894 n = PyUnicode_GET_SIZE(*v);
3895 if (_PyUnicode_Resize(v, n + usize) < 0)
3896 return -1;
3899 /* Do the conversion */
3900 if (size > 0) {
3901 p = PyUnicode_AS_UNICODE(*v) + n;
3902 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3903 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3904 return -1;
3908 return size;
3911 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3912 Py_ssize_t size,
3913 const char *errors,
3914 Py_ssize_t *consumed)
3916 PyUnicodeObject *v = NULL;
3917 int done;
3919 if (consumed)
3920 *consumed = 0;
3922 #ifdef NEED_RETRY
3923 retry:
3924 if (size > INT_MAX)
3925 done = decode_mbcs(&v, s, INT_MAX, 0);
3926 else
3927 #endif
3928 done = decode_mbcs(&v, s, (int)size, !consumed);
3930 if (done < 0) {
3931 Py_XDECREF(v);
3932 return NULL;
3935 if (consumed)
3936 *consumed += done;
3938 #ifdef NEED_RETRY
3939 if (size > INT_MAX) {
3940 s += done;
3941 size -= done;
3942 goto retry;
3944 #endif
3946 return (PyObject *)v;
3949 PyObject *PyUnicode_DecodeMBCS(const char *s,
3950 Py_ssize_t size,
3951 const char *errors)
3953 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3957 * Convert unicode into string object (MBCS).
3958 * Returns 0 if succeed, -1 otherwise.
3960 static int encode_mbcs(PyObject **repr,
3961 const Py_UNICODE *p, /* unicode */
3962 int size) /* size of unicode */
3964 int mbcssize = 0;
3965 Py_ssize_t n = 0;
3967 assert(size >= 0);
3969 /* First get the size of the result */
3970 if (size > 0) {
3971 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3972 if (mbcssize == 0) {
3973 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3974 return -1;
3978 if (*repr == NULL) {
3979 /* Create string object */
3980 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3981 if (*repr == NULL)
3982 return -1;
3984 else {
3985 /* Extend string object */
3986 n = PyString_Size(*repr);
3987 if (_PyString_Resize(repr, n + mbcssize) < 0)
3988 return -1;
3991 /* Do the conversion */
3992 if (size > 0) {
3993 char *s = PyString_AS_STRING(*repr) + n;
3994 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3995 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3996 return -1;
4000 return 0;
4003 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
4004 Py_ssize_t size,
4005 const char *errors)
4007 PyObject *repr = NULL;
4008 int ret;
4010 #ifdef NEED_RETRY
4011 retry:
4012 if (size > INT_MAX)
4013 ret = encode_mbcs(&repr, p, INT_MAX);
4014 else
4015 #endif
4016 ret = encode_mbcs(&repr, p, (int)size);
4018 if (ret < 0) {
4019 Py_XDECREF(repr);
4020 return NULL;
4023 #ifdef NEED_RETRY
4024 if (size > INT_MAX) {
4025 p += INT_MAX;
4026 size -= INT_MAX;
4027 goto retry;
4029 #endif
4031 return repr;
4034 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4036 if (!PyUnicode_Check(unicode)) {
4037 PyErr_BadArgument();
4038 return NULL;
4040 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4041 PyUnicode_GET_SIZE(unicode),
4042 NULL);
4045 #undef NEED_RETRY
4047 #endif /* MS_WINDOWS */
4049 /* --- Character Mapping Codec -------------------------------------------- */
4051 PyObject *PyUnicode_DecodeCharmap(const char *s,
4052 Py_ssize_t size,
4053 PyObject *mapping,
4054 const char *errors)
4056 const char *starts = s;
4057 Py_ssize_t startinpos;
4058 Py_ssize_t endinpos;
4059 Py_ssize_t outpos;
4060 const char *e;
4061 PyUnicodeObject *v;
4062 Py_UNICODE *p;
4063 Py_ssize_t extrachars = 0;
4064 PyObject *errorHandler = NULL;
4065 PyObject *exc = NULL;
4066 Py_UNICODE *mapstring = NULL;
4067 Py_ssize_t maplen = 0;
4069 /* Default to Latin-1 */
4070 if (mapping == NULL)
4071 return PyUnicode_DecodeLatin1(s, size, errors);
4073 v = _PyUnicode_New(size);
4074 if (v == NULL)
4075 goto onError;
4076 if (size == 0)
4077 return (PyObject *)v;
4078 p = PyUnicode_AS_UNICODE(v);
4079 e = s + size;
4080 if (PyUnicode_CheckExact(mapping)) {
4081 mapstring = PyUnicode_AS_UNICODE(mapping);
4082 maplen = PyUnicode_GET_SIZE(mapping);
4083 while (s < e) {
4084 unsigned char ch = *s;
4085 Py_UNICODE x = 0xfffe; /* illegal value */
4087 if (ch < maplen)
4088 x = mapstring[ch];
4090 if (x == 0xfffe) {
4091 /* undefined mapping */
4092 outpos = p-PyUnicode_AS_UNICODE(v);
4093 startinpos = s-starts;
4094 endinpos = startinpos+1;
4095 if (unicode_decode_call_errorhandler(
4096 errors, &errorHandler,
4097 "charmap", "character maps to <undefined>",
4098 starts, size, &startinpos, &endinpos, &exc, &s,
4099 &v, &outpos, &p)) {
4100 goto onError;
4102 continue;
4104 *p++ = x;
4105 ++s;
4108 else {
4109 while (s < e) {
4110 unsigned char ch = *s;
4111 PyObject *w, *x;
4113 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4114 w = PyInt_FromLong((long)ch);
4115 if (w == NULL)
4116 goto onError;
4117 x = PyObject_GetItem(mapping, w);
4118 Py_DECREF(w);
4119 if (x == NULL) {
4120 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4121 /* No mapping found means: mapping is undefined. */
4122 PyErr_Clear();
4123 x = Py_None;
4124 Py_INCREF(x);
4125 } else
4126 goto onError;
4129 /* Apply mapping */
4130 if (PyInt_Check(x)) {
4131 long value = PyInt_AS_LONG(x);
4132 if (value < 0 || value > 65535) {
4133 PyErr_SetString(PyExc_TypeError,
4134 "character mapping must be in range(65536)");
4135 Py_DECREF(x);
4136 goto onError;
4138 *p++ = (Py_UNICODE)value;
4140 else if (x == Py_None) {
4141 /* undefined mapping */
4142 outpos = p-PyUnicode_AS_UNICODE(v);
4143 startinpos = s-starts;
4144 endinpos = startinpos+1;
4145 if (unicode_decode_call_errorhandler(
4146 errors, &errorHandler,
4147 "charmap", "character maps to <undefined>",
4148 starts, size, &startinpos, &endinpos, &exc, &s,
4149 &v, &outpos, &p)) {
4150 Py_DECREF(x);
4151 goto onError;
4153 Py_DECREF(x);
4154 continue;
4156 else if (PyUnicode_Check(x)) {
4157 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4159 if (targetsize == 1)
4160 /* 1-1 mapping */
4161 *p++ = *PyUnicode_AS_UNICODE(x);
4163 else if (targetsize > 1) {
4164 /* 1-n mapping */
4165 if (targetsize > extrachars) {
4166 /* resize first */
4167 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4168 Py_ssize_t needed = (targetsize - extrachars) + \
4169 (targetsize << 2);
4170 extrachars += needed;
4171 /* XXX overflow detection missing */
4172 if (_PyUnicode_Resize(&v,
4173 PyUnicode_GET_SIZE(v) + needed) < 0) {
4174 Py_DECREF(x);
4175 goto onError;
4177 p = PyUnicode_AS_UNICODE(v) + oldpos;
4179 Py_UNICODE_COPY(p,
4180 PyUnicode_AS_UNICODE(x),
4181 targetsize);
4182 p += targetsize;
4183 extrachars -= targetsize;
4185 /* 1-0 mapping: skip the character */
4187 else {
4188 /* wrong return value */
4189 PyErr_SetString(PyExc_TypeError,
4190 "character mapping must return integer, None or unicode");
4191 Py_DECREF(x);
4192 goto onError;
4194 Py_DECREF(x);
4195 ++s;
4198 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4199 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4200 goto onError;
4201 Py_XDECREF(errorHandler);
4202 Py_XDECREF(exc);
4203 return (PyObject *)v;
4205 onError:
4206 Py_XDECREF(errorHandler);
4207 Py_XDECREF(exc);
4208 Py_XDECREF(v);
4209 return NULL;
4212 /* Charmap encoding: the lookup table */
4214 struct encoding_map{
4215 PyObject_HEAD
4216 unsigned char level1[32];
4217 int count2, count3;
4218 unsigned char level23[1];
4221 static PyObject*
4222 encoding_map_size(PyObject *obj, PyObject* args)
4224 struct encoding_map *map = (struct encoding_map*)obj;
4225 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4226 128*map->count3);
4229 static PyMethodDef encoding_map_methods[] = {
4230 {"size", encoding_map_size, METH_NOARGS,
4231 PyDoc_STR("Return the size (in bytes) of this object") },
4232 { 0 }
4235 static void
4236 encoding_map_dealloc(PyObject* o)
4238 PyObject_FREE(o);
4241 static PyTypeObject EncodingMapType = {
4242 PyVarObject_HEAD_INIT(NULL, 0)
4243 "EncodingMap", /*tp_name*/
4244 sizeof(struct encoding_map), /*tp_basicsize*/
4245 0, /*tp_itemsize*/
4246 /* methods */
4247 encoding_map_dealloc, /*tp_dealloc*/
4248 0, /*tp_print*/
4249 0, /*tp_getattr*/
4250 0, /*tp_setattr*/
4251 0, /*tp_compare*/
4252 0, /*tp_repr*/
4253 0, /*tp_as_number*/
4254 0, /*tp_as_sequence*/
4255 0, /*tp_as_mapping*/
4256 0, /*tp_hash*/
4257 0, /*tp_call*/
4258 0, /*tp_str*/
4259 0, /*tp_getattro*/
4260 0, /*tp_setattro*/
4261 0, /*tp_as_buffer*/
4262 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4263 0, /*tp_doc*/
4264 0, /*tp_traverse*/
4265 0, /*tp_clear*/
4266 0, /*tp_richcompare*/
4267 0, /*tp_weaklistoffset*/
4268 0, /*tp_iter*/
4269 0, /*tp_iternext*/
4270 encoding_map_methods, /*tp_methods*/
4271 0, /*tp_members*/
4272 0, /*tp_getset*/
4273 0, /*tp_base*/
4274 0, /*tp_dict*/
4275 0, /*tp_descr_get*/
4276 0, /*tp_descr_set*/
4277 0, /*tp_dictoffset*/
4278 0, /*tp_init*/
4279 0, /*tp_alloc*/
4280 0, /*tp_new*/
4281 0, /*tp_free*/
4282 0, /*tp_is_gc*/
4285 PyObject*
4286 PyUnicode_BuildEncodingMap(PyObject* string)
4288 Py_UNICODE *decode;
4289 PyObject *result;
4290 struct encoding_map *mresult;
4291 int i;
4292 int need_dict = 0;
4293 unsigned char level1[32];
4294 unsigned char level2[512];
4295 unsigned char *mlevel1, *mlevel2, *mlevel3;
4296 int count2 = 0, count3 = 0;
4298 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4299 PyErr_BadArgument();
4300 return NULL;
4302 decode = PyUnicode_AS_UNICODE(string);
4303 memset(level1, 0xFF, sizeof level1);
4304 memset(level2, 0xFF, sizeof level2);
4306 /* If there isn't a one-to-one mapping of NULL to \0,
4307 or if there are non-BMP characters, we need to use
4308 a mapping dictionary. */
4309 if (decode[0] != 0)
4310 need_dict = 1;
4311 for (i = 1; i < 256; i++) {
4312 int l1, l2;
4313 if (decode[i] == 0
4314 #ifdef Py_UNICODE_WIDE
4315 || decode[i] > 0xFFFF
4316 #endif
4318 need_dict = 1;
4319 break;
4321 if (decode[i] == 0xFFFE)
4322 /* unmapped character */
4323 continue;
4324 l1 = decode[i] >> 11;
4325 l2 = decode[i] >> 7;
4326 if (level1[l1] == 0xFF)
4327 level1[l1] = count2++;
4328 if (level2[l2] == 0xFF)
4329 level2[l2] = count3++;
4332 if (count2 >= 0xFF || count3 >= 0xFF)
4333 need_dict = 1;
4335 if (need_dict) {
4336 PyObject *result = PyDict_New();
4337 PyObject *key, *value;
4338 if (!result)
4339 return NULL;
4340 for (i = 0; i < 256; i++) {
4341 key = value = NULL;
4342 key = PyInt_FromLong(decode[i]);
4343 value = PyInt_FromLong(i);
4344 if (!key || !value)
4345 goto failed1;
4346 if (PyDict_SetItem(result, key, value) == -1)
4347 goto failed1;
4348 Py_DECREF(key);
4349 Py_DECREF(value);
4351 return result;
4352 failed1:
4353 Py_XDECREF(key);
4354 Py_XDECREF(value);
4355 Py_DECREF(result);
4356 return NULL;
4359 /* Create a three-level trie */
4360 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4361 16*count2 + 128*count3 - 1);
4362 if (!result)
4363 return PyErr_NoMemory();
4364 PyObject_Init(result, &EncodingMapType);
4365 mresult = (struct encoding_map*)result;
4366 mresult->count2 = count2;
4367 mresult->count3 = count3;
4368 mlevel1 = mresult->level1;
4369 mlevel2 = mresult->level23;
4370 mlevel3 = mresult->level23 + 16*count2;
4371 memcpy(mlevel1, level1, 32);
4372 memset(mlevel2, 0xFF, 16*count2);
4373 memset(mlevel3, 0, 128*count3);
4374 count3 = 0;
4375 for (i = 1; i < 256; i++) {
4376 int o1, o2, o3, i2, i3;
4377 if (decode[i] == 0xFFFE)
4378 /* unmapped character */
4379 continue;
4380 o1 = decode[i]>>11;
4381 o2 = (decode[i]>>7) & 0xF;
4382 i2 = 16*mlevel1[o1] + o2;
4383 if (mlevel2[i2] == 0xFF)
4384 mlevel2[i2] = count3++;
4385 o3 = decode[i] & 0x7F;
4386 i3 = 128*mlevel2[i2] + o3;
4387 mlevel3[i3] = i;
4389 return result;
4392 static int
4393 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4395 struct encoding_map *map = (struct encoding_map*)mapping;
4396 int l1 = c>>11;
4397 int l2 = (c>>7) & 0xF;
4398 int l3 = c & 0x7F;
4399 int i;
4401 #ifdef Py_UNICODE_WIDE
4402 if (c > 0xFFFF) {
4403 return -1;
4405 #endif
4406 if (c == 0)
4407 return 0;
4408 /* level 1*/
4409 i = map->level1[l1];
4410 if (i == 0xFF) {
4411 return -1;
4413 /* level 2*/
4414 i = map->level23[16*i+l2];
4415 if (i == 0xFF) {
4416 return -1;
4418 /* level 3 */
4419 i = map->level23[16*map->count2 + 128*i + l3];
4420 if (i == 0) {
4421 return -1;
4423 return i;
4426 /* Lookup the character ch in the mapping. If the character
4427 can't be found, Py_None is returned (or NULL, if another
4428 error occurred). */
4429 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4431 PyObject *w = PyInt_FromLong((long)c);
4432 PyObject *x;
4434 if (w == NULL)
4435 return NULL;
4436 x = PyObject_GetItem(mapping, w);
4437 Py_DECREF(w);
4438 if (x == NULL) {
4439 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4440 /* No mapping found means: mapping is undefined. */
4441 PyErr_Clear();
4442 x = Py_None;
4443 Py_INCREF(x);
4444 return x;
4445 } else
4446 return NULL;
4448 else if (x == Py_None)
4449 return x;
4450 else if (PyInt_Check(x)) {
4451 long value = PyInt_AS_LONG(x);
4452 if (value < 0 || value > 255) {
4453 PyErr_SetString(PyExc_TypeError,
4454 "character mapping must be in range(256)");
4455 Py_DECREF(x);
4456 return NULL;
4458 return x;
4460 else if (PyString_Check(x))
4461 return x;
4462 else {
4463 /* wrong return value */
4464 PyErr_SetString(PyExc_TypeError,
4465 "character mapping must return integer, None or str");
4466 Py_DECREF(x);
4467 return NULL;
4471 static int
4472 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4474 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4475 /* exponentially overallocate to minimize reallocations */
4476 if (requiredsize < 2*outsize)
4477 requiredsize = 2*outsize;
4478 if (_PyString_Resize(outobj, requiredsize)) {
4479 return 0;
4481 return 1;
4484 typedef enum charmapencode_result {
4485 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4486 }charmapencode_result;
4487 /* lookup the character, put the result in the output string and adjust
4488 various state variables. Reallocate the output string if not enough
4489 space is available. Return a new reference to the object that
4490 was put in the output buffer, or Py_None, if the mapping was undefined
4491 (in which case no character was written) or NULL, if a
4492 reallocation error occurred. The caller must decref the result */
4493 static
4494 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4495 PyObject **outobj, Py_ssize_t *outpos)
4497 PyObject *rep;
4498 char *outstart;
4499 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4501 if (Py_TYPE(mapping) == &EncodingMapType) {
4502 int res = encoding_map_lookup(c, mapping);
4503 Py_ssize_t requiredsize = *outpos+1;
4504 if (res == -1)
4505 return enc_FAILED;
4506 if (outsize<requiredsize)
4507 if (!charmapencode_resize(outobj, outpos, requiredsize))
4508 return enc_EXCEPTION;
4509 outstart = PyString_AS_STRING(*outobj);
4510 outstart[(*outpos)++] = (char)res;
4511 return enc_SUCCESS;
4514 rep = charmapencode_lookup(c, mapping);
4515 if (rep==NULL)
4516 return enc_EXCEPTION;
4517 else if (rep==Py_None) {
4518 Py_DECREF(rep);
4519 return enc_FAILED;
4520 } else {
4521 if (PyInt_Check(rep)) {
4522 Py_ssize_t requiredsize = *outpos+1;
4523 if (outsize<requiredsize)
4524 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4525 Py_DECREF(rep);
4526 return enc_EXCEPTION;
4528 outstart = PyString_AS_STRING(*outobj);
4529 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4531 else {
4532 const char *repchars = PyString_AS_STRING(rep);
4533 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4534 Py_ssize_t requiredsize = *outpos+repsize;
4535 if (outsize<requiredsize)
4536 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4537 Py_DECREF(rep);
4538 return enc_EXCEPTION;
4540 outstart = PyString_AS_STRING(*outobj);
4541 memcpy(outstart + *outpos, repchars, repsize);
4542 *outpos += repsize;
4545 Py_DECREF(rep);
4546 return enc_SUCCESS;
4549 /* handle an error in PyUnicode_EncodeCharmap
4550 Return 0 on success, -1 on error */
4551 static
4552 int charmap_encoding_error(
4553 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4554 PyObject **exceptionObject,
4555 int *known_errorHandler, PyObject **errorHandler, const char *errors,
4556 PyObject **res, Py_ssize_t *respos)
4558 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4559 Py_ssize_t repsize;
4560 Py_ssize_t newpos;
4561 Py_UNICODE *uni2;
4562 /* startpos for collecting unencodable chars */
4563 Py_ssize_t collstartpos = *inpos;
4564 Py_ssize_t collendpos = *inpos+1;
4565 Py_ssize_t collpos;
4566 char *encoding = "charmap";
4567 char *reason = "character maps to <undefined>";
4568 charmapencode_result x;
4570 /* find all unencodable characters */
4571 while (collendpos < size) {
4572 PyObject *rep;
4573 if (Py_TYPE(mapping) == &EncodingMapType) {
4574 int res = encoding_map_lookup(p[collendpos], mapping);
4575 if (res != -1)
4576 break;
4577 ++collendpos;
4578 continue;
4581 rep = charmapencode_lookup(p[collendpos], mapping);
4582 if (rep==NULL)
4583 return -1;
4584 else if (rep!=Py_None) {
4585 Py_DECREF(rep);
4586 break;
4588 Py_DECREF(rep);
4589 ++collendpos;
4591 /* cache callback name lookup
4592 * (if not done yet, i.e. it's the first error) */
4593 if (*known_errorHandler==-1) {
4594 if ((errors==NULL) || (!strcmp(errors, "strict")))
4595 *known_errorHandler = 1;
4596 else if (!strcmp(errors, "replace"))
4597 *known_errorHandler = 2;
4598 else if (!strcmp(errors, "ignore"))
4599 *known_errorHandler = 3;
4600 else if (!strcmp(errors, "xmlcharrefreplace"))
4601 *known_errorHandler = 4;
4602 else
4603 *known_errorHandler = 0;
4605 switch (*known_errorHandler) {
4606 case 1: /* strict */
4607 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4608 return -1;
4609 case 2: /* replace */
4610 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4611 x = charmapencode_output('?', mapping, res, respos);
4612 if (x==enc_EXCEPTION) {
4613 return -1;
4615 else if (x==enc_FAILED) {
4616 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4617 return -1;
4620 /* fall through */
4621 case 3: /* ignore */
4622 *inpos = collendpos;
4623 break;
4624 case 4: /* xmlcharrefreplace */
4625 /* generate replacement (temporarily (mis)uses p) */
4626 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4627 char buffer[2+29+1+1];
4628 char *cp;
4629 sprintf(buffer, "&#%d;", (int)p[collpos]);
4630 for (cp = buffer; *cp; ++cp) {
4631 x = charmapencode_output(*cp, mapping, res, respos);
4632 if (x==enc_EXCEPTION)
4633 return -1;
4634 else if (x==enc_FAILED) {
4635 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4636 return -1;
4640 *inpos = collendpos;
4641 break;
4642 default:
4643 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4644 encoding, reason, p, size, exceptionObject,
4645 collstartpos, collendpos, &newpos);
4646 if (repunicode == NULL)
4647 return -1;
4648 /* generate replacement */
4649 repsize = PyUnicode_GET_SIZE(repunicode);
4650 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4651 x = charmapencode_output(*uni2, mapping, res, respos);
4652 if (x==enc_EXCEPTION) {
4653 return -1;
4655 else if (x==enc_FAILED) {
4656 Py_DECREF(repunicode);
4657 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4658 return -1;
4661 *inpos = newpos;
4662 Py_DECREF(repunicode);
4664 return 0;
4667 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4668 Py_ssize_t size,
4669 PyObject *mapping,
4670 const char *errors)
4672 /* output object */
4673 PyObject *res = NULL;
4674 /* current input position */
4675 Py_ssize_t inpos = 0;
4676 /* current output position */
4677 Py_ssize_t respos = 0;
4678 PyObject *errorHandler = NULL;
4679 PyObject *exc = NULL;
4680 /* the following variable is used for caching string comparisons
4681 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4682 * 3=ignore, 4=xmlcharrefreplace */
4683 int known_errorHandler = -1;
4685 /* Default to Latin-1 */
4686 if (mapping == NULL)
4687 return PyUnicode_EncodeLatin1(p, size, errors);
4689 /* allocate enough for a simple encoding without
4690 replacements, if we need more, we'll resize */
4691 res = PyString_FromStringAndSize(NULL, size);
4692 if (res == NULL)
4693 goto onError;
4694 if (size == 0)
4695 return res;
4697 while (inpos<size) {
4698 /* try to encode it */
4699 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4700 if (x==enc_EXCEPTION) /* error */
4701 goto onError;
4702 if (x==enc_FAILED) { /* unencodable character */
4703 if (charmap_encoding_error(p, size, &inpos, mapping,
4704 &exc,
4705 &known_errorHandler, &errorHandler, errors,
4706 &res, &respos)) {
4707 goto onError;
4710 else
4711 /* done with this character => adjust input position */
4712 ++inpos;
4715 /* Resize if we allocated to much */
4716 if (respos<PyString_GET_SIZE(res)) {
4717 if (_PyString_Resize(&res, respos))
4718 goto onError;
4720 Py_XDECREF(exc);
4721 Py_XDECREF(errorHandler);
4722 return res;
4724 onError:
4725 Py_XDECREF(res);
4726 Py_XDECREF(exc);
4727 Py_XDECREF(errorHandler);
4728 return NULL;
4731 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4732 PyObject *mapping)
4734 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4735 PyErr_BadArgument();
4736 return NULL;
4738 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4739 PyUnicode_GET_SIZE(unicode),
4740 mapping,
4741 NULL);
4744 /* create or adjust a UnicodeTranslateError */
4745 static void make_translate_exception(PyObject **exceptionObject,
4746 const Py_UNICODE *unicode, Py_ssize_t size,
4747 Py_ssize_t startpos, Py_ssize_t endpos,
4748 const char *reason)
4750 if (*exceptionObject == NULL) {
4751 *exceptionObject = PyUnicodeTranslateError_Create(
4752 unicode, size, startpos, endpos, reason);
4754 else {
4755 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4756 goto onError;
4757 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4758 goto onError;
4759 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4760 goto onError;
4761 return;
4762 onError:
4763 Py_DECREF(*exceptionObject);
4764 *exceptionObject = NULL;
4768 /* raises a UnicodeTranslateError */
4769 static void raise_translate_exception(PyObject **exceptionObject,
4770 const Py_UNICODE *unicode, Py_ssize_t size,
4771 Py_ssize_t startpos, Py_ssize_t endpos,
4772 const char *reason)
4774 make_translate_exception(exceptionObject,
4775 unicode, size, startpos, endpos, reason);
4776 if (*exceptionObject != NULL)
4777 PyCodec_StrictErrors(*exceptionObject);
4780 /* error handling callback helper:
4781 build arguments, call the callback and check the arguments,
4782 put the result into newpos and return the replacement string, which
4783 has to be freed by the caller */
4784 static PyObject *unicode_translate_call_errorhandler(const char *errors,
4785 PyObject **errorHandler,
4786 const char *reason,
4787 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4788 Py_ssize_t startpos, Py_ssize_t endpos,
4789 Py_ssize_t *newpos)
4791 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4793 Py_ssize_t i_newpos;
4794 PyObject *restuple;
4795 PyObject *resunicode;
4797 if (*errorHandler == NULL) {
4798 *errorHandler = PyCodec_LookupError(errors);
4799 if (*errorHandler == NULL)
4800 return NULL;
4803 make_translate_exception(exceptionObject,
4804 unicode, size, startpos, endpos, reason);
4805 if (*exceptionObject == NULL)
4806 return NULL;
4808 restuple = PyObject_CallFunctionObjArgs(
4809 *errorHandler, *exceptionObject, NULL);
4810 if (restuple == NULL)
4811 return NULL;
4812 if (!PyTuple_Check(restuple)) {
4813 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4814 Py_DECREF(restuple);
4815 return NULL;
4817 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4818 &resunicode, &i_newpos)) {
4819 Py_DECREF(restuple);
4820 return NULL;
4822 if (i_newpos<0)
4823 *newpos = size+i_newpos;
4824 else
4825 *newpos = i_newpos;
4826 if (*newpos<0 || *newpos>size) {
4827 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4828 Py_DECREF(restuple);
4829 return NULL;
4831 Py_INCREF(resunicode);
4832 Py_DECREF(restuple);
4833 return resunicode;
4836 /* Lookup the character ch in the mapping and put the result in result,
4837 which must be decrefed by the caller.
4838 Return 0 on success, -1 on error */
4839 static
4840 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4842 PyObject *w = PyInt_FromLong((long)c);
4843 PyObject *x;
4845 if (w == NULL)
4846 return -1;
4847 x = PyObject_GetItem(mapping, w);
4848 Py_DECREF(w);
4849 if (x == NULL) {
4850 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4851 /* No mapping found means: use 1:1 mapping. */
4852 PyErr_Clear();
4853 *result = NULL;
4854 return 0;
4855 } else
4856 return -1;
4858 else if (x == Py_None) {
4859 *result = x;
4860 return 0;
4862 else if (PyInt_Check(x)) {
4863 long value = PyInt_AS_LONG(x);
4864 long max = PyUnicode_GetMax();
4865 if (value < 0 || value > max) {
4866 PyErr_Format(PyExc_TypeError,
4867 "character mapping must be in range(0x%lx)", max+1);
4868 Py_DECREF(x);
4869 return -1;
4871 *result = x;
4872 return 0;
4874 else if (PyUnicode_Check(x)) {
4875 *result = x;
4876 return 0;
4878 else {
4879 /* wrong return value */
4880 PyErr_SetString(PyExc_TypeError,
4881 "character mapping must return integer, None or unicode");
4882 Py_DECREF(x);
4883 return -1;
4886 /* ensure that *outobj is at least requiredsize characters long,
4887 if not reallocate and adjust various state variables.
4888 Return 0 on success, -1 on error */
4889 static
4890 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4891 Py_ssize_t requiredsize)
4893 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4894 if (requiredsize > oldsize) {
4895 /* remember old output position */
4896 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4897 /* exponentially overallocate to minimize reallocations */
4898 if (requiredsize < 2 * oldsize)
4899 requiredsize = 2 * oldsize;
4900 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4901 return -1;
4902 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4904 return 0;
4906 /* lookup the character, put the result in the output string and adjust
4907 various state variables. Return a new reference to the object that
4908 was put in the output buffer in *result, or Py_None, if the mapping was
4909 undefined (in which case no character was written).
4910 The called must decref result.
4911 Return 0 on success, -1 on error. */
4912 static
4913 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
4914 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4915 PyObject **res)
4917 if (charmaptranslate_lookup(*curinp, mapping, res))
4918 return -1;
4919 if (*res==NULL) {
4920 /* not found => default to 1:1 mapping */
4921 *(*outp)++ = *curinp;
4923 else if (*res==Py_None)
4925 else if (PyInt_Check(*res)) {
4926 /* no overflow check, because we know that the space is enough */
4927 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4929 else if (PyUnicode_Check(*res)) {
4930 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4931 if (repsize==1) {
4932 /* no overflow check, because we know that the space is enough */
4933 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4935 else if (repsize!=0) {
4936 /* more than one character */
4937 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4938 (insize - (curinp-startinp)) +
4939 repsize - 1;
4940 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4941 return -1;
4942 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4943 *outp += repsize;
4946 else
4947 return -1;
4948 return 0;
4951 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
4952 Py_ssize_t size,
4953 PyObject *mapping,
4954 const char *errors)
4956 /* output object */
4957 PyObject *res = NULL;
4958 /* pointers to the beginning and end+1 of input */
4959 const Py_UNICODE *startp = p;
4960 const Py_UNICODE *endp = p + size;
4961 /* pointer into the output */
4962 Py_UNICODE *str;
4963 /* current output position */
4964 Py_ssize_t respos = 0;
4965 char *reason = "character maps to <undefined>";
4966 PyObject *errorHandler = NULL;
4967 PyObject *exc = NULL;
4968 /* the following variable is used for caching string comparisons
4969 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4970 * 3=ignore, 4=xmlcharrefreplace */
4971 int known_errorHandler = -1;
4973 if (mapping == NULL) {
4974 PyErr_BadArgument();
4975 return NULL;
4978 /* allocate enough for a simple 1:1 translation without
4979 replacements, if we need more, we'll resize */
4980 res = PyUnicode_FromUnicode(NULL, size);
4981 if (res == NULL)
4982 goto onError;
4983 if (size == 0)
4984 return res;
4985 str = PyUnicode_AS_UNICODE(res);
4987 while (p<endp) {
4988 /* try to encode it */
4989 PyObject *x = NULL;
4990 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4991 Py_XDECREF(x);
4992 goto onError;
4994 Py_XDECREF(x);
4995 if (x!=Py_None) /* it worked => adjust input pointer */
4996 ++p;
4997 else { /* untranslatable character */
4998 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4999 Py_ssize_t repsize;
5000 Py_ssize_t newpos;
5001 Py_UNICODE *uni2;
5002 /* startpos for collecting untranslatable chars */
5003 const Py_UNICODE *collstart = p;
5004 const Py_UNICODE *collend = p+1;
5005 const Py_UNICODE *coll;
5007 /* find all untranslatable characters */
5008 while (collend < endp) {
5009 if (charmaptranslate_lookup(*collend, mapping, &x))
5010 goto onError;
5011 Py_XDECREF(x);
5012 if (x!=Py_None)
5013 break;
5014 ++collend;
5016 /* cache callback name lookup
5017 * (if not done yet, i.e. it's the first error) */
5018 if (known_errorHandler==-1) {
5019 if ((errors==NULL) || (!strcmp(errors, "strict")))
5020 known_errorHandler = 1;
5021 else if (!strcmp(errors, "replace"))
5022 known_errorHandler = 2;
5023 else if (!strcmp(errors, "ignore"))
5024 known_errorHandler = 3;
5025 else if (!strcmp(errors, "xmlcharrefreplace"))
5026 known_errorHandler = 4;
5027 else
5028 known_errorHandler = 0;
5030 switch (known_errorHandler) {
5031 case 1: /* strict */
5032 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5033 goto onError;
5034 case 2: /* replace */
5035 /* No need to check for space, this is a 1:1 replacement */
5036 for (coll = collstart; coll<collend; ++coll)
5037 *str++ = '?';
5038 /* fall through */
5039 case 3: /* ignore */
5040 p = collend;
5041 break;
5042 case 4: /* xmlcharrefreplace */
5043 /* generate replacement (temporarily (mis)uses p) */
5044 for (p = collstart; p < collend; ++p) {
5045 char buffer[2+29+1+1];
5046 char *cp;
5047 sprintf(buffer, "&#%d;", (int)*p);
5048 if (charmaptranslate_makespace(&res, &str,
5049 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5050 goto onError;
5051 for (cp = buffer; *cp; ++cp)
5052 *str++ = *cp;
5054 p = collend;
5055 break;
5056 default:
5057 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5058 reason, startp, size, &exc,
5059 collstart-startp, collend-startp, &newpos);
5060 if (repunicode == NULL)
5061 goto onError;
5062 /* generate replacement */
5063 repsize = PyUnicode_GET_SIZE(repunicode);
5064 if (charmaptranslate_makespace(&res, &str,
5065 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5066 Py_DECREF(repunicode);
5067 goto onError;
5069 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5070 *str++ = *uni2;
5071 p = startp + newpos;
5072 Py_DECREF(repunicode);
5076 /* Resize if we allocated to much */
5077 respos = str-PyUnicode_AS_UNICODE(res);
5078 if (respos<PyUnicode_GET_SIZE(res)) {
5079 if (PyUnicode_Resize(&res, respos) < 0)
5080 goto onError;
5082 Py_XDECREF(exc);
5083 Py_XDECREF(errorHandler);
5084 return res;
5086 onError:
5087 Py_XDECREF(res);
5088 Py_XDECREF(exc);
5089 Py_XDECREF(errorHandler);
5090 return NULL;
5093 PyObject *PyUnicode_Translate(PyObject *str,
5094 PyObject *mapping,
5095 const char *errors)
5097 PyObject *result;
5099 str = PyUnicode_FromObject(str);
5100 if (str == NULL)
5101 goto onError;
5102 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5103 PyUnicode_GET_SIZE(str),
5104 mapping,
5105 errors);
5106 Py_DECREF(str);
5107 return result;
5109 onError:
5110 Py_XDECREF(str);
5111 return NULL;
5114 /* --- Decimal Encoder ---------------------------------------------------- */
5116 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5117 Py_ssize_t length,
5118 char *output,
5119 const char *errors)
5121 Py_UNICODE *p, *end;
5122 PyObject *errorHandler = NULL;
5123 PyObject *exc = NULL;
5124 const char *encoding = "decimal";
5125 const char *reason = "invalid decimal Unicode string";
5126 /* the following variable is used for caching string comparisons
5127 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5128 int known_errorHandler = -1;
5130 if (output == NULL) {
5131 PyErr_BadArgument();
5132 return -1;
5135 p = s;
5136 end = s + length;
5137 while (p < end) {
5138 register Py_UNICODE ch = *p;
5139 int decimal;
5140 PyObject *repunicode;
5141 Py_ssize_t repsize;
5142 Py_ssize_t newpos;
5143 Py_UNICODE *uni2;
5144 Py_UNICODE *collstart;
5145 Py_UNICODE *collend;
5147 if (Py_UNICODE_ISSPACE(ch)) {
5148 *output++ = ' ';
5149 ++p;
5150 continue;
5152 decimal = Py_UNICODE_TODECIMAL(ch);
5153 if (decimal >= 0) {
5154 *output++ = '0' + decimal;
5155 ++p;
5156 continue;
5158 if (0 < ch && ch < 256) {
5159 *output++ = (char)ch;
5160 ++p;
5161 continue;
5163 /* All other characters are considered unencodable */
5164 collstart = p;
5165 collend = p+1;
5166 while (collend < end) {
5167 if ((0 < *collend && *collend < 256) ||
5168 !Py_UNICODE_ISSPACE(*collend) ||
5169 Py_UNICODE_TODECIMAL(*collend))
5170 break;
5172 /* cache callback name lookup
5173 * (if not done yet, i.e. it's the first error) */
5174 if (known_errorHandler==-1) {
5175 if ((errors==NULL) || (!strcmp(errors, "strict")))
5176 known_errorHandler = 1;
5177 else if (!strcmp(errors, "replace"))
5178 known_errorHandler = 2;
5179 else if (!strcmp(errors, "ignore"))
5180 known_errorHandler = 3;
5181 else if (!strcmp(errors, "xmlcharrefreplace"))
5182 known_errorHandler = 4;
5183 else
5184 known_errorHandler = 0;
5186 switch (known_errorHandler) {
5187 case 1: /* strict */
5188 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5189 goto onError;
5190 case 2: /* replace */
5191 for (p = collstart; p < collend; ++p)
5192 *output++ = '?';
5193 /* fall through */
5194 case 3: /* ignore */
5195 p = collend;
5196 break;
5197 case 4: /* xmlcharrefreplace */
5198 /* generate replacement (temporarily (mis)uses p) */
5199 for (p = collstart; p < collend; ++p)
5200 output += sprintf(output, "&#%d;", (int)*p);
5201 p = collend;
5202 break;
5203 default:
5204 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5205 encoding, reason, s, length, &exc,
5206 collstart-s, collend-s, &newpos);
5207 if (repunicode == NULL)
5208 goto onError;
5209 /* generate replacement */
5210 repsize = PyUnicode_GET_SIZE(repunicode);
5211 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5212 Py_UNICODE ch = *uni2;
5213 if (Py_UNICODE_ISSPACE(ch))
5214 *output++ = ' ';
5215 else {
5216 decimal = Py_UNICODE_TODECIMAL(ch);
5217 if (decimal >= 0)
5218 *output++ = '0' + decimal;
5219 else if (0 < ch && ch < 256)
5220 *output++ = (char)ch;
5221 else {
5222 Py_DECREF(repunicode);
5223 raise_encode_exception(&exc, encoding,
5224 s, length, collstart-s, collend-s, reason);
5225 goto onError;
5229 p = s + newpos;
5230 Py_DECREF(repunicode);
5233 /* 0-terminate the output string */
5234 *output++ = '\0';
5235 Py_XDECREF(exc);
5236 Py_XDECREF(errorHandler);
5237 return 0;
5239 onError:
5240 Py_XDECREF(exc);
5241 Py_XDECREF(errorHandler);
5242 return -1;
5245 /* --- Helpers ------------------------------------------------------------ */
5247 #include "stringlib/unicodedefs.h"
5249 #define FROM_UNICODE
5251 #include "stringlib/fastsearch.h"
5253 #include "stringlib/count.h"
5254 #include "stringlib/find.h"
5255 #include "stringlib/partition.h"
5257 /* helper macro to fixup start/end slice values */
5258 #define FIX_START_END(obj) \
5259 if (start < 0) \
5260 start += (obj)->length; \
5261 if (start < 0) \
5262 start = 0; \
5263 if (end > (obj)->length) \
5264 end = (obj)->length; \
5265 if (end < 0) \
5266 end += (obj)->length; \
5267 if (end < 0) \
5268 end = 0;
5270 Py_ssize_t PyUnicode_Count(PyObject *str,
5271 PyObject *substr,
5272 Py_ssize_t start,
5273 Py_ssize_t end)
5275 Py_ssize_t result;
5276 PyUnicodeObject* str_obj;
5277 PyUnicodeObject* sub_obj;
5279 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5280 if (!str_obj)
5281 return -1;
5282 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5283 if (!sub_obj) {
5284 Py_DECREF(str_obj);
5285 return -1;
5288 FIX_START_END(str_obj);
5290 result = stringlib_count(
5291 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5294 Py_DECREF(sub_obj);
5295 Py_DECREF(str_obj);
5297 return result;
5300 Py_ssize_t PyUnicode_Find(PyObject *str,
5301 PyObject *sub,
5302 Py_ssize_t start,
5303 Py_ssize_t end,
5304 int direction)
5306 Py_ssize_t result;
5308 str = PyUnicode_FromObject(str);
5309 if (!str)
5310 return -2;
5311 sub = PyUnicode_FromObject(sub);
5312 if (!sub) {
5313 Py_DECREF(str);
5314 return -2;
5317 if (direction > 0)
5318 result = stringlib_find_slice(
5319 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5320 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5321 start, end
5323 else
5324 result = stringlib_rfind_slice(
5325 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5326 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5327 start, end
5330 Py_DECREF(str);
5331 Py_DECREF(sub);
5333 return result;
5336 static
5337 int tailmatch(PyUnicodeObject *self,
5338 PyUnicodeObject *substring,
5339 Py_ssize_t start,
5340 Py_ssize_t end,
5341 int direction)
5343 if (substring->length == 0)
5344 return 1;
5346 FIX_START_END(self);
5348 end -= substring->length;
5349 if (end < start)
5350 return 0;
5352 if (direction > 0) {
5353 if (Py_UNICODE_MATCH(self, end, substring))
5354 return 1;
5355 } else {
5356 if (Py_UNICODE_MATCH(self, start, substring))
5357 return 1;
5360 return 0;
5363 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5364 PyObject *substr,
5365 Py_ssize_t start,
5366 Py_ssize_t end,
5367 int direction)
5369 Py_ssize_t result;
5371 str = PyUnicode_FromObject(str);
5372 if (str == NULL)
5373 return -1;
5374 substr = PyUnicode_FromObject(substr);
5375 if (substr == NULL) {
5376 Py_DECREF(str);
5377 return -1;
5380 result = tailmatch((PyUnicodeObject *)str,
5381 (PyUnicodeObject *)substr,
5382 start, end, direction);
5383 Py_DECREF(str);
5384 Py_DECREF(substr);
5385 return result;
5388 /* Apply fixfct filter to the Unicode object self and return a
5389 reference to the modified object */
5391 static
5392 PyObject *fixup(PyUnicodeObject *self,
5393 int (*fixfct)(PyUnicodeObject *s))
5396 PyUnicodeObject *u;
5398 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5399 if (u == NULL)
5400 return NULL;
5402 Py_UNICODE_COPY(u->str, self->str, self->length);
5404 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5405 /* fixfct should return TRUE if it modified the buffer. If
5406 FALSE, return a reference to the original buffer instead
5407 (to save space, not time) */
5408 Py_INCREF(self);
5409 Py_DECREF(u);
5410 return (PyObject*) self;
5412 return (PyObject*) u;
5415 static
5416 int fixupper(PyUnicodeObject *self)
5418 Py_ssize_t len = self->length;
5419 Py_UNICODE *s = self->str;
5420 int status = 0;
5422 while (len-- > 0) {
5423 register Py_UNICODE ch;
5425 ch = Py_UNICODE_TOUPPER(*s);
5426 if (ch != *s) {
5427 status = 1;
5428 *s = ch;
5430 s++;
5433 return status;
5436 static
5437 int fixlower(PyUnicodeObject *self)
5439 Py_ssize_t len = self->length;
5440 Py_UNICODE *s = self->str;
5441 int status = 0;
5443 while (len-- > 0) {
5444 register Py_UNICODE ch;
5446 ch = Py_UNICODE_TOLOWER(*s);
5447 if (ch != *s) {
5448 status = 1;
5449 *s = ch;
5451 s++;
5454 return status;
5457 static
5458 int fixswapcase(PyUnicodeObject *self)
5460 Py_ssize_t len = self->length;
5461 Py_UNICODE *s = self->str;
5462 int status = 0;
5464 while (len-- > 0) {
5465 if (Py_UNICODE_ISUPPER(*s)) {
5466 *s = Py_UNICODE_TOLOWER(*s);
5467 status = 1;
5468 } else if (Py_UNICODE_ISLOWER(*s)) {
5469 *s = Py_UNICODE_TOUPPER(*s);
5470 status = 1;
5472 s++;
5475 return status;
5478 static
5479 int fixcapitalize(PyUnicodeObject *self)
5481 Py_ssize_t len = self->length;
5482 Py_UNICODE *s = self->str;
5483 int status = 0;
5485 if (len == 0)
5486 return 0;
5487 if (Py_UNICODE_ISLOWER(*s)) {
5488 *s = Py_UNICODE_TOUPPER(*s);
5489 status = 1;
5491 s++;
5492 while (--len > 0) {
5493 if (Py_UNICODE_ISUPPER(*s)) {
5494 *s = Py_UNICODE_TOLOWER(*s);
5495 status = 1;
5497 s++;
5499 return status;
5502 static
5503 int fixtitle(PyUnicodeObject *self)
5505 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5506 register Py_UNICODE *e;
5507 int previous_is_cased;
5509 /* Shortcut for single character strings */
5510 if (PyUnicode_GET_SIZE(self) == 1) {
5511 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5512 if (*p != ch) {
5513 *p = ch;
5514 return 1;
5516 else
5517 return 0;
5520 e = p + PyUnicode_GET_SIZE(self);
5521 previous_is_cased = 0;
5522 for (; p < e; p++) {
5523 register const Py_UNICODE ch = *p;
5525 if (previous_is_cased)
5526 *p = Py_UNICODE_TOLOWER(ch);
5527 else
5528 *p = Py_UNICODE_TOTITLE(ch);
5530 if (Py_UNICODE_ISLOWER(ch) ||
5531 Py_UNICODE_ISUPPER(ch) ||
5532 Py_UNICODE_ISTITLE(ch))
5533 previous_is_cased = 1;
5534 else
5535 previous_is_cased = 0;
5537 return 1;
5540 PyObject *
5541 PyUnicode_Join(PyObject *separator, PyObject *seq)
5543 PyObject *internal_separator = NULL;
5544 const Py_UNICODE blank = ' ';
5545 const Py_UNICODE *sep = &blank;
5546 Py_ssize_t seplen = 1;
5547 PyUnicodeObject *res = NULL; /* the result */
5548 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5549 Py_ssize_t res_used; /* # used bytes */
5550 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5551 PyObject *fseq; /* PySequence_Fast(seq) */
5552 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
5553 PyObject *item;
5554 Py_ssize_t i;
5556 fseq = PySequence_Fast(seq, "");
5557 if (fseq == NULL) {
5558 return NULL;
5561 /* Grrrr. A codec may be invoked to convert str objects to
5562 * Unicode, and so it's possible to call back into Python code
5563 * during PyUnicode_FromObject(), and so it's possible for a sick
5564 * codec to change the size of fseq (if seq is a list). Therefore
5565 * we have to keep refetching the size -- can't assume seqlen
5566 * is invariant.
5568 seqlen = PySequence_Fast_GET_SIZE(fseq);
5569 /* If empty sequence, return u"". */
5570 if (seqlen == 0) {
5571 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5572 goto Done;
5574 /* If singleton sequence with an exact Unicode, return that. */
5575 if (seqlen == 1) {
5576 item = PySequence_Fast_GET_ITEM(fseq, 0);
5577 if (PyUnicode_CheckExact(item)) {
5578 Py_INCREF(item);
5579 res = (PyUnicodeObject *)item;
5580 goto Done;
5584 /* At least two items to join, or one that isn't exact Unicode. */
5585 if (seqlen > 1) {
5586 /* Set up sep and seplen -- they're needed. */
5587 if (separator == NULL) {
5588 sep = &blank;
5589 seplen = 1;
5591 else {
5592 internal_separator = PyUnicode_FromObject(separator);
5593 if (internal_separator == NULL)
5594 goto onError;
5595 sep = PyUnicode_AS_UNICODE(internal_separator);
5596 seplen = PyUnicode_GET_SIZE(internal_separator);
5597 /* In case PyUnicode_FromObject() mutated seq. */
5598 seqlen = PySequence_Fast_GET_SIZE(fseq);
5602 /* Get space. */
5603 res = _PyUnicode_New(res_alloc);
5604 if (res == NULL)
5605 goto onError;
5606 res_p = PyUnicode_AS_UNICODE(res);
5607 res_used = 0;
5609 for (i = 0; i < seqlen; ++i) {
5610 Py_ssize_t itemlen;
5611 Py_ssize_t new_res_used;
5613 item = PySequence_Fast_GET_ITEM(fseq, i);
5614 /* Convert item to Unicode. */
5615 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5616 PyErr_Format(PyExc_TypeError,
5617 "sequence item %zd: expected string or Unicode,"
5618 " %.80s found",
5619 i, Py_TYPE(item)->tp_name);
5620 goto onError;
5622 item = PyUnicode_FromObject(item);
5623 if (item == NULL)
5624 goto onError;
5625 /* We own a reference to item from here on. */
5627 /* In case PyUnicode_FromObject() mutated seq. */
5628 seqlen = PySequence_Fast_GET_SIZE(fseq);
5630 /* Make sure we have enough space for the separator and the item. */
5631 itemlen = PyUnicode_GET_SIZE(item);
5632 new_res_used = res_used + itemlen;
5633 if (new_res_used < 0)
5634 goto Overflow;
5635 if (i < seqlen - 1) {
5636 new_res_used += seplen;
5637 if (new_res_used < 0)
5638 goto Overflow;
5640 if (new_res_used > res_alloc) {
5641 /* double allocated size until it's big enough */
5642 do {
5643 res_alloc += res_alloc;
5644 if (res_alloc <= 0)
5645 goto Overflow;
5646 } while (new_res_used > res_alloc);
5647 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5648 Py_DECREF(item);
5649 goto onError;
5651 res_p = PyUnicode_AS_UNICODE(res) + res_used;
5654 /* Copy item, and maybe the separator. */
5655 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5656 res_p += itemlen;
5657 if (i < seqlen - 1) {
5658 Py_UNICODE_COPY(res_p, sep, seplen);
5659 res_p += seplen;
5661 Py_DECREF(item);
5662 res_used = new_res_used;
5665 /* Shrink res to match the used area; this probably can't fail,
5666 * but it's cheap to check.
5668 if (_PyUnicode_Resize(&res, res_used) < 0)
5669 goto onError;
5671 Done:
5672 Py_XDECREF(internal_separator);
5673 Py_DECREF(fseq);
5674 return (PyObject *)res;
5676 Overflow:
5677 PyErr_SetString(PyExc_OverflowError,
5678 "join() result is too long for a Python string");
5679 Py_DECREF(item);
5680 /* fall through */
5682 onError:
5683 Py_XDECREF(internal_separator);
5684 Py_DECREF(fseq);
5685 Py_XDECREF(res);
5686 return NULL;
5689 static
5690 PyUnicodeObject *pad(PyUnicodeObject *self,
5691 Py_ssize_t left,
5692 Py_ssize_t right,
5693 Py_UNICODE fill)
5695 PyUnicodeObject *u;
5697 if (left < 0)
5698 left = 0;
5699 if (right < 0)
5700 right = 0;
5702 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5703 Py_INCREF(self);
5704 return self;
5707 if (left > PY_SSIZE_T_MAX - self->length ||
5708 right > PY_SSIZE_T_MAX - (left + self->length)) {
5709 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5710 return NULL;
5712 u = _PyUnicode_New(left + self->length + right);
5713 if (u) {
5714 if (left)
5715 Py_UNICODE_FILL(u->str, fill, left);
5716 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5717 if (right)
5718 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5721 return u;
5724 #define SPLIT_APPEND(data, left, right) \
5725 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
5726 if (!str) \
5727 goto onError; \
5728 if (PyList_Append(list, str)) { \
5729 Py_DECREF(str); \
5730 goto onError; \
5732 else \
5733 Py_DECREF(str);
5735 static
5736 PyObject *split_whitespace(PyUnicodeObject *self,
5737 PyObject *list,
5738 Py_ssize_t maxcount)
5740 register Py_ssize_t i;
5741 register Py_ssize_t j;
5742 Py_ssize_t len = self->length;
5743 PyObject *str;
5744 register const Py_UNICODE *buf = self->str;
5746 for (i = j = 0; i < len; ) {
5747 /* find a token */
5748 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5749 i++;
5750 j = i;
5751 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
5752 i++;
5753 if (j < i) {
5754 if (maxcount-- <= 0)
5755 break;
5756 SPLIT_APPEND(buf, j, i);
5757 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5758 i++;
5759 j = i;
5762 if (j < len) {
5763 SPLIT_APPEND(buf, j, len);
5765 return list;
5767 onError:
5768 Py_DECREF(list);
5769 return NULL;
5772 PyObject *PyUnicode_Splitlines(PyObject *string,
5773 int keepends)
5775 register Py_ssize_t i;
5776 register Py_ssize_t j;
5777 Py_ssize_t len;
5778 PyObject *list;
5779 PyObject *str;
5780 Py_UNICODE *data;
5782 string = PyUnicode_FromObject(string);
5783 if (string == NULL)
5784 return NULL;
5785 data = PyUnicode_AS_UNICODE(string);
5786 len = PyUnicode_GET_SIZE(string);
5788 list = PyList_New(0);
5789 if (!list)
5790 goto onError;
5792 for (i = j = 0; i < len; ) {
5793 Py_ssize_t eol;
5795 /* Find a line and append it */
5796 while (i < len && !BLOOM_LINEBREAK(data[i]))
5797 i++;
5799 /* Skip the line break reading CRLF as one line break */
5800 eol = i;
5801 if (i < len) {
5802 if (data[i] == '\r' && i + 1 < len &&
5803 data[i+1] == '\n')
5804 i += 2;
5805 else
5806 i++;
5807 if (keepends)
5808 eol = i;
5810 SPLIT_APPEND(data, j, eol);
5811 j = i;
5813 if (j < len) {
5814 SPLIT_APPEND(data, j, len);
5817 Py_DECREF(string);
5818 return list;
5820 onError:
5821 Py_XDECREF(list);
5822 Py_DECREF(string);
5823 return NULL;
5826 static
5827 PyObject *split_char(PyUnicodeObject *self,
5828 PyObject *list,
5829 Py_UNICODE ch,
5830 Py_ssize_t maxcount)
5832 register Py_ssize_t i;
5833 register Py_ssize_t j;
5834 Py_ssize_t len = self->length;
5835 PyObject *str;
5836 register const Py_UNICODE *buf = self->str;
5838 for (i = j = 0; i < len; ) {
5839 if (buf[i] == ch) {
5840 if (maxcount-- <= 0)
5841 break;
5842 SPLIT_APPEND(buf, j, i);
5843 i = j = i + 1;
5844 } else
5845 i++;
5847 if (j <= len) {
5848 SPLIT_APPEND(buf, j, len);
5850 return list;
5852 onError:
5853 Py_DECREF(list);
5854 return NULL;
5857 static
5858 PyObject *split_substring(PyUnicodeObject *self,
5859 PyObject *list,
5860 PyUnicodeObject *substring,
5861 Py_ssize_t maxcount)
5863 register Py_ssize_t i;
5864 register Py_ssize_t j;
5865 Py_ssize_t len = self->length;
5866 Py_ssize_t sublen = substring->length;
5867 PyObject *str;
5869 for (i = j = 0; i <= len - sublen; ) {
5870 if (Py_UNICODE_MATCH(self, i, substring)) {
5871 if (maxcount-- <= 0)
5872 break;
5873 SPLIT_APPEND(self->str, j, i);
5874 i = j = i + sublen;
5875 } else
5876 i++;
5878 if (j <= len) {
5879 SPLIT_APPEND(self->str, j, len);
5881 return list;
5883 onError:
5884 Py_DECREF(list);
5885 return NULL;
5888 static
5889 PyObject *rsplit_whitespace(PyUnicodeObject *self,
5890 PyObject *list,
5891 Py_ssize_t maxcount)
5893 register Py_ssize_t i;
5894 register Py_ssize_t j;
5895 Py_ssize_t len = self->length;
5896 PyObject *str;
5897 register const Py_UNICODE *buf = self->str;
5899 for (i = j = len - 1; i >= 0; ) {
5900 /* find a token */
5901 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5902 i--;
5903 j = i;
5904 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
5905 i--;
5906 if (j > i) {
5907 if (maxcount-- <= 0)
5908 break;
5909 SPLIT_APPEND(buf, i + 1, j + 1);
5910 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5911 i--;
5912 j = i;
5915 if (j >= 0) {
5916 SPLIT_APPEND(buf, 0, j + 1);
5918 if (PyList_Reverse(list) < 0)
5919 goto onError;
5920 return list;
5922 onError:
5923 Py_DECREF(list);
5924 return NULL;
5927 static
5928 PyObject *rsplit_char(PyUnicodeObject *self,
5929 PyObject *list,
5930 Py_UNICODE ch,
5931 Py_ssize_t maxcount)
5933 register Py_ssize_t i;
5934 register Py_ssize_t j;
5935 Py_ssize_t len = self->length;
5936 PyObject *str;
5937 register const Py_UNICODE *buf = self->str;
5939 for (i = j = len - 1; i >= 0; ) {
5940 if (buf[i] == ch) {
5941 if (maxcount-- <= 0)
5942 break;
5943 SPLIT_APPEND(buf, i + 1, j + 1);
5944 j = i = i - 1;
5945 } else
5946 i--;
5948 if (j >= -1) {
5949 SPLIT_APPEND(buf, 0, j + 1);
5951 if (PyList_Reverse(list) < 0)
5952 goto onError;
5953 return list;
5955 onError:
5956 Py_DECREF(list);
5957 return NULL;
5960 static
5961 PyObject *rsplit_substring(PyUnicodeObject *self,
5962 PyObject *list,
5963 PyUnicodeObject *substring,
5964 Py_ssize_t maxcount)
5966 register Py_ssize_t i;
5967 register Py_ssize_t j;
5968 Py_ssize_t len = self->length;
5969 Py_ssize_t sublen = substring->length;
5970 PyObject *str;
5972 for (i = len - sublen, j = len; i >= 0; ) {
5973 if (Py_UNICODE_MATCH(self, i, substring)) {
5974 if (maxcount-- <= 0)
5975 break;
5976 SPLIT_APPEND(self->str, i + sublen, j);
5977 j = i;
5978 i -= sublen;
5979 } else
5980 i--;
5982 if (j >= 0) {
5983 SPLIT_APPEND(self->str, 0, j);
5985 if (PyList_Reverse(list) < 0)
5986 goto onError;
5987 return list;
5989 onError:
5990 Py_DECREF(list);
5991 return NULL;
5994 #undef SPLIT_APPEND
5996 static
5997 PyObject *split(PyUnicodeObject *self,
5998 PyUnicodeObject *substring,
5999 Py_ssize_t maxcount)
6001 PyObject *list;
6003 if (maxcount < 0)
6004 maxcount = PY_SSIZE_T_MAX;
6006 list = PyList_New(0);
6007 if (!list)
6008 return NULL;
6010 if (substring == NULL)
6011 return split_whitespace(self,list,maxcount);
6013 else if (substring->length == 1)
6014 return split_char(self,list,substring->str[0],maxcount);
6016 else if (substring->length == 0) {
6017 Py_DECREF(list);
6018 PyErr_SetString(PyExc_ValueError, "empty separator");
6019 return NULL;
6021 else
6022 return split_substring(self,list,substring,maxcount);
6025 static
6026 PyObject *rsplit(PyUnicodeObject *self,
6027 PyUnicodeObject *substring,
6028 Py_ssize_t maxcount)
6030 PyObject *list;
6032 if (maxcount < 0)
6033 maxcount = PY_SSIZE_T_MAX;
6035 list = PyList_New(0);
6036 if (!list)
6037 return NULL;
6039 if (substring == NULL)
6040 return rsplit_whitespace(self,list,maxcount);
6042 else if (substring->length == 1)
6043 return rsplit_char(self,list,substring->str[0],maxcount);
6045 else if (substring->length == 0) {
6046 Py_DECREF(list);
6047 PyErr_SetString(PyExc_ValueError, "empty separator");
6048 return NULL;
6050 else
6051 return rsplit_substring(self,list,substring,maxcount);
6054 static
6055 PyObject *replace(PyUnicodeObject *self,
6056 PyUnicodeObject *str1,
6057 PyUnicodeObject *str2,
6058 Py_ssize_t maxcount)
6060 PyUnicodeObject *u;
6062 if (maxcount < 0)
6063 maxcount = PY_SSIZE_T_MAX;
6065 if (str1->length == str2->length) {
6066 /* same length */
6067 Py_ssize_t i;
6068 if (str1->length == 1) {
6069 /* replace characters */
6070 Py_UNICODE u1, u2;
6071 if (!findchar(self->str, self->length, str1->str[0]))
6072 goto nothing;
6073 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6074 if (!u)
6075 return NULL;
6076 Py_UNICODE_COPY(u->str, self->str, self->length);
6077 u1 = str1->str[0];
6078 u2 = str2->str[0];
6079 for (i = 0; i < u->length; i++)
6080 if (u->str[i] == u1) {
6081 if (--maxcount < 0)
6082 break;
6083 u->str[i] = u2;
6085 } else {
6086 i = fastsearch(
6087 self->str, self->length, str1->str, str1->length, FAST_SEARCH
6089 if (i < 0)
6090 goto nothing;
6091 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6092 if (!u)
6093 return NULL;
6094 Py_UNICODE_COPY(u->str, self->str, self->length);
6095 while (i <= self->length - str1->length)
6096 if (Py_UNICODE_MATCH(self, i, str1)) {
6097 if (--maxcount < 0)
6098 break;
6099 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6100 i += str1->length;
6101 } else
6102 i++;
6104 } else {
6106 Py_ssize_t n, i, j, e;
6107 Py_ssize_t product, new_size, delta;
6108 Py_UNICODE *p;
6110 /* replace strings */
6111 n = stringlib_count(self->str, self->length, str1->str, str1->length);
6112 if (n > maxcount)
6113 n = maxcount;
6114 if (n == 0)
6115 goto nothing;
6116 /* new_size = self->length + n * (str2->length - str1->length)); */
6117 delta = (str2->length - str1->length);
6118 if (delta == 0) {
6119 new_size = self->length;
6120 } else {
6121 product = n * (str2->length - str1->length);
6122 if ((product / (str2->length - str1->length)) != n) {
6123 PyErr_SetString(PyExc_OverflowError,
6124 "replace string is too long");
6125 return NULL;
6127 new_size = self->length + product;
6128 if (new_size < 0) {
6129 PyErr_SetString(PyExc_OverflowError,
6130 "replace string is too long");
6131 return NULL;
6134 u = _PyUnicode_New(new_size);
6135 if (!u)
6136 return NULL;
6137 i = 0;
6138 p = u->str;
6139 e = self->length - str1->length;
6140 if (str1->length > 0) {
6141 while (n-- > 0) {
6142 /* look for next match */
6143 j = i;
6144 while (j <= e) {
6145 if (Py_UNICODE_MATCH(self, j, str1))
6146 break;
6147 j++;
6149 if (j > i) {
6150 if (j > e)
6151 break;
6152 /* copy unchanged part [i:j] */
6153 Py_UNICODE_COPY(p, self->str+i, j-i);
6154 p += j - i;
6156 /* copy substitution string */
6157 if (str2->length > 0) {
6158 Py_UNICODE_COPY(p, str2->str, str2->length);
6159 p += str2->length;
6161 i = j + str1->length;
6163 if (i < self->length)
6164 /* copy tail [i:] */
6165 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6166 } else {
6167 /* interleave */
6168 while (n > 0) {
6169 Py_UNICODE_COPY(p, str2->str, str2->length);
6170 p += str2->length;
6171 if (--n <= 0)
6172 break;
6173 *p++ = self->str[i++];
6175 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6178 return (PyObject *) u;
6180 nothing:
6181 /* nothing to replace; return original string (when possible) */
6182 if (PyUnicode_CheckExact(self)) {
6183 Py_INCREF(self);
6184 return (PyObject *) self;
6186 return PyUnicode_FromUnicode(self->str, self->length);
6189 /* --- Unicode Object Methods --------------------------------------------- */
6191 PyDoc_STRVAR(title__doc__,
6192 "S.title() -> unicode\n\
6194 Return a titlecased version of S, i.e. words start with title case\n\
6195 characters, all remaining cased characters have lower case.");
6197 static PyObject*
6198 unicode_title(PyUnicodeObject *self)
6200 return fixup(self, fixtitle);
6203 PyDoc_STRVAR(capitalize__doc__,
6204 "S.capitalize() -> unicode\n\
6206 Return a capitalized version of S, i.e. make the first character\n\
6207 have upper case.");
6209 static PyObject*
6210 unicode_capitalize(PyUnicodeObject *self)
6212 return fixup(self, fixcapitalize);
6215 #if 0
6216 PyDoc_STRVAR(capwords__doc__,
6217 "S.capwords() -> unicode\n\
6219 Apply .capitalize() to all words in S and return the result with\n\
6220 normalized whitespace (all whitespace strings are replaced by ' ').");
6222 static PyObject*
6223 unicode_capwords(PyUnicodeObject *self)
6225 PyObject *list;
6226 PyObject *item;
6227 Py_ssize_t i;
6229 /* Split into words */
6230 list = split(self, NULL, -1);
6231 if (!list)
6232 return NULL;
6234 /* Capitalize each word */
6235 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6236 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6237 fixcapitalize);
6238 if (item == NULL)
6239 goto onError;
6240 Py_DECREF(PyList_GET_ITEM(list, i));
6241 PyList_SET_ITEM(list, i, item);
6244 /* Join the words to form a new string */
6245 item = PyUnicode_Join(NULL, list);
6247 onError:
6248 Py_DECREF(list);
6249 return (PyObject *)item;
6251 #endif
6253 /* Argument converter. Coerces to a single unicode character */
6255 static int
6256 convert_uc(PyObject *obj, void *addr)
6258 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6259 PyObject *uniobj;
6260 Py_UNICODE *unistr;
6262 uniobj = PyUnicode_FromObject(obj);
6263 if (uniobj == NULL) {
6264 PyErr_SetString(PyExc_TypeError,
6265 "The fill character cannot be converted to Unicode");
6266 return 0;
6268 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6269 PyErr_SetString(PyExc_TypeError,
6270 "The fill character must be exactly one character long");
6271 Py_DECREF(uniobj);
6272 return 0;
6274 unistr = PyUnicode_AS_UNICODE(uniobj);
6275 *fillcharloc = unistr[0];
6276 Py_DECREF(uniobj);
6277 return 1;
6280 PyDoc_STRVAR(center__doc__,
6281 "S.center(width[, fillchar]) -> unicode\n\
6283 Return S centered in a Unicode string of length width. Padding is\n\
6284 done using the specified fill character (default is a space)");
6286 static PyObject *
6287 unicode_center(PyUnicodeObject *self, PyObject *args)
6289 Py_ssize_t marg, left;
6290 Py_ssize_t width;
6291 Py_UNICODE fillchar = ' ';
6293 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6294 return NULL;
6296 if (self->length >= width && PyUnicode_CheckExact(self)) {
6297 Py_INCREF(self);
6298 return (PyObject*) self;
6301 marg = width - self->length;
6302 left = marg / 2 + (marg & width & 1);
6304 return (PyObject*) pad(self, left, marg - left, fillchar);
6307 #if 0
6309 /* This code should go into some future Unicode collation support
6310 module. The basic comparison should compare ordinals on a naive
6311 basis (this is what Java does and thus Jython too). */
6313 /* speedy UTF-16 code point order comparison */
6314 /* gleaned from: */
6315 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6317 static short utf16Fixup[32] =
6319 0, 0, 0, 0, 0, 0, 0, 0,
6320 0, 0, 0, 0, 0, 0, 0, 0,
6321 0, 0, 0, 0, 0, 0, 0, 0,
6322 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6325 static int
6326 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6328 Py_ssize_t len1, len2;
6330 Py_UNICODE *s1 = str1->str;
6331 Py_UNICODE *s2 = str2->str;
6333 len1 = str1->length;
6334 len2 = str2->length;
6336 while (len1 > 0 && len2 > 0) {
6337 Py_UNICODE c1, c2;
6339 c1 = *s1++;
6340 c2 = *s2++;
6342 if (c1 > (1<<11) * 26)
6343 c1 += utf16Fixup[c1>>11];
6344 if (c2 > (1<<11) * 26)
6345 c2 += utf16Fixup[c2>>11];
6346 /* now c1 and c2 are in UTF-32-compatible order */
6348 if (c1 != c2)
6349 return (c1 < c2) ? -1 : 1;
6351 len1--; len2--;
6354 return (len1 < len2) ? -1 : (len1 != len2);
6357 #else
6359 static int
6360 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6362 register Py_ssize_t len1, len2;
6364 Py_UNICODE *s1 = str1->str;
6365 Py_UNICODE *s2 = str2->str;
6367 len1 = str1->length;
6368 len2 = str2->length;
6370 while (len1 > 0 && len2 > 0) {
6371 Py_UNICODE c1, c2;
6373 c1 = *s1++;
6374 c2 = *s2++;
6376 if (c1 != c2)
6377 return (c1 < c2) ? -1 : 1;
6379 len1--; len2--;
6382 return (len1 < len2) ? -1 : (len1 != len2);
6385 #endif
6387 int PyUnicode_Compare(PyObject *left,
6388 PyObject *right)
6390 PyUnicodeObject *u = NULL, *v = NULL;
6391 int result;
6393 /* Coerce the two arguments */
6394 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6395 if (u == NULL)
6396 goto onError;
6397 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6398 if (v == NULL)
6399 goto onError;
6401 /* Shortcut for empty or interned objects */
6402 if (v == u) {
6403 Py_DECREF(u);
6404 Py_DECREF(v);
6405 return 0;
6408 result = unicode_compare(u, v);
6410 Py_DECREF(u);
6411 Py_DECREF(v);
6412 return result;
6414 onError:
6415 Py_XDECREF(u);
6416 Py_XDECREF(v);
6417 return -1;
6420 PyObject *PyUnicode_RichCompare(PyObject *left,
6421 PyObject *right,
6422 int op)
6424 int result;
6426 result = PyUnicode_Compare(left, right);
6427 if (result == -1 && PyErr_Occurred())
6428 goto onError;
6430 /* Convert the return value to a Boolean */
6431 switch (op) {
6432 case Py_EQ:
6433 result = (result == 0);
6434 break;
6435 case Py_NE:
6436 result = (result != 0);
6437 break;
6438 case Py_LE:
6439 result = (result <= 0);
6440 break;
6441 case Py_GE:
6442 result = (result >= 0);
6443 break;
6444 case Py_LT:
6445 result = (result == -1);
6446 break;
6447 case Py_GT:
6448 result = (result == 1);
6449 break;
6451 return PyBool_FromLong(result);
6453 onError:
6455 /* Standard case
6457 Type errors mean that PyUnicode_FromObject() could not convert
6458 one of the arguments (usually the right hand side) to Unicode,
6459 ie. we can't handle the comparison request. However, it is
6460 possible that the other object knows a comparison method, which
6461 is why we return Py_NotImplemented to give the other object a
6462 chance.
6465 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6466 PyErr_Clear();
6467 Py_INCREF(Py_NotImplemented);
6468 return Py_NotImplemented;
6470 if (op != Py_EQ && op != Py_NE)
6471 return NULL;
6473 /* Equality comparison.
6475 This is a special case: we silence any PyExc_UnicodeDecodeError
6476 and instead turn it into a PyErr_UnicodeWarning.
6479 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6480 return NULL;
6481 PyErr_Clear();
6482 if (PyErr_Warn(PyExc_UnicodeWarning,
6483 (op == Py_EQ) ?
6484 "Unicode equal comparison "
6485 "failed to convert both arguments to Unicode - "
6486 "interpreting them as being unequal" :
6487 "Unicode unequal comparison "
6488 "failed to convert both arguments to Unicode - "
6489 "interpreting them as being unequal"
6490 ) < 0)
6491 return NULL;
6492 result = (op == Py_NE);
6493 return PyBool_FromLong(result);
6496 int PyUnicode_Contains(PyObject *container,
6497 PyObject *element)
6499 PyObject *str, *sub;
6500 int result;
6502 /* Coerce the two arguments */
6503 sub = PyUnicode_FromObject(element);
6504 if (!sub) {
6505 return -1;
6508 str = PyUnicode_FromObject(container);
6509 if (!str) {
6510 Py_DECREF(sub);
6511 return -1;
6514 result = stringlib_contains_obj(str, sub);
6516 Py_DECREF(str);
6517 Py_DECREF(sub);
6519 return result;
6522 /* Concat to string or Unicode object giving a new Unicode object. */
6524 PyObject *PyUnicode_Concat(PyObject *left,
6525 PyObject *right)
6527 PyUnicodeObject *u = NULL, *v = NULL, *w;
6529 /* Coerce the two arguments */
6530 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6531 if (u == NULL)
6532 goto onError;
6533 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6534 if (v == NULL)
6535 goto onError;
6537 /* Shortcuts */
6538 if (v == unicode_empty) {
6539 Py_DECREF(v);
6540 return (PyObject *)u;
6542 if (u == unicode_empty) {
6543 Py_DECREF(u);
6544 return (PyObject *)v;
6547 /* Concat the two Unicode strings */
6548 w = _PyUnicode_New(u->length + v->length);
6549 if (w == NULL)
6550 goto onError;
6551 Py_UNICODE_COPY(w->str, u->str, u->length);
6552 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6554 Py_DECREF(u);
6555 Py_DECREF(v);
6556 return (PyObject *)w;
6558 onError:
6559 Py_XDECREF(u);
6560 Py_XDECREF(v);
6561 return NULL;
6564 PyDoc_STRVAR(count__doc__,
6565 "S.count(sub[, start[, end]]) -> int\n\
6567 Return the number of non-overlapping occurrences of substring sub in\n\
6568 Unicode string S[start:end]. Optional arguments start and end are\n\
6569 interpreted as in slice notation.");
6571 static PyObject *
6572 unicode_count(PyUnicodeObject *self, PyObject *args)
6574 PyUnicodeObject *substring;
6575 Py_ssize_t start = 0;
6576 Py_ssize_t end = PY_SSIZE_T_MAX;
6577 PyObject *result;
6579 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6580 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6581 return NULL;
6583 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6584 (PyObject *)substring);
6585 if (substring == NULL)
6586 return NULL;
6588 FIX_START_END(self);
6590 result = PyInt_FromSsize_t(
6591 stringlib_count(self->str + start, end - start,
6592 substring->str, substring->length)
6595 Py_DECREF(substring);
6597 return result;
6600 PyDoc_STRVAR(encode__doc__,
6601 "S.encode([encoding[,errors]]) -> string or unicode\n\
6603 Encodes S using the codec registered for encoding. encoding defaults\n\
6604 to the default encoding. errors may be given to set a different error\n\
6605 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6606 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6607 'xmlcharrefreplace' as well as any other name registered with\n\
6608 codecs.register_error that can handle UnicodeEncodeErrors.");
6610 static PyObject *
6611 unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6613 static char *kwlist[] = {"encoding", "errors", 0};
6614 char *encoding = NULL;
6615 char *errors = NULL;
6616 PyObject *v;
6618 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6619 kwlist, &encoding, &errors))
6620 return NULL;
6621 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6622 if (v == NULL)
6623 goto onError;
6624 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6625 PyErr_Format(PyExc_TypeError,
6626 "encoder did not return a string/unicode object "
6627 "(type=%.400s)",
6628 Py_TYPE(v)->tp_name);
6629 Py_DECREF(v);
6630 return NULL;
6632 return v;
6634 onError:
6635 return NULL;
6638 PyDoc_STRVAR(decode__doc__,
6639 "S.decode([encoding[,errors]]) -> string or unicode\n\
6641 Decodes S using the codec registered for encoding. encoding defaults\n\
6642 to the default encoding. errors may be given to set a different error\n\
6643 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6644 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6645 as well as any other name registerd with codecs.register_error that is\n\
6646 able to handle UnicodeDecodeErrors.");
6648 static PyObject *
6649 unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6651 static char *kwlist[] = {"encoding", "errors", 0};
6652 char *encoding = NULL;
6653 char *errors = NULL;
6654 PyObject *v;
6656 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6657 kwlist, &encoding, &errors))
6658 return NULL;
6659 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
6660 if (v == NULL)
6661 goto onError;
6662 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6663 PyErr_Format(PyExc_TypeError,
6664 "decoder did not return a string/unicode object "
6665 "(type=%.400s)",
6666 Py_TYPE(v)->tp_name);
6667 Py_DECREF(v);
6668 return NULL;
6670 return v;
6672 onError:
6673 return NULL;
6676 PyDoc_STRVAR(expandtabs__doc__,
6677 "S.expandtabs([tabsize]) -> unicode\n\
6679 Return a copy of S where all tab characters are expanded using spaces.\n\
6680 If tabsize is not given, a tab size of 8 characters is assumed.");
6682 static PyObject*
6683 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6685 Py_UNICODE *e;
6686 Py_UNICODE *p;
6687 Py_UNICODE *q;
6688 Py_UNICODE *qe;
6689 Py_ssize_t i, j, incr;
6690 PyUnicodeObject *u;
6691 int tabsize = 8;
6693 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6694 return NULL;
6696 /* First pass: determine size of output string */
6697 i = 0; /* chars up to and including most recent \n or \r */
6698 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6699 e = self->str + self->length; /* end of input */
6700 for (p = self->str; p < e; p++)
6701 if (*p == '\t') {
6702 if (tabsize > 0) {
6703 incr = tabsize - (j % tabsize); /* cannot overflow */
6704 if (j > PY_SSIZE_T_MAX - incr)
6705 goto overflow1;
6706 j += incr;
6709 else {
6710 if (j > PY_SSIZE_T_MAX - 1)
6711 goto overflow1;
6712 j++;
6713 if (*p == '\n' || *p == '\r') {
6714 if (i > PY_SSIZE_T_MAX - j)
6715 goto overflow1;
6716 i += j;
6717 j = 0;
6721 if (i > PY_SSIZE_T_MAX - j)
6722 goto overflow1;
6724 /* Second pass: create output string and fill it */
6725 u = _PyUnicode_New(i + j);
6726 if (!u)
6727 return NULL;
6729 j = 0; /* same as in first pass */
6730 q = u->str; /* next output char */
6731 qe = u->str + u->length; /* end of output */
6733 for (p = self->str; p < e; p++)
6734 if (*p == '\t') {
6735 if (tabsize > 0) {
6736 i = tabsize - (j % tabsize);
6737 j += i;
6738 while (i--) {
6739 if (q >= qe)
6740 goto overflow2;
6741 *q++ = ' ';
6745 else {
6746 if (q >= qe)
6747 goto overflow2;
6748 *q++ = *p;
6749 j++;
6750 if (*p == '\n' || *p == '\r')
6751 j = 0;
6754 return (PyObject*) u;
6756 overflow2:
6757 Py_DECREF(u);
6758 overflow1:
6759 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6760 return NULL;
6763 PyDoc_STRVAR(find__doc__,
6764 "S.find(sub [,start [,end]]) -> int\n\
6766 Return the lowest index in S where substring sub is found,\n\
6767 such that sub is contained within s[start:end]. Optional\n\
6768 arguments start and end are interpreted as in slice notation.\n\
6770 Return -1 on failure.");
6772 static PyObject *
6773 unicode_find(PyUnicodeObject *self, PyObject *args)
6775 PyObject *substring;
6776 Py_ssize_t start;
6777 Py_ssize_t end;
6778 Py_ssize_t result;
6780 if (!_ParseTupleFinds(args, &substring, &start, &end))
6781 return NULL;
6783 result = stringlib_find_slice(
6784 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6785 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6786 start, end
6789 Py_DECREF(substring);
6791 return PyInt_FromSsize_t(result);
6794 static PyObject *
6795 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6797 if (index < 0 || index >= self->length) {
6798 PyErr_SetString(PyExc_IndexError, "string index out of range");
6799 return NULL;
6802 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6805 static long
6806 unicode_hash(PyUnicodeObject *self)
6808 /* Since Unicode objects compare equal to their ASCII string
6809 counterparts, they should use the individual character values
6810 as basis for their hash value. This is needed to assure that
6811 strings and Unicode objects behave in the same way as
6812 dictionary keys. */
6814 register Py_ssize_t len;
6815 register Py_UNICODE *p;
6816 register long x;
6818 if (self->hash != -1)
6819 return self->hash;
6820 len = PyUnicode_GET_SIZE(self);
6821 p = PyUnicode_AS_UNICODE(self);
6822 x = *p << 7;
6823 while (--len >= 0)
6824 x = (1000003*x) ^ *p++;
6825 x ^= PyUnicode_GET_SIZE(self);
6826 if (x == -1)
6827 x = -2;
6828 self->hash = x;
6829 return x;
6832 PyDoc_STRVAR(index__doc__,
6833 "S.index(sub [,start [,end]]) -> int\n\
6835 Like S.find() but raise ValueError when the substring is not found.");
6837 static PyObject *
6838 unicode_index(PyUnicodeObject *self, PyObject *args)
6840 Py_ssize_t result;
6841 PyObject *substring;
6842 Py_ssize_t start;
6843 Py_ssize_t end;
6845 if (!_ParseTupleFinds(args, &substring, &start, &end))
6846 return NULL;
6848 result = stringlib_find_slice(
6849 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6850 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6851 start, end
6854 Py_DECREF(substring);
6856 if (result < 0) {
6857 PyErr_SetString(PyExc_ValueError, "substring not found");
6858 return NULL;
6861 return PyInt_FromSsize_t(result);
6864 PyDoc_STRVAR(islower__doc__,
6865 "S.islower() -> bool\n\
6867 Return True if all cased characters in S are lowercase and there is\n\
6868 at least one cased character in S, False otherwise.");
6870 static PyObject*
6871 unicode_islower(PyUnicodeObject *self)
6873 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6874 register const Py_UNICODE *e;
6875 int cased;
6877 /* Shortcut for single character strings */
6878 if (PyUnicode_GET_SIZE(self) == 1)
6879 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6881 /* Special case for empty strings */
6882 if (PyUnicode_GET_SIZE(self) == 0)
6883 return PyBool_FromLong(0);
6885 e = p + PyUnicode_GET_SIZE(self);
6886 cased = 0;
6887 for (; p < e; p++) {
6888 register const Py_UNICODE ch = *p;
6890 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6891 return PyBool_FromLong(0);
6892 else if (!cased && Py_UNICODE_ISLOWER(ch))
6893 cased = 1;
6895 return PyBool_FromLong(cased);
6898 PyDoc_STRVAR(isupper__doc__,
6899 "S.isupper() -> bool\n\
6901 Return True if all cased characters in S are uppercase and there is\n\
6902 at least one cased character in S, False otherwise.");
6904 static PyObject*
6905 unicode_isupper(PyUnicodeObject *self)
6907 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6908 register const Py_UNICODE *e;
6909 int cased;
6911 /* Shortcut for single character strings */
6912 if (PyUnicode_GET_SIZE(self) == 1)
6913 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6915 /* Special case for empty strings */
6916 if (PyUnicode_GET_SIZE(self) == 0)
6917 return PyBool_FromLong(0);
6919 e = p + PyUnicode_GET_SIZE(self);
6920 cased = 0;
6921 for (; p < e; p++) {
6922 register const Py_UNICODE ch = *p;
6924 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6925 return PyBool_FromLong(0);
6926 else if (!cased && Py_UNICODE_ISUPPER(ch))
6927 cased = 1;
6929 return PyBool_FromLong(cased);
6932 PyDoc_STRVAR(istitle__doc__,
6933 "S.istitle() -> bool\n\
6935 Return True if S is a titlecased string and there is at least one\n\
6936 character in S, i.e. upper- and titlecase characters may only\n\
6937 follow uncased characters and lowercase characters only cased ones.\n\
6938 Return False otherwise.");
6940 static PyObject*
6941 unicode_istitle(PyUnicodeObject *self)
6943 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6944 register const Py_UNICODE *e;
6945 int cased, previous_is_cased;
6947 /* Shortcut for single character strings */
6948 if (PyUnicode_GET_SIZE(self) == 1)
6949 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6950 (Py_UNICODE_ISUPPER(*p) != 0));
6952 /* Special case for empty strings */
6953 if (PyUnicode_GET_SIZE(self) == 0)
6954 return PyBool_FromLong(0);
6956 e = p + PyUnicode_GET_SIZE(self);
6957 cased = 0;
6958 previous_is_cased = 0;
6959 for (; p < e; p++) {
6960 register const Py_UNICODE ch = *p;
6962 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6963 if (previous_is_cased)
6964 return PyBool_FromLong(0);
6965 previous_is_cased = 1;
6966 cased = 1;
6968 else if (Py_UNICODE_ISLOWER(ch)) {
6969 if (!previous_is_cased)
6970 return PyBool_FromLong(0);
6971 previous_is_cased = 1;
6972 cased = 1;
6974 else
6975 previous_is_cased = 0;
6977 return PyBool_FromLong(cased);
6980 PyDoc_STRVAR(isspace__doc__,
6981 "S.isspace() -> bool\n\
6983 Return True if all characters in S are whitespace\n\
6984 and there is at least one character in S, False otherwise.");
6986 static PyObject*
6987 unicode_isspace(PyUnicodeObject *self)
6989 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6990 register const Py_UNICODE *e;
6992 /* Shortcut for single character strings */
6993 if (PyUnicode_GET_SIZE(self) == 1 &&
6994 Py_UNICODE_ISSPACE(*p))
6995 return PyBool_FromLong(1);
6997 /* Special case for empty strings */
6998 if (PyUnicode_GET_SIZE(self) == 0)
6999 return PyBool_FromLong(0);
7001 e = p + PyUnicode_GET_SIZE(self);
7002 for (; p < e; p++) {
7003 if (!Py_UNICODE_ISSPACE(*p))
7004 return PyBool_FromLong(0);
7006 return PyBool_FromLong(1);
7009 PyDoc_STRVAR(isalpha__doc__,
7010 "S.isalpha() -> bool\n\
7012 Return True if all characters in S are alphabetic\n\
7013 and there is at least one character in S, False otherwise.");
7015 static PyObject*
7016 unicode_isalpha(PyUnicodeObject *self)
7018 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7019 register const Py_UNICODE *e;
7021 /* Shortcut for single character strings */
7022 if (PyUnicode_GET_SIZE(self) == 1 &&
7023 Py_UNICODE_ISALPHA(*p))
7024 return PyBool_FromLong(1);
7026 /* Special case for empty strings */
7027 if (PyUnicode_GET_SIZE(self) == 0)
7028 return PyBool_FromLong(0);
7030 e = p + PyUnicode_GET_SIZE(self);
7031 for (; p < e; p++) {
7032 if (!Py_UNICODE_ISALPHA(*p))
7033 return PyBool_FromLong(0);
7035 return PyBool_FromLong(1);
7038 PyDoc_STRVAR(isalnum__doc__,
7039 "S.isalnum() -> bool\n\
7041 Return True if all characters in S are alphanumeric\n\
7042 and there is at least one character in S, False otherwise.");
7044 static PyObject*
7045 unicode_isalnum(PyUnicodeObject *self)
7047 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7048 register const Py_UNICODE *e;
7050 /* Shortcut for single character strings */
7051 if (PyUnicode_GET_SIZE(self) == 1 &&
7052 Py_UNICODE_ISALNUM(*p))
7053 return PyBool_FromLong(1);
7055 /* Special case for empty strings */
7056 if (PyUnicode_GET_SIZE(self) == 0)
7057 return PyBool_FromLong(0);
7059 e = p + PyUnicode_GET_SIZE(self);
7060 for (; p < e; p++) {
7061 if (!Py_UNICODE_ISALNUM(*p))
7062 return PyBool_FromLong(0);
7064 return PyBool_FromLong(1);
7067 PyDoc_STRVAR(isdecimal__doc__,
7068 "S.isdecimal() -> bool\n\
7070 Return True if there are only decimal characters in S,\n\
7071 False otherwise.");
7073 static PyObject*
7074 unicode_isdecimal(PyUnicodeObject *self)
7076 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7077 register const Py_UNICODE *e;
7079 /* Shortcut for single character strings */
7080 if (PyUnicode_GET_SIZE(self) == 1 &&
7081 Py_UNICODE_ISDECIMAL(*p))
7082 return PyBool_FromLong(1);
7084 /* Special case for empty strings */
7085 if (PyUnicode_GET_SIZE(self) == 0)
7086 return PyBool_FromLong(0);
7088 e = p + PyUnicode_GET_SIZE(self);
7089 for (; p < e; p++) {
7090 if (!Py_UNICODE_ISDECIMAL(*p))
7091 return PyBool_FromLong(0);
7093 return PyBool_FromLong(1);
7096 PyDoc_STRVAR(isdigit__doc__,
7097 "S.isdigit() -> bool\n\
7099 Return True if all characters in S are digits\n\
7100 and there is at least one character in S, False otherwise.");
7102 static PyObject*
7103 unicode_isdigit(PyUnicodeObject *self)
7105 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7106 register const Py_UNICODE *e;
7108 /* Shortcut for single character strings */
7109 if (PyUnicode_GET_SIZE(self) == 1 &&
7110 Py_UNICODE_ISDIGIT(*p))
7111 return PyBool_FromLong(1);
7113 /* Special case for empty strings */
7114 if (PyUnicode_GET_SIZE(self) == 0)
7115 return PyBool_FromLong(0);
7117 e = p + PyUnicode_GET_SIZE(self);
7118 for (; p < e; p++) {
7119 if (!Py_UNICODE_ISDIGIT(*p))
7120 return PyBool_FromLong(0);
7122 return PyBool_FromLong(1);
7125 PyDoc_STRVAR(isnumeric__doc__,
7126 "S.isnumeric() -> bool\n\
7128 Return True if there are only numeric characters in S,\n\
7129 False otherwise.");
7131 static PyObject*
7132 unicode_isnumeric(PyUnicodeObject *self)
7134 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7135 register const Py_UNICODE *e;
7137 /* Shortcut for single character strings */
7138 if (PyUnicode_GET_SIZE(self) == 1 &&
7139 Py_UNICODE_ISNUMERIC(*p))
7140 return PyBool_FromLong(1);
7142 /* Special case for empty strings */
7143 if (PyUnicode_GET_SIZE(self) == 0)
7144 return PyBool_FromLong(0);
7146 e = p + PyUnicode_GET_SIZE(self);
7147 for (; p < e; p++) {
7148 if (!Py_UNICODE_ISNUMERIC(*p))
7149 return PyBool_FromLong(0);
7151 return PyBool_FromLong(1);
7154 PyDoc_STRVAR(join__doc__,
7155 "S.join(iterable) -> unicode\n\
7157 Return a string which is the concatenation of the strings in the\n\
7158 iterable. The separator between elements is S.");
7160 static PyObject*
7161 unicode_join(PyObject *self, PyObject *data)
7163 return PyUnicode_Join(self, data);
7166 static Py_ssize_t
7167 unicode_length(PyUnicodeObject *self)
7169 return self->length;
7172 PyDoc_STRVAR(ljust__doc__,
7173 "S.ljust(width[, fillchar]) -> int\n\
7175 Return S left-justified in a Unicode string of length width. Padding is\n\
7176 done using the specified fill character (default is a space).");
7178 static PyObject *
7179 unicode_ljust(PyUnicodeObject *self, PyObject *args)
7181 Py_ssize_t width;
7182 Py_UNICODE fillchar = ' ';
7184 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7185 return NULL;
7187 if (self->length >= width && PyUnicode_CheckExact(self)) {
7188 Py_INCREF(self);
7189 return (PyObject*) self;
7192 return (PyObject*) pad(self, 0, width - self->length, fillchar);
7195 PyDoc_STRVAR(lower__doc__,
7196 "S.lower() -> unicode\n\
7198 Return a copy of the string S converted to lowercase.");
7200 static PyObject*
7201 unicode_lower(PyUnicodeObject *self)
7203 return fixup(self, fixlower);
7206 #define LEFTSTRIP 0
7207 #define RIGHTSTRIP 1
7208 #define BOTHSTRIP 2
7210 /* Arrays indexed by above */
7211 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7213 #define STRIPNAME(i) (stripformat[i]+3)
7215 /* externally visible for str.strip(unicode) */
7216 PyObject *
7217 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7219 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7220 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7221 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7222 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7223 Py_ssize_t i, j;
7225 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7227 i = 0;
7228 if (striptype != RIGHTSTRIP) {
7229 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7230 i++;
7234 j = len;
7235 if (striptype != LEFTSTRIP) {
7236 do {
7237 j--;
7238 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7239 j++;
7242 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7243 Py_INCREF(self);
7244 return (PyObject*)self;
7246 else
7247 return PyUnicode_FromUnicode(s+i, j-i);
7251 static PyObject *
7252 do_strip(PyUnicodeObject *self, int striptype)
7254 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7255 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7257 i = 0;
7258 if (striptype != RIGHTSTRIP) {
7259 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7260 i++;
7264 j = len;
7265 if (striptype != LEFTSTRIP) {
7266 do {
7267 j--;
7268 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7269 j++;
7272 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7273 Py_INCREF(self);
7274 return (PyObject*)self;
7276 else
7277 return PyUnicode_FromUnicode(s+i, j-i);
7281 static PyObject *
7282 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7284 PyObject *sep = NULL;
7286 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7287 return NULL;
7289 if (sep != NULL && sep != Py_None) {
7290 if (PyUnicode_Check(sep))
7291 return _PyUnicode_XStrip(self, striptype, sep);
7292 else if (PyString_Check(sep)) {
7293 PyObject *res;
7294 sep = PyUnicode_FromObject(sep);
7295 if (sep==NULL)
7296 return NULL;
7297 res = _PyUnicode_XStrip(self, striptype, sep);
7298 Py_DECREF(sep);
7299 return res;
7301 else {
7302 PyErr_Format(PyExc_TypeError,
7303 "%s arg must be None, unicode or str",
7304 STRIPNAME(striptype));
7305 return NULL;
7309 return do_strip(self, striptype);
7313 PyDoc_STRVAR(strip__doc__,
7314 "S.strip([chars]) -> unicode\n\
7316 Return a copy of the string S with leading and trailing\n\
7317 whitespace removed.\n\
7318 If chars is given and not None, remove characters in chars instead.\n\
7319 If chars is a str, it will be converted to unicode before stripping");
7321 static PyObject *
7322 unicode_strip(PyUnicodeObject *self, PyObject *args)
7324 if (PyTuple_GET_SIZE(args) == 0)
7325 return do_strip(self, BOTHSTRIP); /* Common case */
7326 else
7327 return do_argstrip(self, BOTHSTRIP, args);
7331 PyDoc_STRVAR(lstrip__doc__,
7332 "S.lstrip([chars]) -> unicode\n\
7334 Return a copy of the string S with leading whitespace removed.\n\
7335 If chars is given and not None, remove characters in chars instead.\n\
7336 If chars is a str, it will be converted to unicode before stripping");
7338 static PyObject *
7339 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7341 if (PyTuple_GET_SIZE(args) == 0)
7342 return do_strip(self, LEFTSTRIP); /* Common case */
7343 else
7344 return do_argstrip(self, LEFTSTRIP, args);
7348 PyDoc_STRVAR(rstrip__doc__,
7349 "S.rstrip([chars]) -> unicode\n\
7351 Return a copy of the string S with trailing whitespace removed.\n\
7352 If chars is given and not None, remove characters in chars instead.\n\
7353 If chars is a str, it will be converted to unicode before stripping");
7355 static PyObject *
7356 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7358 if (PyTuple_GET_SIZE(args) == 0)
7359 return do_strip(self, RIGHTSTRIP); /* Common case */
7360 else
7361 return do_argstrip(self, RIGHTSTRIP, args);
7365 static PyObject*
7366 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7368 PyUnicodeObject *u;
7369 Py_UNICODE *p;
7370 Py_ssize_t nchars;
7371 size_t nbytes;
7373 if (len < 0)
7374 len = 0;
7376 if (len == 1 && PyUnicode_CheckExact(str)) {
7377 /* no repeat, return original string */
7378 Py_INCREF(str);
7379 return (PyObject*) str;
7382 /* ensure # of chars needed doesn't overflow int and # of bytes
7383 * needed doesn't overflow size_t
7385 nchars = len * str->length;
7386 if (len && nchars / len != str->length) {
7387 PyErr_SetString(PyExc_OverflowError,
7388 "repeated string is too long");
7389 return NULL;
7391 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7392 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7393 PyErr_SetString(PyExc_OverflowError,
7394 "repeated string is too long");
7395 return NULL;
7397 u = _PyUnicode_New(nchars);
7398 if (!u)
7399 return NULL;
7401 p = u->str;
7403 if (str->length == 1 && len > 0) {
7404 Py_UNICODE_FILL(p, str->str[0], len);
7405 } else {
7406 Py_ssize_t done = 0; /* number of characters copied this far */
7407 if (done < nchars) {
7408 Py_UNICODE_COPY(p, str->str, str->length);
7409 done = str->length;
7411 while (done < nchars) {
7412 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7413 Py_UNICODE_COPY(p+done, p, n);
7414 done += n;
7418 return (PyObject*) u;
7421 PyObject *PyUnicode_Replace(PyObject *obj,
7422 PyObject *subobj,
7423 PyObject *replobj,
7424 Py_ssize_t maxcount)
7426 PyObject *self;
7427 PyObject *str1;
7428 PyObject *str2;
7429 PyObject *result;
7431 self = PyUnicode_FromObject(obj);
7432 if (self == NULL)
7433 return NULL;
7434 str1 = PyUnicode_FromObject(subobj);
7435 if (str1 == NULL) {
7436 Py_DECREF(self);
7437 return NULL;
7439 str2 = PyUnicode_FromObject(replobj);
7440 if (str2 == NULL) {
7441 Py_DECREF(self);
7442 Py_DECREF(str1);
7443 return NULL;
7445 result = replace((PyUnicodeObject *)self,
7446 (PyUnicodeObject *)str1,
7447 (PyUnicodeObject *)str2,
7448 maxcount);
7449 Py_DECREF(self);
7450 Py_DECREF(str1);
7451 Py_DECREF(str2);
7452 return result;
7455 PyDoc_STRVAR(replace__doc__,
7456 "S.replace (old, new[, count]) -> unicode\n\
7458 Return a copy of S with all occurrences of substring\n\
7459 old replaced by new. If the optional argument count is\n\
7460 given, only the first count occurrences are replaced.");
7462 static PyObject*
7463 unicode_replace(PyUnicodeObject *self, PyObject *args)
7465 PyUnicodeObject *str1;
7466 PyUnicodeObject *str2;
7467 Py_ssize_t maxcount = -1;
7468 PyObject *result;
7470 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7471 return NULL;
7472 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7473 if (str1 == NULL)
7474 return NULL;
7475 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7476 if (str2 == NULL) {
7477 Py_DECREF(str1);
7478 return NULL;
7481 result = replace(self, str1, str2, maxcount);
7483 Py_DECREF(str1);
7484 Py_DECREF(str2);
7485 return result;
7488 static
7489 PyObject *unicode_repr(PyObject *unicode)
7491 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7492 PyUnicode_GET_SIZE(unicode),
7496 PyDoc_STRVAR(rfind__doc__,
7497 "S.rfind(sub [,start [,end]]) -> int\n\
7499 Return the highest index in S where substring sub is found,\n\
7500 such that sub is contained within s[start:end]. Optional\n\
7501 arguments start and end are interpreted as in slice notation.\n\
7503 Return -1 on failure.");
7505 static PyObject *
7506 unicode_rfind(PyUnicodeObject *self, PyObject *args)
7508 PyObject *substring;
7509 Py_ssize_t start;
7510 Py_ssize_t end;
7511 Py_ssize_t result;
7513 if (!_ParseTupleFinds(args, &substring, &start, &end))
7514 return NULL;
7516 result = stringlib_rfind_slice(
7517 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7518 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7519 start, end
7522 Py_DECREF(substring);
7524 return PyInt_FromSsize_t(result);
7527 PyDoc_STRVAR(rindex__doc__,
7528 "S.rindex(sub [,start [,end]]) -> int\n\
7530 Like S.rfind() but raise ValueError when the substring is not found.");
7532 static PyObject *
7533 unicode_rindex(PyUnicodeObject *self, PyObject *args)
7535 PyObject *substring;
7536 Py_ssize_t start;
7537 Py_ssize_t end;
7538 Py_ssize_t result;
7540 if (!_ParseTupleFinds(args, &substring, &start, &end))
7541 return NULL;
7543 result = stringlib_rfind_slice(
7544 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7545 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7546 start, end
7549 Py_DECREF(substring);
7551 if (result < 0) {
7552 PyErr_SetString(PyExc_ValueError, "substring not found");
7553 return NULL;
7555 return PyInt_FromSsize_t(result);
7558 PyDoc_STRVAR(rjust__doc__,
7559 "S.rjust(width[, fillchar]) -> unicode\n\
7561 Return S right-justified in a Unicode string of length width. Padding is\n\
7562 done using the specified fill character (default is a space).");
7564 static PyObject *
7565 unicode_rjust(PyUnicodeObject *self, PyObject *args)
7567 Py_ssize_t width;
7568 Py_UNICODE fillchar = ' ';
7570 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7571 return NULL;
7573 if (self->length >= width && PyUnicode_CheckExact(self)) {
7574 Py_INCREF(self);
7575 return (PyObject*) self;
7578 return (PyObject*) pad(self, width - self->length, 0, fillchar);
7581 static PyObject*
7582 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
7584 /* standard clamping */
7585 if (start < 0)
7586 start = 0;
7587 if (end < 0)
7588 end = 0;
7589 if (end > self->length)
7590 end = self->length;
7591 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
7592 /* full slice, return original string */
7593 Py_INCREF(self);
7594 return (PyObject*) self;
7596 if (start > end)
7597 start = end;
7598 /* copy slice */
7599 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7600 end - start);
7603 PyObject *PyUnicode_Split(PyObject *s,
7604 PyObject *sep,
7605 Py_ssize_t maxsplit)
7607 PyObject *result;
7609 s = PyUnicode_FromObject(s);
7610 if (s == NULL)
7611 return NULL;
7612 if (sep != NULL) {
7613 sep = PyUnicode_FromObject(sep);
7614 if (sep == NULL) {
7615 Py_DECREF(s);
7616 return NULL;
7620 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7622 Py_DECREF(s);
7623 Py_XDECREF(sep);
7624 return result;
7627 PyDoc_STRVAR(split__doc__,
7628 "S.split([sep [,maxsplit]]) -> list of strings\n\
7630 Return a list of the words in S, using sep as the\n\
7631 delimiter string. If maxsplit is given, at most maxsplit\n\
7632 splits are done. If sep is not specified or is None, any\n\
7633 whitespace string is a separator and empty strings are\n\
7634 removed from the result.");
7636 static PyObject*
7637 unicode_split(PyUnicodeObject *self, PyObject *args)
7639 PyObject *substring = Py_None;
7640 Py_ssize_t maxcount = -1;
7642 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7643 return NULL;
7645 if (substring == Py_None)
7646 return split(self, NULL, maxcount);
7647 else if (PyUnicode_Check(substring))
7648 return split(self, (PyUnicodeObject *)substring, maxcount);
7649 else
7650 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7653 PyObject *
7654 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7656 PyObject* str_obj;
7657 PyObject* sep_obj;
7658 PyObject* out;
7660 str_obj = PyUnicode_FromObject(str_in);
7661 if (!str_obj)
7662 return NULL;
7663 sep_obj = PyUnicode_FromObject(sep_in);
7664 if (!sep_obj) {
7665 Py_DECREF(str_obj);
7666 return NULL;
7669 out = stringlib_partition(
7670 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7671 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7674 Py_DECREF(sep_obj);
7675 Py_DECREF(str_obj);
7677 return out;
7681 PyObject *
7682 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7684 PyObject* str_obj;
7685 PyObject* sep_obj;
7686 PyObject* out;
7688 str_obj = PyUnicode_FromObject(str_in);
7689 if (!str_obj)
7690 return NULL;
7691 sep_obj = PyUnicode_FromObject(sep_in);
7692 if (!sep_obj) {
7693 Py_DECREF(str_obj);
7694 return NULL;
7697 out = stringlib_rpartition(
7698 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7699 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7702 Py_DECREF(sep_obj);
7703 Py_DECREF(str_obj);
7705 return out;
7708 PyDoc_STRVAR(partition__doc__,
7709 "S.partition(sep) -> (head, sep, tail)\n\
7711 Search for the separator sep in S, and return the part before it,\n\
7712 the separator itself, and the part after it. If the separator is not\n\
7713 found, return S and two empty strings.");
7715 static PyObject*
7716 unicode_partition(PyUnicodeObject *self, PyObject *separator)
7718 return PyUnicode_Partition((PyObject *)self, separator);
7721 PyDoc_STRVAR(rpartition__doc__,
7722 "S.rpartition(sep) -> (tail, sep, head)\n\
7724 Search for the separator sep in S, starting at the end of S, and return\n\
7725 the part before it, the separator itself, and the part after it. If the\n\
7726 separator is not found, return two empty strings and S.");
7728 static PyObject*
7729 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7731 return PyUnicode_RPartition((PyObject *)self, separator);
7734 PyObject *PyUnicode_RSplit(PyObject *s,
7735 PyObject *sep,
7736 Py_ssize_t maxsplit)
7738 PyObject *result;
7740 s = PyUnicode_FromObject(s);
7741 if (s == NULL)
7742 return NULL;
7743 if (sep != NULL) {
7744 sep = PyUnicode_FromObject(sep);
7745 if (sep == NULL) {
7746 Py_DECREF(s);
7747 return NULL;
7751 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7753 Py_DECREF(s);
7754 Py_XDECREF(sep);
7755 return result;
7758 PyDoc_STRVAR(rsplit__doc__,
7759 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7761 Return a list of the words in S, using sep as the\n\
7762 delimiter string, starting at the end of the string and\n\
7763 working to the front. If maxsplit is given, at most maxsplit\n\
7764 splits are done. If sep is not specified, any whitespace string\n\
7765 is a separator.");
7767 static PyObject*
7768 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7770 PyObject *substring = Py_None;
7771 Py_ssize_t maxcount = -1;
7773 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7774 return NULL;
7776 if (substring == Py_None)
7777 return rsplit(self, NULL, maxcount);
7778 else if (PyUnicode_Check(substring))
7779 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7780 else
7781 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7784 PyDoc_STRVAR(splitlines__doc__,
7785 "S.splitlines([keepends]) -> list of strings\n\
7787 Return a list of the lines in S, breaking at line boundaries.\n\
7788 Line breaks are not included in the resulting list unless keepends\n\
7789 is given and true.");
7791 static PyObject*
7792 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7794 int keepends = 0;
7796 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7797 return NULL;
7799 return PyUnicode_Splitlines((PyObject *)self, keepends);
7802 static
7803 PyObject *unicode_str(PyUnicodeObject *self)
7805 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
7808 PyDoc_STRVAR(swapcase__doc__,
7809 "S.swapcase() -> unicode\n\
7811 Return a copy of S with uppercase characters converted to lowercase\n\
7812 and vice versa.");
7814 static PyObject*
7815 unicode_swapcase(PyUnicodeObject *self)
7817 return fixup(self, fixswapcase);
7820 PyDoc_STRVAR(translate__doc__,
7821 "S.translate(table) -> unicode\n\
7823 Return a copy of the string S, where all characters have been mapped\n\
7824 through the given translation table, which must be a mapping of\n\
7825 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7826 Unmapped characters are left untouched. Characters mapped to None\n\
7827 are deleted.");
7829 static PyObject*
7830 unicode_translate(PyUnicodeObject *self, PyObject *table)
7832 return PyUnicode_TranslateCharmap(self->str,
7833 self->length,
7834 table,
7835 "ignore");
7838 PyDoc_STRVAR(upper__doc__,
7839 "S.upper() -> unicode\n\
7841 Return a copy of S converted to uppercase.");
7843 static PyObject*
7844 unicode_upper(PyUnicodeObject *self)
7846 return fixup(self, fixupper);
7849 PyDoc_STRVAR(zfill__doc__,
7850 "S.zfill(width) -> unicode\n\
7852 Pad a numeric string S with zeros on the left, to fill a field\n\
7853 of the specified width. The string S is never truncated.");
7855 static PyObject *
7856 unicode_zfill(PyUnicodeObject *self, PyObject *args)
7858 Py_ssize_t fill;
7859 PyUnicodeObject *u;
7861 Py_ssize_t width;
7862 if (!PyArg_ParseTuple(args, "n:zfill", &width))
7863 return NULL;
7865 if (self->length >= width) {
7866 if (PyUnicode_CheckExact(self)) {
7867 Py_INCREF(self);
7868 return (PyObject*) self;
7870 else
7871 return PyUnicode_FromUnicode(
7872 PyUnicode_AS_UNICODE(self),
7873 PyUnicode_GET_SIZE(self)
7877 fill = width - self->length;
7879 u = pad(self, fill, 0, '0');
7881 if (u == NULL)
7882 return NULL;
7884 if (u->str[fill] == '+' || u->str[fill] == '-') {
7885 /* move sign to beginning of string */
7886 u->str[0] = u->str[fill];
7887 u->str[fill] = '0';
7890 return (PyObject*) u;
7893 #if 0
7894 static PyObject*
7895 free_listsize(PyUnicodeObject *self)
7897 return PyInt_FromLong(numfree);
7899 #endif
7901 PyDoc_STRVAR(startswith__doc__,
7902 "S.startswith(prefix[, start[, end]]) -> bool\n\
7904 Return True if S starts with the specified prefix, False otherwise.\n\
7905 With optional start, test S beginning at that position.\n\
7906 With optional end, stop comparing S at that position.\n\
7907 prefix can also be a tuple of strings to try.");
7909 static PyObject *
7910 unicode_startswith(PyUnicodeObject *self,
7911 PyObject *args)
7913 PyObject *subobj;
7914 PyUnicodeObject *substring;
7915 Py_ssize_t start = 0;
7916 Py_ssize_t end = PY_SSIZE_T_MAX;
7917 int result;
7919 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
7920 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7921 return NULL;
7922 if (PyTuple_Check(subobj)) {
7923 Py_ssize_t i;
7924 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7925 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7926 PyTuple_GET_ITEM(subobj, i));
7927 if (substring == NULL)
7928 return NULL;
7929 result = tailmatch(self, substring, start, end, -1);
7930 Py_DECREF(substring);
7931 if (result) {
7932 Py_RETURN_TRUE;
7935 /* nothing matched */
7936 Py_RETURN_FALSE;
7938 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7939 if (substring == NULL)
7940 return NULL;
7941 result = tailmatch(self, substring, start, end, -1);
7942 Py_DECREF(substring);
7943 return PyBool_FromLong(result);
7947 PyDoc_STRVAR(endswith__doc__,
7948 "S.endswith(suffix[, start[, end]]) -> bool\n\
7950 Return True if S ends with the specified suffix, False otherwise.\n\
7951 With optional start, test S beginning at that position.\n\
7952 With optional end, stop comparing S at that position.\n\
7953 suffix can also be a tuple of strings to try.");
7955 static PyObject *
7956 unicode_endswith(PyUnicodeObject *self,
7957 PyObject *args)
7959 PyObject *subobj;
7960 PyUnicodeObject *substring;
7961 Py_ssize_t start = 0;
7962 Py_ssize_t end = PY_SSIZE_T_MAX;
7963 int result;
7965 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7966 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7967 return NULL;
7968 if (PyTuple_Check(subobj)) {
7969 Py_ssize_t i;
7970 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7971 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7972 PyTuple_GET_ITEM(subobj, i));
7973 if (substring == NULL)
7974 return NULL;
7975 result = tailmatch(self, substring, start, end, +1);
7976 Py_DECREF(substring);
7977 if (result) {
7978 Py_RETURN_TRUE;
7981 Py_RETURN_FALSE;
7983 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7984 if (substring == NULL)
7985 return NULL;
7987 result = tailmatch(self, substring, start, end, +1);
7988 Py_DECREF(substring);
7989 return PyBool_FromLong(result);
7993 /* Implements do_string_format, which is unicode because of stringlib */
7994 #include "stringlib/string_format.h"
7996 PyDoc_STRVAR(format__doc__,
7997 "S.format(*args, **kwargs) -> unicode\n\
8001 static PyObject *
8002 unicode__format__(PyObject *self, PyObject *args)
8004 PyObject *format_spec;
8005 PyObject *result = NULL;
8006 PyObject *tmp = NULL;
8008 /* If 2.x, convert format_spec to the same type as value */
8009 /* This is to allow things like u''.format('') */
8010 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
8011 goto done;
8012 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
8013 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
8014 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
8015 goto done;
8017 tmp = PyObject_Unicode(format_spec);
8018 if (tmp == NULL)
8019 goto done;
8020 format_spec = tmp;
8022 result = _PyUnicode_FormatAdvanced(self,
8023 PyUnicode_AS_UNICODE(format_spec),
8024 PyUnicode_GET_SIZE(format_spec));
8025 done:
8026 Py_XDECREF(tmp);
8027 return result;
8030 PyDoc_STRVAR(p_format__doc__,
8031 "S.__format__(format_spec) -> unicode\n\
8035 static PyObject *
8036 unicode__sizeof__(PyUnicodeObject *v)
8038 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
8039 sizeof(Py_UNICODE) * (v->length + 1));
8042 PyDoc_STRVAR(sizeof__doc__,
8043 "S.__sizeof__() -> size of S in memory, in bytes\n\
8047 static PyObject *
8048 unicode_getnewargs(PyUnicodeObject *v)
8050 return Py_BuildValue("(u#)", v->str, v->length);
8054 static PyMethodDef unicode_methods[] = {
8056 /* Order is according to common usage: often used methods should
8057 appear first, since lookup is done sequentially. */
8059 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
8060 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8061 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
8062 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
8063 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8064 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8065 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8066 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8067 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8068 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8069 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
8070 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
8071 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8072 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8073 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
8074 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
8075 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
8076 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
8077 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8078 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8079 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
8080 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
8081 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
8082 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
8083 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
8084 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8085 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8086 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8087 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8088 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8089 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8090 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8091 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8092 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8093 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8094 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8095 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8096 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8097 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
8098 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
8099 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8100 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
8101 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8102 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
8103 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
8104 #if 0
8105 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
8106 #endif
8108 #if 0
8109 /* This one is just used for debugging the implementation. */
8110 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
8111 #endif
8113 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
8114 {NULL, NULL}
8117 static PyObject *
8118 unicode_mod(PyObject *v, PyObject *w)
8120 if (!PyUnicode_Check(v)) {
8121 Py_INCREF(Py_NotImplemented);
8122 return Py_NotImplemented;
8124 return PyUnicode_Format(v, w);
8127 static PyNumberMethods unicode_as_number = {
8128 0, /*nb_add*/
8129 0, /*nb_subtract*/
8130 0, /*nb_multiply*/
8131 0, /*nb_divide*/
8132 unicode_mod, /*nb_remainder*/
8135 static PySequenceMethods unicode_as_sequence = {
8136 (lenfunc) unicode_length, /* sq_length */
8137 PyUnicode_Concat, /* sq_concat */
8138 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8139 (ssizeargfunc) unicode_getitem, /* sq_item */
8140 (ssizessizeargfunc) unicode_slice, /* sq_slice */
8141 0, /* sq_ass_item */
8142 0, /* sq_ass_slice */
8143 PyUnicode_Contains, /* sq_contains */
8146 static PyObject*
8147 unicode_subscript(PyUnicodeObject* self, PyObject* item)
8149 if (PyIndex_Check(item)) {
8150 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
8151 if (i == -1 && PyErr_Occurred())
8152 return NULL;
8153 if (i < 0)
8154 i += PyUnicode_GET_SIZE(self);
8155 return unicode_getitem(self, i);
8156 } else if (PySlice_Check(item)) {
8157 Py_ssize_t start, stop, step, slicelength, cur, i;
8158 Py_UNICODE* source_buf;
8159 Py_UNICODE* result_buf;
8160 PyObject* result;
8162 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
8163 &start, &stop, &step, &slicelength) < 0) {
8164 return NULL;
8167 if (slicelength <= 0) {
8168 return PyUnicode_FromUnicode(NULL, 0);
8169 } else if (start == 0 && step == 1 && slicelength == self->length &&
8170 PyUnicode_CheckExact(self)) {
8171 Py_INCREF(self);
8172 return (PyObject *)self;
8173 } else if (step == 1) {
8174 return PyUnicode_FromUnicode(self->str + start, slicelength);
8175 } else {
8176 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
8177 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8178 sizeof(Py_UNICODE));
8180 if (result_buf == NULL)
8181 return PyErr_NoMemory();
8183 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8184 result_buf[i] = source_buf[cur];
8187 result = PyUnicode_FromUnicode(result_buf, slicelength);
8188 PyObject_FREE(result_buf);
8189 return result;
8191 } else {
8192 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8193 return NULL;
8197 static PyMappingMethods unicode_as_mapping = {
8198 (lenfunc)unicode_length, /* mp_length */
8199 (binaryfunc)unicode_subscript, /* mp_subscript */
8200 (objobjargproc)0, /* mp_ass_subscript */
8203 static Py_ssize_t
8204 unicode_buffer_getreadbuf(PyUnicodeObject *self,
8205 Py_ssize_t index,
8206 const void **ptr)
8208 if (index != 0) {
8209 PyErr_SetString(PyExc_SystemError,
8210 "accessing non-existent unicode segment");
8211 return -1;
8213 *ptr = (void *) self->str;
8214 return PyUnicode_GET_DATA_SIZE(self);
8217 static Py_ssize_t
8218 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
8219 const void **ptr)
8221 PyErr_SetString(PyExc_TypeError,
8222 "cannot use unicode as modifiable buffer");
8223 return -1;
8226 static int
8227 unicode_buffer_getsegcount(PyUnicodeObject *self,
8228 Py_ssize_t *lenp)
8230 if (lenp)
8231 *lenp = PyUnicode_GET_DATA_SIZE(self);
8232 return 1;
8235 static Py_ssize_t
8236 unicode_buffer_getcharbuf(PyUnicodeObject *self,
8237 Py_ssize_t index,
8238 const void **ptr)
8240 PyObject *str;
8242 if (index != 0) {
8243 PyErr_SetString(PyExc_SystemError,
8244 "accessing non-existent unicode segment");
8245 return -1;
8247 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
8248 if (str == NULL)
8249 return -1;
8250 *ptr = (void *) PyString_AS_STRING(str);
8251 return PyString_GET_SIZE(str);
8254 /* Helpers for PyUnicode_Format() */
8256 static PyObject *
8257 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8259 Py_ssize_t argidx = *p_argidx;
8260 if (argidx < arglen) {
8261 (*p_argidx)++;
8262 if (arglen < 0)
8263 return args;
8264 else
8265 return PyTuple_GetItem(args, argidx);
8267 PyErr_SetString(PyExc_TypeError,
8268 "not enough arguments for format string");
8269 return NULL;
8272 #define F_LJUST (1<<0)
8273 #define F_SIGN (1<<1)
8274 #define F_BLANK (1<<2)
8275 #define F_ALT (1<<3)
8276 #define F_ZERO (1<<4)
8278 static Py_ssize_t
8279 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8281 register Py_ssize_t i;
8282 Py_ssize_t len = strlen(charbuffer);
8283 for (i = len - 1; i >= 0; i--)
8284 buffer[i] = (Py_UNICODE) charbuffer[i];
8286 return len;
8289 static int
8290 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8292 Py_ssize_t result;
8294 PyOS_snprintf((char *)buffer, len, format, x);
8295 result = strtounicode(buffer, (char *)buffer);
8296 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8299 /* XXX To save some code duplication, formatfloat/long/int could have been
8300 shared with stringobject.c, converting from 8-bit to Unicode after the
8301 formatting is done. */
8303 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
8305 static PyObject *
8306 formatfloat(PyObject *v, int flags, int prec, int type)
8308 char *p;
8309 PyObject *result;
8310 double x;
8312 x = PyFloat_AsDouble(v);
8313 if (x == -1.0 && PyErr_Occurred())
8314 return NULL;
8316 if (prec < 0)
8317 prec = 6;
8319 p = PyOS_double_to_string(x, type, prec,
8320 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8321 if (p == NULL)
8322 return NULL;
8323 result = PyUnicode_FromStringAndSize(p, strlen(p));
8324 PyMem_Free(p);
8325 return result;
8328 static PyObject*
8329 formatlong(PyObject *val, int flags, int prec, int type)
8331 char *buf;
8332 int i, len;
8333 PyObject *str; /* temporary string object. */
8334 PyUnicodeObject *result;
8336 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8337 if (!str)
8338 return NULL;
8339 result = _PyUnicode_New(len);
8340 if (!result) {
8341 Py_DECREF(str);
8342 return NULL;
8344 for (i = 0; i < len; i++)
8345 result->str[i] = buf[i];
8346 result->str[len] = 0;
8347 Py_DECREF(str);
8348 return (PyObject*)result;
8351 static int
8352 formatint(Py_UNICODE *buf,
8353 size_t buflen,
8354 int flags,
8355 int prec,
8356 int type,
8357 PyObject *v)
8359 /* fmt = '%#.' + `prec` + 'l' + `type`
8360 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8361 * + 1 + 1
8362 * = 24
8364 char fmt[64]; /* plenty big enough! */
8365 char *sign;
8366 long x;
8368 x = PyInt_AsLong(v);
8369 if (x == -1 && PyErr_Occurred())
8370 return -1;
8371 if (x < 0 && type == 'u') {
8372 type = 'd';
8374 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8375 sign = "-";
8376 else
8377 sign = "";
8378 if (prec < 0)
8379 prec = 1;
8381 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8382 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8384 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8385 PyErr_SetString(PyExc_OverflowError,
8386 "formatted integer is too long (precision too large?)");
8387 return -1;
8390 if ((flags & F_ALT) &&
8391 (type == 'x' || type == 'X')) {
8392 /* When converting under %#x or %#X, there are a number
8393 * of issues that cause pain:
8394 * - when 0 is being converted, the C standard leaves off
8395 * the '0x' or '0X', which is inconsistent with other
8396 * %#x/%#X conversions and inconsistent with Python's
8397 * hex() function
8398 * - there are platforms that violate the standard and
8399 * convert 0 with the '0x' or '0X'
8400 * (Metrowerks, Compaq Tru64)
8401 * - there are platforms that give '0x' when converting
8402 * under %#X, but convert 0 in accordance with the
8403 * standard (OS/2 EMX)
8405 * We can achieve the desired consistency by inserting our
8406 * own '0x' or '0X' prefix, and substituting %x/%X in place
8407 * of %#x/%#X.
8409 * Note that this is the same approach as used in
8410 * formatint() in stringobject.c
8412 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8413 sign, type, prec, type);
8415 else {
8416 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8417 sign, (flags&F_ALT) ? "#" : "",
8418 prec, type);
8420 if (sign[0])
8421 return longtounicode(buf, buflen, fmt, -x);
8422 else
8423 return longtounicode(buf, buflen, fmt, x);
8426 static int
8427 formatchar(Py_UNICODE *buf,
8428 size_t buflen,
8429 PyObject *v)
8431 /* presume that the buffer is at least 2 characters long */
8432 if (PyUnicode_Check(v)) {
8433 if (PyUnicode_GET_SIZE(v) != 1)
8434 goto onError;
8435 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8438 else if (PyString_Check(v)) {
8439 if (PyString_GET_SIZE(v) != 1)
8440 goto onError;
8441 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8444 else {
8445 /* Integer input truncated to a character */
8446 long x;
8447 x = PyInt_AsLong(v);
8448 if (x == -1 && PyErr_Occurred())
8449 goto onError;
8450 #ifdef Py_UNICODE_WIDE
8451 if (x < 0 || x > 0x10ffff) {
8452 PyErr_SetString(PyExc_OverflowError,
8453 "%c arg not in range(0x110000) "
8454 "(wide Python build)");
8455 return -1;
8457 #else
8458 if (x < 0 || x > 0xffff) {
8459 PyErr_SetString(PyExc_OverflowError,
8460 "%c arg not in range(0x10000) "
8461 "(narrow Python build)");
8462 return -1;
8464 #endif
8465 buf[0] = (Py_UNICODE) x;
8467 buf[1] = '\0';
8468 return 1;
8470 onError:
8471 PyErr_SetString(PyExc_TypeError,
8472 "%c requires int or char");
8473 return -1;
8476 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8478 FORMATBUFLEN is the length of the buffer in which the ints &
8479 chars are formatted. XXX This is a magic number. Each formatting
8480 routine does bounds checking to ensure no overflow, but a better
8481 solution may be to malloc a buffer of appropriate size for each
8482 format. For now, the current solution is sufficient.
8484 #define FORMATBUFLEN (size_t)120
8486 PyObject *PyUnicode_Format(PyObject *format,
8487 PyObject *args)
8489 Py_UNICODE *fmt, *res;
8490 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8491 int args_owned = 0;
8492 PyUnicodeObject *result = NULL;
8493 PyObject *dict = NULL;
8494 PyObject *uformat;
8496 if (format == NULL || args == NULL) {
8497 PyErr_BadInternalCall();
8498 return NULL;
8500 uformat = PyUnicode_FromObject(format);
8501 if (uformat == NULL)
8502 return NULL;
8503 fmt = PyUnicode_AS_UNICODE(uformat);
8504 fmtcnt = PyUnicode_GET_SIZE(uformat);
8506 reslen = rescnt = fmtcnt + 100;
8507 result = _PyUnicode_New(reslen);
8508 if (result == NULL)
8509 goto onError;
8510 res = PyUnicode_AS_UNICODE(result);
8512 if (PyTuple_Check(args)) {
8513 arglen = PyTuple_Size(args);
8514 argidx = 0;
8516 else {
8517 arglen = -1;
8518 argidx = -2;
8520 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
8521 !PyObject_TypeCheck(args, &PyBaseString_Type))
8522 dict = args;
8524 while (--fmtcnt >= 0) {
8525 if (*fmt != '%') {
8526 if (--rescnt < 0) {
8527 rescnt = fmtcnt + 100;
8528 reslen += rescnt;
8529 if (_PyUnicode_Resize(&result, reslen) < 0)
8530 goto onError;
8531 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8532 --rescnt;
8534 *res++ = *fmt++;
8536 else {
8537 /* Got a format specifier */
8538 int flags = 0;
8539 Py_ssize_t width = -1;
8540 int prec = -1;
8541 Py_UNICODE c = '\0';
8542 Py_UNICODE fill;
8543 int isnumok;
8544 PyObject *v = NULL;
8545 PyObject *temp = NULL;
8546 Py_UNICODE *pbuf;
8547 Py_UNICODE sign;
8548 Py_ssize_t len;
8549 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
8551 fmt++;
8552 if (*fmt == '(') {
8553 Py_UNICODE *keystart;
8554 Py_ssize_t keylen;
8555 PyObject *key;
8556 int pcount = 1;
8558 if (dict == NULL) {
8559 PyErr_SetString(PyExc_TypeError,
8560 "format requires a mapping");
8561 goto onError;
8563 ++fmt;
8564 --fmtcnt;
8565 keystart = fmt;
8566 /* Skip over balanced parentheses */
8567 while (pcount > 0 && --fmtcnt >= 0) {
8568 if (*fmt == ')')
8569 --pcount;
8570 else if (*fmt == '(')
8571 ++pcount;
8572 fmt++;
8574 keylen = fmt - keystart - 1;
8575 if (fmtcnt < 0 || pcount > 0) {
8576 PyErr_SetString(PyExc_ValueError,
8577 "incomplete format key");
8578 goto onError;
8580 #if 0
8581 /* keys are converted to strings using UTF-8 and
8582 then looked up since Python uses strings to hold
8583 variables names etc. in its namespaces and we
8584 wouldn't want to break common idioms. */
8585 key = PyUnicode_EncodeUTF8(keystart,
8586 keylen,
8587 NULL);
8588 #else
8589 key = PyUnicode_FromUnicode(keystart, keylen);
8590 #endif
8591 if (key == NULL)
8592 goto onError;
8593 if (args_owned) {
8594 Py_DECREF(args);
8595 args_owned = 0;
8597 args = PyObject_GetItem(dict, key);
8598 Py_DECREF(key);
8599 if (args == NULL) {
8600 goto onError;
8602 args_owned = 1;
8603 arglen = -1;
8604 argidx = -2;
8606 while (--fmtcnt >= 0) {
8607 switch (c = *fmt++) {
8608 case '-': flags |= F_LJUST; continue;
8609 case '+': flags |= F_SIGN; continue;
8610 case ' ': flags |= F_BLANK; continue;
8611 case '#': flags |= F_ALT; continue;
8612 case '0': flags |= F_ZERO; continue;
8614 break;
8616 if (c == '*') {
8617 v = getnextarg(args, arglen, &argidx);
8618 if (v == NULL)
8619 goto onError;
8620 if (!PyInt_Check(v)) {
8621 PyErr_SetString(PyExc_TypeError,
8622 "* wants int");
8623 goto onError;
8625 width = PyInt_AsLong(v);
8626 if (width < 0) {
8627 flags |= F_LJUST;
8628 width = -width;
8630 if (--fmtcnt >= 0)
8631 c = *fmt++;
8633 else if (c >= '0' && c <= '9') {
8634 width = c - '0';
8635 while (--fmtcnt >= 0) {
8636 c = *fmt++;
8637 if (c < '0' || c > '9')
8638 break;
8639 if ((width*10) / 10 != width) {
8640 PyErr_SetString(PyExc_ValueError,
8641 "width too big");
8642 goto onError;
8644 width = width*10 + (c - '0');
8647 if (c == '.') {
8648 prec = 0;
8649 if (--fmtcnt >= 0)
8650 c = *fmt++;
8651 if (c == '*') {
8652 v = getnextarg(args, arglen, &argidx);
8653 if (v == NULL)
8654 goto onError;
8655 if (!PyInt_Check(v)) {
8656 PyErr_SetString(PyExc_TypeError,
8657 "* wants int");
8658 goto onError;
8660 prec = PyInt_AsLong(v);
8661 if (prec < 0)
8662 prec = 0;
8663 if (--fmtcnt >= 0)
8664 c = *fmt++;
8666 else if (c >= '0' && c <= '9') {
8667 prec = c - '0';
8668 while (--fmtcnt >= 0) {
8669 c = Py_CHARMASK(*fmt++);
8670 if (c < '0' || c > '9')
8671 break;
8672 if ((prec*10) / 10 != prec) {
8673 PyErr_SetString(PyExc_ValueError,
8674 "prec too big");
8675 goto onError;
8677 prec = prec*10 + (c - '0');
8680 } /* prec */
8681 if (fmtcnt >= 0) {
8682 if (c == 'h' || c == 'l' || c == 'L') {
8683 if (--fmtcnt >= 0)
8684 c = *fmt++;
8687 if (fmtcnt < 0) {
8688 PyErr_SetString(PyExc_ValueError,
8689 "incomplete format");
8690 goto onError;
8692 if (c != '%') {
8693 v = getnextarg(args, arglen, &argidx);
8694 if (v == NULL)
8695 goto onError;
8697 sign = 0;
8698 fill = ' ';
8699 switch (c) {
8701 case '%':
8702 pbuf = formatbuf;
8703 /* presume that buffer length is at least 1 */
8704 pbuf[0] = '%';
8705 len = 1;
8706 break;
8708 case 's':
8709 case 'r':
8710 if (PyUnicode_Check(v) && c == 's') {
8711 temp = v;
8712 Py_INCREF(temp);
8714 else {
8715 PyObject *unicode;
8716 if (c == 's')
8717 temp = PyObject_Unicode(v);
8718 else
8719 temp = PyObject_Repr(v);
8720 if (temp == NULL)
8721 goto onError;
8722 if (PyUnicode_Check(temp))
8723 /* nothing to do */;
8724 else if (PyString_Check(temp)) {
8725 /* convert to string to Unicode */
8726 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8727 PyString_GET_SIZE(temp),
8728 NULL,
8729 "strict");
8730 Py_DECREF(temp);
8731 temp = unicode;
8732 if (temp == NULL)
8733 goto onError;
8735 else {
8736 Py_DECREF(temp);
8737 PyErr_SetString(PyExc_TypeError,
8738 "%s argument has non-string str()");
8739 goto onError;
8742 pbuf = PyUnicode_AS_UNICODE(temp);
8743 len = PyUnicode_GET_SIZE(temp);
8744 if (prec >= 0 && len > prec)
8745 len = prec;
8746 break;
8748 case 'i':
8749 case 'd':
8750 case 'u':
8751 case 'o':
8752 case 'x':
8753 case 'X':
8754 if (c == 'i')
8755 c = 'd';
8756 isnumok = 0;
8757 if (PyNumber_Check(v)) {
8758 PyObject *iobj=NULL;
8760 if (PyInt_Check(v) || (PyLong_Check(v))) {
8761 iobj = v;
8762 Py_INCREF(iobj);
8764 else {
8765 iobj = PyNumber_Int(v);
8766 if (iobj==NULL) iobj = PyNumber_Long(v);
8768 if (iobj!=NULL) {
8769 if (PyInt_Check(iobj)) {
8770 isnumok = 1;
8771 pbuf = formatbuf;
8772 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8773 flags, prec, c, iobj);
8774 Py_DECREF(iobj);
8775 if (len < 0)
8776 goto onError;
8777 sign = 1;
8779 else if (PyLong_Check(iobj)) {
8780 isnumok = 1;
8781 temp = formatlong(iobj, flags, prec, c);
8782 Py_DECREF(iobj);
8783 if (!temp)
8784 goto onError;
8785 pbuf = PyUnicode_AS_UNICODE(temp);
8786 len = PyUnicode_GET_SIZE(temp);
8787 sign = 1;
8789 else {
8790 Py_DECREF(iobj);
8794 if (!isnumok) {
8795 PyErr_Format(PyExc_TypeError,
8796 "%%%c format: a number is required, "
8797 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8798 goto onError;
8800 if (flags & F_ZERO)
8801 fill = '0';
8802 break;
8804 case 'e':
8805 case 'E':
8806 case 'f':
8807 case 'F':
8808 case 'g':
8809 case 'G':
8810 temp = formatfloat(v, flags, prec, c);
8811 if (temp == NULL)
8812 goto onError;
8813 pbuf = PyUnicode_AS_UNICODE(temp);
8814 len = PyUnicode_GET_SIZE(temp);
8815 sign = 1;
8816 if (flags & F_ZERO)
8817 fill = '0';
8818 break;
8820 case 'c':
8821 pbuf = formatbuf;
8822 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8823 if (len < 0)
8824 goto onError;
8825 break;
8827 default:
8828 PyErr_Format(PyExc_ValueError,
8829 "unsupported format character '%c' (0x%x) "
8830 "at index %zd",
8831 (31<=c && c<=126) ? (char)c : '?',
8832 (int)c,
8833 (Py_ssize_t)(fmt - 1 -
8834 PyUnicode_AS_UNICODE(uformat)));
8835 goto onError;
8837 if (sign) {
8838 if (*pbuf == '-' || *pbuf == '+') {
8839 sign = *pbuf++;
8840 len--;
8842 else if (flags & F_SIGN)
8843 sign = '+';
8844 else if (flags & F_BLANK)
8845 sign = ' ';
8846 else
8847 sign = 0;
8849 if (width < len)
8850 width = len;
8851 if (rescnt - (sign != 0) < width) {
8852 reslen -= rescnt;
8853 rescnt = width + fmtcnt + 100;
8854 reslen += rescnt;
8855 if (reslen < 0) {
8856 Py_XDECREF(temp);
8857 PyErr_NoMemory();
8858 goto onError;
8860 if (_PyUnicode_Resize(&result, reslen) < 0) {
8861 Py_XDECREF(temp);
8862 goto onError;
8864 res = PyUnicode_AS_UNICODE(result)
8865 + reslen - rescnt;
8867 if (sign) {
8868 if (fill != ' ')
8869 *res++ = sign;
8870 rescnt--;
8871 if (width > len)
8872 width--;
8874 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8875 assert(pbuf[0] == '0');
8876 assert(pbuf[1] == c);
8877 if (fill != ' ') {
8878 *res++ = *pbuf++;
8879 *res++ = *pbuf++;
8881 rescnt -= 2;
8882 width -= 2;
8883 if (width < 0)
8884 width = 0;
8885 len -= 2;
8887 if (width > len && !(flags & F_LJUST)) {
8888 do {
8889 --rescnt;
8890 *res++ = fill;
8891 } while (--width > len);
8893 if (fill == ' ') {
8894 if (sign)
8895 *res++ = sign;
8896 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8897 assert(pbuf[0] == '0');
8898 assert(pbuf[1] == c);
8899 *res++ = *pbuf++;
8900 *res++ = *pbuf++;
8903 Py_UNICODE_COPY(res, pbuf, len);
8904 res += len;
8905 rescnt -= len;
8906 while (--width >= len) {
8907 --rescnt;
8908 *res++ = ' ';
8910 if (dict && (argidx < arglen) && c != '%') {
8911 PyErr_SetString(PyExc_TypeError,
8912 "not all arguments converted during string formatting");
8913 Py_XDECREF(temp);
8914 goto onError;
8916 Py_XDECREF(temp);
8917 } /* '%' */
8918 } /* until end */
8919 if (argidx < arglen && !dict) {
8920 PyErr_SetString(PyExc_TypeError,
8921 "not all arguments converted during string formatting");
8922 goto onError;
8925 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8926 goto onError;
8927 if (args_owned) {
8928 Py_DECREF(args);
8930 Py_DECREF(uformat);
8931 return (PyObject *)result;
8933 onError:
8934 Py_XDECREF(result);
8935 Py_DECREF(uformat);
8936 if (args_owned) {
8937 Py_DECREF(args);
8939 return NULL;
8942 static PyBufferProcs unicode_as_buffer = {
8943 (readbufferproc) unicode_buffer_getreadbuf,
8944 (writebufferproc) unicode_buffer_getwritebuf,
8945 (segcountproc) unicode_buffer_getsegcount,
8946 (charbufferproc) unicode_buffer_getcharbuf,
8949 static PyObject *
8950 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8952 static PyObject *
8953 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8955 PyObject *x = NULL;
8956 static char *kwlist[] = {"string", "encoding", "errors", 0};
8957 char *encoding = NULL;
8958 char *errors = NULL;
8960 if (type != &PyUnicode_Type)
8961 return unicode_subtype_new(type, args, kwds);
8962 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8963 kwlist, &x, &encoding, &errors))
8964 return NULL;
8965 if (x == NULL)
8966 return (PyObject *)_PyUnicode_New(0);
8967 if (encoding == NULL && errors == NULL)
8968 return PyObject_Unicode(x);
8969 else
8970 return PyUnicode_FromEncodedObject(x, encoding, errors);
8973 static PyObject *
8974 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8976 PyUnicodeObject *tmp, *pnew;
8977 Py_ssize_t n;
8979 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8980 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8981 if (tmp == NULL)
8982 return NULL;
8983 assert(PyUnicode_Check(tmp));
8984 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8985 if (pnew == NULL) {
8986 Py_DECREF(tmp);
8987 return NULL;
8989 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8990 if (pnew->str == NULL) {
8991 _Py_ForgetReference((PyObject *)pnew);
8992 PyObject_Del(pnew);
8993 Py_DECREF(tmp);
8994 return PyErr_NoMemory();
8996 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8997 pnew->length = n;
8998 pnew->hash = tmp->hash;
8999 Py_DECREF(tmp);
9000 return (PyObject *)pnew;
9003 PyDoc_STRVAR(unicode_doc,
9004 "unicode(string [, encoding[, errors]]) -> object\n\
9006 Create a new Unicode object from the given encoded string.\n\
9007 encoding defaults to the current default string encoding.\n\
9008 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
9010 PyTypeObject PyUnicode_Type = {
9011 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9012 "unicode", /* tp_name */
9013 sizeof(PyUnicodeObject), /* tp_size */
9014 0, /* tp_itemsize */
9015 /* Slots */
9016 (destructor)unicode_dealloc, /* tp_dealloc */
9017 0, /* tp_print */
9018 0, /* tp_getattr */
9019 0, /* tp_setattr */
9020 0, /* tp_compare */
9021 unicode_repr, /* tp_repr */
9022 &unicode_as_number, /* tp_as_number */
9023 &unicode_as_sequence, /* tp_as_sequence */
9024 &unicode_as_mapping, /* tp_as_mapping */
9025 (hashfunc) unicode_hash, /* tp_hash*/
9026 0, /* tp_call*/
9027 (reprfunc) unicode_str, /* tp_str */
9028 PyObject_GenericGetAttr, /* tp_getattro */
9029 0, /* tp_setattro */
9030 &unicode_as_buffer, /* tp_as_buffer */
9031 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
9032 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
9033 unicode_doc, /* tp_doc */
9034 0, /* tp_traverse */
9035 0, /* tp_clear */
9036 PyUnicode_RichCompare, /* tp_richcompare */
9037 0, /* tp_weaklistoffset */
9038 0, /* tp_iter */
9039 0, /* tp_iternext */
9040 unicode_methods, /* tp_methods */
9041 0, /* tp_members */
9042 0, /* tp_getset */
9043 &PyBaseString_Type, /* tp_base */
9044 0, /* tp_dict */
9045 0, /* tp_descr_get */
9046 0, /* tp_descr_set */
9047 0, /* tp_dictoffset */
9048 0, /* tp_init */
9049 0, /* tp_alloc */
9050 unicode_new, /* tp_new */
9051 PyObject_Del, /* tp_free */
9054 /* Initialize the Unicode implementation */
9056 void _PyUnicode_Init(void)
9058 int i;
9060 /* XXX - move this array to unicodectype.c ? */
9061 Py_UNICODE linebreak[] = {
9062 0x000A, /* LINE FEED */
9063 0x000D, /* CARRIAGE RETURN */
9064 0x001C, /* FILE SEPARATOR */
9065 0x001D, /* GROUP SEPARATOR */
9066 0x001E, /* RECORD SEPARATOR */
9067 0x0085, /* NEXT LINE */
9068 0x2028, /* LINE SEPARATOR */
9069 0x2029, /* PARAGRAPH SEPARATOR */
9072 /* Init the implementation */
9073 free_list = NULL;
9074 numfree = 0;
9075 unicode_empty = _PyUnicode_New(0);
9076 if (!unicode_empty)
9077 return;
9079 strcpy(unicode_default_encoding, "ascii");
9080 for (i = 0; i < 256; i++)
9081 unicode_latin1[i] = NULL;
9082 if (PyType_Ready(&PyUnicode_Type) < 0)
9083 Py_FatalError("Can't initialize 'unicode'");
9085 /* initialize the linebreak bloom filter */
9086 bloom_linebreak = make_bloom_mask(
9087 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9090 PyType_Ready(&EncodingMapType);
9093 /* Finalize the Unicode implementation */
9096 PyUnicode_ClearFreeList(void)
9098 int freelist_size = numfree;
9099 PyUnicodeObject *u;
9101 for (u = free_list; u != NULL;) {
9102 PyUnicodeObject *v = u;
9103 u = *(PyUnicodeObject **)u;
9104 if (v->str)
9105 PyObject_DEL(v->str);
9106 Py_XDECREF(v->defenc);
9107 PyObject_Del(v);
9108 numfree--;
9110 free_list = NULL;
9111 assert(numfree == 0);
9112 return freelist_size;
9115 void
9116 _PyUnicode_Fini(void)
9118 int i;
9120 Py_XDECREF(unicode_empty);
9121 unicode_empty = NULL;
9123 for (i = 0; i < 256; i++) {
9124 if (unicode_latin1[i]) {
9125 Py_DECREF(unicode_latin1[i]);
9126 unicode_latin1[i] = NULL;
9129 (void)PyUnicode_ClearFreeList();
9132 #ifdef __cplusplus
9134 #endif
9138 Local variables:
9139 c-basic-offset: 4
9140 indent-tabs-mode: nil
9141 End: