Updated with fix for #3126.
[python.git] / Objects / unicodeobject.c
blob840efb9de3998f3993628e4d9492b33fb62a7754
1 /*
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
10 Copyright (c) Corporation for National Research Initiatives.
12 --------------------------------------------------------------------
13 The original string type implementation is:
15 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
18 By obtaining, using, and/or copying this software and/or its
19 associated documentation, you agree that you have read, understood,
20 and will comply with the following terms and conditions:
22 Permission to use, copy, modify, and distribute this software and its
23 associated documentation for any purpose and without fee is hereby
24 granted, provided that the above copyright notice appears in all
25 copies, and that both that copyright notice and this permission notice
26 appear in supporting documentation, and that the name of Secret Labs
27 AB or the author not be used in advertising or publicity pertaining to
28 distribution of the software without specific, written prior
29 permission.
31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 --------------------------------------------------------------------
42 #define PY_SSIZE_T_CLEAN
43 #include "Python.h"
45 #include "unicodeobject.h"
46 #include "ucnhash.h"
48 #ifdef MS_WINDOWS
49 #include <windows.h>
50 #endif
52 /* Limit for the Unicode object free list */
54 #define PyUnicode_MAXFREELIST 1024
56 /* Limit for the Unicode object free list stay alive optimization.
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
60 limit. This reduces malloc() overhead for small Unicode objects.
62 At worst this will result in PyUnicode_MAXFREELIST *
63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64 malloc()-overhead) bytes of unused garbage.
66 Setting the limit to 0 effectively turns the feature off.
68 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
73 #define KEEPALIVE_SIZE_LIMIT 9
75 /* Endianness switches; defaults to little endian */
77 #ifdef WORDS_BIGENDIAN
78 # define BYTEORDER_IS_BIG_ENDIAN
79 #else
80 # define BYTEORDER_IS_LITTLE_ENDIAN
81 #endif
83 /* --- Globals ------------------------------------------------------------
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
91 #ifdef __cplusplus
92 extern "C" {
93 #endif
95 /* Free list for Unicode objects */
96 static PyUnicodeObject *free_list;
97 static int numfree;
99 /* The empty Unicode object is shared to improve performance. */
100 static PyUnicodeObject *unicode_empty;
102 /* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104 static PyUnicodeObject *unicode_latin1[256];
106 /* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
109 Always use the PyUnicode_SetDefaultEncoding() and
110 PyUnicode_GetDefaultEncoding() APIs to access this global.
113 static char unicode_default_encoding[100];
115 /* Fast detection of the most frequent whitespace characters */
116 const unsigned char _Py_ascii_whitespace[] = {
117 0, 0, 0, 0, 0, 0, 0, 0,
118 // case 0x0009: /* HORIZONTAL TABULATION */
119 // case 0x000A: /* LINE FEED */
120 // case 0x000B: /* VERTICAL TABULATION */
121 // case 0x000C: /* FORM FEED */
122 // case 0x000D: /* CARRIAGE RETURN */
123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
125 // case 0x001C: /* FILE SEPARATOR */
126 // case 0x001D: /* GROUP SEPARATOR */
127 // case 0x001E: /* RECORD SEPARATOR */
128 // case 0x001F: /* UNIT SEPARATOR */
129 0, 0, 0, 0, 1, 1, 1, 1,
130 // case 0x0020: /* SPACE */
131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
146 /* Same for linebreaks */
147 static unsigned char ascii_linebreak[] = {
148 0, 0, 0, 0, 0, 0, 0, 0,
149 // 0x000A, /* LINE FEED */
150 // 0x000D, /* CARRIAGE RETURN */
151 0, 0, 1, 0, 0, 1, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 // 0x001C, /* FILE SEPARATOR */
154 // 0x001D, /* GROUP SEPARATOR */
155 // 0x001E, /* RECORD SEPARATOR */
156 0, 0, 0, 0, 1, 1, 1, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0
173 Py_UNICODE
174 PyUnicode_GetMax(void)
176 #ifdef Py_UNICODE_WIDE
177 return 0x10FFFF;
178 #else
179 /* This is actually an illegal character, so it should
180 not be passed to unichr. */
181 return 0xFFFF;
182 #endif
185 /* --- Bloom Filters ----------------------------------------------------- */
187 /* stuff to implement simple "bloom filters" for Unicode characters.
188 to keep things simple, we use a single bitmask, using the least 5
189 bits from each unicode characters as the bit index. */
191 /* the linebreak mask is set up by Unicode_Init below */
193 #define BLOOM_MASK unsigned long
195 static BLOOM_MASK bloom_linebreak;
197 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
199 #define BLOOM_LINEBREAK(ch) \
200 ((ch) < 128U ? ascii_linebreak[(ch)] : \
201 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
203 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
205 /* calculate simple bloom-style bitmask for a given unicode string */
207 long mask;
208 Py_ssize_t i;
210 mask = 0;
211 for (i = 0; i < len; i++)
212 mask |= (1 << (ptr[i] & 0x1F));
214 return mask;
217 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
219 Py_ssize_t i;
221 for (i = 0; i < setlen; i++)
222 if (set[i] == chr)
223 return 1;
225 return 0;
228 #define BLOOM_MEMBER(mask, chr, set, setlen)\
229 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
231 /* --- Unicode Object ----------------------------------------------------- */
233 static
234 int unicode_resize(register PyUnicodeObject *unicode,
235 Py_ssize_t length)
237 void *oldstr;
239 /* Shortcut if there's nothing much to do. */
240 if (unicode->length == length)
241 goto reset;
243 /* Resizing shared object (unicode_empty or single character
244 objects) in-place is not allowed. Use PyUnicode_Resize()
245 instead ! */
247 if (unicode == unicode_empty ||
248 (unicode->length == 1 &&
249 unicode->str[0] < 256U &&
250 unicode_latin1[unicode->str[0]] == unicode)) {
251 PyErr_SetString(PyExc_SystemError,
252 "can't resize shared unicode objects");
253 return -1;
256 /* We allocate one more byte to make sure the string is Ux0000 terminated.
257 The overallocation is also used by fastsearch, which assumes that it's
258 safe to look at str[length] (without making any assumptions about what
259 it contains). */
261 oldstr = unicode->str;
262 unicode->str = PyObject_REALLOC(unicode->str,
263 sizeof(Py_UNICODE) * (length + 1));
264 if (!unicode->str) {
265 unicode->str = (Py_UNICODE *)oldstr;
266 PyErr_NoMemory();
267 return -1;
269 unicode->str[length] = 0;
270 unicode->length = length;
272 reset:
273 /* Reset the object caches */
274 if (unicode->defenc) {
275 Py_DECREF(unicode->defenc);
276 unicode->defenc = NULL;
278 unicode->hash = -1;
280 return 0;
283 /* We allocate one more byte to make sure the string is
284 Ux0000 terminated -- XXX is this needed ?
286 XXX This allocator could further be enhanced by assuring that the
287 free list never reduces its size below 1.
291 static
292 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
294 register PyUnicodeObject *unicode;
296 /* Optimization for empty strings */
297 if (length == 0 && unicode_empty != NULL) {
298 Py_INCREF(unicode_empty);
299 return unicode_empty;
302 /* Unicode freelist & memory allocation */
303 if (free_list) {
304 unicode = free_list;
305 free_list = *(PyUnicodeObject **)unicode;
306 numfree--;
307 if (unicode->str) {
308 /* Keep-Alive optimization: we only upsize the buffer,
309 never downsize it. */
310 if ((unicode->length < length) &&
311 unicode_resize(unicode, length) < 0) {
312 PyObject_DEL(unicode->str);
313 goto onError;
316 else {
317 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
318 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
320 PyObject_INIT(unicode, &PyUnicode_Type);
322 else {
323 size_t new_size;
324 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
325 if (unicode == NULL)
326 return NULL;
327 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
328 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
331 if (!unicode->str) {
332 PyErr_NoMemory();
333 goto onError;
335 /* Initialize the first element to guard against cases where
336 * the caller fails before initializing str -- unicode_resize()
337 * reads str[0], and the Keep-Alive optimization can keep memory
338 * allocated for str alive across a call to unicode_dealloc(unicode).
339 * We don't want unicode_resize to read uninitialized memory in
340 * that case.
342 unicode->str[0] = 0;
343 unicode->str[length] = 0;
344 unicode->length = length;
345 unicode->hash = -1;
346 unicode->defenc = NULL;
347 return unicode;
349 onError:
350 _Py_ForgetReference((PyObject *)unicode);
351 PyObject_Del(unicode);
352 return NULL;
355 static
356 void unicode_dealloc(register PyUnicodeObject *unicode)
358 if (PyUnicode_CheckExact(unicode) &&
359 numfree < PyUnicode_MAXFREELIST) {
360 /* Keep-Alive optimization */
361 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
362 PyObject_DEL(unicode->str);
363 unicode->str = NULL;
364 unicode->length = 0;
366 if (unicode->defenc) {
367 Py_DECREF(unicode->defenc);
368 unicode->defenc = NULL;
370 /* Add to free list */
371 *(PyUnicodeObject **)unicode = free_list;
372 free_list = unicode;
373 numfree++;
375 else {
376 PyObject_DEL(unicode->str);
377 Py_XDECREF(unicode->defenc);
378 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
382 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
384 register PyUnicodeObject *v;
386 /* Argument checks */
387 if (unicode == NULL) {
388 PyErr_BadInternalCall();
389 return -1;
391 v = (PyUnicodeObject *)*unicode;
392 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
393 PyErr_BadInternalCall();
394 return -1;
397 /* Resizing unicode_empty and single character objects is not
398 possible since these are being shared. We simply return a fresh
399 copy with the same Unicode content. */
400 if (v->length != length &&
401 (v == unicode_empty || v->length == 1)) {
402 PyUnicodeObject *w = _PyUnicode_New(length);
403 if (w == NULL)
404 return -1;
405 Py_UNICODE_COPY(w->str, v->str,
406 length < v->length ? length : v->length);
407 Py_DECREF(*unicode);
408 *unicode = (PyObject *)w;
409 return 0;
412 /* Note that we don't have to modify *unicode for unshared Unicode
413 objects, since we can modify them in-place. */
414 return unicode_resize(v, length);
417 /* Internal API for use in unicodeobject.c only ! */
418 #define _PyUnicode_Resize(unicodevar, length) \
419 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
421 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
422 Py_ssize_t size)
424 PyUnicodeObject *unicode;
426 /* If the Unicode data is known at construction time, we can apply
427 some optimizations which share commonly used objects. */
428 if (u != NULL) {
430 /* Optimization for empty strings */
431 if (size == 0 && unicode_empty != NULL) {
432 Py_INCREF(unicode_empty);
433 return (PyObject *)unicode_empty;
436 /* Single character Unicode objects in the Latin-1 range are
437 shared when using this constructor */
438 if (size == 1 && *u < 256) {
439 unicode = unicode_latin1[*u];
440 if (!unicode) {
441 unicode = _PyUnicode_New(1);
442 if (!unicode)
443 return NULL;
444 unicode->str[0] = *u;
445 unicode_latin1[*u] = unicode;
447 Py_INCREF(unicode);
448 return (PyObject *)unicode;
452 unicode = _PyUnicode_New(size);
453 if (!unicode)
454 return NULL;
456 /* Copy the Unicode data into the new object */
457 if (u != NULL)
458 Py_UNICODE_COPY(unicode->str, u, size);
460 return (PyObject *)unicode;
463 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
465 PyUnicodeObject *unicode;
467 if (size < 0) {
468 PyErr_SetString(PyExc_SystemError,
469 "Negative size passed to PyUnicode_FromStringAndSize");
470 return NULL;
473 /* If the Unicode data is known at construction time, we can apply
474 some optimizations which share commonly used objects.
475 Also, this means the input must be UTF-8, so fall back to the
476 UTF-8 decoder at the end. */
477 if (u != NULL) {
479 /* Optimization for empty strings */
480 if (size == 0 && unicode_empty != NULL) {
481 Py_INCREF(unicode_empty);
482 return (PyObject *)unicode_empty;
485 /* Single characters are shared when using this constructor.
486 Restrict to ASCII, since the input must be UTF-8. */
487 if (size == 1 && Py_CHARMASK(*u) < 128) {
488 unicode = unicode_latin1[Py_CHARMASK(*u)];
489 if (!unicode) {
490 unicode = _PyUnicode_New(1);
491 if (!unicode)
492 return NULL;
493 unicode->str[0] = Py_CHARMASK(*u);
494 unicode_latin1[Py_CHARMASK(*u)] = unicode;
496 Py_INCREF(unicode);
497 return (PyObject *)unicode;
500 return PyUnicode_DecodeUTF8(u, size, NULL);
503 unicode = _PyUnicode_New(size);
504 if (!unicode)
505 return NULL;
507 return (PyObject *)unicode;
510 PyObject *PyUnicode_FromString(const char *u)
512 size_t size = strlen(u);
513 if (size > PY_SSIZE_T_MAX) {
514 PyErr_SetString(PyExc_OverflowError, "input too long");
515 return NULL;
518 return PyUnicode_FromStringAndSize(u, size);
521 #ifdef HAVE_WCHAR_H
523 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
524 Py_ssize_t size)
526 PyUnicodeObject *unicode;
528 if (w == NULL) {
529 PyErr_BadInternalCall();
530 return NULL;
533 unicode = _PyUnicode_New(size);
534 if (!unicode)
535 return NULL;
537 /* Copy the wchar_t data into the new object */
538 #ifdef HAVE_USABLE_WCHAR_T
539 memcpy(unicode->str, w, size * sizeof(wchar_t));
540 #else
542 register Py_UNICODE *u;
543 register Py_ssize_t i;
544 u = PyUnicode_AS_UNICODE(unicode);
545 for (i = size; i > 0; i--)
546 *u++ = *w++;
548 #endif
550 return (PyObject *)unicode;
553 static void
554 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
556 *fmt++ = '%';
557 if (width) {
558 if (zeropad)
559 *fmt++ = '0';
560 fmt += sprintf(fmt, "%d", width);
562 if (precision)
563 fmt += sprintf(fmt, ".%d", precision);
564 if (longflag)
565 *fmt++ = 'l';
566 else if (size_tflag) {
567 char *f = PY_FORMAT_SIZE_T;
568 while (*f)
569 *fmt++ = *f++;
571 *fmt++ = c;
572 *fmt = '\0';
575 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
577 PyObject *
578 PyUnicode_FromFormatV(const char *format, va_list vargs)
580 va_list count;
581 Py_ssize_t callcount = 0;
582 PyObject **callresults = NULL;
583 PyObject **callresult = NULL;
584 Py_ssize_t n = 0;
585 int width = 0;
586 int precision = 0;
587 int zeropad;
588 const char* f;
589 Py_UNICODE *s;
590 PyObject *string;
591 /* used by sprintf */
592 char buffer[21];
593 /* use abuffer instead of buffer, if we need more space
594 * (which can happen if there's a format specifier with width). */
595 char *abuffer = NULL;
596 char *realbuffer;
597 Py_ssize_t abuffersize = 0;
598 char fmt[60]; /* should be enough for %0width.precisionld */
599 const char *copy;
601 #ifdef VA_LIST_IS_ARRAY
602 Py_MEMCPY(count, vargs, sizeof(va_list));
603 #else
604 #ifdef __va_copy
605 __va_copy(count, vargs);
606 #else
607 count = vargs;
608 #endif
609 #endif
610 /* step 1: count the number of %S/%R format specifications
611 * (we call PyObject_Str()/PyObject_Repr() for these objects
612 * once during step 3 and put the result in an array) */
613 for (f = format; *f; f++) {
614 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
615 ++callcount;
617 /* step 2: allocate memory for the results of
618 * PyObject_Str()/PyObject_Repr() calls */
619 if (callcount) {
620 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
621 if (!callresults) {
622 PyErr_NoMemory();
623 return NULL;
625 callresult = callresults;
627 /* step 3: figure out how large a buffer we need */
628 for (f = format; *f; f++) {
629 if (*f == '%') {
630 const char* p = f;
631 width = 0;
632 while (isdigit((unsigned)*f))
633 width = (width*10) + *f++ - '0';
634 while (*++f && *f != '%' && !isalpha((unsigned)*f))
637 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
638 * they don't affect the amount of space we reserve.
640 if ((*f == 'l' || *f == 'z') &&
641 (f[1] == 'd' || f[1] == 'u'))
642 ++f;
644 switch (*f) {
645 case 'c':
646 (void)va_arg(count, int);
647 /* fall through... */
648 case '%':
649 n++;
650 break;
651 case 'd': case 'u': case 'i': case 'x':
652 (void) va_arg(count, int);
653 /* 20 bytes is enough to hold a 64-bit
654 integer. Decimal takes the most space.
655 This isn't enough for octal.
656 If a width is specified we need more
657 (which we allocate later). */
658 if (width < 20)
659 width = 20;
660 n += width;
661 if (abuffersize < width)
662 abuffersize = width;
663 break;
664 case 's':
666 /* UTF-8 */
667 unsigned char*s;
668 s = va_arg(count, unsigned char*);
669 while (*s) {
670 if (*s < 128) {
671 n++; s++;
672 } else if (*s < 0xc0) {
673 /* invalid UTF-8 */
674 n++; s++;
675 } else if (*s < 0xc0) {
676 n++;
677 s++; if(!*s)break;
678 s++;
679 } else if (*s < 0xe0) {
680 n++;
681 s++; if(!*s)break;
682 s++; if(!*s)break;
683 s++;
684 } else {
685 #ifdef Py_UNICODE_WIDE
686 n++;
687 #else
688 n+=2;
689 #endif
690 s++; if(!*s)break;
691 s++; if(!*s)break;
692 s++; if(!*s)break;
693 s++;
696 break;
698 case 'U':
700 PyObject *obj = va_arg(count, PyObject *);
701 assert(obj && PyUnicode_Check(obj));
702 n += PyUnicode_GET_SIZE(obj);
703 break;
705 case 'V':
707 PyObject *obj = va_arg(count, PyObject *);
708 const char *str = va_arg(count, const char *);
709 assert(obj || str);
710 assert(!obj || PyUnicode_Check(obj));
711 if (obj)
712 n += PyUnicode_GET_SIZE(obj);
713 else
714 n += strlen(str);
715 break;
717 case 'S':
719 PyObject *obj = va_arg(count, PyObject *);
720 PyObject *str;
721 assert(obj);
722 str = PyObject_Str(obj);
723 if (!str)
724 goto fail;
725 n += PyUnicode_GET_SIZE(str);
726 /* Remember the str and switch to the next slot */
727 *callresult++ = str;
728 break;
730 case 'R':
732 PyObject *obj = va_arg(count, PyObject *);
733 PyObject *repr;
734 assert(obj);
735 repr = PyObject_Repr(obj);
736 if (!repr)
737 goto fail;
738 n += PyUnicode_GET_SIZE(repr);
739 /* Remember the repr and switch to the next slot */
740 *callresult++ = repr;
741 break;
743 case 'p':
744 (void) va_arg(count, int);
745 /* maximum 64-bit pointer representation:
746 * 0xffffffffffffffff
747 * so 19 characters is enough.
748 * XXX I count 18 -- what's the extra for?
750 n += 19;
751 break;
752 default:
753 /* if we stumble upon an unknown
754 formatting code, copy the rest of
755 the format string to the output
756 string. (we cannot just skip the
757 code, since there's no way to know
758 what's in the argument list) */
759 n += strlen(p);
760 goto expand;
762 } else
763 n++;
765 expand:
766 if (abuffersize > 20) {
767 abuffer = PyObject_Malloc(abuffersize);
768 if (!abuffer) {
769 PyErr_NoMemory();
770 goto fail;
772 realbuffer = abuffer;
774 else
775 realbuffer = buffer;
776 /* step 4: fill the buffer */
777 /* Since we've analyzed how much space we need for the worst case,
778 we don't have to resize the string.
779 There can be no errors beyond this point. */
780 string = PyUnicode_FromUnicode(NULL, n);
781 if (!string)
782 goto fail;
784 s = PyUnicode_AS_UNICODE(string);
785 callresult = callresults;
787 for (f = format; *f; f++) {
788 if (*f == '%') {
789 const char* p = f++;
790 int longflag = 0;
791 int size_tflag = 0;
792 zeropad = (*f == '0');
793 /* parse the width.precision part */
794 width = 0;
795 while (isdigit((unsigned)*f))
796 width = (width*10) + *f++ - '0';
797 precision = 0;
798 if (*f == '.') {
799 f++;
800 while (isdigit((unsigned)*f))
801 precision = (precision*10) + *f++ - '0';
803 /* handle the long flag, but only for %ld and %lu.
804 others can be added when necessary. */
805 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
806 longflag = 1;
807 ++f;
809 /* handle the size_t flag. */
810 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
811 size_tflag = 1;
812 ++f;
815 switch (*f) {
816 case 'c':
817 *s++ = va_arg(vargs, int);
818 break;
819 case 'd':
820 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
821 if (longflag)
822 sprintf(realbuffer, fmt, va_arg(vargs, long));
823 else if (size_tflag)
824 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
825 else
826 sprintf(realbuffer, fmt, va_arg(vargs, int));
827 appendstring(realbuffer);
828 break;
829 case 'u':
830 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
831 if (longflag)
832 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
833 else if (size_tflag)
834 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
835 else
836 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
837 appendstring(realbuffer);
838 break;
839 case 'i':
840 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
841 sprintf(realbuffer, fmt, va_arg(vargs, int));
842 appendstring(realbuffer);
843 break;
844 case 'x':
845 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
846 sprintf(realbuffer, fmt, va_arg(vargs, int));
847 appendstring(realbuffer);
848 break;
849 case 's':
851 /* Parameter must be UTF-8 encoded.
852 In case of encoding errors, use
853 the replacement character. */
854 PyObject *u;
855 p = va_arg(vargs, char*);
856 u = PyUnicode_DecodeUTF8(p, strlen(p),
857 "replace");
858 if (!u)
859 goto fail;
860 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
861 PyUnicode_GET_SIZE(u));
862 s += PyUnicode_GET_SIZE(u);
863 Py_DECREF(u);
864 break;
866 case 'U':
868 PyObject *obj = va_arg(vargs, PyObject *);
869 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
870 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
871 s += size;
872 break;
874 case 'V':
876 PyObject *obj = va_arg(vargs, PyObject *);
877 const char *str = va_arg(vargs, const char *);
878 if (obj) {
879 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
880 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
881 s += size;
882 } else {
883 appendstring(str);
885 break;
887 case 'S':
888 case 'R':
890 Py_UNICODE *ucopy;
891 Py_ssize_t usize;
892 Py_ssize_t upos;
893 /* unused, since we already have the result */
894 (void) va_arg(vargs, PyObject *);
895 ucopy = PyUnicode_AS_UNICODE(*callresult);
896 usize = PyUnicode_GET_SIZE(*callresult);
897 for (upos = 0; upos<usize;)
898 *s++ = ucopy[upos++];
899 /* We're done with the unicode()/repr() => forget it */
900 Py_DECREF(*callresult);
901 /* switch to next unicode()/repr() result */
902 ++callresult;
903 break;
905 case 'p':
906 sprintf(buffer, "%p", va_arg(vargs, void*));
907 /* %p is ill-defined: ensure leading 0x. */
908 if (buffer[1] == 'X')
909 buffer[1] = 'x';
910 else if (buffer[1] != 'x') {
911 memmove(buffer+2, buffer, strlen(buffer)+1);
912 buffer[0] = '0';
913 buffer[1] = 'x';
915 appendstring(buffer);
916 break;
917 case '%':
918 *s++ = '%';
919 break;
920 default:
921 appendstring(p);
922 goto end;
924 } else
925 *s++ = *f;
928 end:
929 if (callresults)
930 PyObject_Free(callresults);
931 if (abuffer)
932 PyObject_Free(abuffer);
933 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
934 return string;
935 fail:
936 if (callresults) {
937 PyObject **callresult2 = callresults;
938 while (callresult2 < callresult) {
939 Py_DECREF(*callresult2);
940 ++callresult2;
942 PyObject_Free(callresults);
944 if (abuffer)
945 PyObject_Free(abuffer);
946 return NULL;
949 #undef appendstring
951 PyObject *
952 PyUnicode_FromFormat(const char *format, ...)
954 PyObject* ret;
955 va_list vargs;
957 #ifdef HAVE_STDARG_PROTOTYPES
958 va_start(vargs, format);
959 #else
960 va_start(vargs);
961 #endif
962 ret = PyUnicode_FromFormatV(format, vargs);
963 va_end(vargs);
964 return ret;
967 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
968 wchar_t *w,
969 Py_ssize_t size)
971 if (unicode == NULL) {
972 PyErr_BadInternalCall();
973 return -1;
976 /* If possible, try to copy the 0-termination as well */
977 if (size > PyUnicode_GET_SIZE(unicode))
978 size = PyUnicode_GET_SIZE(unicode) + 1;
980 #ifdef HAVE_USABLE_WCHAR_T
981 memcpy(w, unicode->str, size * sizeof(wchar_t));
982 #else
984 register Py_UNICODE *u;
985 register Py_ssize_t i;
986 u = PyUnicode_AS_UNICODE(unicode);
987 for (i = size; i > 0; i--)
988 *w++ = *u++;
990 #endif
992 if (size > PyUnicode_GET_SIZE(unicode))
993 return PyUnicode_GET_SIZE(unicode);
994 else
995 return size;
998 #endif
1000 PyObject *PyUnicode_FromOrdinal(int ordinal)
1002 Py_UNICODE s[1];
1004 #ifdef Py_UNICODE_WIDE
1005 if (ordinal < 0 || ordinal > 0x10ffff) {
1006 PyErr_SetString(PyExc_ValueError,
1007 "unichr() arg not in range(0x110000) "
1008 "(wide Python build)");
1009 return NULL;
1011 #else
1012 if (ordinal < 0 || ordinal > 0xffff) {
1013 PyErr_SetString(PyExc_ValueError,
1014 "unichr() arg not in range(0x10000) "
1015 "(narrow Python build)");
1016 return NULL;
1018 #endif
1020 s[0] = (Py_UNICODE)ordinal;
1021 return PyUnicode_FromUnicode(s, 1);
1024 PyObject *PyUnicode_FromObject(register PyObject *obj)
1026 /* XXX Perhaps we should make this API an alias of
1027 PyObject_Unicode() instead ?! */
1028 if (PyUnicode_CheckExact(obj)) {
1029 Py_INCREF(obj);
1030 return obj;
1032 if (PyUnicode_Check(obj)) {
1033 /* For a Unicode subtype that's not a Unicode object,
1034 return a true Unicode object with the same data. */
1035 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1036 PyUnicode_GET_SIZE(obj));
1038 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1041 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1042 const char *encoding,
1043 const char *errors)
1045 const char *s = NULL;
1046 Py_ssize_t len;
1047 PyObject *v;
1049 if (obj == NULL) {
1050 PyErr_BadInternalCall();
1051 return NULL;
1054 #if 0
1055 /* For b/w compatibility we also accept Unicode objects provided
1056 that no encodings is given and then redirect to
1057 PyObject_Unicode() which then applies the additional logic for
1058 Unicode subclasses.
1060 NOTE: This API should really only be used for object which
1061 represent *encoded* Unicode !
1064 if (PyUnicode_Check(obj)) {
1065 if (encoding) {
1066 PyErr_SetString(PyExc_TypeError,
1067 "decoding Unicode is not supported");
1068 return NULL;
1070 return PyObject_Unicode(obj);
1072 #else
1073 if (PyUnicode_Check(obj)) {
1074 PyErr_SetString(PyExc_TypeError,
1075 "decoding Unicode is not supported");
1076 return NULL;
1078 #endif
1080 /* Coerce object */
1081 if (PyString_Check(obj)) {
1082 s = PyString_AS_STRING(obj);
1083 len = PyString_GET_SIZE(obj);
1085 else if (PyByteArray_Check(obj)) {
1086 /* Python 2.x specific */
1087 PyErr_Format(PyExc_TypeError,
1088 "decoding bytearray is not supported");
1089 return NULL;
1091 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1092 /* Overwrite the error message with something more useful in
1093 case of a TypeError. */
1094 if (PyErr_ExceptionMatches(PyExc_TypeError))
1095 PyErr_Format(PyExc_TypeError,
1096 "coercing to Unicode: need string or buffer, "
1097 "%.80s found",
1098 Py_TYPE(obj)->tp_name);
1099 goto onError;
1102 /* Convert to Unicode */
1103 if (len == 0) {
1104 Py_INCREF(unicode_empty);
1105 v = (PyObject *)unicode_empty;
1107 else
1108 v = PyUnicode_Decode(s, len, encoding, errors);
1110 return v;
1112 onError:
1113 return NULL;
1116 PyObject *PyUnicode_Decode(const char *s,
1117 Py_ssize_t size,
1118 const char *encoding,
1119 const char *errors)
1121 PyObject *buffer = NULL, *unicode;
1123 if (encoding == NULL)
1124 encoding = PyUnicode_GetDefaultEncoding();
1126 /* Shortcuts for common default encodings */
1127 if (strcmp(encoding, "utf-8") == 0)
1128 return PyUnicode_DecodeUTF8(s, size, errors);
1129 else if (strcmp(encoding, "latin-1") == 0)
1130 return PyUnicode_DecodeLatin1(s, size, errors);
1131 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1132 else if (strcmp(encoding, "mbcs") == 0)
1133 return PyUnicode_DecodeMBCS(s, size, errors);
1134 #endif
1135 else if (strcmp(encoding, "ascii") == 0)
1136 return PyUnicode_DecodeASCII(s, size, errors);
1138 /* Decode via the codec registry */
1139 buffer = PyBuffer_FromMemory((void *)s, size);
1140 if (buffer == NULL)
1141 goto onError;
1142 unicode = PyCodec_Decode(buffer, encoding, errors);
1143 if (unicode == NULL)
1144 goto onError;
1145 if (!PyUnicode_Check(unicode)) {
1146 PyErr_Format(PyExc_TypeError,
1147 "decoder did not return an unicode object (type=%.400s)",
1148 Py_TYPE(unicode)->tp_name);
1149 Py_DECREF(unicode);
1150 goto onError;
1152 Py_DECREF(buffer);
1153 return unicode;
1155 onError:
1156 Py_XDECREF(buffer);
1157 return NULL;
1160 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1161 const char *encoding,
1162 const char *errors)
1164 PyObject *v;
1166 if (!PyUnicode_Check(unicode)) {
1167 PyErr_BadArgument();
1168 goto onError;
1171 if (encoding == NULL)
1172 encoding = PyUnicode_GetDefaultEncoding();
1174 /* Decode via the codec registry */
1175 v = PyCodec_Decode(unicode, encoding, errors);
1176 if (v == NULL)
1177 goto onError;
1178 return v;
1180 onError:
1181 return NULL;
1184 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1185 Py_ssize_t size,
1186 const char *encoding,
1187 const char *errors)
1189 PyObject *v, *unicode;
1191 unicode = PyUnicode_FromUnicode(s, size);
1192 if (unicode == NULL)
1193 return NULL;
1194 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1195 Py_DECREF(unicode);
1196 return v;
1199 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1200 const char *encoding,
1201 const char *errors)
1203 PyObject *v;
1205 if (!PyUnicode_Check(unicode)) {
1206 PyErr_BadArgument();
1207 goto onError;
1210 if (encoding == NULL)
1211 encoding = PyUnicode_GetDefaultEncoding();
1213 /* Encode via the codec registry */
1214 v = PyCodec_Encode(unicode, encoding, errors);
1215 if (v == NULL)
1216 goto onError;
1217 return v;
1219 onError:
1220 return NULL;
1223 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1224 const char *encoding,
1225 const char *errors)
1227 PyObject *v;
1229 if (!PyUnicode_Check(unicode)) {
1230 PyErr_BadArgument();
1231 goto onError;
1234 if (encoding == NULL)
1235 encoding = PyUnicode_GetDefaultEncoding();
1237 /* Shortcuts for common default encodings */
1238 if (errors == NULL) {
1239 if (strcmp(encoding, "utf-8") == 0)
1240 return PyUnicode_AsUTF8String(unicode);
1241 else if (strcmp(encoding, "latin-1") == 0)
1242 return PyUnicode_AsLatin1String(unicode);
1243 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1244 else if (strcmp(encoding, "mbcs") == 0)
1245 return PyUnicode_AsMBCSString(unicode);
1246 #endif
1247 else if (strcmp(encoding, "ascii") == 0)
1248 return PyUnicode_AsASCIIString(unicode);
1251 /* Encode via the codec registry */
1252 v = PyCodec_Encode(unicode, encoding, errors);
1253 if (v == NULL)
1254 goto onError;
1255 if (!PyString_Check(v)) {
1256 PyErr_Format(PyExc_TypeError,
1257 "encoder did not return a string object (type=%.400s)",
1258 Py_TYPE(v)->tp_name);
1259 Py_DECREF(v);
1260 goto onError;
1262 return v;
1264 onError:
1265 return NULL;
1268 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1269 const char *errors)
1271 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1273 if (v)
1274 return v;
1275 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1276 if (v && errors == NULL)
1277 ((PyUnicodeObject *)unicode)->defenc = v;
1278 return v;
1281 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1283 if (!PyUnicode_Check(unicode)) {
1284 PyErr_BadArgument();
1285 goto onError;
1287 return PyUnicode_AS_UNICODE(unicode);
1289 onError:
1290 return NULL;
1293 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1295 if (!PyUnicode_Check(unicode)) {
1296 PyErr_BadArgument();
1297 goto onError;
1299 return PyUnicode_GET_SIZE(unicode);
1301 onError:
1302 return -1;
1305 const char *PyUnicode_GetDefaultEncoding(void)
1307 return unicode_default_encoding;
1310 int PyUnicode_SetDefaultEncoding(const char *encoding)
1312 PyObject *v;
1314 /* Make sure the encoding is valid. As side effect, this also
1315 loads the encoding into the codec registry cache. */
1316 v = _PyCodec_Lookup(encoding);
1317 if (v == NULL)
1318 goto onError;
1319 Py_DECREF(v);
1320 strncpy(unicode_default_encoding,
1321 encoding,
1322 sizeof(unicode_default_encoding));
1323 return 0;
1325 onError:
1326 return -1;
1329 /* error handling callback helper:
1330 build arguments, call the callback and check the arguments,
1331 if no exception occurred, copy the replacement to the output
1332 and adjust various state variables.
1333 return 0 on success, -1 on error
1336 static
1337 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1338 const char *encoding, const char *reason,
1339 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1340 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1341 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1343 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1345 PyObject *restuple = NULL;
1346 PyObject *repunicode = NULL;
1347 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1348 Py_ssize_t requiredsize;
1349 Py_ssize_t newpos;
1350 Py_UNICODE *repptr;
1351 Py_ssize_t repsize;
1352 int res = -1;
1354 if (*errorHandler == NULL) {
1355 *errorHandler = PyCodec_LookupError(errors);
1356 if (*errorHandler == NULL)
1357 goto onError;
1360 if (*exceptionObject == NULL) {
1361 *exceptionObject = PyUnicodeDecodeError_Create(
1362 encoding, input, insize, *startinpos, *endinpos, reason);
1363 if (*exceptionObject == NULL)
1364 goto onError;
1366 else {
1367 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1368 goto onError;
1369 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1370 goto onError;
1371 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1372 goto onError;
1375 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1376 if (restuple == NULL)
1377 goto onError;
1378 if (!PyTuple_Check(restuple)) {
1379 PyErr_Format(PyExc_TypeError, &argparse[4]);
1380 goto onError;
1382 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1383 goto onError;
1384 if (newpos<0)
1385 newpos = insize+newpos;
1386 if (newpos<0 || newpos>insize) {
1387 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1388 goto onError;
1391 /* need more space? (at least enough for what we
1392 have+the replacement+the rest of the string (starting
1393 at the new input position), so we won't have to check space
1394 when there are no errors in the rest of the string) */
1395 repptr = PyUnicode_AS_UNICODE(repunicode);
1396 repsize = PyUnicode_GET_SIZE(repunicode);
1397 requiredsize = *outpos + repsize + insize-newpos;
1398 if (requiredsize > outsize) {
1399 if (requiredsize<2*outsize)
1400 requiredsize = 2*outsize;
1401 if (PyUnicode_Resize(output, requiredsize) < 0)
1402 goto onError;
1403 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1405 *endinpos = newpos;
1406 *inptr = input + newpos;
1407 Py_UNICODE_COPY(*outptr, repptr, repsize);
1408 *outptr += repsize;
1409 *outpos += repsize;
1410 /* we made it! */
1411 res = 0;
1413 onError:
1414 Py_XDECREF(restuple);
1415 return res;
1418 /* --- UTF-7 Codec -------------------------------------------------------- */
1420 /* see RFC2152 for details */
1422 static
1423 char utf7_special[128] = {
1424 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1425 encoded:
1426 0 - not special
1427 1 - special
1428 2 - whitespace (optional)
1429 3 - RFC2152 Set O (optional) */
1430 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1431 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1432 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1433 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1434 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1435 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1436 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1437 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1441 /* Note: The comparison (c) <= 0 is a trick to work-around gcc
1442 warnings about the comparison always being false; since
1443 utf7_special[0] is 1, we can safely make that one comparison
1444 true */
1446 #define SPECIAL(c, encodeO, encodeWS) \
1447 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
1448 (encodeWS && (utf7_special[(c)] == 2)) || \
1449 (encodeO && (utf7_special[(c)] == 3)))
1451 #define B64(n) \
1452 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1453 #define B64CHAR(c) \
1454 (isalnum(c) || (c) == '+' || (c) == '/')
1455 #define UB64(c) \
1456 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1457 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
1459 #define ENCODE(out, ch, bits) \
1460 while (bits >= 6) { \
1461 *out++ = B64(ch >> (bits-6)); \
1462 bits -= 6; \
1465 #define DECODE(out, ch, bits, surrogate) \
1466 while (bits >= 16) { \
1467 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1468 bits -= 16; \
1469 if (surrogate) { \
1470 /* We have already generated an error for the high surrogate \
1471 so let's not bother seeing if the low surrogate is correct or not */ \
1472 surrogate = 0; \
1473 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
1474 /* This is a surrogate pair. Unfortunately we can't represent \
1475 it in a 16-bit character */ \
1476 surrogate = 1; \
1477 errmsg = "code pairs are not supported"; \
1478 goto utf7Error; \
1479 } else { \
1480 *out++ = outCh; \
1484 PyObject *PyUnicode_DecodeUTF7(const char *s,
1485 Py_ssize_t size,
1486 const char *errors)
1488 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1491 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1492 Py_ssize_t size,
1493 const char *errors,
1494 Py_ssize_t *consumed)
1496 const char *starts = s;
1497 Py_ssize_t startinpos;
1498 Py_ssize_t endinpos;
1499 Py_ssize_t outpos;
1500 const char *e;
1501 PyUnicodeObject *unicode;
1502 Py_UNICODE *p;
1503 const char *errmsg = "";
1504 int inShift = 0;
1505 unsigned int bitsleft = 0;
1506 unsigned long charsleft = 0;
1507 int surrogate = 0;
1508 PyObject *errorHandler = NULL;
1509 PyObject *exc = NULL;
1511 unicode = _PyUnicode_New(size);
1512 if (!unicode)
1513 return NULL;
1514 if (size == 0) {
1515 if (consumed)
1516 *consumed = 0;
1517 return (PyObject *)unicode;
1520 p = unicode->str;
1521 e = s + size;
1523 while (s < e) {
1524 Py_UNICODE ch;
1525 restart:
1526 ch = *s;
1528 if (inShift) {
1529 if ((ch == '-') || !B64CHAR(ch)) {
1530 inShift = 0;
1531 s++;
1533 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1534 if (bitsleft >= 6) {
1535 /* The shift sequence has a partial character in it. If
1536 bitsleft < 6 then we could just classify it as padding
1537 but that is not the case here */
1539 errmsg = "partial character in shift sequence";
1540 goto utf7Error;
1542 /* According to RFC2152 the remaining bits should be zero. We
1543 choose to signal an error/insert a replacement character
1544 here so indicate the potential of a misencoded character. */
1546 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1547 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1548 errmsg = "non-zero padding bits in shift sequence";
1549 goto utf7Error;
1552 if (ch == '-') {
1553 if ((s < e) && (*(s) == '-')) {
1554 *p++ = '-';
1555 inShift = 1;
1557 } else if (SPECIAL(ch,0,0)) {
1558 errmsg = "unexpected special character";
1559 goto utf7Error;
1560 } else {
1561 *p++ = ch;
1563 } else {
1564 charsleft = (charsleft << 6) | UB64(ch);
1565 bitsleft += 6;
1566 s++;
1567 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1570 else if ( ch == '+' ) {
1571 startinpos = s-starts;
1572 s++;
1573 if (s < e && *s == '-') {
1574 s++;
1575 *p++ = '+';
1576 } else
1578 inShift = 1;
1579 bitsleft = 0;
1582 else if (SPECIAL(ch,0,0)) {
1583 startinpos = s-starts;
1584 errmsg = "unexpected special character";
1585 s++;
1586 goto utf7Error;
1588 else {
1589 *p++ = ch;
1590 s++;
1592 continue;
1593 utf7Error:
1594 outpos = p-PyUnicode_AS_UNICODE(unicode);
1595 endinpos = s-starts;
1596 if (unicode_decode_call_errorhandler(
1597 errors, &errorHandler,
1598 "utf7", errmsg,
1599 starts, size, &startinpos, &endinpos, &exc, &s,
1600 (PyObject **)&unicode, &outpos, &p))
1601 goto onError;
1604 if (inShift && !consumed) {
1605 outpos = p-PyUnicode_AS_UNICODE(unicode);
1606 endinpos = size;
1607 if (unicode_decode_call_errorhandler(
1608 errors, &errorHandler,
1609 "utf7", "unterminated shift sequence",
1610 starts, size, &startinpos, &endinpos, &exc, &s,
1611 (PyObject **)&unicode, &outpos, &p))
1612 goto onError;
1613 if (s < e)
1614 goto restart;
1616 if (consumed) {
1617 if(inShift)
1618 *consumed = startinpos;
1619 else
1620 *consumed = s-starts;
1623 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1624 goto onError;
1626 Py_XDECREF(errorHandler);
1627 Py_XDECREF(exc);
1628 return (PyObject *)unicode;
1630 onError:
1631 Py_XDECREF(errorHandler);
1632 Py_XDECREF(exc);
1633 Py_DECREF(unicode);
1634 return NULL;
1638 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1639 Py_ssize_t size,
1640 int encodeSetO,
1641 int encodeWhiteSpace,
1642 const char *errors)
1644 PyObject *v;
1645 /* It might be possible to tighten this worst case */
1646 Py_ssize_t cbAllocated = 5 * size;
1647 int inShift = 0;
1648 Py_ssize_t i = 0;
1649 unsigned int bitsleft = 0;
1650 unsigned long charsleft = 0;
1651 char * out;
1652 char * start;
1654 if (size == 0)
1655 return PyString_FromStringAndSize(NULL, 0);
1657 v = PyString_FromStringAndSize(NULL, cbAllocated);
1658 if (v == NULL)
1659 return NULL;
1661 start = out = PyString_AS_STRING(v);
1662 for (;i < size; ++i) {
1663 Py_UNICODE ch = s[i];
1665 if (!inShift) {
1666 if (ch == '+') {
1667 *out++ = '+';
1668 *out++ = '-';
1669 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1670 charsleft = ch;
1671 bitsleft = 16;
1672 *out++ = '+';
1673 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1674 inShift = bitsleft > 0;
1675 } else {
1676 *out++ = (char) ch;
1678 } else {
1679 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1680 *out++ = B64(charsleft << (6-bitsleft));
1681 charsleft = 0;
1682 bitsleft = 0;
1683 /* Characters not in the BASE64 set implicitly unshift the sequence
1684 so no '-' is required, except if the character is itself a '-' */
1685 if (B64CHAR(ch) || ch == '-') {
1686 *out++ = '-';
1688 inShift = 0;
1689 *out++ = (char) ch;
1690 } else {
1691 bitsleft += 16;
1692 charsleft = (charsleft << 16) | ch;
1693 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1695 /* If the next character is special then we dont' need to terminate
1696 the shift sequence. If the next character is not a BASE64 character
1697 or '-' then the shift sequence will be terminated implicitly and we
1698 don't have to insert a '-'. */
1700 if (bitsleft == 0) {
1701 if (i + 1 < size) {
1702 Py_UNICODE ch2 = s[i+1];
1704 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1706 } else if (B64CHAR(ch2) || ch2 == '-') {
1707 *out++ = '-';
1708 inShift = 0;
1709 } else {
1710 inShift = 0;
1714 else {
1715 *out++ = '-';
1716 inShift = 0;
1722 if (bitsleft) {
1723 *out++= B64(charsleft << (6-bitsleft) );
1724 *out++ = '-';
1727 _PyString_Resize(&v, out - start);
1728 return v;
1731 #undef SPECIAL
1732 #undef B64
1733 #undef B64CHAR
1734 #undef UB64
1735 #undef ENCODE
1736 #undef DECODE
1738 /* --- UTF-8 Codec -------------------------------------------------------- */
1740 static
1741 char utf8_code_length[256] = {
1742 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1743 illegal prefix. see RFC 2279 for details */
1744 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1745 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1746 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1747 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1748 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1749 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1750 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1751 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1752 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1753 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1754 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1755 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1756 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1757 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1758 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1759 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1762 PyObject *PyUnicode_DecodeUTF8(const char *s,
1763 Py_ssize_t size,
1764 const char *errors)
1766 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1769 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1770 Py_ssize_t size,
1771 const char *errors,
1772 Py_ssize_t *consumed)
1774 const char *starts = s;
1775 int n;
1776 Py_ssize_t startinpos;
1777 Py_ssize_t endinpos;
1778 Py_ssize_t outpos;
1779 const char *e;
1780 PyUnicodeObject *unicode;
1781 Py_UNICODE *p;
1782 const char *errmsg = "";
1783 PyObject *errorHandler = NULL;
1784 PyObject *exc = NULL;
1786 /* Note: size will always be longer than the resulting Unicode
1787 character count */
1788 unicode = _PyUnicode_New(size);
1789 if (!unicode)
1790 return NULL;
1791 if (size == 0) {
1792 if (consumed)
1793 *consumed = 0;
1794 return (PyObject *)unicode;
1797 /* Unpack UTF-8 encoded data */
1798 p = unicode->str;
1799 e = s + size;
1801 while (s < e) {
1802 Py_UCS4 ch = (unsigned char)*s;
1804 if (ch < 0x80) {
1805 *p++ = (Py_UNICODE)ch;
1806 s++;
1807 continue;
1810 n = utf8_code_length[ch];
1812 if (s + n > e) {
1813 if (consumed)
1814 break;
1815 else {
1816 errmsg = "unexpected end of data";
1817 startinpos = s-starts;
1818 endinpos = size;
1819 goto utf8Error;
1823 switch (n) {
1825 case 0:
1826 errmsg = "unexpected code byte";
1827 startinpos = s-starts;
1828 endinpos = startinpos+1;
1829 goto utf8Error;
1831 case 1:
1832 errmsg = "internal error";
1833 startinpos = s-starts;
1834 endinpos = startinpos+1;
1835 goto utf8Error;
1837 case 2:
1838 if ((s[1] & 0xc0) != 0x80) {
1839 errmsg = "invalid data";
1840 startinpos = s-starts;
1841 endinpos = startinpos+2;
1842 goto utf8Error;
1844 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1845 if (ch < 0x80) {
1846 startinpos = s-starts;
1847 endinpos = startinpos+2;
1848 errmsg = "illegal encoding";
1849 goto utf8Error;
1851 else
1852 *p++ = (Py_UNICODE)ch;
1853 break;
1855 case 3:
1856 if ((s[1] & 0xc0) != 0x80 ||
1857 (s[2] & 0xc0) != 0x80) {
1858 errmsg = "invalid data";
1859 startinpos = s-starts;
1860 endinpos = startinpos+3;
1861 goto utf8Error;
1863 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1864 if (ch < 0x0800) {
1865 /* Note: UTF-8 encodings of surrogates are considered
1866 legal UTF-8 sequences;
1868 XXX For wide builds (UCS-4) we should probably try
1869 to recombine the surrogates into a single code
1870 unit.
1872 errmsg = "illegal encoding";
1873 startinpos = s-starts;
1874 endinpos = startinpos+3;
1875 goto utf8Error;
1877 else
1878 *p++ = (Py_UNICODE)ch;
1879 break;
1881 case 4:
1882 if ((s[1] & 0xc0) != 0x80 ||
1883 (s[2] & 0xc0) != 0x80 ||
1884 (s[3] & 0xc0) != 0x80) {
1885 errmsg = "invalid data";
1886 startinpos = s-starts;
1887 endinpos = startinpos+4;
1888 goto utf8Error;
1890 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1891 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1892 /* validate and convert to UTF-16 */
1893 if ((ch < 0x10000) /* minimum value allowed for 4
1894 byte encoding */
1895 || (ch > 0x10ffff)) /* maximum value allowed for
1896 UTF-16 */
1898 errmsg = "illegal encoding";
1899 startinpos = s-starts;
1900 endinpos = startinpos+4;
1901 goto utf8Error;
1903 #ifdef Py_UNICODE_WIDE
1904 *p++ = (Py_UNICODE)ch;
1905 #else
1906 /* compute and append the two surrogates: */
1908 /* translate from 10000..10FFFF to 0..FFFF */
1909 ch -= 0x10000;
1911 /* high surrogate = top 10 bits added to D800 */
1912 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1914 /* low surrogate = bottom 10 bits added to DC00 */
1915 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1916 #endif
1917 break;
1919 default:
1920 /* Other sizes are only needed for UCS-4 */
1921 errmsg = "unsupported Unicode code range";
1922 startinpos = s-starts;
1923 endinpos = startinpos+n;
1924 goto utf8Error;
1926 s += n;
1927 continue;
1929 utf8Error:
1930 outpos = p-PyUnicode_AS_UNICODE(unicode);
1931 if (unicode_decode_call_errorhandler(
1932 errors, &errorHandler,
1933 "utf8", errmsg,
1934 starts, size, &startinpos, &endinpos, &exc, &s,
1935 (PyObject **)&unicode, &outpos, &p))
1936 goto onError;
1938 if (consumed)
1939 *consumed = s-starts;
1941 /* Adjust length */
1942 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1943 goto onError;
1945 Py_XDECREF(errorHandler);
1946 Py_XDECREF(exc);
1947 return (PyObject *)unicode;
1949 onError:
1950 Py_XDECREF(errorHandler);
1951 Py_XDECREF(exc);
1952 Py_DECREF(unicode);
1953 return NULL;
1956 /* Allocation strategy: if the string is short, convert into a stack buffer
1957 and allocate exactly as much space needed at the end. Else allocate the
1958 maximum possible needed (4 result bytes per Unicode character), and return
1959 the excess memory at the end.
1961 PyObject *
1962 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1963 Py_ssize_t size,
1964 const char *errors)
1966 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
1968 Py_ssize_t i; /* index into s of next input byte */
1969 PyObject *v; /* result string object */
1970 char *p; /* next free byte in output buffer */
1971 Py_ssize_t nallocated; /* number of result bytes allocated */
1972 Py_ssize_t nneeded; /* number of result bytes needed */
1973 char stackbuf[MAX_SHORT_UNICHARS * 4];
1975 assert(s != NULL);
1976 assert(size >= 0);
1978 if (size <= MAX_SHORT_UNICHARS) {
1979 /* Write into the stack buffer; nallocated can't overflow.
1980 * At the end, we'll allocate exactly as much heap space as it
1981 * turns out we need.
1983 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1984 v = NULL; /* will allocate after we're done */
1985 p = stackbuf;
1987 else {
1988 /* Overallocate on the heap, and give the excess back at the end. */
1989 nallocated = size * 4;
1990 if (nallocated / 4 != size) /* overflow! */
1991 return PyErr_NoMemory();
1992 v = PyString_FromStringAndSize(NULL, nallocated);
1993 if (v == NULL)
1994 return NULL;
1995 p = PyString_AS_STRING(v);
1998 for (i = 0; i < size;) {
1999 Py_UCS4 ch = s[i++];
2001 if (ch < 0x80)
2002 /* Encode ASCII */
2003 *p++ = (char) ch;
2005 else if (ch < 0x0800) {
2006 /* Encode Latin-1 */
2007 *p++ = (char)(0xc0 | (ch >> 6));
2008 *p++ = (char)(0x80 | (ch & 0x3f));
2010 else {
2011 /* Encode UCS2 Unicode ordinals */
2012 if (ch < 0x10000) {
2013 /* Special case: check for high surrogate */
2014 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2015 Py_UCS4 ch2 = s[i];
2016 /* Check for low surrogate and combine the two to
2017 form a UCS4 value */
2018 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2019 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2020 i++;
2021 goto encodeUCS4;
2023 /* Fall through: handles isolated high surrogates */
2025 *p++ = (char)(0xe0 | (ch >> 12));
2026 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2027 *p++ = (char)(0x80 | (ch & 0x3f));
2028 continue;
2030 encodeUCS4:
2031 /* Encode UCS4 Unicode ordinals */
2032 *p++ = (char)(0xf0 | (ch >> 18));
2033 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2034 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2035 *p++ = (char)(0x80 | (ch & 0x3f));
2039 if (v == NULL) {
2040 /* This was stack allocated. */
2041 nneeded = p - stackbuf;
2042 assert(nneeded <= nallocated);
2043 v = PyString_FromStringAndSize(stackbuf, nneeded);
2045 else {
2046 /* Cut back to size actually needed. */
2047 nneeded = p - PyString_AS_STRING(v);
2048 assert(nneeded <= nallocated);
2049 _PyString_Resize(&v, nneeded);
2051 return v;
2053 #undef MAX_SHORT_UNICHARS
2056 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2058 if (!PyUnicode_Check(unicode)) {
2059 PyErr_BadArgument();
2060 return NULL;
2062 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2063 PyUnicode_GET_SIZE(unicode),
2064 NULL);
2067 /* --- UTF-32 Codec ------------------------------------------------------- */
2069 PyObject *
2070 PyUnicode_DecodeUTF32(const char *s,
2071 Py_ssize_t size,
2072 const char *errors,
2073 int *byteorder)
2075 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2078 PyObject *
2079 PyUnicode_DecodeUTF32Stateful(const char *s,
2080 Py_ssize_t size,
2081 const char *errors,
2082 int *byteorder,
2083 Py_ssize_t *consumed)
2085 const char *starts = s;
2086 Py_ssize_t startinpos;
2087 Py_ssize_t endinpos;
2088 Py_ssize_t outpos;
2089 PyUnicodeObject *unicode;
2090 Py_UNICODE *p;
2091 #ifndef Py_UNICODE_WIDE
2092 int i, pairs;
2093 #else
2094 const int pairs = 0;
2095 #endif
2096 const unsigned char *q, *e;
2097 int bo = 0; /* assume native ordering by default */
2098 const char *errmsg = "";
2099 /* Offsets from q for retrieving bytes in the right order. */
2100 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2101 int iorder[] = {0, 1, 2, 3};
2102 #else
2103 int iorder[] = {3, 2, 1, 0};
2104 #endif
2105 PyObject *errorHandler = NULL;
2106 PyObject *exc = NULL;
2107 /* On narrow builds we split characters outside the BMP into two
2108 codepoints => count how much extra space we need. */
2109 #ifndef Py_UNICODE_WIDE
2110 for (i = pairs = 0; i < size/4; i++)
2111 if (((Py_UCS4 *)s)[i] >= 0x10000)
2112 pairs++;
2113 #endif
2115 /* This might be one to much, because of a BOM */
2116 unicode = _PyUnicode_New((size+3)/4+pairs);
2117 if (!unicode)
2118 return NULL;
2119 if (size == 0)
2120 return (PyObject *)unicode;
2122 /* Unpack UTF-32 encoded data */
2123 p = unicode->str;
2124 q = (unsigned char *)s;
2125 e = q + size;
2127 if (byteorder)
2128 bo = *byteorder;
2130 /* Check for BOM marks (U+FEFF) in the input and adjust current
2131 byte order setting accordingly. In native mode, the leading BOM
2132 mark is skipped, in all other modes, it is copied to the output
2133 stream as-is (giving a ZWNBSP character). */
2134 if (bo == 0) {
2135 if (size >= 4) {
2136 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2137 (q[iorder[1]] << 8) | q[iorder[0]];
2138 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2139 if (bom == 0x0000FEFF) {
2140 q += 4;
2141 bo = -1;
2143 else if (bom == 0xFFFE0000) {
2144 q += 4;
2145 bo = 1;
2147 #else
2148 if (bom == 0x0000FEFF) {
2149 q += 4;
2150 bo = 1;
2152 else if (bom == 0xFFFE0000) {
2153 q += 4;
2154 bo = -1;
2156 #endif
2160 if (bo == -1) {
2161 /* force LE */
2162 iorder[0] = 0;
2163 iorder[1] = 1;
2164 iorder[2] = 2;
2165 iorder[3] = 3;
2167 else if (bo == 1) {
2168 /* force BE */
2169 iorder[0] = 3;
2170 iorder[1] = 2;
2171 iorder[2] = 1;
2172 iorder[3] = 0;
2175 while (q < e) {
2176 Py_UCS4 ch;
2177 /* remaining bytes at the end? (size should be divisible by 4) */
2178 if (e-q<4) {
2179 if (consumed)
2180 break;
2181 errmsg = "truncated data";
2182 startinpos = ((const char *)q)-starts;
2183 endinpos = ((const char *)e)-starts;
2184 goto utf32Error;
2185 /* The remaining input chars are ignored if the callback
2186 chooses to skip the input */
2188 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2189 (q[iorder[1]] << 8) | q[iorder[0]];
2191 if (ch >= 0x110000)
2193 errmsg = "codepoint not in range(0x110000)";
2194 startinpos = ((const char *)q)-starts;
2195 endinpos = startinpos+4;
2196 goto utf32Error;
2198 #ifndef Py_UNICODE_WIDE
2199 if (ch >= 0x10000)
2201 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2202 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2204 else
2205 #endif
2206 *p++ = ch;
2207 q += 4;
2208 continue;
2209 utf32Error:
2210 outpos = p-PyUnicode_AS_UNICODE(unicode);
2211 if (unicode_decode_call_errorhandler(
2212 errors, &errorHandler,
2213 "utf32", errmsg,
2214 starts, size, &startinpos, &endinpos, &exc, &s,
2215 (PyObject **)&unicode, &outpos, &p))
2216 goto onError;
2219 if (byteorder)
2220 *byteorder = bo;
2222 if (consumed)
2223 *consumed = (const char *)q-starts;
2225 /* Adjust length */
2226 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2227 goto onError;
2229 Py_XDECREF(errorHandler);
2230 Py_XDECREF(exc);
2231 return (PyObject *)unicode;
2233 onError:
2234 Py_DECREF(unicode);
2235 Py_XDECREF(errorHandler);
2236 Py_XDECREF(exc);
2237 return NULL;
2240 PyObject *
2241 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2242 Py_ssize_t size,
2243 const char *errors,
2244 int byteorder)
2246 PyObject *v;
2247 unsigned char *p;
2248 #ifndef Py_UNICODE_WIDE
2249 int i, pairs;
2250 #else
2251 const int pairs = 0;
2252 #endif
2253 /* Offsets from p for storing byte pairs in the right order. */
2254 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2255 int iorder[] = {0, 1, 2, 3};
2256 #else
2257 int iorder[] = {3, 2, 1, 0};
2258 #endif
2260 #define STORECHAR(CH) \
2261 do { \
2262 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2263 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2264 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2265 p[iorder[0]] = (CH) & 0xff; \
2266 p += 4; \
2267 } while(0)
2269 /* In narrow builds we can output surrogate pairs as one codepoint,
2270 so we need less space. */
2271 #ifndef Py_UNICODE_WIDE
2272 for (i = pairs = 0; i < size-1; i++)
2273 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2274 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2275 pairs++;
2276 #endif
2277 v = PyString_FromStringAndSize(NULL,
2278 4 * (size - pairs + (byteorder == 0)));
2279 if (v == NULL)
2280 return NULL;
2282 p = (unsigned char *)PyString_AS_STRING(v);
2283 if (byteorder == 0)
2284 STORECHAR(0xFEFF);
2285 if (size == 0)
2286 return v;
2288 if (byteorder == -1) {
2289 /* force LE */
2290 iorder[0] = 0;
2291 iorder[1] = 1;
2292 iorder[2] = 2;
2293 iorder[3] = 3;
2295 else if (byteorder == 1) {
2296 /* force BE */
2297 iorder[0] = 3;
2298 iorder[1] = 2;
2299 iorder[2] = 1;
2300 iorder[3] = 0;
2303 while (size-- > 0) {
2304 Py_UCS4 ch = *s++;
2305 #ifndef Py_UNICODE_WIDE
2306 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2307 Py_UCS4 ch2 = *s;
2308 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2309 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2310 s++;
2311 size--;
2314 #endif
2315 STORECHAR(ch);
2317 return v;
2318 #undef STORECHAR
2321 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2323 if (!PyUnicode_Check(unicode)) {
2324 PyErr_BadArgument();
2325 return NULL;
2327 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2328 PyUnicode_GET_SIZE(unicode),
2329 NULL,
2333 /* --- UTF-16 Codec ------------------------------------------------------- */
2335 PyObject *
2336 PyUnicode_DecodeUTF16(const char *s,
2337 Py_ssize_t size,
2338 const char *errors,
2339 int *byteorder)
2341 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2344 PyObject *
2345 PyUnicode_DecodeUTF16Stateful(const char *s,
2346 Py_ssize_t size,
2347 const char *errors,
2348 int *byteorder,
2349 Py_ssize_t *consumed)
2351 const char *starts = s;
2352 Py_ssize_t startinpos;
2353 Py_ssize_t endinpos;
2354 Py_ssize_t outpos;
2355 PyUnicodeObject *unicode;
2356 Py_UNICODE *p;
2357 const unsigned char *q, *e;
2358 int bo = 0; /* assume native ordering by default */
2359 const char *errmsg = "";
2360 /* Offsets from q for retrieving byte pairs in the right order. */
2361 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2362 int ihi = 1, ilo = 0;
2363 #else
2364 int ihi = 0, ilo = 1;
2365 #endif
2366 PyObject *errorHandler = NULL;
2367 PyObject *exc = NULL;
2369 /* Note: size will always be longer than the resulting Unicode
2370 character count */
2371 unicode = _PyUnicode_New(size);
2372 if (!unicode)
2373 return NULL;
2374 if (size == 0)
2375 return (PyObject *)unicode;
2377 /* Unpack UTF-16 encoded data */
2378 p = unicode->str;
2379 q = (unsigned char *)s;
2380 e = q + size;
2382 if (byteorder)
2383 bo = *byteorder;
2385 /* Check for BOM marks (U+FEFF) in the input and adjust current
2386 byte order setting accordingly. In native mode, the leading BOM
2387 mark is skipped, in all other modes, it is copied to the output
2388 stream as-is (giving a ZWNBSP character). */
2389 if (bo == 0) {
2390 if (size >= 2) {
2391 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2392 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2393 if (bom == 0xFEFF) {
2394 q += 2;
2395 bo = -1;
2397 else if (bom == 0xFFFE) {
2398 q += 2;
2399 bo = 1;
2401 #else
2402 if (bom == 0xFEFF) {
2403 q += 2;
2404 bo = 1;
2406 else if (bom == 0xFFFE) {
2407 q += 2;
2408 bo = -1;
2410 #endif
2414 if (bo == -1) {
2415 /* force LE */
2416 ihi = 1;
2417 ilo = 0;
2419 else if (bo == 1) {
2420 /* force BE */
2421 ihi = 0;
2422 ilo = 1;
2425 while (q < e) {
2426 Py_UNICODE ch;
2427 /* remaining bytes at the end? (size should be even) */
2428 if (e-q<2) {
2429 if (consumed)
2430 break;
2431 errmsg = "truncated data";
2432 startinpos = ((const char *)q)-starts;
2433 endinpos = ((const char *)e)-starts;
2434 goto utf16Error;
2435 /* The remaining input chars are ignored if the callback
2436 chooses to skip the input */
2438 ch = (q[ihi] << 8) | q[ilo];
2440 q += 2;
2442 if (ch < 0xD800 || ch > 0xDFFF) {
2443 *p++ = ch;
2444 continue;
2447 /* UTF-16 code pair: */
2448 if (q >= e) {
2449 errmsg = "unexpected end of data";
2450 startinpos = (((const char *)q)-2)-starts;
2451 endinpos = ((const char *)e)-starts;
2452 goto utf16Error;
2454 if (0xD800 <= ch && ch <= 0xDBFF) {
2455 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2456 q += 2;
2457 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2458 #ifndef Py_UNICODE_WIDE
2459 *p++ = ch;
2460 *p++ = ch2;
2461 #else
2462 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2463 #endif
2464 continue;
2466 else {
2467 errmsg = "illegal UTF-16 surrogate";
2468 startinpos = (((const char *)q)-4)-starts;
2469 endinpos = startinpos+2;
2470 goto utf16Error;
2474 errmsg = "illegal encoding";
2475 startinpos = (((const char *)q)-2)-starts;
2476 endinpos = startinpos+2;
2477 /* Fall through to report the error */
2479 utf16Error:
2480 outpos = p-PyUnicode_AS_UNICODE(unicode);
2481 if (unicode_decode_call_errorhandler(
2482 errors, &errorHandler,
2483 "utf16", errmsg,
2484 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2485 (PyObject **)&unicode, &outpos, &p))
2486 goto onError;
2489 if (byteorder)
2490 *byteorder = bo;
2492 if (consumed)
2493 *consumed = (const char *)q-starts;
2495 /* Adjust length */
2496 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2497 goto onError;
2499 Py_XDECREF(errorHandler);
2500 Py_XDECREF(exc);
2501 return (PyObject *)unicode;
2503 onError:
2504 Py_DECREF(unicode);
2505 Py_XDECREF(errorHandler);
2506 Py_XDECREF(exc);
2507 return NULL;
2510 PyObject *
2511 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2512 Py_ssize_t size,
2513 const char *errors,
2514 int byteorder)
2516 PyObject *v;
2517 unsigned char *p;
2518 #ifdef Py_UNICODE_WIDE
2519 int i, pairs;
2520 #else
2521 const int pairs = 0;
2522 #endif
2523 /* Offsets from p for storing byte pairs in the right order. */
2524 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2525 int ihi = 1, ilo = 0;
2526 #else
2527 int ihi = 0, ilo = 1;
2528 #endif
2530 #define STORECHAR(CH) \
2531 do { \
2532 p[ihi] = ((CH) >> 8) & 0xff; \
2533 p[ilo] = (CH) & 0xff; \
2534 p += 2; \
2535 } while(0)
2537 #ifdef Py_UNICODE_WIDE
2538 for (i = pairs = 0; i < size; i++)
2539 if (s[i] >= 0x10000)
2540 pairs++;
2541 #endif
2542 v = PyString_FromStringAndSize(NULL,
2543 2 * (size + pairs + (byteorder == 0)));
2544 if (v == NULL)
2545 return NULL;
2547 p = (unsigned char *)PyString_AS_STRING(v);
2548 if (byteorder == 0)
2549 STORECHAR(0xFEFF);
2550 if (size == 0)
2551 return v;
2553 if (byteorder == -1) {
2554 /* force LE */
2555 ihi = 1;
2556 ilo = 0;
2558 else if (byteorder == 1) {
2559 /* force BE */
2560 ihi = 0;
2561 ilo = 1;
2564 while (size-- > 0) {
2565 Py_UNICODE ch = *s++;
2566 Py_UNICODE ch2 = 0;
2567 #ifdef Py_UNICODE_WIDE
2568 if (ch >= 0x10000) {
2569 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2570 ch = 0xD800 | ((ch-0x10000) >> 10);
2572 #endif
2573 STORECHAR(ch);
2574 if (ch2)
2575 STORECHAR(ch2);
2577 return v;
2578 #undef STORECHAR
2581 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2583 if (!PyUnicode_Check(unicode)) {
2584 PyErr_BadArgument();
2585 return NULL;
2587 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2588 PyUnicode_GET_SIZE(unicode),
2589 NULL,
2593 /* --- Unicode Escape Codec ----------------------------------------------- */
2595 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2597 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2598 Py_ssize_t size,
2599 const char *errors)
2601 const char *starts = s;
2602 Py_ssize_t startinpos;
2603 Py_ssize_t endinpos;
2604 Py_ssize_t outpos;
2605 int i;
2606 PyUnicodeObject *v;
2607 Py_UNICODE *p;
2608 const char *end;
2609 char* message;
2610 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2611 PyObject *errorHandler = NULL;
2612 PyObject *exc = NULL;
2614 /* Escaped strings will always be longer than the resulting
2615 Unicode string, so we start with size here and then reduce the
2616 length after conversion to the true value.
2617 (but if the error callback returns a long replacement string
2618 we'll have to allocate more space) */
2619 v = _PyUnicode_New(size);
2620 if (v == NULL)
2621 goto onError;
2622 if (size == 0)
2623 return (PyObject *)v;
2625 p = PyUnicode_AS_UNICODE(v);
2626 end = s + size;
2628 while (s < end) {
2629 unsigned char c;
2630 Py_UNICODE x;
2631 int digits;
2633 /* Non-escape characters are interpreted as Unicode ordinals */
2634 if (*s != '\\') {
2635 *p++ = (unsigned char) *s++;
2636 continue;
2639 startinpos = s-starts;
2640 /* \ - Escapes */
2641 s++;
2642 c = *s++;
2643 if (s > end)
2644 c = '\0'; /* Invalid after \ */
2645 switch (c) {
2647 /* \x escapes */
2648 case '\n': break;
2649 case '\\': *p++ = '\\'; break;
2650 case '\'': *p++ = '\''; break;
2651 case '\"': *p++ = '\"'; break;
2652 case 'b': *p++ = '\b'; break;
2653 case 'f': *p++ = '\014'; break; /* FF */
2654 case 't': *p++ = '\t'; break;
2655 case 'n': *p++ = '\n'; break;
2656 case 'r': *p++ = '\r'; break;
2657 case 'v': *p++ = '\013'; break; /* VT */
2658 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2660 /* \OOO (octal) escapes */
2661 case '0': case '1': case '2': case '3':
2662 case '4': case '5': case '6': case '7':
2663 x = s[-1] - '0';
2664 if (s < end && '0' <= *s && *s <= '7') {
2665 x = (x<<3) + *s++ - '0';
2666 if (s < end && '0' <= *s && *s <= '7')
2667 x = (x<<3) + *s++ - '0';
2669 *p++ = x;
2670 break;
2672 /* hex escapes */
2673 /* \xXX */
2674 case 'x':
2675 digits = 2;
2676 message = "truncated \\xXX escape";
2677 goto hexescape;
2679 /* \uXXXX */
2680 case 'u':
2681 digits = 4;
2682 message = "truncated \\uXXXX escape";
2683 goto hexescape;
2685 /* \UXXXXXXXX */
2686 case 'U':
2687 digits = 8;
2688 message = "truncated \\UXXXXXXXX escape";
2689 hexescape:
2690 chr = 0;
2691 outpos = p-PyUnicode_AS_UNICODE(v);
2692 if (s+digits>end) {
2693 endinpos = size;
2694 if (unicode_decode_call_errorhandler(
2695 errors, &errorHandler,
2696 "unicodeescape", "end of string in escape sequence",
2697 starts, size, &startinpos, &endinpos, &exc, &s,
2698 (PyObject **)&v, &outpos, &p))
2699 goto onError;
2700 goto nextByte;
2702 for (i = 0; i < digits; ++i) {
2703 c = (unsigned char) s[i];
2704 if (!isxdigit(c)) {
2705 endinpos = (s+i+1)-starts;
2706 if (unicode_decode_call_errorhandler(
2707 errors, &errorHandler,
2708 "unicodeescape", message,
2709 starts, size, &startinpos, &endinpos, &exc, &s,
2710 (PyObject **)&v, &outpos, &p))
2711 goto onError;
2712 goto nextByte;
2714 chr = (chr<<4) & ~0xF;
2715 if (c >= '0' && c <= '9')
2716 chr += c - '0';
2717 else if (c >= 'a' && c <= 'f')
2718 chr += 10 + c - 'a';
2719 else
2720 chr += 10 + c - 'A';
2722 s += i;
2723 if (chr == 0xffffffff && PyErr_Occurred())
2724 /* _decoding_error will have already written into the
2725 target buffer. */
2726 break;
2727 store:
2728 /* when we get here, chr is a 32-bit unicode character */
2729 if (chr <= 0xffff)
2730 /* UCS-2 character */
2731 *p++ = (Py_UNICODE) chr;
2732 else if (chr <= 0x10ffff) {
2733 /* UCS-4 character. Either store directly, or as
2734 surrogate pair. */
2735 #ifdef Py_UNICODE_WIDE
2736 *p++ = chr;
2737 #else
2738 chr -= 0x10000L;
2739 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2740 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2741 #endif
2742 } else {
2743 endinpos = s-starts;
2744 outpos = p-PyUnicode_AS_UNICODE(v);
2745 if (unicode_decode_call_errorhandler(
2746 errors, &errorHandler,
2747 "unicodeescape", "illegal Unicode character",
2748 starts, size, &startinpos, &endinpos, &exc, &s,
2749 (PyObject **)&v, &outpos, &p))
2750 goto onError;
2752 break;
2754 /* \N{name} */
2755 case 'N':
2756 message = "malformed \\N character escape";
2757 if (ucnhash_CAPI == NULL) {
2758 /* load the unicode data module */
2759 PyObject *m, *api;
2760 m = PyImport_ImportModuleNoBlock("unicodedata");
2761 if (m == NULL)
2762 goto ucnhashError;
2763 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
2764 Py_DECREF(m);
2765 if (api == NULL)
2766 goto ucnhashError;
2767 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
2768 Py_DECREF(api);
2769 if (ucnhash_CAPI == NULL)
2770 goto ucnhashError;
2772 if (*s == '{') {
2773 const char *start = s+1;
2774 /* look for the closing brace */
2775 while (*s != '}' && s < end)
2776 s++;
2777 if (s > start && s < end && *s == '}') {
2778 /* found a name. look it up in the unicode database */
2779 message = "unknown Unicode character name";
2780 s++;
2781 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2782 goto store;
2785 endinpos = s-starts;
2786 outpos = p-PyUnicode_AS_UNICODE(v);
2787 if (unicode_decode_call_errorhandler(
2788 errors, &errorHandler,
2789 "unicodeescape", message,
2790 starts, size, &startinpos, &endinpos, &exc, &s,
2791 (PyObject **)&v, &outpos, &p))
2792 goto onError;
2793 break;
2795 default:
2796 if (s > end) {
2797 message = "\\ at end of string";
2798 s--;
2799 endinpos = s-starts;
2800 outpos = p-PyUnicode_AS_UNICODE(v);
2801 if (unicode_decode_call_errorhandler(
2802 errors, &errorHandler,
2803 "unicodeescape", message,
2804 starts, size, &startinpos, &endinpos, &exc, &s,
2805 (PyObject **)&v, &outpos, &p))
2806 goto onError;
2808 else {
2809 *p++ = '\\';
2810 *p++ = (unsigned char)s[-1];
2812 break;
2814 nextByte:
2817 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2818 goto onError;
2819 Py_XDECREF(errorHandler);
2820 Py_XDECREF(exc);
2821 return (PyObject *)v;
2823 ucnhashError:
2824 PyErr_SetString(
2825 PyExc_UnicodeError,
2826 "\\N escapes not supported (can't load unicodedata module)"
2828 Py_XDECREF(v);
2829 Py_XDECREF(errorHandler);
2830 Py_XDECREF(exc);
2831 return NULL;
2833 onError:
2834 Py_XDECREF(v);
2835 Py_XDECREF(errorHandler);
2836 Py_XDECREF(exc);
2837 return NULL;
2840 /* Return a Unicode-Escape string version of the Unicode object.
2842 If quotes is true, the string is enclosed in u"" or u'' quotes as
2843 appropriate.
2847 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2848 Py_ssize_t size,
2849 Py_UNICODE ch)
2851 /* like wcschr, but doesn't stop at NULL characters */
2853 while (size-- > 0) {
2854 if (*s == ch)
2855 return s;
2856 s++;
2859 return NULL;
2862 static
2863 PyObject *unicodeescape_string(const Py_UNICODE *s,
2864 Py_ssize_t size,
2865 int quotes)
2867 PyObject *repr;
2868 char *p;
2870 static const char *hexdigit = "0123456789abcdef";
2872 /* XXX(nnorwitz): rather than over-allocating, it would be
2873 better to choose a different scheme. Perhaps scan the
2874 first N-chars of the string and allocate based on that size.
2876 /* Initial allocation is based on the longest-possible unichr
2877 escape.
2879 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2880 unichr, so in this case it's the longest unichr escape. In
2881 narrow (UTF-16) builds this is five chars per source unichr
2882 since there are two unichrs in the surrogate pair, so in narrow
2883 (UTF-16) builds it's not the longest unichr escape.
2885 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2886 so in the narrow (UTF-16) build case it's the longest unichr
2887 escape.
2890 repr = PyString_FromStringAndSize(NULL,
2892 #ifdef Py_UNICODE_WIDE
2893 + 10*size
2894 #else
2895 + 6*size
2896 #endif
2897 + 1);
2898 if (repr == NULL)
2899 return NULL;
2901 p = PyString_AS_STRING(repr);
2903 if (quotes) {
2904 *p++ = 'u';
2905 *p++ = (findchar(s, size, '\'') &&
2906 !findchar(s, size, '"')) ? '"' : '\'';
2908 while (size-- > 0) {
2909 Py_UNICODE ch = *s++;
2911 /* Escape quotes and backslashes */
2912 if ((quotes &&
2913 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
2914 *p++ = '\\';
2915 *p++ = (char) ch;
2916 continue;
2919 #ifdef Py_UNICODE_WIDE
2920 /* Map 21-bit characters to '\U00xxxxxx' */
2921 else if (ch >= 0x10000) {
2922 *p++ = '\\';
2923 *p++ = 'U';
2924 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2925 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2926 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2927 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2928 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2929 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2930 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
2931 *p++ = hexdigit[ch & 0x0000000F];
2932 continue;
2934 #else
2935 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
2936 else if (ch >= 0xD800 && ch < 0xDC00) {
2937 Py_UNICODE ch2;
2938 Py_UCS4 ucs;
2940 ch2 = *s++;
2941 size--;
2942 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2943 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2944 *p++ = '\\';
2945 *p++ = 'U';
2946 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2947 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2948 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2949 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2950 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2951 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2952 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2953 *p++ = hexdigit[ucs & 0x0000000F];
2954 continue;
2956 /* Fall through: isolated surrogates are copied as-is */
2957 s--;
2958 size++;
2960 #endif
2962 /* Map 16-bit characters to '\uxxxx' */
2963 if (ch >= 256) {
2964 *p++ = '\\';
2965 *p++ = 'u';
2966 *p++ = hexdigit[(ch >> 12) & 0x000F];
2967 *p++ = hexdigit[(ch >> 8) & 0x000F];
2968 *p++ = hexdigit[(ch >> 4) & 0x000F];
2969 *p++ = hexdigit[ch & 0x000F];
2972 /* Map special whitespace to '\t', \n', '\r' */
2973 else if (ch == '\t') {
2974 *p++ = '\\';
2975 *p++ = 't';
2977 else if (ch == '\n') {
2978 *p++ = '\\';
2979 *p++ = 'n';
2981 else if (ch == '\r') {
2982 *p++ = '\\';
2983 *p++ = 'r';
2986 /* Map non-printable US ASCII to '\xhh' */
2987 else if (ch < ' ' || ch >= 0x7F) {
2988 *p++ = '\\';
2989 *p++ = 'x';
2990 *p++ = hexdigit[(ch >> 4) & 0x000F];
2991 *p++ = hexdigit[ch & 0x000F];
2994 /* Copy everything else as-is */
2995 else
2996 *p++ = (char) ch;
2998 if (quotes)
2999 *p++ = PyString_AS_STRING(repr)[1];
3001 *p = '\0';
3002 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
3003 return repr;
3006 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3007 Py_ssize_t size)
3009 return unicodeescape_string(s, size, 0);
3012 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3014 if (!PyUnicode_Check(unicode)) {
3015 PyErr_BadArgument();
3016 return NULL;
3018 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3019 PyUnicode_GET_SIZE(unicode));
3022 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3024 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3025 Py_ssize_t size,
3026 const char *errors)
3028 const char *starts = s;
3029 Py_ssize_t startinpos;
3030 Py_ssize_t endinpos;
3031 Py_ssize_t outpos;
3032 PyUnicodeObject *v;
3033 Py_UNICODE *p;
3034 const char *end;
3035 const char *bs;
3036 PyObject *errorHandler = NULL;
3037 PyObject *exc = NULL;
3039 /* Escaped strings will always be longer than the resulting
3040 Unicode string, so we start with size here and then reduce the
3041 length after conversion to the true value. (But decoding error
3042 handler might have to resize the string) */
3043 v = _PyUnicode_New(size);
3044 if (v == NULL)
3045 goto onError;
3046 if (size == 0)
3047 return (PyObject *)v;
3048 p = PyUnicode_AS_UNICODE(v);
3049 end = s + size;
3050 while (s < end) {
3051 unsigned char c;
3052 Py_UCS4 x;
3053 int i;
3054 int count;
3056 /* Non-escape characters are interpreted as Unicode ordinals */
3057 if (*s != '\\') {
3058 *p++ = (unsigned char)*s++;
3059 continue;
3061 startinpos = s-starts;
3063 /* \u-escapes are only interpreted iff the number of leading
3064 backslashes if odd */
3065 bs = s;
3066 for (;s < end;) {
3067 if (*s != '\\')
3068 break;
3069 *p++ = (unsigned char)*s++;
3071 if (((s - bs) & 1) == 0 ||
3072 s >= end ||
3073 (*s != 'u' && *s != 'U')) {
3074 continue;
3076 p--;
3077 count = *s=='u' ? 4 : 8;
3078 s++;
3080 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3081 outpos = p-PyUnicode_AS_UNICODE(v);
3082 for (x = 0, i = 0; i < count; ++i, ++s) {
3083 c = (unsigned char)*s;
3084 if (!isxdigit(c)) {
3085 endinpos = s-starts;
3086 if (unicode_decode_call_errorhandler(
3087 errors, &errorHandler,
3088 "rawunicodeescape", "truncated \\uXXXX",
3089 starts, size, &startinpos, &endinpos, &exc, &s,
3090 (PyObject **)&v, &outpos, &p))
3091 goto onError;
3092 goto nextByte;
3094 x = (x<<4) & ~0xF;
3095 if (c >= '0' && c <= '9')
3096 x += c - '0';
3097 else if (c >= 'a' && c <= 'f')
3098 x += 10 + c - 'a';
3099 else
3100 x += 10 + c - 'A';
3102 if (x <= 0xffff)
3103 /* UCS-2 character */
3104 *p++ = (Py_UNICODE) x;
3105 else if (x <= 0x10ffff) {
3106 /* UCS-4 character. Either store directly, or as
3107 surrogate pair. */
3108 #ifdef Py_UNICODE_WIDE
3109 *p++ = (Py_UNICODE) x;
3110 #else
3111 x -= 0x10000L;
3112 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3113 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3114 #endif
3115 } else {
3116 endinpos = s-starts;
3117 outpos = p-PyUnicode_AS_UNICODE(v);
3118 if (unicode_decode_call_errorhandler(
3119 errors, &errorHandler,
3120 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3121 starts, size, &startinpos, &endinpos, &exc, &s,
3122 (PyObject **)&v, &outpos, &p))
3123 goto onError;
3125 nextByte:
3128 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3129 goto onError;
3130 Py_XDECREF(errorHandler);
3131 Py_XDECREF(exc);
3132 return (PyObject *)v;
3134 onError:
3135 Py_XDECREF(v);
3136 Py_XDECREF(errorHandler);
3137 Py_XDECREF(exc);
3138 return NULL;
3141 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3142 Py_ssize_t size)
3144 PyObject *repr;
3145 char *p;
3146 char *q;
3148 static const char *hexdigit = "0123456789abcdef";
3150 #ifdef Py_UNICODE_WIDE
3151 repr = PyString_FromStringAndSize(NULL, 10 * size);
3152 #else
3153 repr = PyString_FromStringAndSize(NULL, 6 * size);
3154 #endif
3155 if (repr == NULL)
3156 return NULL;
3157 if (size == 0)
3158 return repr;
3160 p = q = PyString_AS_STRING(repr);
3161 while (size-- > 0) {
3162 Py_UNICODE ch = *s++;
3163 #ifdef Py_UNICODE_WIDE
3164 /* Map 32-bit characters to '\Uxxxxxxxx' */
3165 if (ch >= 0x10000) {
3166 *p++ = '\\';
3167 *p++ = 'U';
3168 *p++ = hexdigit[(ch >> 28) & 0xf];
3169 *p++ = hexdigit[(ch >> 24) & 0xf];
3170 *p++ = hexdigit[(ch >> 20) & 0xf];
3171 *p++ = hexdigit[(ch >> 16) & 0xf];
3172 *p++ = hexdigit[(ch >> 12) & 0xf];
3173 *p++ = hexdigit[(ch >> 8) & 0xf];
3174 *p++ = hexdigit[(ch >> 4) & 0xf];
3175 *p++ = hexdigit[ch & 15];
3177 else
3178 #else
3179 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3180 if (ch >= 0xD800 && ch < 0xDC00) {
3181 Py_UNICODE ch2;
3182 Py_UCS4 ucs;
3184 ch2 = *s++;
3185 size--;
3186 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3187 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3188 *p++ = '\\';
3189 *p++ = 'U';
3190 *p++ = hexdigit[(ucs >> 28) & 0xf];
3191 *p++ = hexdigit[(ucs >> 24) & 0xf];
3192 *p++ = hexdigit[(ucs >> 20) & 0xf];
3193 *p++ = hexdigit[(ucs >> 16) & 0xf];
3194 *p++ = hexdigit[(ucs >> 12) & 0xf];
3195 *p++ = hexdigit[(ucs >> 8) & 0xf];
3196 *p++ = hexdigit[(ucs >> 4) & 0xf];
3197 *p++ = hexdigit[ucs & 0xf];
3198 continue;
3200 /* Fall through: isolated surrogates are copied as-is */
3201 s--;
3202 size++;
3204 #endif
3205 /* Map 16-bit characters to '\uxxxx' */
3206 if (ch >= 256) {
3207 *p++ = '\\';
3208 *p++ = 'u';
3209 *p++ = hexdigit[(ch >> 12) & 0xf];
3210 *p++ = hexdigit[(ch >> 8) & 0xf];
3211 *p++ = hexdigit[(ch >> 4) & 0xf];
3212 *p++ = hexdigit[ch & 15];
3214 /* Copy everything else as-is */
3215 else
3216 *p++ = (char) ch;
3218 *p = '\0';
3219 _PyString_Resize(&repr, p - q);
3220 return repr;
3223 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3225 if (!PyUnicode_Check(unicode)) {
3226 PyErr_BadArgument();
3227 return NULL;
3229 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3230 PyUnicode_GET_SIZE(unicode));
3233 /* --- Unicode Internal Codec ------------------------------------------- */
3235 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3236 Py_ssize_t size,
3237 const char *errors)
3239 const char *starts = s;
3240 Py_ssize_t startinpos;
3241 Py_ssize_t endinpos;
3242 Py_ssize_t outpos;
3243 PyUnicodeObject *v;
3244 Py_UNICODE *p;
3245 const char *end;
3246 const char *reason;
3247 PyObject *errorHandler = NULL;
3248 PyObject *exc = NULL;
3250 #ifdef Py_UNICODE_WIDE
3251 Py_UNICODE unimax = PyUnicode_GetMax();
3252 #endif
3254 /* XXX overflow detection missing */
3255 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3256 if (v == NULL)
3257 goto onError;
3258 if (PyUnicode_GetSize((PyObject *)v) == 0)
3259 return (PyObject *)v;
3260 p = PyUnicode_AS_UNICODE(v);
3261 end = s + size;
3263 while (s < end) {
3264 memcpy(p, s, sizeof(Py_UNICODE));
3265 /* We have to sanity check the raw data, otherwise doom looms for
3266 some malformed UCS-4 data. */
3267 if (
3268 #ifdef Py_UNICODE_WIDE
3269 *p > unimax || *p < 0 ||
3270 #endif
3271 end-s < Py_UNICODE_SIZE
3274 startinpos = s - starts;
3275 if (end-s < Py_UNICODE_SIZE) {
3276 endinpos = end-starts;
3277 reason = "truncated input";
3279 else {
3280 endinpos = s - starts + Py_UNICODE_SIZE;
3281 reason = "illegal code point (> 0x10FFFF)";
3283 outpos = p - PyUnicode_AS_UNICODE(v);
3284 if (unicode_decode_call_errorhandler(
3285 errors, &errorHandler,
3286 "unicode_internal", reason,
3287 starts, size, &startinpos, &endinpos, &exc, &s,
3288 (PyObject **)&v, &outpos, &p)) {
3289 goto onError;
3292 else {
3293 p++;
3294 s += Py_UNICODE_SIZE;
3298 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3299 goto onError;
3300 Py_XDECREF(errorHandler);
3301 Py_XDECREF(exc);
3302 return (PyObject *)v;
3304 onError:
3305 Py_XDECREF(v);
3306 Py_XDECREF(errorHandler);
3307 Py_XDECREF(exc);
3308 return NULL;
3311 /* --- Latin-1 Codec ------------------------------------------------------ */
3313 PyObject *PyUnicode_DecodeLatin1(const char *s,
3314 Py_ssize_t size,
3315 const char *errors)
3317 PyUnicodeObject *v;
3318 Py_UNICODE *p;
3320 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3321 if (size == 1) {
3322 Py_UNICODE r = *(unsigned char*)s;
3323 return PyUnicode_FromUnicode(&r, 1);
3326 v = _PyUnicode_New(size);
3327 if (v == NULL)
3328 goto onError;
3329 if (size == 0)
3330 return (PyObject *)v;
3331 p = PyUnicode_AS_UNICODE(v);
3332 while (size-- > 0)
3333 *p++ = (unsigned char)*s++;
3334 return (PyObject *)v;
3336 onError:
3337 Py_XDECREF(v);
3338 return NULL;
3341 /* create or adjust a UnicodeEncodeError */
3342 static void make_encode_exception(PyObject **exceptionObject,
3343 const char *encoding,
3344 const Py_UNICODE *unicode, Py_ssize_t size,
3345 Py_ssize_t startpos, Py_ssize_t endpos,
3346 const char *reason)
3348 if (*exceptionObject == NULL) {
3349 *exceptionObject = PyUnicodeEncodeError_Create(
3350 encoding, unicode, size, startpos, endpos, reason);
3352 else {
3353 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3354 goto onError;
3355 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3356 goto onError;
3357 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3358 goto onError;
3359 return;
3360 onError:
3361 Py_DECREF(*exceptionObject);
3362 *exceptionObject = NULL;
3366 /* raises a UnicodeEncodeError */
3367 static void raise_encode_exception(PyObject **exceptionObject,
3368 const char *encoding,
3369 const Py_UNICODE *unicode, Py_ssize_t size,
3370 Py_ssize_t startpos, Py_ssize_t endpos,
3371 const char *reason)
3373 make_encode_exception(exceptionObject,
3374 encoding, unicode, size, startpos, endpos, reason);
3375 if (*exceptionObject != NULL)
3376 PyCodec_StrictErrors(*exceptionObject);
3379 /* error handling callback helper:
3380 build arguments, call the callback and check the arguments,
3381 put the result into newpos and return the replacement string, which
3382 has to be freed by the caller */
3383 static PyObject *unicode_encode_call_errorhandler(const char *errors,
3384 PyObject **errorHandler,
3385 const char *encoding, const char *reason,
3386 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3387 Py_ssize_t startpos, Py_ssize_t endpos,
3388 Py_ssize_t *newpos)
3390 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3392 PyObject *restuple;
3393 PyObject *resunicode;
3395 if (*errorHandler == NULL) {
3396 *errorHandler = PyCodec_LookupError(errors);
3397 if (*errorHandler == NULL)
3398 return NULL;
3401 make_encode_exception(exceptionObject,
3402 encoding, unicode, size, startpos, endpos, reason);
3403 if (*exceptionObject == NULL)
3404 return NULL;
3406 restuple = PyObject_CallFunctionObjArgs(
3407 *errorHandler, *exceptionObject, NULL);
3408 if (restuple == NULL)
3409 return NULL;
3410 if (!PyTuple_Check(restuple)) {
3411 PyErr_Format(PyExc_TypeError, &argparse[4]);
3412 Py_DECREF(restuple);
3413 return NULL;
3415 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3416 &resunicode, newpos)) {
3417 Py_DECREF(restuple);
3418 return NULL;
3420 if (*newpos<0)
3421 *newpos = size+*newpos;
3422 if (*newpos<0 || *newpos>size) {
3423 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3424 Py_DECREF(restuple);
3425 return NULL;
3427 Py_INCREF(resunicode);
3428 Py_DECREF(restuple);
3429 return resunicode;
3432 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3433 Py_ssize_t size,
3434 const char *errors,
3435 int limit)
3437 /* output object */
3438 PyObject *res;
3439 /* pointers to the beginning and end+1 of input */
3440 const Py_UNICODE *startp = p;
3441 const Py_UNICODE *endp = p + size;
3442 /* pointer to the beginning of the unencodable characters */
3443 /* const Py_UNICODE *badp = NULL; */
3444 /* pointer into the output */
3445 char *str;
3446 /* current output position */
3447 Py_ssize_t respos = 0;
3448 Py_ssize_t ressize;
3449 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3450 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3451 PyObject *errorHandler = NULL;
3452 PyObject *exc = NULL;
3453 /* the following variable is used for caching string comparisons
3454 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3455 int known_errorHandler = -1;
3457 /* allocate enough for a simple encoding without
3458 replacements, if we need more, we'll resize */
3459 res = PyString_FromStringAndSize(NULL, size);
3460 if (res == NULL)
3461 goto onError;
3462 if (size == 0)
3463 return res;
3464 str = PyString_AS_STRING(res);
3465 ressize = size;
3467 while (p<endp) {
3468 Py_UNICODE c = *p;
3470 /* can we encode this? */
3471 if (c<limit) {
3472 /* no overflow check, because we know that the space is enough */
3473 *str++ = (char)c;
3474 ++p;
3476 else {
3477 Py_ssize_t unicodepos = p-startp;
3478 Py_ssize_t requiredsize;
3479 PyObject *repunicode;
3480 Py_ssize_t repsize;
3481 Py_ssize_t newpos;
3482 Py_ssize_t respos;
3483 Py_UNICODE *uni2;
3484 /* startpos for collecting unencodable chars */
3485 const Py_UNICODE *collstart = p;
3486 const Py_UNICODE *collend = p;
3487 /* find all unecodable characters */
3488 while ((collend < endp) && ((*collend)>=limit))
3489 ++collend;
3490 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3491 if (known_errorHandler==-1) {
3492 if ((errors==NULL) || (!strcmp(errors, "strict")))
3493 known_errorHandler = 1;
3494 else if (!strcmp(errors, "replace"))
3495 known_errorHandler = 2;
3496 else if (!strcmp(errors, "ignore"))
3497 known_errorHandler = 3;
3498 else if (!strcmp(errors, "xmlcharrefreplace"))
3499 known_errorHandler = 4;
3500 else
3501 known_errorHandler = 0;
3503 switch (known_errorHandler) {
3504 case 1: /* strict */
3505 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3506 goto onError;
3507 case 2: /* replace */
3508 while (collstart++<collend)
3509 *str++ = '?'; /* fall through */
3510 case 3: /* ignore */
3511 p = collend;
3512 break;
3513 case 4: /* xmlcharrefreplace */
3514 respos = str-PyString_AS_STRING(res);
3515 /* determine replacement size (temporarily (mis)uses p) */
3516 for (p = collstart, repsize = 0; p < collend; ++p) {
3517 if (*p<10)
3518 repsize += 2+1+1;
3519 else if (*p<100)
3520 repsize += 2+2+1;
3521 else if (*p<1000)
3522 repsize += 2+3+1;
3523 else if (*p<10000)
3524 repsize += 2+4+1;
3525 #ifndef Py_UNICODE_WIDE
3526 else
3527 repsize += 2+5+1;
3528 #else
3529 else if (*p<100000)
3530 repsize += 2+5+1;
3531 else if (*p<1000000)
3532 repsize += 2+6+1;
3533 else
3534 repsize += 2+7+1;
3535 #endif
3537 requiredsize = respos+repsize+(endp-collend);
3538 if (requiredsize > ressize) {
3539 if (requiredsize<2*ressize)
3540 requiredsize = 2*ressize;
3541 if (_PyString_Resize(&res, requiredsize))
3542 goto onError;
3543 str = PyString_AS_STRING(res) + respos;
3544 ressize = requiredsize;
3546 /* generate replacement (temporarily (mis)uses p) */
3547 for (p = collstart; p < collend; ++p) {
3548 str += sprintf(str, "&#%d;", (int)*p);
3550 p = collend;
3551 break;
3552 default:
3553 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3554 encoding, reason, startp, size, &exc,
3555 collstart-startp, collend-startp, &newpos);
3556 if (repunicode == NULL)
3557 goto onError;
3558 /* need more space? (at least enough for what we
3559 have+the replacement+the rest of the string, so
3560 we won't have to check space for encodable characters) */
3561 respos = str-PyString_AS_STRING(res);
3562 repsize = PyUnicode_GET_SIZE(repunicode);
3563 requiredsize = respos+repsize+(endp-collend);
3564 if (requiredsize > ressize) {
3565 if (requiredsize<2*ressize)
3566 requiredsize = 2*ressize;
3567 if (_PyString_Resize(&res, requiredsize)) {
3568 Py_DECREF(repunicode);
3569 goto onError;
3571 str = PyString_AS_STRING(res) + respos;
3572 ressize = requiredsize;
3574 /* check if there is anything unencodable in the replacement
3575 and copy it to the output */
3576 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3577 c = *uni2;
3578 if (c >= limit) {
3579 raise_encode_exception(&exc, encoding, startp, size,
3580 unicodepos, unicodepos+1, reason);
3581 Py_DECREF(repunicode);
3582 goto onError;
3584 *str = (char)c;
3586 p = startp + newpos;
3587 Py_DECREF(repunicode);
3591 /* Resize if we allocated to much */
3592 respos = str-PyString_AS_STRING(res);
3593 if (respos<ressize)
3594 /* If this falls res will be NULL */
3595 _PyString_Resize(&res, respos);
3596 Py_XDECREF(errorHandler);
3597 Py_XDECREF(exc);
3598 return res;
3600 onError:
3601 Py_XDECREF(res);
3602 Py_XDECREF(errorHandler);
3603 Py_XDECREF(exc);
3604 return NULL;
3607 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3608 Py_ssize_t size,
3609 const char *errors)
3611 return unicode_encode_ucs1(p, size, errors, 256);
3614 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3616 if (!PyUnicode_Check(unicode)) {
3617 PyErr_BadArgument();
3618 return NULL;
3620 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3621 PyUnicode_GET_SIZE(unicode),
3622 NULL);
3625 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3627 PyObject *PyUnicode_DecodeASCII(const char *s,
3628 Py_ssize_t size,
3629 const char *errors)
3631 const char *starts = s;
3632 PyUnicodeObject *v;
3633 Py_UNICODE *p;
3634 Py_ssize_t startinpos;
3635 Py_ssize_t endinpos;
3636 Py_ssize_t outpos;
3637 const char *e;
3638 PyObject *errorHandler = NULL;
3639 PyObject *exc = NULL;
3641 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3642 if (size == 1 && *(unsigned char*)s < 128) {
3643 Py_UNICODE r = *(unsigned char*)s;
3644 return PyUnicode_FromUnicode(&r, 1);
3647 v = _PyUnicode_New(size);
3648 if (v == NULL)
3649 goto onError;
3650 if (size == 0)
3651 return (PyObject *)v;
3652 p = PyUnicode_AS_UNICODE(v);
3653 e = s + size;
3654 while (s < e) {
3655 register unsigned char c = (unsigned char)*s;
3656 if (c < 128) {
3657 *p++ = c;
3658 ++s;
3660 else {
3661 startinpos = s-starts;
3662 endinpos = startinpos + 1;
3663 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3664 if (unicode_decode_call_errorhandler(
3665 errors, &errorHandler,
3666 "ascii", "ordinal not in range(128)",
3667 starts, size, &startinpos, &endinpos, &exc, &s,
3668 (PyObject **)&v, &outpos, &p))
3669 goto onError;
3672 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
3673 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3674 goto onError;
3675 Py_XDECREF(errorHandler);
3676 Py_XDECREF(exc);
3677 return (PyObject *)v;
3679 onError:
3680 Py_XDECREF(v);
3681 Py_XDECREF(errorHandler);
3682 Py_XDECREF(exc);
3683 return NULL;
3686 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3687 Py_ssize_t size,
3688 const char *errors)
3690 return unicode_encode_ucs1(p, size, errors, 128);
3693 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3695 if (!PyUnicode_Check(unicode)) {
3696 PyErr_BadArgument();
3697 return NULL;
3699 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3700 PyUnicode_GET_SIZE(unicode),
3701 NULL);
3704 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3706 /* --- MBCS codecs for Windows -------------------------------------------- */
3708 #if SIZEOF_INT < SIZEOF_SSIZE_T
3709 #define NEED_RETRY
3710 #endif
3712 /* XXX This code is limited to "true" double-byte encodings, as
3713 a) it assumes an incomplete character consists of a single byte, and
3714 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3715 encodings, see IsDBCSLeadByteEx documentation. */
3717 static int is_dbcs_lead_byte(const char *s, int offset)
3719 const char *curr = s + offset;
3721 if (IsDBCSLeadByte(*curr)) {
3722 const char *prev = CharPrev(s, curr);
3723 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3725 return 0;
3729 * Decode MBCS string into unicode object. If 'final' is set, converts
3730 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3732 static int decode_mbcs(PyUnicodeObject **v,
3733 const char *s, /* MBCS string */
3734 int size, /* sizeof MBCS string */
3735 int final)
3737 Py_UNICODE *p;
3738 Py_ssize_t n = 0;
3739 int usize = 0;
3741 assert(size >= 0);
3743 /* Skip trailing lead-byte unless 'final' is set */
3744 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3745 --size;
3747 /* First get the size of the result */
3748 if (size > 0) {
3749 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3750 if (usize == 0) {
3751 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3752 return -1;
3756 if (*v == NULL) {
3757 /* Create unicode object */
3758 *v = _PyUnicode_New(usize);
3759 if (*v == NULL)
3760 return -1;
3762 else {
3763 /* Extend unicode object */
3764 n = PyUnicode_GET_SIZE(*v);
3765 if (_PyUnicode_Resize(v, n + usize) < 0)
3766 return -1;
3769 /* Do the conversion */
3770 if (size > 0) {
3771 p = PyUnicode_AS_UNICODE(*v) + n;
3772 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3773 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3774 return -1;
3778 return size;
3781 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3782 Py_ssize_t size,
3783 const char *errors,
3784 Py_ssize_t *consumed)
3786 PyUnicodeObject *v = NULL;
3787 int done;
3789 if (consumed)
3790 *consumed = 0;
3792 #ifdef NEED_RETRY
3793 retry:
3794 if (size > INT_MAX)
3795 done = decode_mbcs(&v, s, INT_MAX, 0);
3796 else
3797 #endif
3798 done = decode_mbcs(&v, s, (int)size, !consumed);
3800 if (done < 0) {
3801 Py_XDECREF(v);
3802 return NULL;
3805 if (consumed)
3806 *consumed += done;
3808 #ifdef NEED_RETRY
3809 if (size > INT_MAX) {
3810 s += done;
3811 size -= done;
3812 goto retry;
3814 #endif
3816 return (PyObject *)v;
3819 PyObject *PyUnicode_DecodeMBCS(const char *s,
3820 Py_ssize_t size,
3821 const char *errors)
3823 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3827 * Convert unicode into string object (MBCS).
3828 * Returns 0 if succeed, -1 otherwise.
3830 static int encode_mbcs(PyObject **repr,
3831 const Py_UNICODE *p, /* unicode */
3832 int size) /* size of unicode */
3834 int mbcssize = 0;
3835 Py_ssize_t n = 0;
3837 assert(size >= 0);
3839 /* First get the size of the result */
3840 if (size > 0) {
3841 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3842 if (mbcssize == 0) {
3843 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3844 return -1;
3848 if (*repr == NULL) {
3849 /* Create string object */
3850 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3851 if (*repr == NULL)
3852 return -1;
3854 else {
3855 /* Extend string object */
3856 n = PyString_Size(*repr);
3857 if (_PyString_Resize(repr, n + mbcssize) < 0)
3858 return -1;
3861 /* Do the conversion */
3862 if (size > 0) {
3863 char *s = PyString_AS_STRING(*repr) + n;
3864 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3865 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3866 return -1;
3870 return 0;
3873 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
3874 Py_ssize_t size,
3875 const char *errors)
3877 PyObject *repr = NULL;
3878 int ret;
3880 #ifdef NEED_RETRY
3881 retry:
3882 if (size > INT_MAX)
3883 ret = encode_mbcs(&repr, p, INT_MAX);
3884 else
3885 #endif
3886 ret = encode_mbcs(&repr, p, (int)size);
3888 if (ret < 0) {
3889 Py_XDECREF(repr);
3890 return NULL;
3893 #ifdef NEED_RETRY
3894 if (size > INT_MAX) {
3895 p += INT_MAX;
3896 size -= INT_MAX;
3897 goto retry;
3899 #endif
3901 return repr;
3904 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3906 if (!PyUnicode_Check(unicode)) {
3907 PyErr_BadArgument();
3908 return NULL;
3910 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3911 PyUnicode_GET_SIZE(unicode),
3912 NULL);
3915 #undef NEED_RETRY
3917 #endif /* MS_WINDOWS */
3919 /* --- Character Mapping Codec -------------------------------------------- */
3921 PyObject *PyUnicode_DecodeCharmap(const char *s,
3922 Py_ssize_t size,
3923 PyObject *mapping,
3924 const char *errors)
3926 const char *starts = s;
3927 Py_ssize_t startinpos;
3928 Py_ssize_t endinpos;
3929 Py_ssize_t outpos;
3930 const char *e;
3931 PyUnicodeObject *v;
3932 Py_UNICODE *p;
3933 Py_ssize_t extrachars = 0;
3934 PyObject *errorHandler = NULL;
3935 PyObject *exc = NULL;
3936 Py_UNICODE *mapstring = NULL;
3937 Py_ssize_t maplen = 0;
3939 /* Default to Latin-1 */
3940 if (mapping == NULL)
3941 return PyUnicode_DecodeLatin1(s, size, errors);
3943 v = _PyUnicode_New(size);
3944 if (v == NULL)
3945 goto onError;
3946 if (size == 0)
3947 return (PyObject *)v;
3948 p = PyUnicode_AS_UNICODE(v);
3949 e = s + size;
3950 if (PyUnicode_CheckExact(mapping)) {
3951 mapstring = PyUnicode_AS_UNICODE(mapping);
3952 maplen = PyUnicode_GET_SIZE(mapping);
3953 while (s < e) {
3954 unsigned char ch = *s;
3955 Py_UNICODE x = 0xfffe; /* illegal value */
3957 if (ch < maplen)
3958 x = mapstring[ch];
3960 if (x == 0xfffe) {
3961 /* undefined mapping */
3962 outpos = p-PyUnicode_AS_UNICODE(v);
3963 startinpos = s-starts;
3964 endinpos = startinpos+1;
3965 if (unicode_decode_call_errorhandler(
3966 errors, &errorHandler,
3967 "charmap", "character maps to <undefined>",
3968 starts, size, &startinpos, &endinpos, &exc, &s,
3969 (PyObject **)&v, &outpos, &p)) {
3970 goto onError;
3972 continue;
3974 *p++ = x;
3975 ++s;
3978 else {
3979 while (s < e) {
3980 unsigned char ch = *s;
3981 PyObject *w, *x;
3983 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3984 w = PyInt_FromLong((long)ch);
3985 if (w == NULL)
3986 goto onError;
3987 x = PyObject_GetItem(mapping, w);
3988 Py_DECREF(w);
3989 if (x == NULL) {
3990 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3991 /* No mapping found means: mapping is undefined. */
3992 PyErr_Clear();
3993 x = Py_None;
3994 Py_INCREF(x);
3995 } else
3996 goto onError;
3999 /* Apply mapping */
4000 if (PyInt_Check(x)) {
4001 long value = PyInt_AS_LONG(x);
4002 if (value < 0 || value > 65535) {
4003 PyErr_SetString(PyExc_TypeError,
4004 "character mapping must be in range(65536)");
4005 Py_DECREF(x);
4006 goto onError;
4008 *p++ = (Py_UNICODE)value;
4010 else if (x == Py_None) {
4011 /* undefined mapping */
4012 outpos = p-PyUnicode_AS_UNICODE(v);
4013 startinpos = s-starts;
4014 endinpos = startinpos+1;
4015 if (unicode_decode_call_errorhandler(
4016 errors, &errorHandler,
4017 "charmap", "character maps to <undefined>",
4018 starts, size, &startinpos, &endinpos, &exc, &s,
4019 (PyObject **)&v, &outpos, &p)) {
4020 Py_DECREF(x);
4021 goto onError;
4023 Py_DECREF(x);
4024 continue;
4026 else if (PyUnicode_Check(x)) {
4027 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4029 if (targetsize == 1)
4030 /* 1-1 mapping */
4031 *p++ = *PyUnicode_AS_UNICODE(x);
4033 else if (targetsize > 1) {
4034 /* 1-n mapping */
4035 if (targetsize > extrachars) {
4036 /* resize first */
4037 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4038 Py_ssize_t needed = (targetsize - extrachars) + \
4039 (targetsize << 2);
4040 extrachars += needed;
4041 /* XXX overflow detection missing */
4042 if (_PyUnicode_Resize(&v,
4043 PyUnicode_GET_SIZE(v) + needed) < 0) {
4044 Py_DECREF(x);
4045 goto onError;
4047 p = PyUnicode_AS_UNICODE(v) + oldpos;
4049 Py_UNICODE_COPY(p,
4050 PyUnicode_AS_UNICODE(x),
4051 targetsize);
4052 p += targetsize;
4053 extrachars -= targetsize;
4055 /* 1-0 mapping: skip the character */
4057 else {
4058 /* wrong return value */
4059 PyErr_SetString(PyExc_TypeError,
4060 "character mapping must return integer, None or unicode");
4061 Py_DECREF(x);
4062 goto onError;
4064 Py_DECREF(x);
4065 ++s;
4068 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4069 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4070 goto onError;
4071 Py_XDECREF(errorHandler);
4072 Py_XDECREF(exc);
4073 return (PyObject *)v;
4075 onError:
4076 Py_XDECREF(errorHandler);
4077 Py_XDECREF(exc);
4078 Py_XDECREF(v);
4079 return NULL;
4082 /* Charmap encoding: the lookup table */
4084 struct encoding_map{
4085 PyObject_HEAD
4086 unsigned char level1[32];
4087 int count2, count3;
4088 unsigned char level23[1];
4091 static PyObject*
4092 encoding_map_size(PyObject *obj, PyObject* args)
4094 struct encoding_map *map = (struct encoding_map*)obj;
4095 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4096 128*map->count3);
4099 static PyMethodDef encoding_map_methods[] = {
4100 {"size", encoding_map_size, METH_NOARGS,
4101 PyDoc_STR("Return the size (in bytes) of this object") },
4102 { 0 }
4105 static void
4106 encoding_map_dealloc(PyObject* o)
4108 PyObject_FREE(o);
4111 static PyTypeObject EncodingMapType = {
4112 PyVarObject_HEAD_INIT(NULL, 0)
4113 "EncodingMap", /*tp_name*/
4114 sizeof(struct encoding_map), /*tp_basicsize*/
4115 0, /*tp_itemsize*/
4116 /* methods */
4117 encoding_map_dealloc, /*tp_dealloc*/
4118 0, /*tp_print*/
4119 0, /*tp_getattr*/
4120 0, /*tp_setattr*/
4121 0, /*tp_compare*/
4122 0, /*tp_repr*/
4123 0, /*tp_as_number*/
4124 0, /*tp_as_sequence*/
4125 0, /*tp_as_mapping*/
4126 0, /*tp_hash*/
4127 0, /*tp_call*/
4128 0, /*tp_str*/
4129 0, /*tp_getattro*/
4130 0, /*tp_setattro*/
4131 0, /*tp_as_buffer*/
4132 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4133 0, /*tp_doc*/
4134 0, /*tp_traverse*/
4135 0, /*tp_clear*/
4136 0, /*tp_richcompare*/
4137 0, /*tp_weaklistoffset*/
4138 0, /*tp_iter*/
4139 0, /*tp_iternext*/
4140 encoding_map_methods, /*tp_methods*/
4141 0, /*tp_members*/
4142 0, /*tp_getset*/
4143 0, /*tp_base*/
4144 0, /*tp_dict*/
4145 0, /*tp_descr_get*/
4146 0, /*tp_descr_set*/
4147 0, /*tp_dictoffset*/
4148 0, /*tp_init*/
4149 0, /*tp_alloc*/
4150 0, /*tp_new*/
4151 0, /*tp_free*/
4152 0, /*tp_is_gc*/
4155 PyObject*
4156 PyUnicode_BuildEncodingMap(PyObject* string)
4158 Py_UNICODE *decode;
4159 PyObject *result;
4160 struct encoding_map *mresult;
4161 int i;
4162 int need_dict = 0;
4163 unsigned char level1[32];
4164 unsigned char level2[512];
4165 unsigned char *mlevel1, *mlevel2, *mlevel3;
4166 int count2 = 0, count3 = 0;
4168 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4169 PyErr_BadArgument();
4170 return NULL;
4172 decode = PyUnicode_AS_UNICODE(string);
4173 memset(level1, 0xFF, sizeof level1);
4174 memset(level2, 0xFF, sizeof level2);
4176 /* If there isn't a one-to-one mapping of NULL to \0,
4177 or if there are non-BMP characters, we need to use
4178 a mapping dictionary. */
4179 if (decode[0] != 0)
4180 need_dict = 1;
4181 for (i = 1; i < 256; i++) {
4182 int l1, l2;
4183 if (decode[i] == 0
4184 #ifdef Py_UNICODE_WIDE
4185 || decode[i] > 0xFFFF
4186 #endif
4188 need_dict = 1;
4189 break;
4191 if (decode[i] == 0xFFFE)
4192 /* unmapped character */
4193 continue;
4194 l1 = decode[i] >> 11;
4195 l2 = decode[i] >> 7;
4196 if (level1[l1] == 0xFF)
4197 level1[l1] = count2++;
4198 if (level2[l2] == 0xFF)
4199 level2[l2] = count3++;
4202 if (count2 >= 0xFF || count3 >= 0xFF)
4203 need_dict = 1;
4205 if (need_dict) {
4206 PyObject *result = PyDict_New();
4207 PyObject *key, *value;
4208 if (!result)
4209 return NULL;
4210 for (i = 0; i < 256; i++) {
4211 key = value = NULL;
4212 key = PyInt_FromLong(decode[i]);
4213 value = PyInt_FromLong(i);
4214 if (!key || !value)
4215 goto failed1;
4216 if (PyDict_SetItem(result, key, value) == -1)
4217 goto failed1;
4218 Py_DECREF(key);
4219 Py_DECREF(value);
4221 return result;
4222 failed1:
4223 Py_XDECREF(key);
4224 Py_XDECREF(value);
4225 Py_DECREF(result);
4226 return NULL;
4229 /* Create a three-level trie */
4230 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4231 16*count2 + 128*count3 - 1);
4232 if (!result)
4233 return PyErr_NoMemory();
4234 PyObject_Init(result, &EncodingMapType);
4235 mresult = (struct encoding_map*)result;
4236 mresult->count2 = count2;
4237 mresult->count3 = count3;
4238 mlevel1 = mresult->level1;
4239 mlevel2 = mresult->level23;
4240 mlevel3 = mresult->level23 + 16*count2;
4241 memcpy(mlevel1, level1, 32);
4242 memset(mlevel2, 0xFF, 16*count2);
4243 memset(mlevel3, 0, 128*count3);
4244 count3 = 0;
4245 for (i = 1; i < 256; i++) {
4246 int o1, o2, o3, i2, i3;
4247 if (decode[i] == 0xFFFE)
4248 /* unmapped character */
4249 continue;
4250 o1 = decode[i]>>11;
4251 o2 = (decode[i]>>7) & 0xF;
4252 i2 = 16*mlevel1[o1] + o2;
4253 if (mlevel2[i2] == 0xFF)
4254 mlevel2[i2] = count3++;
4255 o3 = decode[i] & 0x7F;
4256 i3 = 128*mlevel2[i2] + o3;
4257 mlevel3[i3] = i;
4259 return result;
4262 static int
4263 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4265 struct encoding_map *map = (struct encoding_map*)mapping;
4266 int l1 = c>>11;
4267 int l2 = (c>>7) & 0xF;
4268 int l3 = c & 0x7F;
4269 int i;
4271 #ifdef Py_UNICODE_WIDE
4272 if (c > 0xFFFF) {
4273 return -1;
4275 #endif
4276 if (c == 0)
4277 return 0;
4278 /* level 1*/
4279 i = map->level1[l1];
4280 if (i == 0xFF) {
4281 return -1;
4283 /* level 2*/
4284 i = map->level23[16*i+l2];
4285 if (i == 0xFF) {
4286 return -1;
4288 /* level 3 */
4289 i = map->level23[16*map->count2 + 128*i + l3];
4290 if (i == 0) {
4291 return -1;
4293 return i;
4296 /* Lookup the character ch in the mapping. If the character
4297 can't be found, Py_None is returned (or NULL, if another
4298 error occurred). */
4299 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4301 PyObject *w = PyInt_FromLong((long)c);
4302 PyObject *x;
4304 if (w == NULL)
4305 return NULL;
4306 x = PyObject_GetItem(mapping, w);
4307 Py_DECREF(w);
4308 if (x == NULL) {
4309 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4310 /* No mapping found means: mapping is undefined. */
4311 PyErr_Clear();
4312 x = Py_None;
4313 Py_INCREF(x);
4314 return x;
4315 } else
4316 return NULL;
4318 else if (x == Py_None)
4319 return x;
4320 else if (PyInt_Check(x)) {
4321 long value = PyInt_AS_LONG(x);
4322 if (value < 0 || value > 255) {
4323 PyErr_SetString(PyExc_TypeError,
4324 "character mapping must be in range(256)");
4325 Py_DECREF(x);
4326 return NULL;
4328 return x;
4330 else if (PyString_Check(x))
4331 return x;
4332 else {
4333 /* wrong return value */
4334 PyErr_SetString(PyExc_TypeError,
4335 "character mapping must return integer, None or str");
4336 Py_DECREF(x);
4337 return NULL;
4341 static int
4342 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4344 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4345 /* exponentially overallocate to minimize reallocations */
4346 if (requiredsize < 2*outsize)
4347 requiredsize = 2*outsize;
4348 if (_PyString_Resize(outobj, requiredsize)) {
4349 return 0;
4351 return 1;
4354 typedef enum charmapencode_result {
4355 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4356 }charmapencode_result;
4357 /* lookup the character, put the result in the output string and adjust
4358 various state variables. Reallocate the output string if not enough
4359 space is available. Return a new reference to the object that
4360 was put in the output buffer, or Py_None, if the mapping was undefined
4361 (in which case no character was written) or NULL, if a
4362 reallocation error occurred. The caller must decref the result */
4363 static
4364 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4365 PyObject **outobj, Py_ssize_t *outpos)
4367 PyObject *rep;
4368 char *outstart;
4369 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4371 if (Py_TYPE(mapping) == &EncodingMapType) {
4372 int res = encoding_map_lookup(c, mapping);
4373 Py_ssize_t requiredsize = *outpos+1;
4374 if (res == -1)
4375 return enc_FAILED;
4376 if (outsize<requiredsize)
4377 if (!charmapencode_resize(outobj, outpos, requiredsize))
4378 return enc_EXCEPTION;
4379 outstart = PyString_AS_STRING(*outobj);
4380 outstart[(*outpos)++] = (char)res;
4381 return enc_SUCCESS;
4384 rep = charmapencode_lookup(c, mapping);
4385 if (rep==NULL)
4386 return enc_EXCEPTION;
4387 else if (rep==Py_None) {
4388 Py_DECREF(rep);
4389 return enc_FAILED;
4390 } else {
4391 if (PyInt_Check(rep)) {
4392 Py_ssize_t requiredsize = *outpos+1;
4393 if (outsize<requiredsize)
4394 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4395 Py_DECREF(rep);
4396 return enc_EXCEPTION;
4398 outstart = PyString_AS_STRING(*outobj);
4399 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4401 else {
4402 const char *repchars = PyString_AS_STRING(rep);
4403 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4404 Py_ssize_t requiredsize = *outpos+repsize;
4405 if (outsize<requiredsize)
4406 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4407 Py_DECREF(rep);
4408 return enc_EXCEPTION;
4410 outstart = PyString_AS_STRING(*outobj);
4411 memcpy(outstart + *outpos, repchars, repsize);
4412 *outpos += repsize;
4415 Py_DECREF(rep);
4416 return enc_SUCCESS;
4419 /* handle an error in PyUnicode_EncodeCharmap
4420 Return 0 on success, -1 on error */
4421 static
4422 int charmap_encoding_error(
4423 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4424 PyObject **exceptionObject,
4425 int *known_errorHandler, PyObject **errorHandler, const char *errors,
4426 PyObject **res, Py_ssize_t *respos)
4428 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4429 Py_ssize_t repsize;
4430 Py_ssize_t newpos;
4431 Py_UNICODE *uni2;
4432 /* startpos for collecting unencodable chars */
4433 Py_ssize_t collstartpos = *inpos;
4434 Py_ssize_t collendpos = *inpos+1;
4435 Py_ssize_t collpos;
4436 char *encoding = "charmap";
4437 char *reason = "character maps to <undefined>";
4438 charmapencode_result x;
4440 /* find all unencodable characters */
4441 while (collendpos < size) {
4442 PyObject *rep;
4443 if (Py_TYPE(mapping) == &EncodingMapType) {
4444 int res = encoding_map_lookup(p[collendpos], mapping);
4445 if (res != -1)
4446 break;
4447 ++collendpos;
4448 continue;
4451 rep = charmapencode_lookup(p[collendpos], mapping);
4452 if (rep==NULL)
4453 return -1;
4454 else if (rep!=Py_None) {
4455 Py_DECREF(rep);
4456 break;
4458 Py_DECREF(rep);
4459 ++collendpos;
4461 /* cache callback name lookup
4462 * (if not done yet, i.e. it's the first error) */
4463 if (*known_errorHandler==-1) {
4464 if ((errors==NULL) || (!strcmp(errors, "strict")))
4465 *known_errorHandler = 1;
4466 else if (!strcmp(errors, "replace"))
4467 *known_errorHandler = 2;
4468 else if (!strcmp(errors, "ignore"))
4469 *known_errorHandler = 3;
4470 else if (!strcmp(errors, "xmlcharrefreplace"))
4471 *known_errorHandler = 4;
4472 else
4473 *known_errorHandler = 0;
4475 switch (*known_errorHandler) {
4476 case 1: /* strict */
4477 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4478 return -1;
4479 case 2: /* replace */
4480 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4481 x = charmapencode_output('?', mapping, res, respos);
4482 if (x==enc_EXCEPTION) {
4483 return -1;
4485 else if (x==enc_FAILED) {
4486 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4487 return -1;
4490 /* fall through */
4491 case 3: /* ignore */
4492 *inpos = collendpos;
4493 break;
4494 case 4: /* xmlcharrefreplace */
4495 /* generate replacement (temporarily (mis)uses p) */
4496 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4497 char buffer[2+29+1+1];
4498 char *cp;
4499 sprintf(buffer, "&#%d;", (int)p[collpos]);
4500 for (cp = buffer; *cp; ++cp) {
4501 x = charmapencode_output(*cp, mapping, res, respos);
4502 if (x==enc_EXCEPTION)
4503 return -1;
4504 else if (x==enc_FAILED) {
4505 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4506 return -1;
4510 *inpos = collendpos;
4511 break;
4512 default:
4513 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4514 encoding, reason, p, size, exceptionObject,
4515 collstartpos, collendpos, &newpos);
4516 if (repunicode == NULL)
4517 return -1;
4518 /* generate replacement */
4519 repsize = PyUnicode_GET_SIZE(repunicode);
4520 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4521 x = charmapencode_output(*uni2, mapping, res, respos);
4522 if (x==enc_EXCEPTION) {
4523 return -1;
4525 else if (x==enc_FAILED) {
4526 Py_DECREF(repunicode);
4527 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4528 return -1;
4531 *inpos = newpos;
4532 Py_DECREF(repunicode);
4534 return 0;
4537 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4538 Py_ssize_t size,
4539 PyObject *mapping,
4540 const char *errors)
4542 /* output object */
4543 PyObject *res = NULL;
4544 /* current input position */
4545 Py_ssize_t inpos = 0;
4546 /* current output position */
4547 Py_ssize_t respos = 0;
4548 PyObject *errorHandler = NULL;
4549 PyObject *exc = NULL;
4550 /* the following variable is used for caching string comparisons
4551 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4552 * 3=ignore, 4=xmlcharrefreplace */
4553 int known_errorHandler = -1;
4555 /* Default to Latin-1 */
4556 if (mapping == NULL)
4557 return PyUnicode_EncodeLatin1(p, size, errors);
4559 /* allocate enough for a simple encoding without
4560 replacements, if we need more, we'll resize */
4561 res = PyString_FromStringAndSize(NULL, size);
4562 if (res == NULL)
4563 goto onError;
4564 if (size == 0)
4565 return res;
4567 while (inpos<size) {
4568 /* try to encode it */
4569 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4570 if (x==enc_EXCEPTION) /* error */
4571 goto onError;
4572 if (x==enc_FAILED) { /* unencodable character */
4573 if (charmap_encoding_error(p, size, &inpos, mapping,
4574 &exc,
4575 &known_errorHandler, &errorHandler, errors,
4576 &res, &respos)) {
4577 goto onError;
4580 else
4581 /* done with this character => adjust input position */
4582 ++inpos;
4585 /* Resize if we allocated to much */
4586 if (respos<PyString_GET_SIZE(res)) {
4587 if (_PyString_Resize(&res, respos))
4588 goto onError;
4590 Py_XDECREF(exc);
4591 Py_XDECREF(errorHandler);
4592 return res;
4594 onError:
4595 Py_XDECREF(res);
4596 Py_XDECREF(exc);
4597 Py_XDECREF(errorHandler);
4598 return NULL;
4601 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4602 PyObject *mapping)
4604 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4605 PyErr_BadArgument();
4606 return NULL;
4608 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4609 PyUnicode_GET_SIZE(unicode),
4610 mapping,
4611 NULL);
4614 /* create or adjust a UnicodeTranslateError */
4615 static void make_translate_exception(PyObject **exceptionObject,
4616 const Py_UNICODE *unicode, Py_ssize_t size,
4617 Py_ssize_t startpos, Py_ssize_t endpos,
4618 const char *reason)
4620 if (*exceptionObject == NULL) {
4621 *exceptionObject = PyUnicodeTranslateError_Create(
4622 unicode, size, startpos, endpos, reason);
4624 else {
4625 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4626 goto onError;
4627 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4628 goto onError;
4629 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4630 goto onError;
4631 return;
4632 onError:
4633 Py_DECREF(*exceptionObject);
4634 *exceptionObject = NULL;
4638 /* raises a UnicodeTranslateError */
4639 static void raise_translate_exception(PyObject **exceptionObject,
4640 const Py_UNICODE *unicode, Py_ssize_t size,
4641 Py_ssize_t startpos, Py_ssize_t endpos,
4642 const char *reason)
4644 make_translate_exception(exceptionObject,
4645 unicode, size, startpos, endpos, reason);
4646 if (*exceptionObject != NULL)
4647 PyCodec_StrictErrors(*exceptionObject);
4650 /* error handling callback helper:
4651 build arguments, call the callback and check the arguments,
4652 put the result into newpos and return the replacement string, which
4653 has to be freed by the caller */
4654 static PyObject *unicode_translate_call_errorhandler(const char *errors,
4655 PyObject **errorHandler,
4656 const char *reason,
4657 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4658 Py_ssize_t startpos, Py_ssize_t endpos,
4659 Py_ssize_t *newpos)
4661 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4663 Py_ssize_t i_newpos;
4664 PyObject *restuple;
4665 PyObject *resunicode;
4667 if (*errorHandler == NULL) {
4668 *errorHandler = PyCodec_LookupError(errors);
4669 if (*errorHandler == NULL)
4670 return NULL;
4673 make_translate_exception(exceptionObject,
4674 unicode, size, startpos, endpos, reason);
4675 if (*exceptionObject == NULL)
4676 return NULL;
4678 restuple = PyObject_CallFunctionObjArgs(
4679 *errorHandler, *exceptionObject, NULL);
4680 if (restuple == NULL)
4681 return NULL;
4682 if (!PyTuple_Check(restuple)) {
4683 PyErr_Format(PyExc_TypeError, &argparse[4]);
4684 Py_DECREF(restuple);
4685 return NULL;
4687 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4688 &resunicode, &i_newpos)) {
4689 Py_DECREF(restuple);
4690 return NULL;
4692 if (i_newpos<0)
4693 *newpos = size+i_newpos;
4694 else
4695 *newpos = i_newpos;
4696 if (*newpos<0 || *newpos>size) {
4697 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4698 Py_DECREF(restuple);
4699 return NULL;
4701 Py_INCREF(resunicode);
4702 Py_DECREF(restuple);
4703 return resunicode;
4706 /* Lookup the character ch in the mapping and put the result in result,
4707 which must be decrefed by the caller.
4708 Return 0 on success, -1 on error */
4709 static
4710 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4712 PyObject *w = PyInt_FromLong((long)c);
4713 PyObject *x;
4715 if (w == NULL)
4716 return -1;
4717 x = PyObject_GetItem(mapping, w);
4718 Py_DECREF(w);
4719 if (x == NULL) {
4720 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4721 /* No mapping found means: use 1:1 mapping. */
4722 PyErr_Clear();
4723 *result = NULL;
4724 return 0;
4725 } else
4726 return -1;
4728 else if (x == Py_None) {
4729 *result = x;
4730 return 0;
4732 else if (PyInt_Check(x)) {
4733 long value = PyInt_AS_LONG(x);
4734 long max = PyUnicode_GetMax();
4735 if (value < 0 || value > max) {
4736 PyErr_Format(PyExc_TypeError,
4737 "character mapping must be in range(0x%lx)", max+1);
4738 Py_DECREF(x);
4739 return -1;
4741 *result = x;
4742 return 0;
4744 else if (PyUnicode_Check(x)) {
4745 *result = x;
4746 return 0;
4748 else {
4749 /* wrong return value */
4750 PyErr_SetString(PyExc_TypeError,
4751 "character mapping must return integer, None or unicode");
4752 Py_DECREF(x);
4753 return -1;
4756 /* ensure that *outobj is at least requiredsize characters long,
4757 if not reallocate and adjust various state variables.
4758 Return 0 on success, -1 on error */
4759 static
4760 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4761 Py_ssize_t requiredsize)
4763 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4764 if (requiredsize > oldsize) {
4765 /* remember old output position */
4766 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4767 /* exponentially overallocate to minimize reallocations */
4768 if (requiredsize < 2 * oldsize)
4769 requiredsize = 2 * oldsize;
4770 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
4771 return -1;
4772 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4774 return 0;
4776 /* lookup the character, put the result in the output string and adjust
4777 various state variables. Return a new reference to the object that
4778 was put in the output buffer in *result, or Py_None, if the mapping was
4779 undefined (in which case no character was written).
4780 The called must decref result.
4781 Return 0 on success, -1 on error. */
4782 static
4783 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
4784 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4785 PyObject **res)
4787 if (charmaptranslate_lookup(*curinp, mapping, res))
4788 return -1;
4789 if (*res==NULL) {
4790 /* not found => default to 1:1 mapping */
4791 *(*outp)++ = *curinp;
4793 else if (*res==Py_None)
4795 else if (PyInt_Check(*res)) {
4796 /* no overflow check, because we know that the space is enough */
4797 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4799 else if (PyUnicode_Check(*res)) {
4800 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4801 if (repsize==1) {
4802 /* no overflow check, because we know that the space is enough */
4803 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4805 else if (repsize!=0) {
4806 /* more than one character */
4807 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4808 (insize - (curinp-startinp)) +
4809 repsize - 1;
4810 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4811 return -1;
4812 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4813 *outp += repsize;
4816 else
4817 return -1;
4818 return 0;
4821 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
4822 Py_ssize_t size,
4823 PyObject *mapping,
4824 const char *errors)
4826 /* output object */
4827 PyObject *res = NULL;
4828 /* pointers to the beginning and end+1 of input */
4829 const Py_UNICODE *startp = p;
4830 const Py_UNICODE *endp = p + size;
4831 /* pointer into the output */
4832 Py_UNICODE *str;
4833 /* current output position */
4834 Py_ssize_t respos = 0;
4835 char *reason = "character maps to <undefined>";
4836 PyObject *errorHandler = NULL;
4837 PyObject *exc = NULL;
4838 /* the following variable is used for caching string comparisons
4839 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4840 * 3=ignore, 4=xmlcharrefreplace */
4841 int known_errorHandler = -1;
4843 if (mapping == NULL) {
4844 PyErr_BadArgument();
4845 return NULL;
4848 /* allocate enough for a simple 1:1 translation without
4849 replacements, if we need more, we'll resize */
4850 res = PyUnicode_FromUnicode(NULL, size);
4851 if (res == NULL)
4852 goto onError;
4853 if (size == 0)
4854 return res;
4855 str = PyUnicode_AS_UNICODE(res);
4857 while (p<endp) {
4858 /* try to encode it */
4859 PyObject *x = NULL;
4860 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4861 Py_XDECREF(x);
4862 goto onError;
4864 Py_XDECREF(x);
4865 if (x!=Py_None) /* it worked => adjust input pointer */
4866 ++p;
4867 else { /* untranslatable character */
4868 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4869 Py_ssize_t repsize;
4870 Py_ssize_t newpos;
4871 Py_UNICODE *uni2;
4872 /* startpos for collecting untranslatable chars */
4873 const Py_UNICODE *collstart = p;
4874 const Py_UNICODE *collend = p+1;
4875 const Py_UNICODE *coll;
4877 /* find all untranslatable characters */
4878 while (collend < endp) {
4879 if (charmaptranslate_lookup(*collend, mapping, &x))
4880 goto onError;
4881 Py_XDECREF(x);
4882 if (x!=Py_None)
4883 break;
4884 ++collend;
4886 /* cache callback name lookup
4887 * (if not done yet, i.e. it's the first error) */
4888 if (known_errorHandler==-1) {
4889 if ((errors==NULL) || (!strcmp(errors, "strict")))
4890 known_errorHandler = 1;
4891 else if (!strcmp(errors, "replace"))
4892 known_errorHandler = 2;
4893 else if (!strcmp(errors, "ignore"))
4894 known_errorHandler = 3;
4895 else if (!strcmp(errors, "xmlcharrefreplace"))
4896 known_errorHandler = 4;
4897 else
4898 known_errorHandler = 0;
4900 switch (known_errorHandler) {
4901 case 1: /* strict */
4902 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4903 goto onError;
4904 case 2: /* replace */
4905 /* No need to check for space, this is a 1:1 replacement */
4906 for (coll = collstart; coll<collend; ++coll)
4907 *str++ = '?';
4908 /* fall through */
4909 case 3: /* ignore */
4910 p = collend;
4911 break;
4912 case 4: /* xmlcharrefreplace */
4913 /* generate replacement (temporarily (mis)uses p) */
4914 for (p = collstart; p < collend; ++p) {
4915 char buffer[2+29+1+1];
4916 char *cp;
4917 sprintf(buffer, "&#%d;", (int)*p);
4918 if (charmaptranslate_makespace(&res, &str,
4919 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4920 goto onError;
4921 for (cp = buffer; *cp; ++cp)
4922 *str++ = *cp;
4924 p = collend;
4925 break;
4926 default:
4927 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4928 reason, startp, size, &exc,
4929 collstart-startp, collend-startp, &newpos);
4930 if (repunicode == NULL)
4931 goto onError;
4932 /* generate replacement */
4933 repsize = PyUnicode_GET_SIZE(repunicode);
4934 if (charmaptranslate_makespace(&res, &str,
4935 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4936 Py_DECREF(repunicode);
4937 goto onError;
4939 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4940 *str++ = *uni2;
4941 p = startp + newpos;
4942 Py_DECREF(repunicode);
4946 /* Resize if we allocated to much */
4947 respos = str-PyUnicode_AS_UNICODE(res);
4948 if (respos<PyUnicode_GET_SIZE(res)) {
4949 if (_PyUnicode_Resize(&res, respos) < 0)
4950 goto onError;
4952 Py_XDECREF(exc);
4953 Py_XDECREF(errorHandler);
4954 return res;
4956 onError:
4957 Py_XDECREF(res);
4958 Py_XDECREF(exc);
4959 Py_XDECREF(errorHandler);
4960 return NULL;
4963 PyObject *PyUnicode_Translate(PyObject *str,
4964 PyObject *mapping,
4965 const char *errors)
4967 PyObject *result;
4969 str = PyUnicode_FromObject(str);
4970 if (str == NULL)
4971 goto onError;
4972 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4973 PyUnicode_GET_SIZE(str),
4974 mapping,
4975 errors);
4976 Py_DECREF(str);
4977 return result;
4979 onError:
4980 Py_XDECREF(str);
4981 return NULL;
4984 /* --- Decimal Encoder ---------------------------------------------------- */
4986 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
4987 Py_ssize_t length,
4988 char *output,
4989 const char *errors)
4991 Py_UNICODE *p, *end;
4992 PyObject *errorHandler = NULL;
4993 PyObject *exc = NULL;
4994 const char *encoding = "decimal";
4995 const char *reason = "invalid decimal Unicode string";
4996 /* the following variable is used for caching string comparisons
4997 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4998 int known_errorHandler = -1;
5000 if (output == NULL) {
5001 PyErr_BadArgument();
5002 return -1;
5005 p = s;
5006 end = s + length;
5007 while (p < end) {
5008 register Py_UNICODE ch = *p;
5009 int decimal;
5010 PyObject *repunicode;
5011 Py_ssize_t repsize;
5012 Py_ssize_t newpos;
5013 Py_UNICODE *uni2;
5014 Py_UNICODE *collstart;
5015 Py_UNICODE *collend;
5017 if (Py_UNICODE_ISSPACE(ch)) {
5018 *output++ = ' ';
5019 ++p;
5020 continue;
5022 decimal = Py_UNICODE_TODECIMAL(ch);
5023 if (decimal >= 0) {
5024 *output++ = '0' + decimal;
5025 ++p;
5026 continue;
5028 if (0 < ch && ch < 256) {
5029 *output++ = (char)ch;
5030 ++p;
5031 continue;
5033 /* All other characters are considered unencodable */
5034 collstart = p;
5035 collend = p+1;
5036 while (collend < end) {
5037 if ((0 < *collend && *collend < 256) ||
5038 !Py_UNICODE_ISSPACE(*collend) ||
5039 Py_UNICODE_TODECIMAL(*collend))
5040 break;
5042 /* cache callback name lookup
5043 * (if not done yet, i.e. it's the first error) */
5044 if (known_errorHandler==-1) {
5045 if ((errors==NULL) || (!strcmp(errors, "strict")))
5046 known_errorHandler = 1;
5047 else if (!strcmp(errors, "replace"))
5048 known_errorHandler = 2;
5049 else if (!strcmp(errors, "ignore"))
5050 known_errorHandler = 3;
5051 else if (!strcmp(errors, "xmlcharrefreplace"))
5052 known_errorHandler = 4;
5053 else
5054 known_errorHandler = 0;
5056 switch (known_errorHandler) {
5057 case 1: /* strict */
5058 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5059 goto onError;
5060 case 2: /* replace */
5061 for (p = collstart; p < collend; ++p)
5062 *output++ = '?';
5063 /* fall through */
5064 case 3: /* ignore */
5065 p = collend;
5066 break;
5067 case 4: /* xmlcharrefreplace */
5068 /* generate replacement (temporarily (mis)uses p) */
5069 for (p = collstart; p < collend; ++p)
5070 output += sprintf(output, "&#%d;", (int)*p);
5071 p = collend;
5072 break;
5073 default:
5074 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5075 encoding, reason, s, length, &exc,
5076 collstart-s, collend-s, &newpos);
5077 if (repunicode == NULL)
5078 goto onError;
5079 /* generate replacement */
5080 repsize = PyUnicode_GET_SIZE(repunicode);
5081 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5082 Py_UNICODE ch = *uni2;
5083 if (Py_UNICODE_ISSPACE(ch))
5084 *output++ = ' ';
5085 else {
5086 decimal = Py_UNICODE_TODECIMAL(ch);
5087 if (decimal >= 0)
5088 *output++ = '0' + decimal;
5089 else if (0 < ch && ch < 256)
5090 *output++ = (char)ch;
5091 else {
5092 Py_DECREF(repunicode);
5093 raise_encode_exception(&exc, encoding,
5094 s, length, collstart-s, collend-s, reason);
5095 goto onError;
5099 p = s + newpos;
5100 Py_DECREF(repunicode);
5103 /* 0-terminate the output string */
5104 *output++ = '\0';
5105 Py_XDECREF(exc);
5106 Py_XDECREF(errorHandler);
5107 return 0;
5109 onError:
5110 Py_XDECREF(exc);
5111 Py_XDECREF(errorHandler);
5112 return -1;
5115 /* --- Helpers ------------------------------------------------------------ */
5117 #include "stringlib/unicodedefs.h"
5119 #define FROM_UNICODE
5121 #include "stringlib/fastsearch.h"
5123 #include "stringlib/count.h"
5124 #include "stringlib/find.h"
5125 #include "stringlib/partition.h"
5127 /* helper macro to fixup start/end slice values */
5128 #define FIX_START_END(obj) \
5129 if (start < 0) \
5130 start += (obj)->length; \
5131 if (start < 0) \
5132 start = 0; \
5133 if (end > (obj)->length) \
5134 end = (obj)->length; \
5135 if (end < 0) \
5136 end += (obj)->length; \
5137 if (end < 0) \
5138 end = 0;
5140 Py_ssize_t PyUnicode_Count(PyObject *str,
5141 PyObject *substr,
5142 Py_ssize_t start,
5143 Py_ssize_t end)
5145 Py_ssize_t result;
5146 PyUnicodeObject* str_obj;
5147 PyUnicodeObject* sub_obj;
5149 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5150 if (!str_obj)
5151 return -1;
5152 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5153 if (!sub_obj) {
5154 Py_DECREF(str_obj);
5155 return -1;
5158 FIX_START_END(str_obj);
5160 result = stringlib_count(
5161 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5164 Py_DECREF(sub_obj);
5165 Py_DECREF(str_obj);
5167 return result;
5170 Py_ssize_t PyUnicode_Find(PyObject *str,
5171 PyObject *sub,
5172 Py_ssize_t start,
5173 Py_ssize_t end,
5174 int direction)
5176 Py_ssize_t result;
5178 str = PyUnicode_FromObject(str);
5179 if (!str)
5180 return -2;
5181 sub = PyUnicode_FromObject(sub);
5182 if (!sub) {
5183 Py_DECREF(str);
5184 return -2;
5187 if (direction > 0)
5188 result = stringlib_find_slice(
5189 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5190 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5191 start, end
5193 else
5194 result = stringlib_rfind_slice(
5195 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5196 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5197 start, end
5200 Py_DECREF(str);
5201 Py_DECREF(sub);
5203 return result;
5206 static
5207 int tailmatch(PyUnicodeObject *self,
5208 PyUnicodeObject *substring,
5209 Py_ssize_t start,
5210 Py_ssize_t end,
5211 int direction)
5213 if (substring->length == 0)
5214 return 1;
5216 FIX_START_END(self);
5218 end -= substring->length;
5219 if (end < start)
5220 return 0;
5222 if (direction > 0) {
5223 if (Py_UNICODE_MATCH(self, end, substring))
5224 return 1;
5225 } else {
5226 if (Py_UNICODE_MATCH(self, start, substring))
5227 return 1;
5230 return 0;
5233 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5234 PyObject *substr,
5235 Py_ssize_t start,
5236 Py_ssize_t end,
5237 int direction)
5239 Py_ssize_t result;
5241 str = PyUnicode_FromObject(str);
5242 if (str == NULL)
5243 return -1;
5244 substr = PyUnicode_FromObject(substr);
5245 if (substr == NULL) {
5246 Py_DECREF(str);
5247 return -1;
5250 result = tailmatch((PyUnicodeObject *)str,
5251 (PyUnicodeObject *)substr,
5252 start, end, direction);
5253 Py_DECREF(str);
5254 Py_DECREF(substr);
5255 return result;
5258 /* Apply fixfct filter to the Unicode object self and return a
5259 reference to the modified object */
5261 static
5262 PyObject *fixup(PyUnicodeObject *self,
5263 int (*fixfct)(PyUnicodeObject *s))
5266 PyUnicodeObject *u;
5268 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5269 if (u == NULL)
5270 return NULL;
5272 Py_UNICODE_COPY(u->str, self->str, self->length);
5274 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5275 /* fixfct should return TRUE if it modified the buffer. If
5276 FALSE, return a reference to the original buffer instead
5277 (to save space, not time) */
5278 Py_INCREF(self);
5279 Py_DECREF(u);
5280 return (PyObject*) self;
5282 return (PyObject*) u;
5285 static
5286 int fixupper(PyUnicodeObject *self)
5288 Py_ssize_t len = self->length;
5289 Py_UNICODE *s = self->str;
5290 int status = 0;
5292 while (len-- > 0) {
5293 register Py_UNICODE ch;
5295 ch = Py_UNICODE_TOUPPER(*s);
5296 if (ch != *s) {
5297 status = 1;
5298 *s = ch;
5300 s++;
5303 return status;
5306 static
5307 int fixlower(PyUnicodeObject *self)
5309 Py_ssize_t len = self->length;
5310 Py_UNICODE *s = self->str;
5311 int status = 0;
5313 while (len-- > 0) {
5314 register Py_UNICODE ch;
5316 ch = Py_UNICODE_TOLOWER(*s);
5317 if (ch != *s) {
5318 status = 1;
5319 *s = ch;
5321 s++;
5324 return status;
5327 static
5328 int fixswapcase(PyUnicodeObject *self)
5330 Py_ssize_t len = self->length;
5331 Py_UNICODE *s = self->str;
5332 int status = 0;
5334 while (len-- > 0) {
5335 if (Py_UNICODE_ISUPPER(*s)) {
5336 *s = Py_UNICODE_TOLOWER(*s);
5337 status = 1;
5338 } else if (Py_UNICODE_ISLOWER(*s)) {
5339 *s = Py_UNICODE_TOUPPER(*s);
5340 status = 1;
5342 s++;
5345 return status;
5348 static
5349 int fixcapitalize(PyUnicodeObject *self)
5351 Py_ssize_t len = self->length;
5352 Py_UNICODE *s = self->str;
5353 int status = 0;
5355 if (len == 0)
5356 return 0;
5357 if (Py_UNICODE_ISLOWER(*s)) {
5358 *s = Py_UNICODE_TOUPPER(*s);
5359 status = 1;
5361 s++;
5362 while (--len > 0) {
5363 if (Py_UNICODE_ISUPPER(*s)) {
5364 *s = Py_UNICODE_TOLOWER(*s);
5365 status = 1;
5367 s++;
5369 return status;
5372 static
5373 int fixtitle(PyUnicodeObject *self)
5375 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5376 register Py_UNICODE *e;
5377 int previous_is_cased;
5379 /* Shortcut for single character strings */
5380 if (PyUnicode_GET_SIZE(self) == 1) {
5381 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5382 if (*p != ch) {
5383 *p = ch;
5384 return 1;
5386 else
5387 return 0;
5390 e = p + PyUnicode_GET_SIZE(self);
5391 previous_is_cased = 0;
5392 for (; p < e; p++) {
5393 register const Py_UNICODE ch = *p;
5395 if (previous_is_cased)
5396 *p = Py_UNICODE_TOLOWER(ch);
5397 else
5398 *p = Py_UNICODE_TOTITLE(ch);
5400 if (Py_UNICODE_ISLOWER(ch) ||
5401 Py_UNICODE_ISUPPER(ch) ||
5402 Py_UNICODE_ISTITLE(ch))
5403 previous_is_cased = 1;
5404 else
5405 previous_is_cased = 0;
5407 return 1;
5410 PyObject *
5411 PyUnicode_Join(PyObject *separator, PyObject *seq)
5413 PyObject *internal_separator = NULL;
5414 const Py_UNICODE blank = ' ';
5415 const Py_UNICODE *sep = &blank;
5416 Py_ssize_t seplen = 1;
5417 PyUnicodeObject *res = NULL; /* the result */
5418 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5419 Py_ssize_t res_used; /* # used bytes */
5420 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5421 PyObject *fseq; /* PySequence_Fast(seq) */
5422 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
5423 PyObject *item;
5424 Py_ssize_t i;
5426 fseq = PySequence_Fast(seq, "");
5427 if (fseq == NULL) {
5428 return NULL;
5431 /* Grrrr. A codec may be invoked to convert str objects to
5432 * Unicode, and so it's possible to call back into Python code
5433 * during PyUnicode_FromObject(), and so it's possible for a sick
5434 * codec to change the size of fseq (if seq is a list). Therefore
5435 * we have to keep refetching the size -- can't assume seqlen
5436 * is invariant.
5438 seqlen = PySequence_Fast_GET_SIZE(fseq);
5439 /* If empty sequence, return u"". */
5440 if (seqlen == 0) {
5441 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5442 goto Done;
5444 /* If singleton sequence with an exact Unicode, return that. */
5445 if (seqlen == 1) {
5446 item = PySequence_Fast_GET_ITEM(fseq, 0);
5447 if (PyUnicode_CheckExact(item)) {
5448 Py_INCREF(item);
5449 res = (PyUnicodeObject *)item;
5450 goto Done;
5454 /* At least two items to join, or one that isn't exact Unicode. */
5455 if (seqlen > 1) {
5456 /* Set up sep and seplen -- they're needed. */
5457 if (separator == NULL) {
5458 sep = &blank;
5459 seplen = 1;
5461 else {
5462 internal_separator = PyUnicode_FromObject(separator);
5463 if (internal_separator == NULL)
5464 goto onError;
5465 sep = PyUnicode_AS_UNICODE(internal_separator);
5466 seplen = PyUnicode_GET_SIZE(internal_separator);
5467 /* In case PyUnicode_FromObject() mutated seq. */
5468 seqlen = PySequence_Fast_GET_SIZE(fseq);
5472 /* Get space. */
5473 res = _PyUnicode_New(res_alloc);
5474 if (res == NULL)
5475 goto onError;
5476 res_p = PyUnicode_AS_UNICODE(res);
5477 res_used = 0;
5479 for (i = 0; i < seqlen; ++i) {
5480 Py_ssize_t itemlen;
5481 Py_ssize_t new_res_used;
5483 item = PySequence_Fast_GET_ITEM(fseq, i);
5484 /* Convert item to Unicode. */
5485 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5486 PyErr_Format(PyExc_TypeError,
5487 "sequence item %zd: expected string or Unicode,"
5488 " %.80s found",
5489 i, Py_TYPE(item)->tp_name);
5490 goto onError;
5492 item = PyUnicode_FromObject(item);
5493 if (item == NULL)
5494 goto onError;
5495 /* We own a reference to item from here on. */
5497 /* In case PyUnicode_FromObject() mutated seq. */
5498 seqlen = PySequence_Fast_GET_SIZE(fseq);
5500 /* Make sure we have enough space for the separator and the item. */
5501 itemlen = PyUnicode_GET_SIZE(item);
5502 new_res_used = res_used + itemlen;
5503 if (new_res_used < 0)
5504 goto Overflow;
5505 if (i < seqlen - 1) {
5506 new_res_used += seplen;
5507 if (new_res_used < 0)
5508 goto Overflow;
5510 if (new_res_used > res_alloc) {
5511 /* double allocated size until it's big enough */
5512 do {
5513 res_alloc += res_alloc;
5514 if (res_alloc <= 0)
5515 goto Overflow;
5516 } while (new_res_used > res_alloc);
5517 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5518 Py_DECREF(item);
5519 goto onError;
5521 res_p = PyUnicode_AS_UNICODE(res) + res_used;
5524 /* Copy item, and maybe the separator. */
5525 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5526 res_p += itemlen;
5527 if (i < seqlen - 1) {
5528 Py_UNICODE_COPY(res_p, sep, seplen);
5529 res_p += seplen;
5531 Py_DECREF(item);
5532 res_used = new_res_used;
5535 /* Shrink res to match the used area; this probably can't fail,
5536 * but it's cheap to check.
5538 if (_PyUnicode_Resize(&res, res_used) < 0)
5539 goto onError;
5541 Done:
5542 Py_XDECREF(internal_separator);
5543 Py_DECREF(fseq);
5544 return (PyObject *)res;
5546 Overflow:
5547 PyErr_SetString(PyExc_OverflowError,
5548 "join() result is too long for a Python string");
5549 Py_DECREF(item);
5550 /* fall through */
5552 onError:
5553 Py_XDECREF(internal_separator);
5554 Py_DECREF(fseq);
5555 Py_XDECREF(res);
5556 return NULL;
5559 static
5560 PyUnicodeObject *pad(PyUnicodeObject *self,
5561 Py_ssize_t left,
5562 Py_ssize_t right,
5563 Py_UNICODE fill)
5565 PyUnicodeObject *u;
5567 if (left < 0)
5568 left = 0;
5569 if (right < 0)
5570 right = 0;
5572 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5573 Py_INCREF(self);
5574 return self;
5577 u = _PyUnicode_New(left + self->length + right);
5578 if (u) {
5579 if (left)
5580 Py_UNICODE_FILL(u->str, fill, left);
5581 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5582 if (right)
5583 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5586 return u;
5589 #define SPLIT_APPEND(data, left, right) \
5590 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
5591 if (!str) \
5592 goto onError; \
5593 if (PyList_Append(list, str)) { \
5594 Py_DECREF(str); \
5595 goto onError; \
5597 else \
5598 Py_DECREF(str);
5600 static
5601 PyObject *split_whitespace(PyUnicodeObject *self,
5602 PyObject *list,
5603 Py_ssize_t maxcount)
5605 register Py_ssize_t i;
5606 register Py_ssize_t j;
5607 Py_ssize_t len = self->length;
5608 PyObject *str;
5609 register const Py_UNICODE *buf = self->str;
5611 for (i = j = 0; i < len; ) {
5612 /* find a token */
5613 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5614 i++;
5615 j = i;
5616 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
5617 i++;
5618 if (j < i) {
5619 if (maxcount-- <= 0)
5620 break;
5621 SPLIT_APPEND(buf, j, i);
5622 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5623 i++;
5624 j = i;
5627 if (j < len) {
5628 SPLIT_APPEND(buf, j, len);
5630 return list;
5632 onError:
5633 Py_DECREF(list);
5634 return NULL;
5637 PyObject *PyUnicode_Splitlines(PyObject *string,
5638 int keepends)
5640 register Py_ssize_t i;
5641 register Py_ssize_t j;
5642 Py_ssize_t len;
5643 PyObject *list;
5644 PyObject *str;
5645 Py_UNICODE *data;
5647 string = PyUnicode_FromObject(string);
5648 if (string == NULL)
5649 return NULL;
5650 data = PyUnicode_AS_UNICODE(string);
5651 len = PyUnicode_GET_SIZE(string);
5653 list = PyList_New(0);
5654 if (!list)
5655 goto onError;
5657 for (i = j = 0; i < len; ) {
5658 Py_ssize_t eol;
5660 /* Find a line and append it */
5661 while (i < len && !BLOOM_LINEBREAK(data[i]))
5662 i++;
5664 /* Skip the line break reading CRLF as one line break */
5665 eol = i;
5666 if (i < len) {
5667 if (data[i] == '\r' && i + 1 < len &&
5668 data[i+1] == '\n')
5669 i += 2;
5670 else
5671 i++;
5672 if (keepends)
5673 eol = i;
5675 SPLIT_APPEND(data, j, eol);
5676 j = i;
5678 if (j < len) {
5679 SPLIT_APPEND(data, j, len);
5682 Py_DECREF(string);
5683 return list;
5685 onError:
5686 Py_XDECREF(list);
5687 Py_DECREF(string);
5688 return NULL;
5691 static
5692 PyObject *split_char(PyUnicodeObject *self,
5693 PyObject *list,
5694 Py_UNICODE ch,
5695 Py_ssize_t maxcount)
5697 register Py_ssize_t i;
5698 register Py_ssize_t j;
5699 Py_ssize_t len = self->length;
5700 PyObject *str;
5701 register const Py_UNICODE *buf = self->str;
5703 for (i = j = 0; i < len; ) {
5704 if (buf[i] == ch) {
5705 if (maxcount-- <= 0)
5706 break;
5707 SPLIT_APPEND(buf, j, i);
5708 i = j = i + 1;
5709 } else
5710 i++;
5712 if (j <= len) {
5713 SPLIT_APPEND(buf, j, len);
5715 return list;
5717 onError:
5718 Py_DECREF(list);
5719 return NULL;
5722 static
5723 PyObject *split_substring(PyUnicodeObject *self,
5724 PyObject *list,
5725 PyUnicodeObject *substring,
5726 Py_ssize_t maxcount)
5728 register Py_ssize_t i;
5729 register Py_ssize_t j;
5730 Py_ssize_t len = self->length;
5731 Py_ssize_t sublen = substring->length;
5732 PyObject *str;
5734 for (i = j = 0; i <= len - sublen; ) {
5735 if (Py_UNICODE_MATCH(self, i, substring)) {
5736 if (maxcount-- <= 0)
5737 break;
5738 SPLIT_APPEND(self->str, j, i);
5739 i = j = i + sublen;
5740 } else
5741 i++;
5743 if (j <= len) {
5744 SPLIT_APPEND(self->str, j, len);
5746 return list;
5748 onError:
5749 Py_DECREF(list);
5750 return NULL;
5753 static
5754 PyObject *rsplit_whitespace(PyUnicodeObject *self,
5755 PyObject *list,
5756 Py_ssize_t maxcount)
5758 register Py_ssize_t i;
5759 register Py_ssize_t j;
5760 Py_ssize_t len = self->length;
5761 PyObject *str;
5762 register const Py_UNICODE *buf = self->str;
5764 for (i = j = len - 1; i >= 0; ) {
5765 /* find a token */
5766 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5767 i--;
5768 j = i;
5769 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
5770 i--;
5771 if (j > i) {
5772 if (maxcount-- <= 0)
5773 break;
5774 SPLIT_APPEND(buf, i + 1, j + 1);
5775 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5776 i--;
5777 j = i;
5780 if (j >= 0) {
5781 SPLIT_APPEND(buf, 0, j + 1);
5783 if (PyList_Reverse(list) < 0)
5784 goto onError;
5785 return list;
5787 onError:
5788 Py_DECREF(list);
5789 return NULL;
5792 static
5793 PyObject *rsplit_char(PyUnicodeObject *self,
5794 PyObject *list,
5795 Py_UNICODE ch,
5796 Py_ssize_t maxcount)
5798 register Py_ssize_t i;
5799 register Py_ssize_t j;
5800 Py_ssize_t len = self->length;
5801 PyObject *str;
5802 register const Py_UNICODE *buf = self->str;
5804 for (i = j = len - 1; i >= 0; ) {
5805 if (buf[i] == ch) {
5806 if (maxcount-- <= 0)
5807 break;
5808 SPLIT_APPEND(buf, i + 1, j + 1);
5809 j = i = i - 1;
5810 } else
5811 i--;
5813 if (j >= -1) {
5814 SPLIT_APPEND(buf, 0, j + 1);
5816 if (PyList_Reverse(list) < 0)
5817 goto onError;
5818 return list;
5820 onError:
5821 Py_DECREF(list);
5822 return NULL;
5825 static
5826 PyObject *rsplit_substring(PyUnicodeObject *self,
5827 PyObject *list,
5828 PyUnicodeObject *substring,
5829 Py_ssize_t maxcount)
5831 register Py_ssize_t i;
5832 register Py_ssize_t j;
5833 Py_ssize_t len = self->length;
5834 Py_ssize_t sublen = substring->length;
5835 PyObject *str;
5837 for (i = len - sublen, j = len; i >= 0; ) {
5838 if (Py_UNICODE_MATCH(self, i, substring)) {
5839 if (maxcount-- <= 0)
5840 break;
5841 SPLIT_APPEND(self->str, i + sublen, j);
5842 j = i;
5843 i -= sublen;
5844 } else
5845 i--;
5847 if (j >= 0) {
5848 SPLIT_APPEND(self->str, 0, j);
5850 if (PyList_Reverse(list) < 0)
5851 goto onError;
5852 return list;
5854 onError:
5855 Py_DECREF(list);
5856 return NULL;
5859 #undef SPLIT_APPEND
5861 static
5862 PyObject *split(PyUnicodeObject *self,
5863 PyUnicodeObject *substring,
5864 Py_ssize_t maxcount)
5866 PyObject *list;
5868 if (maxcount < 0)
5869 maxcount = PY_SSIZE_T_MAX;
5871 list = PyList_New(0);
5872 if (!list)
5873 return NULL;
5875 if (substring == NULL)
5876 return split_whitespace(self,list,maxcount);
5878 else if (substring->length == 1)
5879 return split_char(self,list,substring->str[0],maxcount);
5881 else if (substring->length == 0) {
5882 Py_DECREF(list);
5883 PyErr_SetString(PyExc_ValueError, "empty separator");
5884 return NULL;
5886 else
5887 return split_substring(self,list,substring,maxcount);
5890 static
5891 PyObject *rsplit(PyUnicodeObject *self,
5892 PyUnicodeObject *substring,
5893 Py_ssize_t maxcount)
5895 PyObject *list;
5897 if (maxcount < 0)
5898 maxcount = PY_SSIZE_T_MAX;
5900 list = PyList_New(0);
5901 if (!list)
5902 return NULL;
5904 if (substring == NULL)
5905 return rsplit_whitespace(self,list,maxcount);
5907 else if (substring->length == 1)
5908 return rsplit_char(self,list,substring->str[0],maxcount);
5910 else if (substring->length == 0) {
5911 Py_DECREF(list);
5912 PyErr_SetString(PyExc_ValueError, "empty separator");
5913 return NULL;
5915 else
5916 return rsplit_substring(self,list,substring,maxcount);
5919 static
5920 PyObject *replace(PyUnicodeObject *self,
5921 PyUnicodeObject *str1,
5922 PyUnicodeObject *str2,
5923 Py_ssize_t maxcount)
5925 PyUnicodeObject *u;
5927 if (maxcount < 0)
5928 maxcount = PY_SSIZE_T_MAX;
5930 if (str1->length == str2->length) {
5931 /* same length */
5932 Py_ssize_t i;
5933 if (str1->length == 1) {
5934 /* replace characters */
5935 Py_UNICODE u1, u2;
5936 if (!findchar(self->str, self->length, str1->str[0]))
5937 goto nothing;
5938 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5939 if (!u)
5940 return NULL;
5941 Py_UNICODE_COPY(u->str, self->str, self->length);
5942 u1 = str1->str[0];
5943 u2 = str2->str[0];
5944 for (i = 0; i < u->length; i++)
5945 if (u->str[i] == u1) {
5946 if (--maxcount < 0)
5947 break;
5948 u->str[i] = u2;
5950 } else {
5951 i = fastsearch(
5952 self->str, self->length, str1->str, str1->length, FAST_SEARCH
5954 if (i < 0)
5955 goto nothing;
5956 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5957 if (!u)
5958 return NULL;
5959 Py_UNICODE_COPY(u->str, self->str, self->length);
5960 while (i <= self->length - str1->length)
5961 if (Py_UNICODE_MATCH(self, i, str1)) {
5962 if (--maxcount < 0)
5963 break;
5964 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5965 i += str1->length;
5966 } else
5967 i++;
5969 } else {
5971 Py_ssize_t n, i, j, e;
5972 Py_ssize_t product, new_size, delta;
5973 Py_UNICODE *p;
5975 /* replace strings */
5976 n = stringlib_count(self->str, self->length, str1->str, str1->length);
5977 if (n > maxcount)
5978 n = maxcount;
5979 if (n == 0)
5980 goto nothing;
5981 /* new_size = self->length + n * (str2->length - str1->length)); */
5982 delta = (str2->length - str1->length);
5983 if (delta == 0) {
5984 new_size = self->length;
5985 } else {
5986 product = n * (str2->length - str1->length);
5987 if ((product / (str2->length - str1->length)) != n) {
5988 PyErr_SetString(PyExc_OverflowError,
5989 "replace string is too long");
5990 return NULL;
5992 new_size = self->length + product;
5993 if (new_size < 0) {
5994 PyErr_SetString(PyExc_OverflowError,
5995 "replace string is too long");
5996 return NULL;
5999 u = _PyUnicode_New(new_size);
6000 if (!u)
6001 return NULL;
6002 i = 0;
6003 p = u->str;
6004 e = self->length - str1->length;
6005 if (str1->length > 0) {
6006 while (n-- > 0) {
6007 /* look for next match */
6008 j = i;
6009 while (j <= e) {
6010 if (Py_UNICODE_MATCH(self, j, str1))
6011 break;
6012 j++;
6014 if (j > i) {
6015 if (j > e)
6016 break;
6017 /* copy unchanged part [i:j] */
6018 Py_UNICODE_COPY(p, self->str+i, j-i);
6019 p += j - i;
6021 /* copy substitution string */
6022 if (str2->length > 0) {
6023 Py_UNICODE_COPY(p, str2->str, str2->length);
6024 p += str2->length;
6026 i = j + str1->length;
6028 if (i < self->length)
6029 /* copy tail [i:] */
6030 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6031 } else {
6032 /* interleave */
6033 while (n > 0) {
6034 Py_UNICODE_COPY(p, str2->str, str2->length);
6035 p += str2->length;
6036 if (--n <= 0)
6037 break;
6038 *p++ = self->str[i++];
6040 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6043 return (PyObject *) u;
6045 nothing:
6046 /* nothing to replace; return original string (when possible) */
6047 if (PyUnicode_CheckExact(self)) {
6048 Py_INCREF(self);
6049 return (PyObject *) self;
6051 return PyUnicode_FromUnicode(self->str, self->length);
6054 /* --- Unicode Object Methods --------------------------------------------- */
6056 PyDoc_STRVAR(title__doc__,
6057 "S.title() -> unicode\n\
6059 Return a titlecased version of S, i.e. words start with title case\n\
6060 characters, all remaining cased characters have lower case.");
6062 static PyObject*
6063 unicode_title(PyUnicodeObject *self)
6065 return fixup(self, fixtitle);
6068 PyDoc_STRVAR(capitalize__doc__,
6069 "S.capitalize() -> unicode\n\
6071 Return a capitalized version of S, i.e. make the first character\n\
6072 have upper case.");
6074 static PyObject*
6075 unicode_capitalize(PyUnicodeObject *self)
6077 return fixup(self, fixcapitalize);
6080 #if 0
6081 PyDoc_STRVAR(capwords__doc__,
6082 "S.capwords() -> unicode\n\
6084 Apply .capitalize() to all words in S and return the result with\n\
6085 normalized whitespace (all whitespace strings are replaced by ' ').");
6087 static PyObject*
6088 unicode_capwords(PyUnicodeObject *self)
6090 PyObject *list;
6091 PyObject *item;
6092 Py_ssize_t i;
6094 /* Split into words */
6095 list = split(self, NULL, -1);
6096 if (!list)
6097 return NULL;
6099 /* Capitalize each word */
6100 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6101 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6102 fixcapitalize);
6103 if (item == NULL)
6104 goto onError;
6105 Py_DECREF(PyList_GET_ITEM(list, i));
6106 PyList_SET_ITEM(list, i, item);
6109 /* Join the words to form a new string */
6110 item = PyUnicode_Join(NULL, list);
6112 onError:
6113 Py_DECREF(list);
6114 return (PyObject *)item;
6116 #endif
6118 /* Argument converter. Coerces to a single unicode character */
6120 static int
6121 convert_uc(PyObject *obj, void *addr)
6123 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6124 PyObject *uniobj;
6125 Py_UNICODE *unistr;
6127 uniobj = PyUnicode_FromObject(obj);
6128 if (uniobj == NULL) {
6129 PyErr_SetString(PyExc_TypeError,
6130 "The fill character cannot be converted to Unicode");
6131 return 0;
6133 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6134 PyErr_SetString(PyExc_TypeError,
6135 "The fill character must be exactly one character long");
6136 Py_DECREF(uniobj);
6137 return 0;
6139 unistr = PyUnicode_AS_UNICODE(uniobj);
6140 *fillcharloc = unistr[0];
6141 Py_DECREF(uniobj);
6142 return 1;
6145 PyDoc_STRVAR(center__doc__,
6146 "S.center(width[, fillchar]) -> unicode\n\
6148 Return S centered in a Unicode string of length width. Padding is\n\
6149 done using the specified fill character (default is a space)");
6151 static PyObject *
6152 unicode_center(PyUnicodeObject *self, PyObject *args)
6154 Py_ssize_t marg, left;
6155 Py_ssize_t width;
6156 Py_UNICODE fillchar = ' ';
6158 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6159 return NULL;
6161 if (self->length >= width && PyUnicode_CheckExact(self)) {
6162 Py_INCREF(self);
6163 return (PyObject*) self;
6166 marg = width - self->length;
6167 left = marg / 2 + (marg & width & 1);
6169 return (PyObject*) pad(self, left, marg - left, fillchar);
6172 #if 0
6174 /* This code should go into some future Unicode collation support
6175 module. The basic comparison should compare ordinals on a naive
6176 basis (this is what Java does and thus JPython too). */
6178 /* speedy UTF-16 code point order comparison */
6179 /* gleaned from: */
6180 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6182 static short utf16Fixup[32] =
6184 0, 0, 0, 0, 0, 0, 0, 0,
6185 0, 0, 0, 0, 0, 0, 0, 0,
6186 0, 0, 0, 0, 0, 0, 0, 0,
6187 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6190 static int
6191 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6193 Py_ssize_t len1, len2;
6195 Py_UNICODE *s1 = str1->str;
6196 Py_UNICODE *s2 = str2->str;
6198 len1 = str1->length;
6199 len2 = str2->length;
6201 while (len1 > 0 && len2 > 0) {
6202 Py_UNICODE c1, c2;
6204 c1 = *s1++;
6205 c2 = *s2++;
6207 if (c1 > (1<<11) * 26)
6208 c1 += utf16Fixup[c1>>11];
6209 if (c2 > (1<<11) * 26)
6210 c2 += utf16Fixup[c2>>11];
6211 /* now c1 and c2 are in UTF-32-compatible order */
6213 if (c1 != c2)
6214 return (c1 < c2) ? -1 : 1;
6216 len1--; len2--;
6219 return (len1 < len2) ? -1 : (len1 != len2);
6222 #else
6224 static int
6225 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6227 register Py_ssize_t len1, len2;
6229 Py_UNICODE *s1 = str1->str;
6230 Py_UNICODE *s2 = str2->str;
6232 len1 = str1->length;
6233 len2 = str2->length;
6235 while (len1 > 0 && len2 > 0) {
6236 Py_UNICODE c1, c2;
6238 c1 = *s1++;
6239 c2 = *s2++;
6241 if (c1 != c2)
6242 return (c1 < c2) ? -1 : 1;
6244 len1--; len2--;
6247 return (len1 < len2) ? -1 : (len1 != len2);
6250 #endif
6252 int PyUnicode_Compare(PyObject *left,
6253 PyObject *right)
6255 PyUnicodeObject *u = NULL, *v = NULL;
6256 int result;
6258 /* Coerce the two arguments */
6259 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6260 if (u == NULL)
6261 goto onError;
6262 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6263 if (v == NULL)
6264 goto onError;
6266 /* Shortcut for empty or interned objects */
6267 if (v == u) {
6268 Py_DECREF(u);
6269 Py_DECREF(v);
6270 return 0;
6273 result = unicode_compare(u, v);
6275 Py_DECREF(u);
6276 Py_DECREF(v);
6277 return result;
6279 onError:
6280 Py_XDECREF(u);
6281 Py_XDECREF(v);
6282 return -1;
6285 PyObject *PyUnicode_RichCompare(PyObject *left,
6286 PyObject *right,
6287 int op)
6289 int result;
6291 result = PyUnicode_Compare(left, right);
6292 if (result == -1 && PyErr_Occurred())
6293 goto onError;
6295 /* Convert the return value to a Boolean */
6296 switch (op) {
6297 case Py_EQ:
6298 result = (result == 0);
6299 break;
6300 case Py_NE:
6301 result = (result != 0);
6302 break;
6303 case Py_LE:
6304 result = (result <= 0);
6305 break;
6306 case Py_GE:
6307 result = (result >= 0);
6308 break;
6309 case Py_LT:
6310 result = (result == -1);
6311 break;
6312 case Py_GT:
6313 result = (result == 1);
6314 break;
6316 return PyBool_FromLong(result);
6318 onError:
6320 /* Standard case
6322 Type errors mean that PyUnicode_FromObject() could not convert
6323 one of the arguments (usually the right hand side) to Unicode,
6324 ie. we can't handle the comparison request. However, it is
6325 possible that the other object knows a comparison method, which
6326 is why we return Py_NotImplemented to give the other object a
6327 chance.
6330 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6331 PyErr_Clear();
6332 Py_INCREF(Py_NotImplemented);
6333 return Py_NotImplemented;
6335 if (op != Py_EQ && op != Py_NE)
6336 return NULL;
6338 /* Equality comparison.
6340 This is a special case: we silence any PyExc_UnicodeDecodeError
6341 and instead turn it into a PyErr_UnicodeWarning.
6344 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6345 return NULL;
6346 PyErr_Clear();
6347 if (PyErr_Warn(PyExc_UnicodeWarning,
6348 (op == Py_EQ) ?
6349 "Unicode equal comparison "
6350 "failed to convert both arguments to Unicode - "
6351 "interpreting them as being unequal" :
6352 "Unicode unequal comparison "
6353 "failed to convert both arguments to Unicode - "
6354 "interpreting them as being unequal"
6355 ) < 0)
6356 return NULL;
6357 result = (op == Py_NE);
6358 return PyBool_FromLong(result);
6361 int PyUnicode_Contains(PyObject *container,
6362 PyObject *element)
6364 PyObject *str, *sub;
6365 int result;
6367 /* Coerce the two arguments */
6368 sub = PyUnicode_FromObject(element);
6369 if (!sub) {
6370 PyErr_SetString(PyExc_TypeError,
6371 "'in <string>' requires string as left operand");
6372 return -1;
6375 str = PyUnicode_FromObject(container);
6376 if (!str) {
6377 Py_DECREF(sub);
6378 return -1;
6381 result = stringlib_contains_obj(str, sub);
6383 Py_DECREF(str);
6384 Py_DECREF(sub);
6386 return result;
6389 /* Concat to string or Unicode object giving a new Unicode object. */
6391 PyObject *PyUnicode_Concat(PyObject *left,
6392 PyObject *right)
6394 PyUnicodeObject *u = NULL, *v = NULL, *w;
6396 /* Coerce the two arguments */
6397 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6398 if (u == NULL)
6399 goto onError;
6400 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6401 if (v == NULL)
6402 goto onError;
6404 /* Shortcuts */
6405 if (v == unicode_empty) {
6406 Py_DECREF(v);
6407 return (PyObject *)u;
6409 if (u == unicode_empty) {
6410 Py_DECREF(u);
6411 return (PyObject *)v;
6414 /* Concat the two Unicode strings */
6415 w = _PyUnicode_New(u->length + v->length);
6416 if (w == NULL)
6417 goto onError;
6418 Py_UNICODE_COPY(w->str, u->str, u->length);
6419 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6421 Py_DECREF(u);
6422 Py_DECREF(v);
6423 return (PyObject *)w;
6425 onError:
6426 Py_XDECREF(u);
6427 Py_XDECREF(v);
6428 return NULL;
6431 PyDoc_STRVAR(count__doc__,
6432 "S.count(sub[, start[, end]]) -> int\n\
6434 Return the number of non-overlapping occurrences of substring sub in\n\
6435 Unicode string S[start:end]. Optional arguments start and end are\n\
6436 interpreted as in slice notation.");
6438 static PyObject *
6439 unicode_count(PyUnicodeObject *self, PyObject *args)
6441 PyUnicodeObject *substring;
6442 Py_ssize_t start = 0;
6443 Py_ssize_t end = PY_SSIZE_T_MAX;
6444 PyObject *result;
6446 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6447 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6448 return NULL;
6450 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6451 (PyObject *)substring);
6452 if (substring == NULL)
6453 return NULL;
6455 FIX_START_END(self);
6457 result = PyInt_FromSsize_t(
6458 stringlib_count(self->str + start, end - start,
6459 substring->str, substring->length)
6462 Py_DECREF(substring);
6464 return result;
6467 PyDoc_STRVAR(encode__doc__,
6468 "S.encode([encoding[,errors]]) -> string or unicode\n\
6470 Encodes S using the codec registered for encoding. encoding defaults\n\
6471 to the default encoding. errors may be given to set a different error\n\
6472 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6473 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6474 'xmlcharrefreplace' as well as any other name registered with\n\
6475 codecs.register_error that can handle UnicodeEncodeErrors.");
6477 static PyObject *
6478 unicode_encode(PyUnicodeObject *self, PyObject *args)
6480 char *encoding = NULL;
6481 char *errors = NULL;
6482 PyObject *v;
6484 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6485 return NULL;
6486 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6487 if (v == NULL)
6488 goto onError;
6489 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6490 PyErr_Format(PyExc_TypeError,
6491 "encoder did not return a string/unicode object "
6492 "(type=%.400s)",
6493 Py_TYPE(v)->tp_name);
6494 Py_DECREF(v);
6495 return NULL;
6497 return v;
6499 onError:
6500 return NULL;
6503 PyDoc_STRVAR(decode__doc__,
6504 "S.decode([encoding[,errors]]) -> string or unicode\n\
6506 Decodes S using the codec registered for encoding. encoding defaults\n\
6507 to the default encoding. errors may be given to set a different error\n\
6508 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6509 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6510 as well as any other name registerd with codecs.register_error that is\n\
6511 able to handle UnicodeDecodeErrors.");
6513 static PyObject *
6514 unicode_decode(PyUnicodeObject *self, PyObject *args)
6516 char *encoding = NULL;
6517 char *errors = NULL;
6518 PyObject *v;
6520 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6521 return NULL;
6522 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
6523 if (v == NULL)
6524 goto onError;
6525 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6526 PyErr_Format(PyExc_TypeError,
6527 "decoder did not return a string/unicode object "
6528 "(type=%.400s)",
6529 Py_TYPE(v)->tp_name);
6530 Py_DECREF(v);
6531 return NULL;
6533 return v;
6535 onError:
6536 return NULL;
6539 PyDoc_STRVAR(expandtabs__doc__,
6540 "S.expandtabs([tabsize]) -> unicode\n\
6542 Return a copy of S where all tab characters are expanded using spaces.\n\
6543 If tabsize is not given, a tab size of 8 characters is assumed.");
6545 static PyObject*
6546 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6548 Py_UNICODE *e;
6549 Py_UNICODE *p;
6550 Py_UNICODE *q;
6551 Py_UNICODE *qe;
6552 Py_ssize_t i, j, incr;
6553 PyUnicodeObject *u;
6554 int tabsize = 8;
6556 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6557 return NULL;
6559 /* First pass: determine size of output string */
6560 i = 0; /* chars up to and including most recent \n or \r */
6561 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6562 e = self->str + self->length; /* end of input */
6563 for (p = self->str; p < e; p++)
6564 if (*p == '\t') {
6565 if (tabsize > 0) {
6566 incr = tabsize - (j % tabsize); /* cannot overflow */
6567 if (j > PY_SSIZE_T_MAX - incr)
6568 goto overflow1;
6569 j += incr;
6572 else {
6573 if (j > PY_SSIZE_T_MAX - 1)
6574 goto overflow1;
6575 j++;
6576 if (*p == '\n' || *p == '\r') {
6577 if (i > PY_SSIZE_T_MAX - j)
6578 goto overflow1;
6579 i += j;
6580 j = 0;
6584 if (i > PY_SSIZE_T_MAX - j)
6585 goto overflow1;
6587 /* Second pass: create output string and fill it */
6588 u = _PyUnicode_New(i + j);
6589 if (!u)
6590 return NULL;
6592 j = 0; /* same as in first pass */
6593 q = u->str; /* next output char */
6594 qe = u->str + u->length; /* end of output */
6596 for (p = self->str; p < e; p++)
6597 if (*p == '\t') {
6598 if (tabsize > 0) {
6599 i = tabsize - (j % tabsize);
6600 j += i;
6601 while (i--) {
6602 if (q >= qe)
6603 goto overflow2;
6604 *q++ = ' ';
6608 else {
6609 if (q >= qe)
6610 goto overflow2;
6611 *q++ = *p;
6612 j++;
6613 if (*p == '\n' || *p == '\r')
6614 j = 0;
6617 return (PyObject*) u;
6619 overflow2:
6620 Py_DECREF(u);
6621 overflow1:
6622 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6623 return NULL;
6626 PyDoc_STRVAR(find__doc__,
6627 "S.find(sub [,start [,end]]) -> int\n\
6629 Return the lowest index in S where substring sub is found,\n\
6630 such that sub is contained within s[start:end]. Optional\n\
6631 arguments start and end are interpreted as in slice notation.\n\
6633 Return -1 on failure.");
6635 static PyObject *
6636 unicode_find(PyUnicodeObject *self, PyObject *args)
6638 PyObject *substring;
6639 Py_ssize_t start;
6640 Py_ssize_t end;
6641 Py_ssize_t result;
6643 if (!_ParseTupleFinds(args, &substring, &start, &end))
6644 return NULL;
6646 result = stringlib_find_slice(
6647 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6648 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6649 start, end
6652 Py_DECREF(substring);
6654 return PyInt_FromSsize_t(result);
6657 static PyObject *
6658 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6660 if (index < 0 || index >= self->length) {
6661 PyErr_SetString(PyExc_IndexError, "string index out of range");
6662 return NULL;
6665 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6668 static long
6669 unicode_hash(PyUnicodeObject *self)
6671 /* Since Unicode objects compare equal to their ASCII string
6672 counterparts, they should use the individual character values
6673 as basis for their hash value. This is needed to assure that
6674 strings and Unicode objects behave in the same way as
6675 dictionary keys. */
6677 register Py_ssize_t len;
6678 register Py_UNICODE *p;
6679 register long x;
6681 if (self->hash != -1)
6682 return self->hash;
6683 len = PyUnicode_GET_SIZE(self);
6684 p = PyUnicode_AS_UNICODE(self);
6685 x = *p << 7;
6686 while (--len >= 0)
6687 x = (1000003*x) ^ *p++;
6688 x ^= PyUnicode_GET_SIZE(self);
6689 if (x == -1)
6690 x = -2;
6691 self->hash = x;
6692 return x;
6695 PyDoc_STRVAR(index__doc__,
6696 "S.index(sub [,start [,end]]) -> int\n\
6698 Like S.find() but raise ValueError when the substring is not found.");
6700 static PyObject *
6701 unicode_index(PyUnicodeObject *self, PyObject *args)
6703 Py_ssize_t result;
6704 PyObject *substring;
6705 Py_ssize_t start;
6706 Py_ssize_t end;
6708 if (!_ParseTupleFinds(args, &substring, &start, &end))
6709 return NULL;
6711 result = stringlib_find_slice(
6712 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6713 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6714 start, end
6717 Py_DECREF(substring);
6719 if (result < 0) {
6720 PyErr_SetString(PyExc_ValueError, "substring not found");
6721 return NULL;
6724 return PyInt_FromSsize_t(result);
6727 PyDoc_STRVAR(islower__doc__,
6728 "S.islower() -> bool\n\
6730 Return True if all cased characters in S are lowercase and there is\n\
6731 at least one cased character in S, False otherwise.");
6733 static PyObject*
6734 unicode_islower(PyUnicodeObject *self)
6736 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6737 register const Py_UNICODE *e;
6738 int cased;
6740 /* Shortcut for single character strings */
6741 if (PyUnicode_GET_SIZE(self) == 1)
6742 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6744 /* Special case for empty strings */
6745 if (PyUnicode_GET_SIZE(self) == 0)
6746 return PyBool_FromLong(0);
6748 e = p + PyUnicode_GET_SIZE(self);
6749 cased = 0;
6750 for (; p < e; p++) {
6751 register const Py_UNICODE ch = *p;
6753 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6754 return PyBool_FromLong(0);
6755 else if (!cased && Py_UNICODE_ISLOWER(ch))
6756 cased = 1;
6758 return PyBool_FromLong(cased);
6761 PyDoc_STRVAR(isupper__doc__,
6762 "S.isupper() -> bool\n\
6764 Return True if all cased characters in S are uppercase and there is\n\
6765 at least one cased character in S, False otherwise.");
6767 static PyObject*
6768 unicode_isupper(PyUnicodeObject *self)
6770 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6771 register const Py_UNICODE *e;
6772 int cased;
6774 /* Shortcut for single character strings */
6775 if (PyUnicode_GET_SIZE(self) == 1)
6776 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6778 /* Special case for empty strings */
6779 if (PyUnicode_GET_SIZE(self) == 0)
6780 return PyBool_FromLong(0);
6782 e = p + PyUnicode_GET_SIZE(self);
6783 cased = 0;
6784 for (; p < e; p++) {
6785 register const Py_UNICODE ch = *p;
6787 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6788 return PyBool_FromLong(0);
6789 else if (!cased && Py_UNICODE_ISUPPER(ch))
6790 cased = 1;
6792 return PyBool_FromLong(cased);
6795 PyDoc_STRVAR(istitle__doc__,
6796 "S.istitle() -> bool\n\
6798 Return True if S is a titlecased string and there is at least one\n\
6799 character in S, i.e. upper- and titlecase characters may only\n\
6800 follow uncased characters and lowercase characters only cased ones.\n\
6801 Return False otherwise.");
6803 static PyObject*
6804 unicode_istitle(PyUnicodeObject *self)
6806 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6807 register const Py_UNICODE *e;
6808 int cased, previous_is_cased;
6810 /* Shortcut for single character strings */
6811 if (PyUnicode_GET_SIZE(self) == 1)
6812 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6813 (Py_UNICODE_ISUPPER(*p) != 0));
6815 /* Special case for empty strings */
6816 if (PyUnicode_GET_SIZE(self) == 0)
6817 return PyBool_FromLong(0);
6819 e = p + PyUnicode_GET_SIZE(self);
6820 cased = 0;
6821 previous_is_cased = 0;
6822 for (; p < e; p++) {
6823 register const Py_UNICODE ch = *p;
6825 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6826 if (previous_is_cased)
6827 return PyBool_FromLong(0);
6828 previous_is_cased = 1;
6829 cased = 1;
6831 else if (Py_UNICODE_ISLOWER(ch)) {
6832 if (!previous_is_cased)
6833 return PyBool_FromLong(0);
6834 previous_is_cased = 1;
6835 cased = 1;
6837 else
6838 previous_is_cased = 0;
6840 return PyBool_FromLong(cased);
6843 PyDoc_STRVAR(isspace__doc__,
6844 "S.isspace() -> bool\n\
6846 Return True if all characters in S are whitespace\n\
6847 and there is at least one character in S, False otherwise.");
6849 static PyObject*
6850 unicode_isspace(PyUnicodeObject *self)
6852 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6853 register const Py_UNICODE *e;
6855 /* Shortcut for single character strings */
6856 if (PyUnicode_GET_SIZE(self) == 1 &&
6857 Py_UNICODE_ISSPACE(*p))
6858 return PyBool_FromLong(1);
6860 /* Special case for empty strings */
6861 if (PyUnicode_GET_SIZE(self) == 0)
6862 return PyBool_FromLong(0);
6864 e = p + PyUnicode_GET_SIZE(self);
6865 for (; p < e; p++) {
6866 if (!Py_UNICODE_ISSPACE(*p))
6867 return PyBool_FromLong(0);
6869 return PyBool_FromLong(1);
6872 PyDoc_STRVAR(isalpha__doc__,
6873 "S.isalpha() -> bool\n\
6875 Return True if all characters in S are alphabetic\n\
6876 and there is at least one character in S, False otherwise.");
6878 static PyObject*
6879 unicode_isalpha(PyUnicodeObject *self)
6881 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6882 register const Py_UNICODE *e;
6884 /* Shortcut for single character strings */
6885 if (PyUnicode_GET_SIZE(self) == 1 &&
6886 Py_UNICODE_ISALPHA(*p))
6887 return PyBool_FromLong(1);
6889 /* Special case for empty strings */
6890 if (PyUnicode_GET_SIZE(self) == 0)
6891 return PyBool_FromLong(0);
6893 e = p + PyUnicode_GET_SIZE(self);
6894 for (; p < e; p++) {
6895 if (!Py_UNICODE_ISALPHA(*p))
6896 return PyBool_FromLong(0);
6898 return PyBool_FromLong(1);
6901 PyDoc_STRVAR(isalnum__doc__,
6902 "S.isalnum() -> bool\n\
6904 Return True if all characters in S are alphanumeric\n\
6905 and there is at least one character in S, False otherwise.");
6907 static PyObject*
6908 unicode_isalnum(PyUnicodeObject *self)
6910 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6911 register const Py_UNICODE *e;
6913 /* Shortcut for single character strings */
6914 if (PyUnicode_GET_SIZE(self) == 1 &&
6915 Py_UNICODE_ISALNUM(*p))
6916 return PyBool_FromLong(1);
6918 /* Special case for empty strings */
6919 if (PyUnicode_GET_SIZE(self) == 0)
6920 return PyBool_FromLong(0);
6922 e = p + PyUnicode_GET_SIZE(self);
6923 for (; p < e; p++) {
6924 if (!Py_UNICODE_ISALNUM(*p))
6925 return PyBool_FromLong(0);
6927 return PyBool_FromLong(1);
6930 PyDoc_STRVAR(isdecimal__doc__,
6931 "S.isdecimal() -> bool\n\
6933 Return True if there are only decimal characters in S,\n\
6934 False otherwise.");
6936 static PyObject*
6937 unicode_isdecimal(PyUnicodeObject *self)
6939 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6940 register const Py_UNICODE *e;
6942 /* Shortcut for single character strings */
6943 if (PyUnicode_GET_SIZE(self) == 1 &&
6944 Py_UNICODE_ISDECIMAL(*p))
6945 return PyBool_FromLong(1);
6947 /* Special case for empty strings */
6948 if (PyUnicode_GET_SIZE(self) == 0)
6949 return PyBool_FromLong(0);
6951 e = p + PyUnicode_GET_SIZE(self);
6952 for (; p < e; p++) {
6953 if (!Py_UNICODE_ISDECIMAL(*p))
6954 return PyBool_FromLong(0);
6956 return PyBool_FromLong(1);
6959 PyDoc_STRVAR(isdigit__doc__,
6960 "S.isdigit() -> bool\n\
6962 Return True if all characters in S are digits\n\
6963 and there is at least one character in S, False otherwise.");
6965 static PyObject*
6966 unicode_isdigit(PyUnicodeObject *self)
6968 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6969 register const Py_UNICODE *e;
6971 /* Shortcut for single character strings */
6972 if (PyUnicode_GET_SIZE(self) == 1 &&
6973 Py_UNICODE_ISDIGIT(*p))
6974 return PyBool_FromLong(1);
6976 /* Special case for empty strings */
6977 if (PyUnicode_GET_SIZE(self) == 0)
6978 return PyBool_FromLong(0);
6980 e = p + PyUnicode_GET_SIZE(self);
6981 for (; p < e; p++) {
6982 if (!Py_UNICODE_ISDIGIT(*p))
6983 return PyBool_FromLong(0);
6985 return PyBool_FromLong(1);
6988 PyDoc_STRVAR(isnumeric__doc__,
6989 "S.isnumeric() -> bool\n\
6991 Return True if there are only numeric characters in S,\n\
6992 False otherwise.");
6994 static PyObject*
6995 unicode_isnumeric(PyUnicodeObject *self)
6997 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6998 register const Py_UNICODE *e;
7000 /* Shortcut for single character strings */
7001 if (PyUnicode_GET_SIZE(self) == 1 &&
7002 Py_UNICODE_ISNUMERIC(*p))
7003 return PyBool_FromLong(1);
7005 /* Special case for empty strings */
7006 if (PyUnicode_GET_SIZE(self) == 0)
7007 return PyBool_FromLong(0);
7009 e = p + PyUnicode_GET_SIZE(self);
7010 for (; p < e; p++) {
7011 if (!Py_UNICODE_ISNUMERIC(*p))
7012 return PyBool_FromLong(0);
7014 return PyBool_FromLong(1);
7017 PyDoc_STRVAR(join__doc__,
7018 "S.join(sequence) -> unicode\n\
7020 Return a string which is the concatenation of the strings in the\n\
7021 sequence. The separator between elements is S.");
7023 static PyObject*
7024 unicode_join(PyObject *self, PyObject *data)
7026 return PyUnicode_Join(self, data);
7029 static Py_ssize_t
7030 unicode_length(PyUnicodeObject *self)
7032 return self->length;
7035 PyDoc_STRVAR(ljust__doc__,
7036 "S.ljust(width[, fillchar]) -> int\n\
7038 Return S left justified in a Unicode string of length width. Padding is\n\
7039 done using the specified fill character (default is a space).");
7041 static PyObject *
7042 unicode_ljust(PyUnicodeObject *self, PyObject *args)
7044 Py_ssize_t width;
7045 Py_UNICODE fillchar = ' ';
7047 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7048 return NULL;
7050 if (self->length >= width && PyUnicode_CheckExact(self)) {
7051 Py_INCREF(self);
7052 return (PyObject*) self;
7055 return (PyObject*) pad(self, 0, width - self->length, fillchar);
7058 PyDoc_STRVAR(lower__doc__,
7059 "S.lower() -> unicode\n\
7061 Return a copy of the string S converted to lowercase.");
7063 static PyObject*
7064 unicode_lower(PyUnicodeObject *self)
7066 return fixup(self, fixlower);
7069 #define LEFTSTRIP 0
7070 #define RIGHTSTRIP 1
7071 #define BOTHSTRIP 2
7073 /* Arrays indexed by above */
7074 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7076 #define STRIPNAME(i) (stripformat[i]+3)
7078 /* externally visible for str.strip(unicode) */
7079 PyObject *
7080 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7082 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7083 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7084 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7085 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7086 Py_ssize_t i, j;
7088 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7090 i = 0;
7091 if (striptype != RIGHTSTRIP) {
7092 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7093 i++;
7097 j = len;
7098 if (striptype != LEFTSTRIP) {
7099 do {
7100 j--;
7101 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7102 j++;
7105 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7106 Py_INCREF(self);
7107 return (PyObject*)self;
7109 else
7110 return PyUnicode_FromUnicode(s+i, j-i);
7114 static PyObject *
7115 do_strip(PyUnicodeObject *self, int striptype)
7117 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7118 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7120 i = 0;
7121 if (striptype != RIGHTSTRIP) {
7122 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7123 i++;
7127 j = len;
7128 if (striptype != LEFTSTRIP) {
7129 do {
7130 j--;
7131 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7132 j++;
7135 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7136 Py_INCREF(self);
7137 return (PyObject*)self;
7139 else
7140 return PyUnicode_FromUnicode(s+i, j-i);
7144 static PyObject *
7145 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7147 PyObject *sep = NULL;
7149 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7150 return NULL;
7152 if (sep != NULL && sep != Py_None) {
7153 if (PyUnicode_Check(sep))
7154 return _PyUnicode_XStrip(self, striptype, sep);
7155 else if (PyString_Check(sep)) {
7156 PyObject *res;
7157 sep = PyUnicode_FromObject(sep);
7158 if (sep==NULL)
7159 return NULL;
7160 res = _PyUnicode_XStrip(self, striptype, sep);
7161 Py_DECREF(sep);
7162 return res;
7164 else {
7165 PyErr_Format(PyExc_TypeError,
7166 "%s arg must be None, unicode or str",
7167 STRIPNAME(striptype));
7168 return NULL;
7172 return do_strip(self, striptype);
7176 PyDoc_STRVAR(strip__doc__,
7177 "S.strip([chars]) -> unicode\n\
7179 Return a copy of the string S with leading and trailing\n\
7180 whitespace removed.\n\
7181 If chars is given and not None, remove characters in chars instead.\n\
7182 If chars is a str, it will be converted to unicode before stripping");
7184 static PyObject *
7185 unicode_strip(PyUnicodeObject *self, PyObject *args)
7187 if (PyTuple_GET_SIZE(args) == 0)
7188 return do_strip(self, BOTHSTRIP); /* Common case */
7189 else
7190 return do_argstrip(self, BOTHSTRIP, args);
7194 PyDoc_STRVAR(lstrip__doc__,
7195 "S.lstrip([chars]) -> unicode\n\
7197 Return a copy of the string S with leading whitespace removed.\n\
7198 If chars is given and not None, remove characters in chars instead.\n\
7199 If chars is a str, it will be converted to unicode before stripping");
7201 static PyObject *
7202 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7204 if (PyTuple_GET_SIZE(args) == 0)
7205 return do_strip(self, LEFTSTRIP); /* Common case */
7206 else
7207 return do_argstrip(self, LEFTSTRIP, args);
7211 PyDoc_STRVAR(rstrip__doc__,
7212 "S.rstrip([chars]) -> unicode\n\
7214 Return a copy of the string S with trailing whitespace removed.\n\
7215 If chars is given and not None, remove characters in chars instead.\n\
7216 If chars is a str, it will be converted to unicode before stripping");
7218 static PyObject *
7219 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7221 if (PyTuple_GET_SIZE(args) == 0)
7222 return do_strip(self, RIGHTSTRIP); /* Common case */
7223 else
7224 return do_argstrip(self, RIGHTSTRIP, args);
7228 static PyObject*
7229 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7231 PyUnicodeObject *u;
7232 Py_UNICODE *p;
7233 Py_ssize_t nchars;
7234 size_t nbytes;
7236 if (len < 0)
7237 len = 0;
7239 if (len == 1 && PyUnicode_CheckExact(str)) {
7240 /* no repeat, return original string */
7241 Py_INCREF(str);
7242 return (PyObject*) str;
7245 /* ensure # of chars needed doesn't overflow int and # of bytes
7246 * needed doesn't overflow size_t
7248 nchars = len * str->length;
7249 if (len && nchars / len != str->length) {
7250 PyErr_SetString(PyExc_OverflowError,
7251 "repeated string is too long");
7252 return NULL;
7254 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7255 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7256 PyErr_SetString(PyExc_OverflowError,
7257 "repeated string is too long");
7258 return NULL;
7260 u = _PyUnicode_New(nchars);
7261 if (!u)
7262 return NULL;
7264 p = u->str;
7266 if (str->length == 1 && len > 0) {
7267 Py_UNICODE_FILL(p, str->str[0], len);
7268 } else {
7269 Py_ssize_t done = 0; /* number of characters copied this far */
7270 if (done < nchars) {
7271 Py_UNICODE_COPY(p, str->str, str->length);
7272 done = str->length;
7274 while (done < nchars) {
7275 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7276 Py_UNICODE_COPY(p+done, p, n);
7277 done += n;
7281 return (PyObject*) u;
7284 PyObject *PyUnicode_Replace(PyObject *obj,
7285 PyObject *subobj,
7286 PyObject *replobj,
7287 Py_ssize_t maxcount)
7289 PyObject *self;
7290 PyObject *str1;
7291 PyObject *str2;
7292 PyObject *result;
7294 self = PyUnicode_FromObject(obj);
7295 if (self == NULL)
7296 return NULL;
7297 str1 = PyUnicode_FromObject(subobj);
7298 if (str1 == NULL) {
7299 Py_DECREF(self);
7300 return NULL;
7302 str2 = PyUnicode_FromObject(replobj);
7303 if (str2 == NULL) {
7304 Py_DECREF(self);
7305 Py_DECREF(str1);
7306 return NULL;
7308 result = replace((PyUnicodeObject *)self,
7309 (PyUnicodeObject *)str1,
7310 (PyUnicodeObject *)str2,
7311 maxcount);
7312 Py_DECREF(self);
7313 Py_DECREF(str1);
7314 Py_DECREF(str2);
7315 return result;
7318 PyDoc_STRVAR(replace__doc__,
7319 "S.replace (old, new[, count]) -> unicode\n\
7321 Return a copy of S with all occurrences of substring\n\
7322 old replaced by new. If the optional argument count is\n\
7323 given, only the first count occurrences are replaced.");
7325 static PyObject*
7326 unicode_replace(PyUnicodeObject *self, PyObject *args)
7328 PyUnicodeObject *str1;
7329 PyUnicodeObject *str2;
7330 Py_ssize_t maxcount = -1;
7331 PyObject *result;
7333 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7334 return NULL;
7335 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7336 if (str1 == NULL)
7337 return NULL;
7338 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7339 if (str2 == NULL) {
7340 Py_DECREF(str1);
7341 return NULL;
7344 result = replace(self, str1, str2, maxcount);
7346 Py_DECREF(str1);
7347 Py_DECREF(str2);
7348 return result;
7351 static
7352 PyObject *unicode_repr(PyObject *unicode)
7354 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7355 PyUnicode_GET_SIZE(unicode),
7359 PyDoc_STRVAR(rfind__doc__,
7360 "S.rfind(sub [,start [,end]]) -> int\n\
7362 Return the highest index in S where substring sub is found,\n\
7363 such that sub is contained within s[start:end]. Optional\n\
7364 arguments start and end are interpreted as in slice notation.\n\
7366 Return -1 on failure.");
7368 static PyObject *
7369 unicode_rfind(PyUnicodeObject *self, PyObject *args)
7371 PyObject *substring;
7372 Py_ssize_t start;
7373 Py_ssize_t end;
7374 Py_ssize_t result;
7376 if (!_ParseTupleFinds(args, &substring, &start, &end))
7377 return NULL;
7379 result = stringlib_rfind_slice(
7380 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7381 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7382 start, end
7385 Py_DECREF(substring);
7387 return PyInt_FromSsize_t(result);
7390 PyDoc_STRVAR(rindex__doc__,
7391 "S.rindex(sub [,start [,end]]) -> int\n\
7393 Like S.rfind() but raise ValueError when the substring is not found.");
7395 static PyObject *
7396 unicode_rindex(PyUnicodeObject *self, PyObject *args)
7398 PyObject *substring;
7399 Py_ssize_t start;
7400 Py_ssize_t end;
7401 Py_ssize_t result;
7403 if (!_ParseTupleFinds(args, &substring, &start, &end))
7404 return NULL;
7406 result = stringlib_rfind_slice(
7407 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7408 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7409 start, end
7412 Py_DECREF(substring);
7414 if (result < 0) {
7415 PyErr_SetString(PyExc_ValueError, "substring not found");
7416 return NULL;
7418 return PyInt_FromSsize_t(result);
7421 PyDoc_STRVAR(rjust__doc__,
7422 "S.rjust(width[, fillchar]) -> unicode\n\
7424 Return S right justified in a Unicode string of length width. Padding is\n\
7425 done using the specified fill character (default is a space).");
7427 static PyObject *
7428 unicode_rjust(PyUnicodeObject *self, PyObject *args)
7430 Py_ssize_t width;
7431 Py_UNICODE fillchar = ' ';
7433 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7434 return NULL;
7436 if (self->length >= width && PyUnicode_CheckExact(self)) {
7437 Py_INCREF(self);
7438 return (PyObject*) self;
7441 return (PyObject*) pad(self, width - self->length, 0, fillchar);
7444 static PyObject*
7445 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
7447 /* standard clamping */
7448 if (start < 0)
7449 start = 0;
7450 if (end < 0)
7451 end = 0;
7452 if (end > self->length)
7453 end = self->length;
7454 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
7455 /* full slice, return original string */
7456 Py_INCREF(self);
7457 return (PyObject*) self;
7459 if (start > end)
7460 start = end;
7461 /* copy slice */
7462 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7463 end - start);
7466 PyObject *PyUnicode_Split(PyObject *s,
7467 PyObject *sep,
7468 Py_ssize_t maxsplit)
7470 PyObject *result;
7472 s = PyUnicode_FromObject(s);
7473 if (s == NULL)
7474 return NULL;
7475 if (sep != NULL) {
7476 sep = PyUnicode_FromObject(sep);
7477 if (sep == NULL) {
7478 Py_DECREF(s);
7479 return NULL;
7483 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7485 Py_DECREF(s);
7486 Py_XDECREF(sep);
7487 return result;
7490 PyDoc_STRVAR(split__doc__,
7491 "S.split([sep [,maxsplit]]) -> list of strings\n\
7493 Return a list of the words in S, using sep as the\n\
7494 delimiter string. If maxsplit is given, at most maxsplit\n\
7495 splits are done. If sep is not specified or is None, any\n\
7496 whitespace string is a separator and empty strings are\n\
7497 removed from the result.");
7499 static PyObject*
7500 unicode_split(PyUnicodeObject *self, PyObject *args)
7502 PyObject *substring = Py_None;
7503 Py_ssize_t maxcount = -1;
7505 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7506 return NULL;
7508 if (substring == Py_None)
7509 return split(self, NULL, maxcount);
7510 else if (PyUnicode_Check(substring))
7511 return split(self, (PyUnicodeObject *)substring, maxcount);
7512 else
7513 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7516 PyObject *
7517 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7519 PyObject* str_obj;
7520 PyObject* sep_obj;
7521 PyObject* out;
7523 str_obj = PyUnicode_FromObject(str_in);
7524 if (!str_obj)
7525 return NULL;
7526 sep_obj = PyUnicode_FromObject(sep_in);
7527 if (!sep_obj) {
7528 Py_DECREF(str_obj);
7529 return NULL;
7532 out = stringlib_partition(
7533 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7534 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7537 Py_DECREF(sep_obj);
7538 Py_DECREF(str_obj);
7540 return out;
7544 PyObject *
7545 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7547 PyObject* str_obj;
7548 PyObject* sep_obj;
7549 PyObject* out;
7551 str_obj = PyUnicode_FromObject(str_in);
7552 if (!str_obj)
7553 return NULL;
7554 sep_obj = PyUnicode_FromObject(sep_in);
7555 if (!sep_obj) {
7556 Py_DECREF(str_obj);
7557 return NULL;
7560 out = stringlib_rpartition(
7561 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7562 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7565 Py_DECREF(sep_obj);
7566 Py_DECREF(str_obj);
7568 return out;
7571 PyDoc_STRVAR(partition__doc__,
7572 "S.partition(sep) -> (head, sep, tail)\n\
7574 Searches for the separator sep in S, and returns the part before it,\n\
7575 the separator itself, and the part after it. If the separator is not\n\
7576 found, returns S and two empty strings.");
7578 static PyObject*
7579 unicode_partition(PyUnicodeObject *self, PyObject *separator)
7581 return PyUnicode_Partition((PyObject *)self, separator);
7584 PyDoc_STRVAR(rpartition__doc__,
7585 "S.rpartition(sep) -> (tail, sep, head)\n\
7587 Searches for the separator sep in S, starting at the end of S, and returns\n\
7588 the part before it, the separator itself, and the part after it. If the\n\
7589 separator is not found, returns two empty strings and S.");
7591 static PyObject*
7592 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7594 return PyUnicode_RPartition((PyObject *)self, separator);
7597 PyObject *PyUnicode_RSplit(PyObject *s,
7598 PyObject *sep,
7599 Py_ssize_t maxsplit)
7601 PyObject *result;
7603 s = PyUnicode_FromObject(s);
7604 if (s == NULL)
7605 return NULL;
7606 if (sep != NULL) {
7607 sep = PyUnicode_FromObject(sep);
7608 if (sep == NULL) {
7609 Py_DECREF(s);
7610 return NULL;
7614 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7616 Py_DECREF(s);
7617 Py_XDECREF(sep);
7618 return result;
7621 PyDoc_STRVAR(rsplit__doc__,
7622 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7624 Return a list of the words in S, using sep as the\n\
7625 delimiter string, starting at the end of the string and\n\
7626 working to the front. If maxsplit is given, at most maxsplit\n\
7627 splits are done. If sep is not specified, any whitespace string\n\
7628 is a separator.");
7630 static PyObject*
7631 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7633 PyObject *substring = Py_None;
7634 Py_ssize_t maxcount = -1;
7636 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7637 return NULL;
7639 if (substring == Py_None)
7640 return rsplit(self, NULL, maxcount);
7641 else if (PyUnicode_Check(substring))
7642 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7643 else
7644 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7647 PyDoc_STRVAR(splitlines__doc__,
7648 "S.splitlines([keepends]]) -> list of strings\n\
7650 Return a list of the lines in S, breaking at line boundaries.\n\
7651 Line breaks are not included in the resulting list unless keepends\n\
7652 is given and true.");
7654 static PyObject*
7655 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7657 int keepends = 0;
7659 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7660 return NULL;
7662 return PyUnicode_Splitlines((PyObject *)self, keepends);
7665 static
7666 PyObject *unicode_str(PyUnicodeObject *self)
7668 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
7671 PyDoc_STRVAR(swapcase__doc__,
7672 "S.swapcase() -> unicode\n\
7674 Return a copy of S with uppercase characters converted to lowercase\n\
7675 and vice versa.");
7677 static PyObject*
7678 unicode_swapcase(PyUnicodeObject *self)
7680 return fixup(self, fixswapcase);
7683 PyDoc_STRVAR(translate__doc__,
7684 "S.translate(table) -> unicode\n\
7686 Return a copy of the string S, where all characters have been mapped\n\
7687 through the given translation table, which must be a mapping of\n\
7688 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7689 Unmapped characters are left untouched. Characters mapped to None\n\
7690 are deleted.");
7692 static PyObject*
7693 unicode_translate(PyUnicodeObject *self, PyObject *table)
7695 return PyUnicode_TranslateCharmap(self->str,
7696 self->length,
7697 table,
7698 "ignore");
7701 PyDoc_STRVAR(upper__doc__,
7702 "S.upper() -> unicode\n\
7704 Return a copy of S converted to uppercase.");
7706 static PyObject*
7707 unicode_upper(PyUnicodeObject *self)
7709 return fixup(self, fixupper);
7712 PyDoc_STRVAR(zfill__doc__,
7713 "S.zfill(width) -> unicode\n\
7715 Pad a numeric string x with zeros on the left, to fill a field\n\
7716 of the specified width. The string x is never truncated.");
7718 static PyObject *
7719 unicode_zfill(PyUnicodeObject *self, PyObject *args)
7721 Py_ssize_t fill;
7722 PyUnicodeObject *u;
7724 Py_ssize_t width;
7725 if (!PyArg_ParseTuple(args, "n:zfill", &width))
7726 return NULL;
7728 if (self->length >= width) {
7729 if (PyUnicode_CheckExact(self)) {
7730 Py_INCREF(self);
7731 return (PyObject*) self;
7733 else
7734 return PyUnicode_FromUnicode(
7735 PyUnicode_AS_UNICODE(self),
7736 PyUnicode_GET_SIZE(self)
7740 fill = width - self->length;
7742 u = pad(self, fill, 0, '0');
7744 if (u == NULL)
7745 return NULL;
7747 if (u->str[fill] == '+' || u->str[fill] == '-') {
7748 /* move sign to beginning of string */
7749 u->str[0] = u->str[fill];
7750 u->str[fill] = '0';
7753 return (PyObject*) u;
7756 #if 0
7757 static PyObject*
7758 free_listsize(PyUnicodeObject *self)
7760 return PyInt_FromLong(numfree);
7762 #endif
7764 PyDoc_STRVAR(startswith__doc__,
7765 "S.startswith(prefix[, start[, end]]) -> bool\n\
7767 Return True if S starts with the specified prefix, False otherwise.\n\
7768 With optional start, test S beginning at that position.\n\
7769 With optional end, stop comparing S at that position.\n\
7770 prefix can also be a tuple of strings to try.");
7772 static PyObject *
7773 unicode_startswith(PyUnicodeObject *self,
7774 PyObject *args)
7776 PyObject *subobj;
7777 PyUnicodeObject *substring;
7778 Py_ssize_t start = 0;
7779 Py_ssize_t end = PY_SSIZE_T_MAX;
7780 int result;
7782 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
7783 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7784 return NULL;
7785 if (PyTuple_Check(subobj)) {
7786 Py_ssize_t i;
7787 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7788 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7789 PyTuple_GET_ITEM(subobj, i));
7790 if (substring == NULL)
7791 return NULL;
7792 result = tailmatch(self, substring, start, end, -1);
7793 Py_DECREF(substring);
7794 if (result) {
7795 Py_RETURN_TRUE;
7798 /* nothing matched */
7799 Py_RETURN_FALSE;
7801 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7802 if (substring == NULL)
7803 return NULL;
7804 result = tailmatch(self, substring, start, end, -1);
7805 Py_DECREF(substring);
7806 return PyBool_FromLong(result);
7810 PyDoc_STRVAR(endswith__doc__,
7811 "S.endswith(suffix[, start[, end]]) -> bool\n\
7813 Return True if S ends with the specified suffix, False otherwise.\n\
7814 With optional start, test S beginning at that position.\n\
7815 With optional end, stop comparing S at that position.\n\
7816 suffix can also be a tuple of strings to try.");
7818 static PyObject *
7819 unicode_endswith(PyUnicodeObject *self,
7820 PyObject *args)
7822 PyObject *subobj;
7823 PyUnicodeObject *substring;
7824 Py_ssize_t start = 0;
7825 Py_ssize_t end = PY_SSIZE_T_MAX;
7826 int result;
7828 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7829 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7830 return NULL;
7831 if (PyTuple_Check(subobj)) {
7832 Py_ssize_t i;
7833 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7834 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7835 PyTuple_GET_ITEM(subobj, i));
7836 if (substring == NULL)
7837 return NULL;
7838 result = tailmatch(self, substring, start, end, +1);
7839 Py_DECREF(substring);
7840 if (result) {
7841 Py_RETURN_TRUE;
7844 Py_RETURN_FALSE;
7846 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7847 if (substring == NULL)
7848 return NULL;
7850 result = tailmatch(self, substring, start, end, +1);
7851 Py_DECREF(substring);
7852 return PyBool_FromLong(result);
7856 /* Implements do_string_format, which is unicode because of stringlib */
7857 #include "stringlib/string_format.h"
7859 PyDoc_STRVAR(format__doc__,
7860 "S.format(*args, **kwargs) -> unicode\n\
7864 static PyObject *
7865 unicode__format__(PyObject *self, PyObject *args)
7867 PyObject *format_spec;
7868 PyObject *result = NULL;
7869 PyObject *tmp = NULL;
7871 /* If 2.x, convert format_spec to the same type as value */
7872 /* This is to allow things like u''.format('') */
7873 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7874 goto done;
7875 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7876 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
7877 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
7878 goto done;
7880 tmp = PyObject_Unicode(format_spec);
7881 if (tmp == NULL)
7882 goto done;
7883 format_spec = tmp;
7885 result = _PyUnicode_FormatAdvanced(self,
7886 PyUnicode_AS_UNICODE(format_spec),
7887 PyUnicode_GET_SIZE(format_spec));
7888 done:
7889 Py_XDECREF(tmp);
7890 return result;
7893 PyDoc_STRVAR(p_format__doc__,
7894 "S.__format__(format_spec) -> unicode\n\
7898 static PyObject *
7899 unicode__sizeof__(PyUnicodeObject *v)
7901 PyObject *res = NULL, *defsize = NULL;
7903 res = PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7904 sizeof(Py_UNICODE) * (v->length + 1));
7905 if (v->defenc) {
7906 defsize = PyObject_CallMethod(v->defenc, "__sizeof__", NULL);
7907 if (defsize == NULL) {
7908 Py_DECREF(res);
7909 return NULL;
7911 res = PyNumber_Add(res, defsize);
7912 Py_DECREF(defsize);
7914 return res;
7917 PyDoc_STRVAR(sizeof__doc__,
7918 "S.__sizeof__() -> size of S in memory, in bytes\n\
7922 static PyObject *
7923 unicode_getnewargs(PyUnicodeObject *v)
7925 return Py_BuildValue("(u#)", v->str, v->length);
7929 static PyMethodDef unicode_methods[] = {
7931 /* Order is according to common usage: often used methods should
7932 appear first, since lookup is done sequentially. */
7934 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7935 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7936 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
7937 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
7938 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7939 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7940 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7941 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7942 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7943 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7944 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
7945 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
7946 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7947 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7948 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
7949 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
7950 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
7951 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7952 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7953 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7954 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
7955 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
7956 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
7957 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
7958 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
7959 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7960 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7961 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7962 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7963 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7964 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7965 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7966 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7967 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7968 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7969 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7970 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7971 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7972 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
7973 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
7974 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7975 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7976 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7977 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
7978 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
7979 #if 0
7980 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
7981 #endif
7983 #if 0
7984 /* This one is just used for debugging the implementation. */
7985 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
7986 #endif
7988 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
7989 {NULL, NULL}
7992 static PyObject *
7993 unicode_mod(PyObject *v, PyObject *w)
7995 if (!PyUnicode_Check(v)) {
7996 Py_INCREF(Py_NotImplemented);
7997 return Py_NotImplemented;
7999 return PyUnicode_Format(v, w);
8002 static PyNumberMethods unicode_as_number = {
8003 0, /*nb_add*/
8004 0, /*nb_subtract*/
8005 0, /*nb_multiply*/
8006 0, /*nb_divide*/
8007 unicode_mod, /*nb_remainder*/
8010 static PySequenceMethods unicode_as_sequence = {
8011 (lenfunc) unicode_length, /* sq_length */
8012 PyUnicode_Concat, /* sq_concat */
8013 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8014 (ssizeargfunc) unicode_getitem, /* sq_item */
8015 (ssizessizeargfunc) unicode_slice, /* sq_slice */
8016 0, /* sq_ass_item */
8017 0, /* sq_ass_slice */
8018 PyUnicode_Contains, /* sq_contains */
8021 static PyObject*
8022 unicode_subscript(PyUnicodeObject* self, PyObject* item)
8024 if (PyIndex_Check(item)) {
8025 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
8026 if (i == -1 && PyErr_Occurred())
8027 return NULL;
8028 if (i < 0)
8029 i += PyUnicode_GET_SIZE(self);
8030 return unicode_getitem(self, i);
8031 } else if (PySlice_Check(item)) {
8032 Py_ssize_t start, stop, step, slicelength, cur, i;
8033 Py_UNICODE* source_buf;
8034 Py_UNICODE* result_buf;
8035 PyObject* result;
8037 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
8038 &start, &stop, &step, &slicelength) < 0) {
8039 return NULL;
8042 if (slicelength <= 0) {
8043 return PyUnicode_FromUnicode(NULL, 0);
8044 } else if (start == 0 && step == 1 && slicelength == self->length &&
8045 PyUnicode_CheckExact(self)) {
8046 Py_INCREF(self);
8047 return (PyObject *)self;
8048 } else if (step == 1) {
8049 return PyUnicode_FromUnicode(self->str + start, slicelength);
8050 } else {
8051 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
8052 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8053 sizeof(Py_UNICODE));
8055 if (result_buf == NULL)
8056 return PyErr_NoMemory();
8058 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8059 result_buf[i] = source_buf[cur];
8062 result = PyUnicode_FromUnicode(result_buf, slicelength);
8063 PyObject_FREE(result_buf);
8064 return result;
8066 } else {
8067 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8068 return NULL;
8072 static PyMappingMethods unicode_as_mapping = {
8073 (lenfunc)unicode_length, /* mp_length */
8074 (binaryfunc)unicode_subscript, /* mp_subscript */
8075 (objobjargproc)0, /* mp_ass_subscript */
8078 static Py_ssize_t
8079 unicode_buffer_getreadbuf(PyUnicodeObject *self,
8080 Py_ssize_t index,
8081 const void **ptr)
8083 if (index != 0) {
8084 PyErr_SetString(PyExc_SystemError,
8085 "accessing non-existent unicode segment");
8086 return -1;
8088 *ptr = (void *) self->str;
8089 return PyUnicode_GET_DATA_SIZE(self);
8092 static Py_ssize_t
8093 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
8094 const void **ptr)
8096 PyErr_SetString(PyExc_TypeError,
8097 "cannot use unicode as modifiable buffer");
8098 return -1;
8101 static int
8102 unicode_buffer_getsegcount(PyUnicodeObject *self,
8103 Py_ssize_t *lenp)
8105 if (lenp)
8106 *lenp = PyUnicode_GET_DATA_SIZE(self);
8107 return 1;
8110 static Py_ssize_t
8111 unicode_buffer_getcharbuf(PyUnicodeObject *self,
8112 Py_ssize_t index,
8113 const void **ptr)
8115 PyObject *str;
8117 if (index != 0) {
8118 PyErr_SetString(PyExc_SystemError,
8119 "accessing non-existent unicode segment");
8120 return -1;
8122 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
8123 if (str == NULL)
8124 return -1;
8125 *ptr = (void *) PyString_AS_STRING(str);
8126 return PyString_GET_SIZE(str);
8129 /* Helpers for PyUnicode_Format() */
8131 static PyObject *
8132 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8134 Py_ssize_t argidx = *p_argidx;
8135 if (argidx < arglen) {
8136 (*p_argidx)++;
8137 if (arglen < 0)
8138 return args;
8139 else
8140 return PyTuple_GetItem(args, argidx);
8142 PyErr_SetString(PyExc_TypeError,
8143 "not enough arguments for format string");
8144 return NULL;
8147 #define F_LJUST (1<<0)
8148 #define F_SIGN (1<<1)
8149 #define F_BLANK (1<<2)
8150 #define F_ALT (1<<3)
8151 #define F_ZERO (1<<4)
8153 static Py_ssize_t
8154 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8156 register Py_ssize_t i;
8157 Py_ssize_t len = strlen(charbuffer);
8158 for (i = len - 1; i >= 0; i--)
8159 buffer[i] = (Py_UNICODE) charbuffer[i];
8161 return len;
8164 static int
8165 doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8167 Py_ssize_t result;
8169 PyOS_ascii_formatd((char *)buffer, len, format, x);
8170 result = strtounicode(buffer, (char *)buffer);
8171 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8174 static int
8175 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8177 Py_ssize_t result;
8179 PyOS_snprintf((char *)buffer, len, format, x);
8180 result = strtounicode(buffer, (char *)buffer);
8181 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8184 /* XXX To save some code duplication, formatfloat/long/int could have been
8185 shared with stringobject.c, converting from 8-bit to Unicode after the
8186 formatting is done. */
8188 static int
8189 formatfloat(Py_UNICODE *buf,
8190 size_t buflen,
8191 int flags,
8192 int prec,
8193 int type,
8194 PyObject *v)
8196 /* fmt = '%#.' + `prec` + `type`
8197 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
8198 char fmt[20];
8199 double x;
8201 x = PyFloat_AsDouble(v);
8202 if (x == -1.0 && PyErr_Occurred())
8203 return -1;
8204 if (prec < 0)
8205 prec = 6;
8206 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8207 type = 'g';
8208 /* Worst case length calc to ensure no buffer overrun:
8210 'g' formats:
8211 fmt = %#.<prec>g
8212 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8213 for any double rep.)
8214 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8216 'f' formats:
8217 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8218 len = 1 + 50 + 1 + prec = 52 + prec
8220 If prec=0 the effective precision is 1 (the leading digit is
8221 always given), therefore increase the length by one.
8224 if (((type == 'g' || type == 'G') &&
8225 buflen <= (size_t)10 + (size_t)prec) ||
8226 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
8227 PyErr_SetString(PyExc_OverflowError,
8228 "formatted float is too long (precision too large?)");
8229 return -1;
8231 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8232 (flags&F_ALT) ? "#" : "",
8233 prec, type);
8234 return doubletounicode(buf, buflen, fmt, x);
8237 static PyObject*
8238 formatlong(PyObject *val, int flags, int prec, int type)
8240 char *buf;
8241 int i, len;
8242 PyObject *str; /* temporary string object. */
8243 PyUnicodeObject *result;
8245 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8246 if (!str)
8247 return NULL;
8248 result = _PyUnicode_New(len);
8249 if (!result) {
8250 Py_DECREF(str);
8251 return NULL;
8253 for (i = 0; i < len; i++)
8254 result->str[i] = buf[i];
8255 result->str[len] = 0;
8256 Py_DECREF(str);
8257 return (PyObject*)result;
8260 static int
8261 formatint(Py_UNICODE *buf,
8262 size_t buflen,
8263 int flags,
8264 int prec,
8265 int type,
8266 PyObject *v)
8268 /* fmt = '%#.' + `prec` + 'l' + `type`
8269 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8270 * + 1 + 1
8271 * = 24
8273 char fmt[64]; /* plenty big enough! */
8274 char *sign;
8275 long x;
8277 x = PyInt_AsLong(v);
8278 if (x == -1 && PyErr_Occurred())
8279 return -1;
8280 if (x < 0 && type == 'u') {
8281 type = 'd';
8283 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8284 sign = "-";
8285 else
8286 sign = "";
8287 if (prec < 0)
8288 prec = 1;
8290 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8291 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8293 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8294 PyErr_SetString(PyExc_OverflowError,
8295 "formatted integer is too long (precision too large?)");
8296 return -1;
8299 if ((flags & F_ALT) &&
8300 (type == 'x' || type == 'X')) {
8301 /* When converting under %#x or %#X, there are a number
8302 * of issues that cause pain:
8303 * - when 0 is being converted, the C standard leaves off
8304 * the '0x' or '0X', which is inconsistent with other
8305 * %#x/%#X conversions and inconsistent with Python's
8306 * hex() function
8307 * - there are platforms that violate the standard and
8308 * convert 0 with the '0x' or '0X'
8309 * (Metrowerks, Compaq Tru64)
8310 * - there are platforms that give '0x' when converting
8311 * under %#X, but convert 0 in accordance with the
8312 * standard (OS/2 EMX)
8314 * We can achieve the desired consistency by inserting our
8315 * own '0x' or '0X' prefix, and substituting %x/%X in place
8316 * of %#x/%#X.
8318 * Note that this is the same approach as used in
8319 * formatint() in stringobject.c
8321 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8322 sign, type, prec, type);
8324 else {
8325 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8326 sign, (flags&F_ALT) ? "#" : "",
8327 prec, type);
8329 if (sign[0])
8330 return longtounicode(buf, buflen, fmt, -x);
8331 else
8332 return longtounicode(buf, buflen, fmt, x);
8335 static int
8336 formatchar(Py_UNICODE *buf,
8337 size_t buflen,
8338 PyObject *v)
8340 /* presume that the buffer is at least 2 characters long */
8341 if (PyUnicode_Check(v)) {
8342 if (PyUnicode_GET_SIZE(v) != 1)
8343 goto onError;
8344 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8347 else if (PyString_Check(v)) {
8348 if (PyString_GET_SIZE(v) != 1)
8349 goto onError;
8350 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8353 else {
8354 /* Integer input truncated to a character */
8355 long x;
8356 x = PyInt_AsLong(v);
8357 if (x == -1 && PyErr_Occurred())
8358 goto onError;
8359 #ifdef Py_UNICODE_WIDE
8360 if (x < 0 || x > 0x10ffff) {
8361 PyErr_SetString(PyExc_OverflowError,
8362 "%c arg not in range(0x110000) "
8363 "(wide Python build)");
8364 return -1;
8366 #else
8367 if (x < 0 || x > 0xffff) {
8368 PyErr_SetString(PyExc_OverflowError,
8369 "%c arg not in range(0x10000) "
8370 "(narrow Python build)");
8371 return -1;
8373 #endif
8374 buf[0] = (Py_UNICODE) x;
8376 buf[1] = '\0';
8377 return 1;
8379 onError:
8380 PyErr_SetString(PyExc_TypeError,
8381 "%c requires int or char");
8382 return -1;
8385 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8387 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8388 chars are formatted. XXX This is a magic number. Each formatting
8389 routine does bounds checking to ensure no overflow, but a better
8390 solution may be to malloc a buffer of appropriate size for each
8391 format. For now, the current solution is sufficient.
8393 #define FORMATBUFLEN (size_t)120
8395 PyObject *PyUnicode_Format(PyObject *format,
8396 PyObject *args)
8398 Py_UNICODE *fmt, *res;
8399 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8400 int args_owned = 0;
8401 PyUnicodeObject *result = NULL;
8402 PyObject *dict = NULL;
8403 PyObject *uformat;
8405 if (format == NULL || args == NULL) {
8406 PyErr_BadInternalCall();
8407 return NULL;
8409 uformat = PyUnicode_FromObject(format);
8410 if (uformat == NULL)
8411 return NULL;
8412 fmt = PyUnicode_AS_UNICODE(uformat);
8413 fmtcnt = PyUnicode_GET_SIZE(uformat);
8415 reslen = rescnt = fmtcnt + 100;
8416 result = _PyUnicode_New(reslen);
8417 if (result == NULL)
8418 goto onError;
8419 res = PyUnicode_AS_UNICODE(result);
8421 if (PyTuple_Check(args)) {
8422 arglen = PyTuple_Size(args);
8423 argidx = 0;
8425 else {
8426 arglen = -1;
8427 argidx = -2;
8429 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
8430 !PyObject_TypeCheck(args, &PyBaseString_Type))
8431 dict = args;
8433 while (--fmtcnt >= 0) {
8434 if (*fmt != '%') {
8435 if (--rescnt < 0) {
8436 rescnt = fmtcnt + 100;
8437 reslen += rescnt;
8438 if (_PyUnicode_Resize(&result, reslen) < 0)
8439 goto onError;
8440 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8441 --rescnt;
8443 *res++ = *fmt++;
8445 else {
8446 /* Got a format specifier */
8447 int flags = 0;
8448 Py_ssize_t width = -1;
8449 int prec = -1;
8450 Py_UNICODE c = '\0';
8451 Py_UNICODE fill;
8452 int isnumok;
8453 PyObject *v = NULL;
8454 PyObject *temp = NULL;
8455 Py_UNICODE *pbuf;
8456 Py_UNICODE sign;
8457 Py_ssize_t len;
8458 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
8460 fmt++;
8461 if (*fmt == '(') {
8462 Py_UNICODE *keystart;
8463 Py_ssize_t keylen;
8464 PyObject *key;
8465 int pcount = 1;
8467 if (dict == NULL) {
8468 PyErr_SetString(PyExc_TypeError,
8469 "format requires a mapping");
8470 goto onError;
8472 ++fmt;
8473 --fmtcnt;
8474 keystart = fmt;
8475 /* Skip over balanced parentheses */
8476 while (pcount > 0 && --fmtcnt >= 0) {
8477 if (*fmt == ')')
8478 --pcount;
8479 else if (*fmt == '(')
8480 ++pcount;
8481 fmt++;
8483 keylen = fmt - keystart - 1;
8484 if (fmtcnt < 0 || pcount > 0) {
8485 PyErr_SetString(PyExc_ValueError,
8486 "incomplete format key");
8487 goto onError;
8489 #if 0
8490 /* keys are converted to strings using UTF-8 and
8491 then looked up since Python uses strings to hold
8492 variables names etc. in its namespaces and we
8493 wouldn't want to break common idioms. */
8494 key = PyUnicode_EncodeUTF8(keystart,
8495 keylen,
8496 NULL);
8497 #else
8498 key = PyUnicode_FromUnicode(keystart, keylen);
8499 #endif
8500 if (key == NULL)
8501 goto onError;
8502 if (args_owned) {
8503 Py_DECREF(args);
8504 args_owned = 0;
8506 args = PyObject_GetItem(dict, key);
8507 Py_DECREF(key);
8508 if (args == NULL) {
8509 goto onError;
8511 args_owned = 1;
8512 arglen = -1;
8513 argidx = -2;
8515 while (--fmtcnt >= 0) {
8516 switch (c = *fmt++) {
8517 case '-': flags |= F_LJUST; continue;
8518 case '+': flags |= F_SIGN; continue;
8519 case ' ': flags |= F_BLANK; continue;
8520 case '#': flags |= F_ALT; continue;
8521 case '0': flags |= F_ZERO; continue;
8523 break;
8525 if (c == '*') {
8526 v = getnextarg(args, arglen, &argidx);
8527 if (v == NULL)
8528 goto onError;
8529 if (!PyInt_Check(v)) {
8530 PyErr_SetString(PyExc_TypeError,
8531 "* wants int");
8532 goto onError;
8534 width = PyInt_AsLong(v);
8535 if (width < 0) {
8536 flags |= F_LJUST;
8537 width = -width;
8539 if (--fmtcnt >= 0)
8540 c = *fmt++;
8542 else if (c >= '0' && c <= '9') {
8543 width = c - '0';
8544 while (--fmtcnt >= 0) {
8545 c = *fmt++;
8546 if (c < '0' || c > '9')
8547 break;
8548 if ((width*10) / 10 != width) {
8549 PyErr_SetString(PyExc_ValueError,
8550 "width too big");
8551 goto onError;
8553 width = width*10 + (c - '0');
8556 if (c == '.') {
8557 prec = 0;
8558 if (--fmtcnt >= 0)
8559 c = *fmt++;
8560 if (c == '*') {
8561 v = getnextarg(args, arglen, &argidx);
8562 if (v == NULL)
8563 goto onError;
8564 if (!PyInt_Check(v)) {
8565 PyErr_SetString(PyExc_TypeError,
8566 "* wants int");
8567 goto onError;
8569 prec = PyInt_AsLong(v);
8570 if (prec < 0)
8571 prec = 0;
8572 if (--fmtcnt >= 0)
8573 c = *fmt++;
8575 else if (c >= '0' && c <= '9') {
8576 prec = c - '0';
8577 while (--fmtcnt >= 0) {
8578 c = Py_CHARMASK(*fmt++);
8579 if (c < '0' || c > '9')
8580 break;
8581 if ((prec*10) / 10 != prec) {
8582 PyErr_SetString(PyExc_ValueError,
8583 "prec too big");
8584 goto onError;
8586 prec = prec*10 + (c - '0');
8589 } /* prec */
8590 if (fmtcnt >= 0) {
8591 if (c == 'h' || c == 'l' || c == 'L') {
8592 if (--fmtcnt >= 0)
8593 c = *fmt++;
8596 if (fmtcnt < 0) {
8597 PyErr_SetString(PyExc_ValueError,
8598 "incomplete format");
8599 goto onError;
8601 if (c != '%') {
8602 v = getnextarg(args, arglen, &argidx);
8603 if (v == NULL)
8604 goto onError;
8606 sign = 0;
8607 fill = ' ';
8608 switch (c) {
8610 case '%':
8611 pbuf = formatbuf;
8612 /* presume that buffer length is at least 1 */
8613 pbuf[0] = '%';
8614 len = 1;
8615 break;
8617 case 's':
8618 case 'r':
8619 if (PyUnicode_Check(v) && c == 's') {
8620 temp = v;
8621 Py_INCREF(temp);
8623 else {
8624 PyObject *unicode;
8625 if (c == 's')
8626 temp = PyObject_Unicode(v);
8627 else
8628 temp = PyObject_Repr(v);
8629 if (temp == NULL)
8630 goto onError;
8631 if (PyUnicode_Check(temp))
8632 /* nothing to do */;
8633 else if (PyString_Check(temp)) {
8634 /* convert to string to Unicode */
8635 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8636 PyString_GET_SIZE(temp),
8637 NULL,
8638 "strict");
8639 Py_DECREF(temp);
8640 temp = unicode;
8641 if (temp == NULL)
8642 goto onError;
8644 else {
8645 Py_DECREF(temp);
8646 PyErr_SetString(PyExc_TypeError,
8647 "%s argument has non-string str()");
8648 goto onError;
8651 pbuf = PyUnicode_AS_UNICODE(temp);
8652 len = PyUnicode_GET_SIZE(temp);
8653 if (prec >= 0 && len > prec)
8654 len = prec;
8655 break;
8657 case 'i':
8658 case 'd':
8659 case 'u':
8660 case 'o':
8661 case 'x':
8662 case 'X':
8663 if (c == 'i')
8664 c = 'd';
8665 isnumok = 0;
8666 if (PyNumber_Check(v)) {
8667 PyObject *iobj=NULL;
8669 if (PyInt_Check(v) || (PyLong_Check(v))) {
8670 iobj = v;
8671 Py_INCREF(iobj);
8673 else {
8674 iobj = PyNumber_Int(v);
8675 if (iobj==NULL) iobj = PyNumber_Long(v);
8677 if (iobj!=NULL) {
8678 if (PyInt_Check(iobj)) {
8679 isnumok = 1;
8680 pbuf = formatbuf;
8681 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8682 flags, prec, c, iobj);
8683 Py_DECREF(iobj);
8684 if (len < 0)
8685 goto onError;
8686 sign = 1;
8688 else if (PyLong_Check(iobj)) {
8689 isnumok = 1;
8690 temp = formatlong(iobj, flags, prec, c);
8691 Py_DECREF(iobj);
8692 if (!temp)
8693 goto onError;
8694 pbuf = PyUnicode_AS_UNICODE(temp);
8695 len = PyUnicode_GET_SIZE(temp);
8696 sign = 1;
8698 else {
8699 Py_DECREF(iobj);
8703 if (!isnumok) {
8704 PyErr_Format(PyExc_TypeError,
8705 "%%%c format: a number is required, "
8706 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8707 goto onError;
8709 if (flags & F_ZERO)
8710 fill = '0';
8711 break;
8713 case 'e':
8714 case 'E':
8715 case 'f':
8716 case 'F':
8717 case 'g':
8718 case 'G':
8719 if (c == 'F')
8720 c = 'f';
8721 pbuf = formatbuf;
8722 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8723 flags, prec, c, v);
8724 if (len < 0)
8725 goto onError;
8726 sign = 1;
8727 if (flags & F_ZERO)
8728 fill = '0';
8729 break;
8731 case 'c':
8732 pbuf = formatbuf;
8733 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8734 if (len < 0)
8735 goto onError;
8736 break;
8738 default:
8739 PyErr_Format(PyExc_ValueError,
8740 "unsupported format character '%c' (0x%x) "
8741 "at index %zd",
8742 (31<=c && c<=126) ? (char)c : '?',
8743 (int)c,
8744 (Py_ssize_t)(fmt - 1 -
8745 PyUnicode_AS_UNICODE(uformat)));
8746 goto onError;
8748 if (sign) {
8749 if (*pbuf == '-' || *pbuf == '+') {
8750 sign = *pbuf++;
8751 len--;
8753 else if (flags & F_SIGN)
8754 sign = '+';
8755 else if (flags & F_BLANK)
8756 sign = ' ';
8757 else
8758 sign = 0;
8760 if (width < len)
8761 width = len;
8762 if (rescnt - (sign != 0) < width) {
8763 reslen -= rescnt;
8764 rescnt = width + fmtcnt + 100;
8765 reslen += rescnt;
8766 if (reslen < 0) {
8767 Py_XDECREF(temp);
8768 PyErr_NoMemory();
8769 goto onError;
8771 if (_PyUnicode_Resize(&result, reslen) < 0) {
8772 Py_XDECREF(temp);
8773 goto onError;
8775 res = PyUnicode_AS_UNICODE(result)
8776 + reslen - rescnt;
8778 if (sign) {
8779 if (fill != ' ')
8780 *res++ = sign;
8781 rescnt--;
8782 if (width > len)
8783 width--;
8785 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8786 assert(pbuf[0] == '0');
8787 assert(pbuf[1] == c);
8788 if (fill != ' ') {
8789 *res++ = *pbuf++;
8790 *res++ = *pbuf++;
8792 rescnt -= 2;
8793 width -= 2;
8794 if (width < 0)
8795 width = 0;
8796 len -= 2;
8798 if (width > len && !(flags & F_LJUST)) {
8799 do {
8800 --rescnt;
8801 *res++ = fill;
8802 } while (--width > len);
8804 if (fill == ' ') {
8805 if (sign)
8806 *res++ = sign;
8807 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8808 assert(pbuf[0] == '0');
8809 assert(pbuf[1] == c);
8810 *res++ = *pbuf++;
8811 *res++ = *pbuf++;
8814 Py_UNICODE_COPY(res, pbuf, len);
8815 res += len;
8816 rescnt -= len;
8817 while (--width >= len) {
8818 --rescnt;
8819 *res++ = ' ';
8821 if (dict && (argidx < arglen) && c != '%') {
8822 PyErr_SetString(PyExc_TypeError,
8823 "not all arguments converted during string formatting");
8824 Py_XDECREF(temp);
8825 goto onError;
8827 Py_XDECREF(temp);
8828 } /* '%' */
8829 } /* until end */
8830 if (argidx < arglen && !dict) {
8831 PyErr_SetString(PyExc_TypeError,
8832 "not all arguments converted during string formatting");
8833 goto onError;
8836 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8837 goto onError;
8838 if (args_owned) {
8839 Py_DECREF(args);
8841 Py_DECREF(uformat);
8842 return (PyObject *)result;
8844 onError:
8845 Py_XDECREF(result);
8846 Py_DECREF(uformat);
8847 if (args_owned) {
8848 Py_DECREF(args);
8850 return NULL;
8853 static PyBufferProcs unicode_as_buffer = {
8854 (readbufferproc) unicode_buffer_getreadbuf,
8855 (writebufferproc) unicode_buffer_getwritebuf,
8856 (segcountproc) unicode_buffer_getsegcount,
8857 (charbufferproc) unicode_buffer_getcharbuf,
8860 static PyObject *
8861 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8863 static PyObject *
8864 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8866 PyObject *x = NULL;
8867 static char *kwlist[] = {"string", "encoding", "errors", 0};
8868 char *encoding = NULL;
8869 char *errors = NULL;
8871 if (type != &PyUnicode_Type)
8872 return unicode_subtype_new(type, args, kwds);
8873 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8874 kwlist, &x, &encoding, &errors))
8875 return NULL;
8876 if (x == NULL)
8877 return (PyObject *)_PyUnicode_New(0);
8878 if (encoding == NULL && errors == NULL)
8879 return PyObject_Unicode(x);
8880 else
8881 return PyUnicode_FromEncodedObject(x, encoding, errors);
8884 static PyObject *
8885 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8887 PyUnicodeObject *tmp, *pnew;
8888 Py_ssize_t n;
8890 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8891 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8892 if (tmp == NULL)
8893 return NULL;
8894 assert(PyUnicode_Check(tmp));
8895 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8896 if (pnew == NULL) {
8897 Py_DECREF(tmp);
8898 return NULL;
8900 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8901 if (pnew->str == NULL) {
8902 _Py_ForgetReference((PyObject *)pnew);
8903 PyObject_Del(pnew);
8904 Py_DECREF(tmp);
8905 return PyErr_NoMemory();
8907 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8908 pnew->length = n;
8909 pnew->hash = tmp->hash;
8910 Py_DECREF(tmp);
8911 return (PyObject *)pnew;
8914 PyDoc_STRVAR(unicode_doc,
8915 "unicode(string [, encoding[, errors]]) -> object\n\
8917 Create a new Unicode object from the given encoded string.\n\
8918 encoding defaults to the current default string encoding.\n\
8919 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8921 PyTypeObject PyUnicode_Type = {
8922 PyVarObject_HEAD_INIT(&PyType_Type, 0)
8923 "unicode", /* tp_name */
8924 sizeof(PyUnicodeObject), /* tp_size */
8925 0, /* tp_itemsize */
8926 /* Slots */
8927 (destructor)unicode_dealloc, /* tp_dealloc */
8928 0, /* tp_print */
8929 0, /* tp_getattr */
8930 0, /* tp_setattr */
8931 0, /* tp_compare */
8932 unicode_repr, /* tp_repr */
8933 &unicode_as_number, /* tp_as_number */
8934 &unicode_as_sequence, /* tp_as_sequence */
8935 &unicode_as_mapping, /* tp_as_mapping */
8936 (hashfunc) unicode_hash, /* tp_hash*/
8937 0, /* tp_call*/
8938 (reprfunc) unicode_str, /* tp_str */
8939 PyObject_GenericGetAttr, /* tp_getattro */
8940 0, /* tp_setattro */
8941 &unicode_as_buffer, /* tp_as_buffer */
8942 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
8943 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
8944 unicode_doc, /* tp_doc */
8945 0, /* tp_traverse */
8946 0, /* tp_clear */
8947 PyUnicode_RichCompare, /* tp_richcompare */
8948 0, /* tp_weaklistoffset */
8949 0, /* tp_iter */
8950 0, /* tp_iternext */
8951 unicode_methods, /* tp_methods */
8952 0, /* tp_members */
8953 0, /* tp_getset */
8954 &PyBaseString_Type, /* tp_base */
8955 0, /* tp_dict */
8956 0, /* tp_descr_get */
8957 0, /* tp_descr_set */
8958 0, /* tp_dictoffset */
8959 0, /* tp_init */
8960 0, /* tp_alloc */
8961 unicode_new, /* tp_new */
8962 PyObject_Del, /* tp_free */
8965 /* Initialize the Unicode implementation */
8967 void _PyUnicode_Init(void)
8969 int i;
8971 /* XXX - move this array to unicodectype.c ? */
8972 Py_UNICODE linebreak[] = {
8973 0x000A, /* LINE FEED */
8974 0x000D, /* CARRIAGE RETURN */
8975 0x001C, /* FILE SEPARATOR */
8976 0x001D, /* GROUP SEPARATOR */
8977 0x001E, /* RECORD SEPARATOR */
8978 0x0085, /* NEXT LINE */
8979 0x2028, /* LINE SEPARATOR */
8980 0x2029, /* PARAGRAPH SEPARATOR */
8983 /* Init the implementation */
8984 free_list = NULL;
8985 numfree = 0;
8986 unicode_empty = _PyUnicode_New(0);
8987 if (!unicode_empty)
8988 return;
8990 strcpy(unicode_default_encoding, "ascii");
8991 for (i = 0; i < 256; i++)
8992 unicode_latin1[i] = NULL;
8993 if (PyType_Ready(&PyUnicode_Type) < 0)
8994 Py_FatalError("Can't initialize 'unicode'");
8996 /* initialize the linebreak bloom filter */
8997 bloom_linebreak = make_bloom_mask(
8998 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9001 PyType_Ready(&EncodingMapType);
9004 /* Finalize the Unicode implementation */
9007 PyUnicode_ClearFreeList(void)
9009 int freelist_size = numfree;
9010 PyUnicodeObject *u;
9012 for (u = free_list; u != NULL;) {
9013 PyUnicodeObject *v = u;
9014 u = *(PyUnicodeObject **)u;
9015 if (v->str)
9016 PyObject_DEL(v->str);
9017 Py_XDECREF(v->defenc);
9018 PyObject_Del(v);
9019 numfree--;
9021 free_list = NULL;
9022 assert(numfree == 0);
9023 return freelist_size;
9026 void
9027 _PyUnicode_Fini(void)
9029 int i;
9031 Py_XDECREF(unicode_empty);
9032 unicode_empty = NULL;
9034 for (i = 0; i < 256; i++) {
9035 if (unicode_latin1[i]) {
9036 Py_DECREF(unicode_latin1[i]);
9037 unicode_latin1[i] = NULL;
9040 (void)PyUnicode_ClearFreeList();
9043 #ifdef __cplusplus
9045 #endif
9049 Local variables:
9050 c-basic-offset: 4
9051 indent-tabs-mode: nil
9052 End: