Add better error reporting for MemoryErrors caused by str->float conversions.
[python.git] / Objects / unicodeobject.c
blob3bb797438e7a5d3cbd3c26c4357b8f8fd2041a92
1 /*
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
10 Copyright (c) Corporation for National Research Initiatives.
12 --------------------------------------------------------------------
13 The original string type implementation is:
15 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
18 By obtaining, using, and/or copying this software and/or its
19 associated documentation, you agree that you have read, understood,
20 and will comply with the following terms and conditions:
22 Permission to use, copy, modify, and distribute this software and its
23 associated documentation for any purpose and without fee is hereby
24 granted, provided that the above copyright notice appears in all
25 copies, and that both that copyright notice and this permission notice
26 appear in supporting documentation, and that the name of Secret Labs
27 AB or the author not be used in advertising or publicity pertaining to
28 distribution of the software without specific, written prior
29 permission.
31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 --------------------------------------------------------------------
42 #define PY_SSIZE_T_CLEAN
43 #include "Python.h"
45 #include "unicodeobject.h"
46 #include "ucnhash.h"
48 #ifdef MS_WINDOWS
49 #include <windows.h>
50 #endif
52 /* Limit for the Unicode object free list */
54 #define PyUnicode_MAXFREELIST 1024
56 /* Limit for the Unicode object free list stay alive optimization.
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
60 limit. This reduces malloc() overhead for small Unicode objects.
62 At worst this will result in PyUnicode_MAXFREELIST *
63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64 malloc()-overhead) bytes of unused garbage.
66 Setting the limit to 0 effectively turns the feature off.
68 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
73 #define KEEPALIVE_SIZE_LIMIT 9
75 /* Endianness switches; defaults to little endian */
77 #ifdef WORDS_BIGENDIAN
78 # define BYTEORDER_IS_BIG_ENDIAN
79 #else
80 # define BYTEORDER_IS_LITTLE_ENDIAN
81 #endif
83 /* --- Globals ------------------------------------------------------------
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
91 #ifdef __cplusplus
92 extern "C" {
93 #endif
95 /* Free list for Unicode objects */
96 static PyUnicodeObject *free_list;
97 static int numfree;
99 /* The empty Unicode object is shared to improve performance. */
100 static PyUnicodeObject *unicode_empty;
102 /* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104 static PyUnicodeObject *unicode_latin1[256];
106 /* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
109 Always use the PyUnicode_SetDefaultEncoding() and
110 PyUnicode_GetDefaultEncoding() APIs to access this global.
113 static char unicode_default_encoding[100];
115 /* Fast detection of the most frequent whitespace characters */
116 const unsigned char _Py_ascii_whitespace[] = {
117 0, 0, 0, 0, 0, 0, 0, 0,
118 /* case 0x0009: * HORIZONTAL TABULATION */
119 /* case 0x000A: * LINE FEED */
120 /* case 0x000B: * VERTICAL TABULATION */
121 /* case 0x000C: * FORM FEED */
122 /* case 0x000D: * CARRIAGE RETURN */
123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
125 /* case 0x001C: * FILE SEPARATOR */
126 /* case 0x001D: * GROUP SEPARATOR */
127 /* case 0x001E: * RECORD SEPARATOR */
128 /* case 0x001F: * UNIT SEPARATOR */
129 0, 0, 0, 0, 1, 1, 1, 1,
130 /* case 0x0020: * SPACE */
131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
146 /* Same for linebreaks */
147 static unsigned char ascii_linebreak[] = {
148 0, 0, 0, 0, 0, 0, 0, 0,
149 /* 0x000A, * LINE FEED */
150 /* 0x000D, * CARRIAGE RETURN */
151 0, 0, 1, 0, 0, 1, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 /* 0x001C, * FILE SEPARATOR */
154 /* 0x001D, * GROUP SEPARATOR */
155 /* 0x001E, * RECORD SEPARATOR */
156 0, 0, 0, 0, 1, 1, 1, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0
173 Py_UNICODE
174 PyUnicode_GetMax(void)
176 #ifdef Py_UNICODE_WIDE
177 return 0x10FFFF;
178 #else
179 /* This is actually an illegal character, so it should
180 not be passed to unichr. */
181 return 0xFFFF;
182 #endif
185 /* --- Bloom Filters ----------------------------------------------------- */
187 /* stuff to implement simple "bloom filters" for Unicode characters.
188 to keep things simple, we use a single bitmask, using the least 5
189 bits from each unicode characters as the bit index. */
191 /* the linebreak mask is set up by Unicode_Init below */
193 #if LONG_BIT >= 128
194 #define BLOOM_WIDTH 128
195 #elif LONG_BIT >= 64
196 #define BLOOM_WIDTH 64
197 #elif LONG_BIT >= 32
198 #define BLOOM_WIDTH 32
199 #else
200 #error "LONG_BIT is smaller than 32"
201 #endif
203 #define BLOOM_MASK unsigned long
205 static BLOOM_MASK bloom_linebreak;
207 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
208 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
210 #define BLOOM_LINEBREAK(ch) \
211 ((ch) < 128U ? ascii_linebreak[(ch)] : \
212 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
214 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
216 /* calculate simple bloom-style bitmask for a given unicode string */
218 BLOOM_MASK mask;
219 Py_ssize_t i;
221 mask = 0;
222 for (i = 0; i < len; i++)
223 BLOOM_ADD(mask, ptr[i]);
225 return mask;
228 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
230 Py_ssize_t i;
232 for (i = 0; i < setlen; i++)
233 if (set[i] == chr)
234 return 1;
236 return 0;
239 #define BLOOM_MEMBER(mask, chr, set, setlen) \
240 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
242 /* --- Unicode Object ----------------------------------------------------- */
244 static
245 int unicode_resize(register PyUnicodeObject *unicode,
246 Py_ssize_t length)
248 void *oldstr;
250 /* Shortcut if there's nothing much to do. */
251 if (unicode->length == length)
252 goto reset;
254 /* Resizing shared object (unicode_empty or single character
255 objects) in-place is not allowed. Use PyUnicode_Resize()
256 instead ! */
258 if (unicode == unicode_empty ||
259 (unicode->length == 1 &&
260 unicode->str[0] < 256U &&
261 unicode_latin1[unicode->str[0]] == unicode)) {
262 PyErr_SetString(PyExc_SystemError,
263 "can't resize shared unicode objects");
264 return -1;
267 /* We allocate one more byte to make sure the string is Ux0000 terminated.
268 The overallocation is also used by fastsearch, which assumes that it's
269 safe to look at str[length] (without making any assumptions about what
270 it contains). */
272 oldstr = unicode->str;
273 unicode->str = PyObject_REALLOC(unicode->str,
274 sizeof(Py_UNICODE) * (length + 1));
275 if (!unicode->str) {
276 unicode->str = (Py_UNICODE *)oldstr;
277 PyErr_NoMemory();
278 return -1;
280 unicode->str[length] = 0;
281 unicode->length = length;
283 reset:
284 /* Reset the object caches */
285 if (unicode->defenc) {
286 Py_DECREF(unicode->defenc);
287 unicode->defenc = NULL;
289 unicode->hash = -1;
291 return 0;
294 /* We allocate one more byte to make sure the string is
295 Ux0000 terminated -- XXX is this needed ?
297 XXX This allocator could further be enhanced by assuring that the
298 free list never reduces its size below 1.
302 static
303 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
305 register PyUnicodeObject *unicode;
307 /* Optimization for empty strings */
308 if (length == 0 && unicode_empty != NULL) {
309 Py_INCREF(unicode_empty);
310 return unicode_empty;
313 /* Ensure we won't overflow the size. */
314 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
315 return (PyUnicodeObject *)PyErr_NoMemory();
318 /* Unicode freelist & memory allocation */
319 if (free_list) {
320 unicode = free_list;
321 free_list = *(PyUnicodeObject **)unicode;
322 numfree--;
323 if (unicode->str) {
324 /* Keep-Alive optimization: we only upsize the buffer,
325 never downsize it. */
326 if ((unicode->length < length) &&
327 unicode_resize(unicode, length) < 0) {
328 PyObject_DEL(unicode->str);
329 unicode->str = NULL;
332 else {
333 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
334 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
336 PyObject_INIT(unicode, &PyUnicode_Type);
338 else {
339 size_t new_size;
340 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
341 if (unicode == NULL)
342 return NULL;
343 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
344 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
347 if (!unicode->str) {
348 PyErr_NoMemory();
349 goto onError;
351 /* Initialize the first element to guard against cases where
352 * the caller fails before initializing str -- unicode_resize()
353 * reads str[0], and the Keep-Alive optimization can keep memory
354 * allocated for str alive across a call to unicode_dealloc(unicode).
355 * We don't want unicode_resize to read uninitialized memory in
356 * that case.
358 unicode->str[0] = 0;
359 unicode->str[length] = 0;
360 unicode->length = length;
361 unicode->hash = -1;
362 unicode->defenc = NULL;
363 return unicode;
365 onError:
366 /* XXX UNREF/NEWREF interface should be more symmetrical */
367 _Py_DEC_REFTOTAL;
368 _Py_ForgetReference((PyObject *)unicode);
369 PyObject_Del(unicode);
370 return NULL;
373 static
374 void unicode_dealloc(register PyUnicodeObject *unicode)
376 if (PyUnicode_CheckExact(unicode) &&
377 numfree < PyUnicode_MAXFREELIST) {
378 /* Keep-Alive optimization */
379 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
380 PyObject_DEL(unicode->str);
381 unicode->str = NULL;
382 unicode->length = 0;
384 if (unicode->defenc) {
385 Py_DECREF(unicode->defenc);
386 unicode->defenc = NULL;
388 /* Add to free list */
389 *(PyUnicodeObject **)unicode = free_list;
390 free_list = unicode;
391 numfree++;
393 else {
394 PyObject_DEL(unicode->str);
395 Py_XDECREF(unicode->defenc);
396 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
400 static
401 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
403 register PyUnicodeObject *v;
405 /* Argument checks */
406 if (unicode == NULL) {
407 PyErr_BadInternalCall();
408 return -1;
410 v = *unicode;
411 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
412 PyErr_BadInternalCall();
413 return -1;
416 /* Resizing unicode_empty and single character objects is not
417 possible since these are being shared. We simply return a fresh
418 copy with the same Unicode content. */
419 if (v->length != length &&
420 (v == unicode_empty || v->length == 1)) {
421 PyUnicodeObject *w = _PyUnicode_New(length);
422 if (w == NULL)
423 return -1;
424 Py_UNICODE_COPY(w->str, v->str,
425 length < v->length ? length : v->length);
426 Py_DECREF(*unicode);
427 *unicode = w;
428 return 0;
431 /* Note that we don't have to modify *unicode for unshared Unicode
432 objects, since we can modify them in-place. */
433 return unicode_resize(v, length);
436 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
438 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
441 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
442 Py_ssize_t size)
444 PyUnicodeObject *unicode;
446 /* If the Unicode data is known at construction time, we can apply
447 some optimizations which share commonly used objects. */
448 if (u != NULL) {
450 /* Optimization for empty strings */
451 if (size == 0 && unicode_empty != NULL) {
452 Py_INCREF(unicode_empty);
453 return (PyObject *)unicode_empty;
456 /* Single character Unicode objects in the Latin-1 range are
457 shared when using this constructor */
458 if (size == 1 && *u < 256) {
459 unicode = unicode_latin1[*u];
460 if (!unicode) {
461 unicode = _PyUnicode_New(1);
462 if (!unicode)
463 return NULL;
464 unicode->str[0] = *u;
465 unicode_latin1[*u] = unicode;
467 Py_INCREF(unicode);
468 return (PyObject *)unicode;
472 unicode = _PyUnicode_New(size);
473 if (!unicode)
474 return NULL;
476 /* Copy the Unicode data into the new object */
477 if (u != NULL)
478 Py_UNICODE_COPY(unicode->str, u, size);
480 return (PyObject *)unicode;
483 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
485 PyUnicodeObject *unicode;
487 if (size < 0) {
488 PyErr_SetString(PyExc_SystemError,
489 "Negative size passed to PyUnicode_FromStringAndSize");
490 return NULL;
493 /* If the Unicode data is known at construction time, we can apply
494 some optimizations which share commonly used objects.
495 Also, this means the input must be UTF-8, so fall back to the
496 UTF-8 decoder at the end. */
497 if (u != NULL) {
499 /* Optimization for empty strings */
500 if (size == 0 && unicode_empty != NULL) {
501 Py_INCREF(unicode_empty);
502 return (PyObject *)unicode_empty;
505 /* Single characters are shared when using this constructor.
506 Restrict to ASCII, since the input must be UTF-8. */
507 if (size == 1 && Py_CHARMASK(*u) < 128) {
508 unicode = unicode_latin1[Py_CHARMASK(*u)];
509 if (!unicode) {
510 unicode = _PyUnicode_New(1);
511 if (!unicode)
512 return NULL;
513 unicode->str[0] = Py_CHARMASK(*u);
514 unicode_latin1[Py_CHARMASK(*u)] = unicode;
516 Py_INCREF(unicode);
517 return (PyObject *)unicode;
520 return PyUnicode_DecodeUTF8(u, size, NULL);
523 unicode = _PyUnicode_New(size);
524 if (!unicode)
525 return NULL;
527 return (PyObject *)unicode;
530 PyObject *PyUnicode_FromString(const char *u)
532 size_t size = strlen(u);
533 if (size > PY_SSIZE_T_MAX) {
534 PyErr_SetString(PyExc_OverflowError, "input too long");
535 return NULL;
538 return PyUnicode_FromStringAndSize(u, size);
541 #ifdef HAVE_WCHAR_H
543 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
544 # define CONVERT_WCHAR_TO_SURROGATES
545 #endif
547 #ifdef CONVERT_WCHAR_TO_SURROGATES
549 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
550 to convert from UTF32 to UTF16. */
552 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
553 Py_ssize_t size)
555 PyUnicodeObject *unicode;
556 register Py_ssize_t i;
557 Py_ssize_t alloc;
558 const wchar_t *orig_w;
560 if (w == NULL) {
561 PyErr_BadInternalCall();
562 return NULL;
565 alloc = size;
566 orig_w = w;
567 for (i = size; i > 0; i--) {
568 if (*w > 0xFFFF)
569 alloc++;
570 w++;
572 w = orig_w;
573 unicode = _PyUnicode_New(alloc);
574 if (!unicode)
575 return NULL;
577 /* Copy the wchar_t data into the new object */
579 register Py_UNICODE *u;
580 u = PyUnicode_AS_UNICODE(unicode);
581 for (i = size; i > 0; i--) {
582 if (*w > 0xFFFF) {
583 wchar_t ordinal = *w++;
584 ordinal -= 0x10000;
585 *u++ = 0xD800 | (ordinal >> 10);
586 *u++ = 0xDC00 | (ordinal & 0x3FF);
588 else
589 *u++ = *w++;
592 return (PyObject *)unicode;
595 #else
597 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
598 Py_ssize_t size)
600 PyUnicodeObject *unicode;
602 if (w == NULL) {
603 PyErr_BadInternalCall();
604 return NULL;
607 unicode = _PyUnicode_New(size);
608 if (!unicode)
609 return NULL;
611 /* Copy the wchar_t data into the new object */
612 #ifdef HAVE_USABLE_WCHAR_T
613 memcpy(unicode->str, w, size * sizeof(wchar_t));
614 #else
616 register Py_UNICODE *u;
617 register Py_ssize_t i;
618 u = PyUnicode_AS_UNICODE(unicode);
619 for (i = size; i > 0; i--)
620 *u++ = *w++;
622 #endif
624 return (PyObject *)unicode;
627 #endif /* CONVERT_WCHAR_TO_SURROGATES */
629 #undef CONVERT_WCHAR_TO_SURROGATES
631 static void
632 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
634 *fmt++ = '%';
635 if (width) {
636 if (zeropad)
637 *fmt++ = '0';
638 fmt += sprintf(fmt, "%d", width);
640 if (precision)
641 fmt += sprintf(fmt, ".%d", precision);
642 if (longflag)
643 *fmt++ = 'l';
644 else if (size_tflag) {
645 char *f = PY_FORMAT_SIZE_T;
646 while (*f)
647 *fmt++ = *f++;
649 *fmt++ = c;
650 *fmt = '\0';
653 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
655 PyObject *
656 PyUnicode_FromFormatV(const char *format, va_list vargs)
658 va_list count;
659 Py_ssize_t callcount = 0;
660 PyObject **callresults = NULL;
661 PyObject **callresult = NULL;
662 Py_ssize_t n = 0;
663 int width = 0;
664 int precision = 0;
665 int zeropad;
666 const char* f;
667 Py_UNICODE *s;
668 PyObject *string;
669 /* used by sprintf */
670 char buffer[21];
671 /* use abuffer instead of buffer, if we need more space
672 * (which can happen if there's a format specifier with width). */
673 char *abuffer = NULL;
674 char *realbuffer;
675 Py_ssize_t abuffersize = 0;
676 char fmt[60]; /* should be enough for %0width.precisionld */
677 const char *copy;
679 #ifdef VA_LIST_IS_ARRAY
680 Py_MEMCPY(count, vargs, sizeof(va_list));
681 #else
682 #ifdef __va_copy
683 __va_copy(count, vargs);
684 #else
685 count = vargs;
686 #endif
687 #endif
688 /* step 1: count the number of %S/%R/%s format specifications
689 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
690 * objects once during step 3 and put the result in an array) */
691 for (f = format; *f; f++) {
692 if (*f == '%') {
693 if (*(f+1)=='%')
694 continue;
695 if (*(f+1)=='S' || *(f+1)=='R')
696 ++callcount;
697 while (isdigit((unsigned)*f))
698 width = (width*10) + *f++ - '0';
699 while (*++f && *f != '%' && !isalpha((unsigned)*f))
701 if (*f == 's')
702 ++callcount;
705 /* step 2: allocate memory for the results of
706 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
707 if (callcount) {
708 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
709 if (!callresults) {
710 PyErr_NoMemory();
711 return NULL;
713 callresult = callresults;
715 /* step 3: figure out how large a buffer we need */
716 for (f = format; *f; f++) {
717 if (*f == '%') {
718 const char* p = f;
719 width = 0;
720 while (isdigit((unsigned)*f))
721 width = (width*10) + *f++ - '0';
722 while (*++f && *f != '%' && !isalpha((unsigned)*f))
725 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
726 * they don't affect the amount of space we reserve.
728 if ((*f == 'l' || *f == 'z') &&
729 (f[1] == 'd' || f[1] == 'u'))
730 ++f;
732 switch (*f) {
733 case 'c':
734 (void)va_arg(count, int);
735 /* fall through... */
736 case '%':
737 n++;
738 break;
739 case 'd': case 'u': case 'i': case 'x':
740 (void) va_arg(count, int);
741 /* 20 bytes is enough to hold a 64-bit
742 integer. Decimal takes the most space.
743 This isn't enough for octal.
744 If a width is specified we need more
745 (which we allocate later). */
746 if (width < 20)
747 width = 20;
748 n += width;
749 if (abuffersize < width)
750 abuffersize = width;
751 break;
752 case 's':
754 /* UTF-8 */
755 const char *s = va_arg(count, const char*);
756 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
757 if (!str)
758 goto fail;
759 n += PyUnicode_GET_SIZE(str);
760 /* Remember the str and switch to the next slot */
761 *callresult++ = str;
762 break;
764 case 'U':
766 PyObject *obj = va_arg(count, PyObject *);
767 assert(obj && PyUnicode_Check(obj));
768 n += PyUnicode_GET_SIZE(obj);
769 break;
771 case 'V':
773 PyObject *obj = va_arg(count, PyObject *);
774 const char *str = va_arg(count, const char *);
775 assert(obj || str);
776 assert(!obj || PyUnicode_Check(obj));
777 if (obj)
778 n += PyUnicode_GET_SIZE(obj);
779 else
780 n += strlen(str);
781 break;
783 case 'S':
785 PyObject *obj = va_arg(count, PyObject *);
786 PyObject *str;
787 assert(obj);
788 str = PyObject_Str(obj);
789 if (!str)
790 goto fail;
791 n += PyUnicode_GET_SIZE(str);
792 /* Remember the str and switch to the next slot */
793 *callresult++ = str;
794 break;
796 case 'R':
798 PyObject *obj = va_arg(count, PyObject *);
799 PyObject *repr;
800 assert(obj);
801 repr = PyObject_Repr(obj);
802 if (!repr)
803 goto fail;
804 n += PyUnicode_GET_SIZE(repr);
805 /* Remember the repr and switch to the next slot */
806 *callresult++ = repr;
807 break;
809 case 'p':
810 (void) va_arg(count, int);
811 /* maximum 64-bit pointer representation:
812 * 0xffffffffffffffff
813 * so 19 characters is enough.
814 * XXX I count 18 -- what's the extra for?
816 n += 19;
817 break;
818 default:
819 /* if we stumble upon an unknown
820 formatting code, copy the rest of
821 the format string to the output
822 string. (we cannot just skip the
823 code, since there's no way to know
824 what's in the argument list) */
825 n += strlen(p);
826 goto expand;
828 } else
829 n++;
831 expand:
832 if (abuffersize > 20) {
833 abuffer = PyObject_Malloc(abuffersize);
834 if (!abuffer) {
835 PyErr_NoMemory();
836 goto fail;
838 realbuffer = abuffer;
840 else
841 realbuffer = buffer;
842 /* step 4: fill the buffer */
843 /* Since we've analyzed how much space we need for the worst case,
844 we don't have to resize the string.
845 There can be no errors beyond this point. */
846 string = PyUnicode_FromUnicode(NULL, n);
847 if (!string)
848 goto fail;
850 s = PyUnicode_AS_UNICODE(string);
851 callresult = callresults;
853 for (f = format; *f; f++) {
854 if (*f == '%') {
855 const char* p = f++;
856 int longflag = 0;
857 int size_tflag = 0;
858 zeropad = (*f == '0');
859 /* parse the width.precision part */
860 width = 0;
861 while (isdigit((unsigned)*f))
862 width = (width*10) + *f++ - '0';
863 precision = 0;
864 if (*f == '.') {
865 f++;
866 while (isdigit((unsigned)*f))
867 precision = (precision*10) + *f++ - '0';
869 /* handle the long flag, but only for %ld and %lu.
870 others can be added when necessary. */
871 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
872 longflag = 1;
873 ++f;
875 /* handle the size_t flag. */
876 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
877 size_tflag = 1;
878 ++f;
881 switch (*f) {
882 case 'c':
883 *s++ = va_arg(vargs, int);
884 break;
885 case 'd':
886 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
887 if (longflag)
888 sprintf(realbuffer, fmt, va_arg(vargs, long));
889 else if (size_tflag)
890 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
891 else
892 sprintf(realbuffer, fmt, va_arg(vargs, int));
893 appendstring(realbuffer);
894 break;
895 case 'u':
896 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
897 if (longflag)
898 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
899 else if (size_tflag)
900 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
901 else
902 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
903 appendstring(realbuffer);
904 break;
905 case 'i':
906 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
907 sprintf(realbuffer, fmt, va_arg(vargs, int));
908 appendstring(realbuffer);
909 break;
910 case 'x':
911 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
912 sprintf(realbuffer, fmt, va_arg(vargs, int));
913 appendstring(realbuffer);
914 break;
915 case 's':
917 /* unused, since we already have the result */
918 (void) va_arg(vargs, char *);
919 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
920 PyUnicode_GET_SIZE(*callresult));
921 s += PyUnicode_GET_SIZE(*callresult);
922 /* We're done with the unicode()/repr() => forget it */
923 Py_DECREF(*callresult);
924 /* switch to next unicode()/repr() result */
925 ++callresult;
926 break;
928 case 'U':
930 PyObject *obj = va_arg(vargs, PyObject *);
931 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
932 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
933 s += size;
934 break;
936 case 'V':
938 PyObject *obj = va_arg(vargs, PyObject *);
939 const char *str = va_arg(vargs, const char *);
940 if (obj) {
941 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
942 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
943 s += size;
944 } else {
945 appendstring(str);
947 break;
949 case 'S':
950 case 'R':
952 Py_UNICODE *ucopy;
953 Py_ssize_t usize;
954 Py_ssize_t upos;
955 /* unused, since we already have the result */
956 (void) va_arg(vargs, PyObject *);
957 ucopy = PyUnicode_AS_UNICODE(*callresult);
958 usize = PyUnicode_GET_SIZE(*callresult);
959 for (upos = 0; upos<usize;)
960 *s++ = ucopy[upos++];
961 /* We're done with the unicode()/repr() => forget it */
962 Py_DECREF(*callresult);
963 /* switch to next unicode()/repr() result */
964 ++callresult;
965 break;
967 case 'p':
968 sprintf(buffer, "%p", va_arg(vargs, void*));
969 /* %p is ill-defined: ensure leading 0x. */
970 if (buffer[1] == 'X')
971 buffer[1] = 'x';
972 else if (buffer[1] != 'x') {
973 memmove(buffer+2, buffer, strlen(buffer)+1);
974 buffer[0] = '0';
975 buffer[1] = 'x';
977 appendstring(buffer);
978 break;
979 case '%':
980 *s++ = '%';
981 break;
982 default:
983 appendstring(p);
984 goto end;
986 } else
987 *s++ = *f;
990 end:
991 if (callresults)
992 PyObject_Free(callresults);
993 if (abuffer)
994 PyObject_Free(abuffer);
995 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
996 return string;
997 fail:
998 if (callresults) {
999 PyObject **callresult2 = callresults;
1000 while (callresult2 < callresult) {
1001 Py_DECREF(*callresult2);
1002 ++callresult2;
1004 PyObject_Free(callresults);
1006 if (abuffer)
1007 PyObject_Free(abuffer);
1008 return NULL;
1011 #undef appendstring
1013 PyObject *
1014 PyUnicode_FromFormat(const char *format, ...)
1016 PyObject* ret;
1017 va_list vargs;
1019 #ifdef HAVE_STDARG_PROTOTYPES
1020 va_start(vargs, format);
1021 #else
1022 va_start(vargs);
1023 #endif
1024 ret = PyUnicode_FromFormatV(format, vargs);
1025 va_end(vargs);
1026 return ret;
1029 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1030 wchar_t *w,
1031 Py_ssize_t size)
1033 if (unicode == NULL) {
1034 PyErr_BadInternalCall();
1035 return -1;
1038 /* If possible, try to copy the 0-termination as well */
1039 if (size > PyUnicode_GET_SIZE(unicode))
1040 size = PyUnicode_GET_SIZE(unicode) + 1;
1042 #ifdef HAVE_USABLE_WCHAR_T
1043 memcpy(w, unicode->str, size * sizeof(wchar_t));
1044 #else
1046 register Py_UNICODE *u;
1047 register Py_ssize_t i;
1048 u = PyUnicode_AS_UNICODE(unicode);
1049 for (i = size; i > 0; i--)
1050 *w++ = *u++;
1052 #endif
1054 if (size > PyUnicode_GET_SIZE(unicode))
1055 return PyUnicode_GET_SIZE(unicode);
1056 else
1057 return size;
1060 #endif
1062 PyObject *PyUnicode_FromOrdinal(int ordinal)
1064 Py_UNICODE s[1];
1066 #ifdef Py_UNICODE_WIDE
1067 if (ordinal < 0 || ordinal > 0x10ffff) {
1068 PyErr_SetString(PyExc_ValueError,
1069 "unichr() arg not in range(0x110000) "
1070 "(wide Python build)");
1071 return NULL;
1073 #else
1074 if (ordinal < 0 || ordinal > 0xffff) {
1075 PyErr_SetString(PyExc_ValueError,
1076 "unichr() arg not in range(0x10000) "
1077 "(narrow Python build)");
1078 return NULL;
1080 #endif
1082 s[0] = (Py_UNICODE)ordinal;
1083 return PyUnicode_FromUnicode(s, 1);
1086 PyObject *PyUnicode_FromObject(register PyObject *obj)
1088 /* XXX Perhaps we should make this API an alias of
1089 PyObject_Unicode() instead ?! */
1090 if (PyUnicode_CheckExact(obj)) {
1091 Py_INCREF(obj);
1092 return obj;
1094 if (PyUnicode_Check(obj)) {
1095 /* For a Unicode subtype that's not a Unicode object,
1096 return a true Unicode object with the same data. */
1097 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1098 PyUnicode_GET_SIZE(obj));
1100 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1103 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1104 const char *encoding,
1105 const char *errors)
1107 const char *s = NULL;
1108 Py_ssize_t len;
1109 PyObject *v;
1111 if (obj == NULL) {
1112 PyErr_BadInternalCall();
1113 return NULL;
1116 #if 0
1117 /* For b/w compatibility we also accept Unicode objects provided
1118 that no encodings is given and then redirect to
1119 PyObject_Unicode() which then applies the additional logic for
1120 Unicode subclasses.
1122 NOTE: This API should really only be used for object which
1123 represent *encoded* Unicode !
1126 if (PyUnicode_Check(obj)) {
1127 if (encoding) {
1128 PyErr_SetString(PyExc_TypeError,
1129 "decoding Unicode is not supported");
1130 return NULL;
1132 return PyObject_Unicode(obj);
1134 #else
1135 if (PyUnicode_Check(obj)) {
1136 PyErr_SetString(PyExc_TypeError,
1137 "decoding Unicode is not supported");
1138 return NULL;
1140 #endif
1142 /* Coerce object */
1143 if (PyString_Check(obj)) {
1144 s = PyString_AS_STRING(obj);
1145 len = PyString_GET_SIZE(obj);
1147 else if (PyByteArray_Check(obj)) {
1148 /* Python 2.x specific */
1149 PyErr_Format(PyExc_TypeError,
1150 "decoding bytearray is not supported");
1151 return NULL;
1153 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1154 /* Overwrite the error message with something more useful in
1155 case of a TypeError. */
1156 if (PyErr_ExceptionMatches(PyExc_TypeError))
1157 PyErr_Format(PyExc_TypeError,
1158 "coercing to Unicode: need string or buffer, "
1159 "%.80s found",
1160 Py_TYPE(obj)->tp_name);
1161 goto onError;
1164 /* Convert to Unicode */
1165 if (len == 0) {
1166 Py_INCREF(unicode_empty);
1167 v = (PyObject *)unicode_empty;
1169 else
1170 v = PyUnicode_Decode(s, len, encoding, errors);
1172 return v;
1174 onError:
1175 return NULL;
1178 PyObject *PyUnicode_Decode(const char *s,
1179 Py_ssize_t size,
1180 const char *encoding,
1181 const char *errors)
1183 PyObject *buffer = NULL, *unicode;
1185 if (encoding == NULL)
1186 encoding = PyUnicode_GetDefaultEncoding();
1188 /* Shortcuts for common default encodings */
1189 if (strcmp(encoding, "utf-8") == 0)
1190 return PyUnicode_DecodeUTF8(s, size, errors);
1191 else if (strcmp(encoding, "latin-1") == 0)
1192 return PyUnicode_DecodeLatin1(s, size, errors);
1193 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1194 else if (strcmp(encoding, "mbcs") == 0)
1195 return PyUnicode_DecodeMBCS(s, size, errors);
1196 #endif
1197 else if (strcmp(encoding, "ascii") == 0)
1198 return PyUnicode_DecodeASCII(s, size, errors);
1200 /* Decode via the codec registry */
1201 buffer = PyBuffer_FromMemory((void *)s, size);
1202 if (buffer == NULL)
1203 goto onError;
1204 unicode = PyCodec_Decode(buffer, encoding, errors);
1205 if (unicode == NULL)
1206 goto onError;
1207 if (!PyUnicode_Check(unicode)) {
1208 PyErr_Format(PyExc_TypeError,
1209 "decoder did not return an unicode object (type=%.400s)",
1210 Py_TYPE(unicode)->tp_name);
1211 Py_DECREF(unicode);
1212 goto onError;
1214 Py_DECREF(buffer);
1215 return unicode;
1217 onError:
1218 Py_XDECREF(buffer);
1219 return NULL;
1222 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1223 const char *encoding,
1224 const char *errors)
1226 PyObject *v;
1228 if (!PyUnicode_Check(unicode)) {
1229 PyErr_BadArgument();
1230 goto onError;
1233 if (encoding == NULL)
1234 encoding = PyUnicode_GetDefaultEncoding();
1236 /* Decode via the codec registry */
1237 v = PyCodec_Decode(unicode, encoding, errors);
1238 if (v == NULL)
1239 goto onError;
1240 return v;
1242 onError:
1243 return NULL;
1246 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1247 Py_ssize_t size,
1248 const char *encoding,
1249 const char *errors)
1251 PyObject *v, *unicode;
1253 unicode = PyUnicode_FromUnicode(s, size);
1254 if (unicode == NULL)
1255 return NULL;
1256 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1257 Py_DECREF(unicode);
1258 return v;
1261 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1262 const char *encoding,
1263 const char *errors)
1265 PyObject *v;
1267 if (!PyUnicode_Check(unicode)) {
1268 PyErr_BadArgument();
1269 goto onError;
1272 if (encoding == NULL)
1273 encoding = PyUnicode_GetDefaultEncoding();
1275 /* Encode via the codec registry */
1276 v = PyCodec_Encode(unicode, encoding, errors);
1277 if (v == NULL)
1278 goto onError;
1279 return v;
1281 onError:
1282 return NULL;
1285 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1286 const char *encoding,
1287 const char *errors)
1289 PyObject *v;
1291 if (!PyUnicode_Check(unicode)) {
1292 PyErr_BadArgument();
1293 goto onError;
1296 if (encoding == NULL)
1297 encoding = PyUnicode_GetDefaultEncoding();
1299 /* Shortcuts for common default encodings */
1300 if (errors == NULL) {
1301 if (strcmp(encoding, "utf-8") == 0)
1302 return PyUnicode_AsUTF8String(unicode);
1303 else if (strcmp(encoding, "latin-1") == 0)
1304 return PyUnicode_AsLatin1String(unicode);
1305 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1306 else if (strcmp(encoding, "mbcs") == 0)
1307 return PyUnicode_AsMBCSString(unicode);
1308 #endif
1309 else if (strcmp(encoding, "ascii") == 0)
1310 return PyUnicode_AsASCIIString(unicode);
1313 /* Encode via the codec registry */
1314 v = PyCodec_Encode(unicode, encoding, errors);
1315 if (v == NULL)
1316 goto onError;
1317 if (!PyString_Check(v)) {
1318 PyErr_Format(PyExc_TypeError,
1319 "encoder did not return a string object (type=%.400s)",
1320 Py_TYPE(v)->tp_name);
1321 Py_DECREF(v);
1322 goto onError;
1324 return v;
1326 onError:
1327 return NULL;
1330 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1331 const char *errors)
1333 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1335 if (v)
1336 return v;
1337 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1338 if (v && errors == NULL)
1339 ((PyUnicodeObject *)unicode)->defenc = v;
1340 return v;
1343 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1345 if (!PyUnicode_Check(unicode)) {
1346 PyErr_BadArgument();
1347 goto onError;
1349 return PyUnicode_AS_UNICODE(unicode);
1351 onError:
1352 return NULL;
1355 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1357 if (!PyUnicode_Check(unicode)) {
1358 PyErr_BadArgument();
1359 goto onError;
1361 return PyUnicode_GET_SIZE(unicode);
1363 onError:
1364 return -1;
1367 const char *PyUnicode_GetDefaultEncoding(void)
1369 return unicode_default_encoding;
1372 int PyUnicode_SetDefaultEncoding(const char *encoding)
1374 PyObject *v;
1376 /* Make sure the encoding is valid. As side effect, this also
1377 loads the encoding into the codec registry cache. */
1378 v = _PyCodec_Lookup(encoding);
1379 if (v == NULL)
1380 goto onError;
1381 Py_DECREF(v);
1382 strncpy(unicode_default_encoding,
1383 encoding,
1384 sizeof(unicode_default_encoding));
1385 return 0;
1387 onError:
1388 return -1;
1391 /* error handling callback helper:
1392 build arguments, call the callback and check the arguments,
1393 if no exception occurred, copy the replacement to the output
1394 and adjust various state variables.
1395 return 0 on success, -1 on error
1398 static
1399 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1400 const char *encoding, const char *reason,
1401 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1402 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1403 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1405 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1407 PyObject *restuple = NULL;
1408 PyObject *repunicode = NULL;
1409 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1410 Py_ssize_t requiredsize;
1411 Py_ssize_t newpos;
1412 Py_UNICODE *repptr;
1413 Py_ssize_t repsize;
1414 int res = -1;
1416 if (*errorHandler == NULL) {
1417 *errorHandler = PyCodec_LookupError(errors);
1418 if (*errorHandler == NULL)
1419 goto onError;
1422 if (*exceptionObject == NULL) {
1423 *exceptionObject = PyUnicodeDecodeError_Create(
1424 encoding, input, insize, *startinpos, *endinpos, reason);
1425 if (*exceptionObject == NULL)
1426 goto onError;
1428 else {
1429 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1430 goto onError;
1431 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1432 goto onError;
1433 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1434 goto onError;
1437 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1438 if (restuple == NULL)
1439 goto onError;
1440 if (!PyTuple_Check(restuple)) {
1441 PyErr_SetString(PyExc_TypeError, &argparse[4]);
1442 goto onError;
1444 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1445 goto onError;
1446 if (newpos<0)
1447 newpos = insize+newpos;
1448 if (newpos<0 || newpos>insize) {
1449 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1450 goto onError;
1453 /* need more space? (at least enough for what we
1454 have+the replacement+the rest of the string (starting
1455 at the new input position), so we won't have to check space
1456 when there are no errors in the rest of the string) */
1457 repptr = PyUnicode_AS_UNICODE(repunicode);
1458 repsize = PyUnicode_GET_SIZE(repunicode);
1459 requiredsize = *outpos + repsize + insize-newpos;
1460 if (requiredsize > outsize) {
1461 if (requiredsize<2*outsize)
1462 requiredsize = 2*outsize;
1463 if (_PyUnicode_Resize(output, requiredsize) < 0)
1464 goto onError;
1465 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1467 *endinpos = newpos;
1468 *inptr = input + newpos;
1469 Py_UNICODE_COPY(*outptr, repptr, repsize);
1470 *outptr += repsize;
1471 *outpos += repsize;
1472 /* we made it! */
1473 res = 0;
1475 onError:
1476 Py_XDECREF(restuple);
1477 return res;
1480 /* --- UTF-7 Codec -------------------------------------------------------- */
1482 /* See RFC2152 for details. We encode conservatively and decode liberally. */
1484 /* Three simple macros defining base-64. */
1486 /* Is c a base-64 character? */
1488 #define IS_BASE64(c) \
1489 (isalnum(c) || (c) == '+' || (c) == '/')
1491 /* given that c is a base-64 character, what is its base-64 value? */
1493 #define FROM_BASE64(c) \
1494 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1495 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1496 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1497 (c) == '+' ? 62 : 63)
1499 /* What is the base-64 character of the bottom 6 bits of n? */
1501 #define TO_BASE64(n) \
1502 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1504 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1505 * decoded as itself. We are permissive on decoding; the only ASCII
1506 * byte not decoding to itself is the + which begins a base64
1507 * string. */
1509 #define DECODE_DIRECT(c) \
1510 ((c) <= 127 && (c) != '+')
1512 /* The UTF-7 encoder treats ASCII characters differently according to
1513 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1514 * the above). See RFC2152. This array identifies these different
1515 * sets:
1516 * 0 : "Set D"
1517 * alphanumeric and '(),-./:?
1518 * 1 : "Set O"
1519 * !"#$%&*;<=>@[]^_`{|}
1520 * 2 : "whitespace"
1521 * ht nl cr sp
1522 * 3 : special (must be base64 encoded)
1523 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1526 static
1527 char utf7_category[128] = {
1528 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1529 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1530 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1531 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1532 /* sp ! " # $ % & ' ( ) * + , - . / */
1533 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1534 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1535 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1536 /* @ A B C D E F G H I J K L M N O */
1537 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1538 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
1539 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1540 /* ` a b c d e f g h i j k l m n o */
1541 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1542 /* p q r s t u v w x y z { | } ~ del */
1543 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
1546 /* ENCODE_DIRECT: this character should be encoded as itself. The
1547 * answer depends on whether we are encoding set O as itself, and also
1548 * on whether we are encoding whitespace as itself. RFC2152 makes it
1549 * clear that the answers to these questions vary between
1550 * applications, so this code needs to be flexible. */
1552 #define ENCODE_DIRECT(c, directO, directWS) \
1553 ((c) < 128 && (c) > 0 && \
1554 ((utf7_category[(c)] == 0) || \
1555 (directWS && (utf7_category[(c)] == 2)) || \
1556 (directO && (utf7_category[(c)] == 1))))
1558 PyObject *PyUnicode_DecodeUTF7(const char *s,
1559 Py_ssize_t size,
1560 const char *errors)
1562 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1565 /* The decoder. The only state we preserve is our read position,
1566 * i.e. how many characters we have consumed. So if we end in the
1567 * middle of a shift sequence we have to back off the read position
1568 * and the output to the beginning of the sequence, otherwise we lose
1569 * all the shift state (seen bits, number of bits seen, high
1570 * surrogate). */
1572 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1573 Py_ssize_t size,
1574 const char *errors,
1575 Py_ssize_t *consumed)
1577 const char *starts = s;
1578 Py_ssize_t startinpos;
1579 Py_ssize_t endinpos;
1580 Py_ssize_t outpos;
1581 const char *e;
1582 PyUnicodeObject *unicode;
1583 Py_UNICODE *p;
1584 const char *errmsg = "";
1585 int inShift = 0;
1586 Py_UNICODE *shiftOutStart;
1587 unsigned int base64bits = 0;
1588 unsigned long base64buffer = 0;
1589 Py_UNICODE surrogate = 0;
1590 PyObject *errorHandler = NULL;
1591 PyObject *exc = NULL;
1593 unicode = _PyUnicode_New(size);
1594 if (!unicode)
1595 return NULL;
1596 if (size == 0) {
1597 if (consumed)
1598 *consumed = 0;
1599 return (PyObject *)unicode;
1602 p = unicode->str;
1603 shiftOutStart = p;
1604 e = s + size;
1606 while (s < e) {
1607 Py_UNICODE ch = (unsigned char) *s;
1609 if (inShift) { /* in a base-64 section */
1610 if (IS_BASE64(ch)) { /* consume a base-64 character */
1611 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1612 base64bits += 6;
1613 s++;
1614 if (base64bits >= 16) {
1615 /* we have enough bits for a UTF-16 value */
1616 Py_UNICODE outCh = (Py_UNICODE)
1617 (base64buffer >> (base64bits-16));
1618 base64bits -= 16;
1619 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1620 if (surrogate) {
1621 /* expecting a second surrogate */
1622 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1623 #ifdef Py_UNICODE_WIDE
1624 *p++ = (((surrogate & 0x3FF)<<10)
1625 | (outCh & 0x3FF)) + 0x10000;
1626 #else
1627 *p++ = surrogate;
1628 *p++ = outCh;
1629 #endif
1630 surrogate = 0;
1632 else {
1633 surrogate = 0;
1634 errmsg = "second surrogate missing";
1635 goto utf7Error;
1638 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1639 /* first surrogate */
1640 surrogate = outCh;
1642 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1643 errmsg = "unexpected second surrogate";
1644 goto utf7Error;
1646 else {
1647 *p++ = outCh;
1651 else { /* now leaving a base-64 section */
1652 inShift = 0;
1653 s++;
1654 if (surrogate) {
1655 errmsg = "second surrogate missing at end of shift sequence";
1656 goto utf7Error;
1658 if (base64bits > 0) { /* left-over bits */
1659 if (base64bits >= 6) {
1660 /* We've seen at least one base-64 character */
1661 errmsg = "partial character in shift sequence";
1662 goto utf7Error;
1664 else {
1665 /* Some bits remain; they should be zero */
1666 if (base64buffer != 0) {
1667 errmsg = "non-zero padding bits in shift sequence";
1668 goto utf7Error;
1672 if (ch != '-') {
1673 /* '-' is absorbed; other terminating
1674 characters are preserved */
1675 *p++ = ch;
1679 else if ( ch == '+' ) {
1680 startinpos = s-starts;
1681 s++; /* consume '+' */
1682 if (s < e && *s == '-') { /* '+-' encodes '+' */
1683 s++;
1684 *p++ = '+';
1686 else { /* begin base64-encoded section */
1687 inShift = 1;
1688 shiftOutStart = p;
1689 base64bits = 0;
1692 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
1693 *p++ = ch;
1694 s++;
1696 else {
1697 startinpos = s-starts;
1698 s++;
1699 errmsg = "unexpected special character";
1700 goto utf7Error;
1702 continue;
1703 utf7Error:
1704 outpos = p-PyUnicode_AS_UNICODE(unicode);
1705 endinpos = s-starts;
1706 if (unicode_decode_call_errorhandler(
1707 errors, &errorHandler,
1708 "utf7", errmsg,
1709 starts, size, &startinpos, &endinpos, &exc, &s,
1710 &unicode, &outpos, &p))
1711 goto onError;
1714 /* end of string */
1716 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1717 /* if we're in an inconsistent state, that's an error */
1718 if (surrogate ||
1719 (base64bits >= 6) ||
1720 (base64bits > 0 && base64buffer != 0)) {
1721 outpos = p-PyUnicode_AS_UNICODE(unicode);
1722 endinpos = size;
1723 if (unicode_decode_call_errorhandler(
1724 errors, &errorHandler,
1725 "utf7", "unterminated shift sequence",
1726 starts, size, &startinpos, &endinpos, &exc, &s,
1727 &unicode, &outpos, &p))
1728 goto onError;
1732 /* return state */
1733 if (consumed) {
1734 if (inShift) {
1735 p = shiftOutStart; /* back off output */
1736 *consumed = startinpos;
1738 else {
1739 *consumed = s-starts;
1743 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1744 goto onError;
1746 Py_XDECREF(errorHandler);
1747 Py_XDECREF(exc);
1748 return (PyObject *)unicode;
1750 onError:
1751 Py_XDECREF(errorHandler);
1752 Py_XDECREF(exc);
1753 Py_DECREF(unicode);
1754 return NULL;
1758 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1759 Py_ssize_t size,
1760 int base64SetO,
1761 int base64WhiteSpace,
1762 const char *errors)
1764 PyObject *v;
1765 /* It might be possible to tighten this worst case */
1766 Py_ssize_t allocated = 8 * size;
1767 int inShift = 0;
1768 Py_ssize_t i = 0;
1769 unsigned int base64bits = 0;
1770 unsigned long base64buffer = 0;
1771 char * out;
1772 char * start;
1774 if (allocated / 8 != size)
1775 return PyErr_NoMemory();
1777 if (size == 0)
1778 return PyString_FromStringAndSize(NULL, 0);
1780 v = PyString_FromStringAndSize(NULL, allocated);
1781 if (v == NULL)
1782 return NULL;
1784 start = out = PyString_AS_STRING(v);
1785 for (;i < size; ++i) {
1786 Py_UNICODE ch = s[i];
1788 if (inShift) {
1789 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1790 /* shifting out */
1791 if (base64bits) { /* output remaining bits */
1792 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1793 base64buffer = 0;
1794 base64bits = 0;
1796 inShift = 0;
1797 /* Characters not in the BASE64 set implicitly unshift the sequence
1798 so no '-' is required, except if the character is itself a '-' */
1799 if (IS_BASE64(ch) || ch == '-') {
1800 *out++ = '-';
1802 *out++ = (char) ch;
1804 else {
1805 goto encode_char;
1808 else { /* not in a shift sequence */
1809 if (ch == '+') {
1810 *out++ = '+';
1811 *out++ = '-';
1813 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1814 *out++ = (char) ch;
1816 else {
1817 *out++ = '+';
1818 inShift = 1;
1819 goto encode_char;
1822 continue;
1823 encode_char:
1824 #ifdef Py_UNICODE_WIDE
1825 if (ch >= 0x10000) {
1826 /* code first surrogate */
1827 base64bits += 16;
1828 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1829 while (base64bits >= 6) {
1830 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1831 base64bits -= 6;
1833 /* prepare second surrogate */
1834 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1836 #endif
1837 base64bits += 16;
1838 base64buffer = (base64buffer << 16) | ch;
1839 while (base64bits >= 6) {
1840 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1841 base64bits -= 6;
1844 if (base64bits)
1845 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1846 if (inShift)
1847 *out++ = '-';
1849 _PyString_Resize(&v, out - start);
1850 return v;
1853 #undef IS_BASE64
1854 #undef FROM_BASE64
1855 #undef TO_BASE64
1856 #undef DECODE_DIRECT
1857 #undef ENCODE_DIRECT
1859 /* --- UTF-8 Codec -------------------------------------------------------- */
1861 static
1862 char utf8_code_length[256] = {
1863 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1864 illegal prefix. see RFC 2279 for details */
1865 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1866 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1867 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1868 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1869 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1870 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1871 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1872 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1873 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1874 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1875 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1876 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1877 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1878 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1879 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1880 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1883 PyObject *PyUnicode_DecodeUTF8(const char *s,
1884 Py_ssize_t size,
1885 const char *errors)
1887 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1890 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1891 Py_ssize_t size,
1892 const char *errors,
1893 Py_ssize_t *consumed)
1895 const char *starts = s;
1896 int n;
1897 Py_ssize_t startinpos;
1898 Py_ssize_t endinpos;
1899 Py_ssize_t outpos;
1900 const char *e;
1901 PyUnicodeObject *unicode;
1902 Py_UNICODE *p;
1903 const char *errmsg = "";
1904 PyObject *errorHandler = NULL;
1905 PyObject *exc = NULL;
1907 /* Note: size will always be longer than the resulting Unicode
1908 character count */
1909 unicode = _PyUnicode_New(size);
1910 if (!unicode)
1911 return NULL;
1912 if (size == 0) {
1913 if (consumed)
1914 *consumed = 0;
1915 return (PyObject *)unicode;
1918 /* Unpack UTF-8 encoded data */
1919 p = unicode->str;
1920 e = s + size;
1922 while (s < e) {
1923 Py_UCS4 ch = (unsigned char)*s;
1925 if (ch < 0x80) {
1926 *p++ = (Py_UNICODE)ch;
1927 s++;
1928 continue;
1931 n = utf8_code_length[ch];
1933 if (s + n > e) {
1934 if (consumed)
1935 break;
1936 else {
1937 errmsg = "unexpected end of data";
1938 startinpos = s-starts;
1939 endinpos = size;
1940 goto utf8Error;
1944 switch (n) {
1946 case 0:
1947 errmsg = "unexpected code byte";
1948 startinpos = s-starts;
1949 endinpos = startinpos+1;
1950 goto utf8Error;
1952 case 1:
1953 errmsg = "internal error";
1954 startinpos = s-starts;
1955 endinpos = startinpos+1;
1956 goto utf8Error;
1958 case 2:
1959 if ((s[1] & 0xc0) != 0x80) {
1960 errmsg = "invalid data";
1961 startinpos = s-starts;
1962 endinpos = startinpos+2;
1963 goto utf8Error;
1965 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1966 if (ch < 0x80) {
1967 startinpos = s-starts;
1968 endinpos = startinpos+2;
1969 errmsg = "illegal encoding";
1970 goto utf8Error;
1972 else
1973 *p++ = (Py_UNICODE)ch;
1974 break;
1976 case 3:
1977 if ((s[1] & 0xc0) != 0x80 ||
1978 (s[2] & 0xc0) != 0x80) {
1979 errmsg = "invalid data";
1980 startinpos = s-starts;
1981 endinpos = startinpos+3;
1982 goto utf8Error;
1984 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1985 if (ch < 0x0800) {
1986 /* Note: UTF-8 encodings of surrogates are considered
1987 legal UTF-8 sequences;
1989 XXX For wide builds (UCS-4) we should probably try
1990 to recombine the surrogates into a single code
1991 unit.
1993 errmsg = "illegal encoding";
1994 startinpos = s-starts;
1995 endinpos = startinpos+3;
1996 goto utf8Error;
1998 else
1999 *p++ = (Py_UNICODE)ch;
2000 break;
2002 case 4:
2003 if ((s[1] & 0xc0) != 0x80 ||
2004 (s[2] & 0xc0) != 0x80 ||
2005 (s[3] & 0xc0) != 0x80) {
2006 errmsg = "invalid data";
2007 startinpos = s-starts;
2008 endinpos = startinpos+4;
2009 goto utf8Error;
2011 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2012 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2013 /* validate and convert to UTF-16 */
2014 if ((ch < 0x10000) /* minimum value allowed for 4
2015 byte encoding */
2016 || (ch > 0x10ffff)) /* maximum value allowed for
2017 UTF-16 */
2019 errmsg = "illegal encoding";
2020 startinpos = s-starts;
2021 endinpos = startinpos+4;
2022 goto utf8Error;
2024 #ifdef Py_UNICODE_WIDE
2025 *p++ = (Py_UNICODE)ch;
2026 #else
2027 /* compute and append the two surrogates: */
2029 /* translate from 10000..10FFFF to 0..FFFF */
2030 ch -= 0x10000;
2032 /* high surrogate = top 10 bits added to D800 */
2033 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2035 /* low surrogate = bottom 10 bits added to DC00 */
2036 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2037 #endif
2038 break;
2040 default:
2041 /* Other sizes are only needed for UCS-4 */
2042 errmsg = "unsupported Unicode code range";
2043 startinpos = s-starts;
2044 endinpos = startinpos+n;
2045 goto utf8Error;
2047 s += n;
2048 continue;
2050 utf8Error:
2051 outpos = p-PyUnicode_AS_UNICODE(unicode);
2052 if (unicode_decode_call_errorhandler(
2053 errors, &errorHandler,
2054 "utf8", errmsg,
2055 starts, size, &startinpos, &endinpos, &exc, &s,
2056 &unicode, &outpos, &p))
2057 goto onError;
2059 if (consumed)
2060 *consumed = s-starts;
2062 /* Adjust length */
2063 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2064 goto onError;
2066 Py_XDECREF(errorHandler);
2067 Py_XDECREF(exc);
2068 return (PyObject *)unicode;
2070 onError:
2071 Py_XDECREF(errorHandler);
2072 Py_XDECREF(exc);
2073 Py_DECREF(unicode);
2074 return NULL;
2077 /* Allocation strategy: if the string is short, convert into a stack buffer
2078 and allocate exactly as much space needed at the end. Else allocate the
2079 maximum possible needed (4 result bytes per Unicode character), and return
2080 the excess memory at the end.
2082 PyObject *
2083 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2084 Py_ssize_t size,
2085 const char *errors)
2087 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
2089 Py_ssize_t i; /* index into s of next input byte */
2090 PyObject *v; /* result string object */
2091 char *p; /* next free byte in output buffer */
2092 Py_ssize_t nallocated; /* number of result bytes allocated */
2093 Py_ssize_t nneeded; /* number of result bytes needed */
2094 char stackbuf[MAX_SHORT_UNICHARS * 4];
2096 assert(s != NULL);
2097 assert(size >= 0);
2099 if (size <= MAX_SHORT_UNICHARS) {
2100 /* Write into the stack buffer; nallocated can't overflow.
2101 * At the end, we'll allocate exactly as much heap space as it
2102 * turns out we need.
2104 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2105 v = NULL; /* will allocate after we're done */
2106 p = stackbuf;
2108 else {
2109 /* Overallocate on the heap, and give the excess back at the end. */
2110 nallocated = size * 4;
2111 if (nallocated / 4 != size) /* overflow! */
2112 return PyErr_NoMemory();
2113 v = PyString_FromStringAndSize(NULL, nallocated);
2114 if (v == NULL)
2115 return NULL;
2116 p = PyString_AS_STRING(v);
2119 for (i = 0; i < size;) {
2120 Py_UCS4 ch = s[i++];
2122 if (ch < 0x80)
2123 /* Encode ASCII */
2124 *p++ = (char) ch;
2126 else if (ch < 0x0800) {
2127 /* Encode Latin-1 */
2128 *p++ = (char)(0xc0 | (ch >> 6));
2129 *p++ = (char)(0x80 | (ch & 0x3f));
2131 else {
2132 /* Encode UCS2 Unicode ordinals */
2133 if (ch < 0x10000) {
2134 /* Special case: check for high surrogate */
2135 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2136 Py_UCS4 ch2 = s[i];
2137 /* Check for low surrogate and combine the two to
2138 form a UCS4 value */
2139 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2140 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2141 i++;
2142 goto encodeUCS4;
2144 /* Fall through: handles isolated high surrogates */
2146 *p++ = (char)(0xe0 | (ch >> 12));
2147 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2148 *p++ = (char)(0x80 | (ch & 0x3f));
2149 continue;
2151 encodeUCS4:
2152 /* Encode UCS4 Unicode ordinals */
2153 *p++ = (char)(0xf0 | (ch >> 18));
2154 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2155 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2156 *p++ = (char)(0x80 | (ch & 0x3f));
2160 if (v == NULL) {
2161 /* This was stack allocated. */
2162 nneeded = p - stackbuf;
2163 assert(nneeded <= nallocated);
2164 v = PyString_FromStringAndSize(stackbuf, nneeded);
2166 else {
2167 /* Cut back to size actually needed. */
2168 nneeded = p - PyString_AS_STRING(v);
2169 assert(nneeded <= nallocated);
2170 _PyString_Resize(&v, nneeded);
2172 return v;
2174 #undef MAX_SHORT_UNICHARS
2177 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2179 if (!PyUnicode_Check(unicode)) {
2180 PyErr_BadArgument();
2181 return NULL;
2183 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2184 PyUnicode_GET_SIZE(unicode),
2185 NULL);
2188 /* --- UTF-32 Codec ------------------------------------------------------- */
2190 PyObject *
2191 PyUnicode_DecodeUTF32(const char *s,
2192 Py_ssize_t size,
2193 const char *errors,
2194 int *byteorder)
2196 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2199 PyObject *
2200 PyUnicode_DecodeUTF32Stateful(const char *s,
2201 Py_ssize_t size,
2202 const char *errors,
2203 int *byteorder,
2204 Py_ssize_t *consumed)
2206 const char *starts = s;
2207 Py_ssize_t startinpos;
2208 Py_ssize_t endinpos;
2209 Py_ssize_t outpos;
2210 PyUnicodeObject *unicode;
2211 Py_UNICODE *p;
2212 #ifndef Py_UNICODE_WIDE
2213 int i, pairs;
2214 #else
2215 const int pairs = 0;
2216 #endif
2217 const unsigned char *q, *e;
2218 int bo = 0; /* assume native ordering by default */
2219 const char *errmsg = "";
2220 /* Offsets from q for retrieving bytes in the right order. */
2221 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2222 int iorder[] = {0, 1, 2, 3};
2223 #else
2224 int iorder[] = {3, 2, 1, 0};
2225 #endif
2226 PyObject *errorHandler = NULL;
2227 PyObject *exc = NULL;
2228 /* On narrow builds we split characters outside the BMP into two
2229 codepoints => count how much extra space we need. */
2230 #ifndef Py_UNICODE_WIDE
2231 for (i = pairs = 0; i < size/4; i++)
2232 if (((Py_UCS4 *)s)[i] >= 0x10000)
2233 pairs++;
2234 #endif
2236 /* This might be one to much, because of a BOM */
2237 unicode = _PyUnicode_New((size+3)/4+pairs);
2238 if (!unicode)
2239 return NULL;
2240 if (size == 0)
2241 return (PyObject *)unicode;
2243 /* Unpack UTF-32 encoded data */
2244 p = unicode->str;
2245 q = (unsigned char *)s;
2246 e = q + size;
2248 if (byteorder)
2249 bo = *byteorder;
2251 /* Check for BOM marks (U+FEFF) in the input and adjust current
2252 byte order setting accordingly. In native mode, the leading BOM
2253 mark is skipped, in all other modes, it is copied to the output
2254 stream as-is (giving a ZWNBSP character). */
2255 if (bo == 0) {
2256 if (size >= 4) {
2257 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2258 (q[iorder[1]] << 8) | q[iorder[0]];
2259 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2260 if (bom == 0x0000FEFF) {
2261 q += 4;
2262 bo = -1;
2264 else if (bom == 0xFFFE0000) {
2265 q += 4;
2266 bo = 1;
2268 #else
2269 if (bom == 0x0000FEFF) {
2270 q += 4;
2271 bo = 1;
2273 else if (bom == 0xFFFE0000) {
2274 q += 4;
2275 bo = -1;
2277 #endif
2281 if (bo == -1) {
2282 /* force LE */
2283 iorder[0] = 0;
2284 iorder[1] = 1;
2285 iorder[2] = 2;
2286 iorder[3] = 3;
2288 else if (bo == 1) {
2289 /* force BE */
2290 iorder[0] = 3;
2291 iorder[1] = 2;
2292 iorder[2] = 1;
2293 iorder[3] = 0;
2296 while (q < e) {
2297 Py_UCS4 ch;
2298 /* remaining bytes at the end? (size should be divisible by 4) */
2299 if (e-q<4) {
2300 if (consumed)
2301 break;
2302 errmsg = "truncated data";
2303 startinpos = ((const char *)q)-starts;
2304 endinpos = ((const char *)e)-starts;
2305 goto utf32Error;
2306 /* The remaining input chars are ignored if the callback
2307 chooses to skip the input */
2309 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2310 (q[iorder[1]] << 8) | q[iorder[0]];
2312 if (ch >= 0x110000)
2314 errmsg = "codepoint not in range(0x110000)";
2315 startinpos = ((const char *)q)-starts;
2316 endinpos = startinpos+4;
2317 goto utf32Error;
2319 #ifndef Py_UNICODE_WIDE
2320 if (ch >= 0x10000)
2322 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2323 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2325 else
2326 #endif
2327 *p++ = ch;
2328 q += 4;
2329 continue;
2330 utf32Error:
2331 outpos = p-PyUnicode_AS_UNICODE(unicode);
2332 if (unicode_decode_call_errorhandler(
2333 errors, &errorHandler,
2334 "utf32", errmsg,
2335 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2336 &unicode, &outpos, &p))
2337 goto onError;
2340 if (byteorder)
2341 *byteorder = bo;
2343 if (consumed)
2344 *consumed = (const char *)q-starts;
2346 /* Adjust length */
2347 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2348 goto onError;
2350 Py_XDECREF(errorHandler);
2351 Py_XDECREF(exc);
2352 return (PyObject *)unicode;
2354 onError:
2355 Py_DECREF(unicode);
2356 Py_XDECREF(errorHandler);
2357 Py_XDECREF(exc);
2358 return NULL;
2361 PyObject *
2362 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2363 Py_ssize_t size,
2364 const char *errors,
2365 int byteorder)
2367 PyObject *v;
2368 unsigned char *p;
2369 Py_ssize_t nsize, bytesize;
2370 #ifndef Py_UNICODE_WIDE
2371 Py_ssize_t i, pairs;
2372 #else
2373 const int pairs = 0;
2374 #endif
2375 /* Offsets from p for storing byte pairs in the right order. */
2376 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2377 int iorder[] = {0, 1, 2, 3};
2378 #else
2379 int iorder[] = {3, 2, 1, 0};
2380 #endif
2382 #define STORECHAR(CH) \
2383 do { \
2384 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2385 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2386 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2387 p[iorder[0]] = (CH) & 0xff; \
2388 p += 4; \
2389 } while(0)
2391 /* In narrow builds we can output surrogate pairs as one codepoint,
2392 so we need less space. */
2393 #ifndef Py_UNICODE_WIDE
2394 for (i = pairs = 0; i < size-1; i++)
2395 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2396 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2397 pairs++;
2398 #endif
2399 nsize = (size - pairs + (byteorder == 0));
2400 bytesize = nsize * 4;
2401 if (bytesize / 4 != nsize)
2402 return PyErr_NoMemory();
2403 v = PyString_FromStringAndSize(NULL, bytesize);
2404 if (v == NULL)
2405 return NULL;
2407 p = (unsigned char *)PyString_AS_STRING(v);
2408 if (byteorder == 0)
2409 STORECHAR(0xFEFF);
2410 if (size == 0)
2411 return v;
2413 if (byteorder == -1) {
2414 /* force LE */
2415 iorder[0] = 0;
2416 iorder[1] = 1;
2417 iorder[2] = 2;
2418 iorder[3] = 3;
2420 else if (byteorder == 1) {
2421 /* force BE */
2422 iorder[0] = 3;
2423 iorder[1] = 2;
2424 iorder[2] = 1;
2425 iorder[3] = 0;
2428 while (size-- > 0) {
2429 Py_UCS4 ch = *s++;
2430 #ifndef Py_UNICODE_WIDE
2431 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2432 Py_UCS4 ch2 = *s;
2433 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2434 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2435 s++;
2436 size--;
2439 #endif
2440 STORECHAR(ch);
2442 return v;
2443 #undef STORECHAR
2446 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2448 if (!PyUnicode_Check(unicode)) {
2449 PyErr_BadArgument();
2450 return NULL;
2452 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2453 PyUnicode_GET_SIZE(unicode),
2454 NULL,
2458 /* --- UTF-16 Codec ------------------------------------------------------- */
2460 PyObject *
2461 PyUnicode_DecodeUTF16(const char *s,
2462 Py_ssize_t size,
2463 const char *errors,
2464 int *byteorder)
2466 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2469 PyObject *
2470 PyUnicode_DecodeUTF16Stateful(const char *s,
2471 Py_ssize_t size,
2472 const char *errors,
2473 int *byteorder,
2474 Py_ssize_t *consumed)
2476 const char *starts = s;
2477 Py_ssize_t startinpos;
2478 Py_ssize_t endinpos;
2479 Py_ssize_t outpos;
2480 PyUnicodeObject *unicode;
2481 Py_UNICODE *p;
2482 const unsigned char *q, *e;
2483 int bo = 0; /* assume native ordering by default */
2484 const char *errmsg = "";
2485 /* Offsets from q for retrieving byte pairs in the right order. */
2486 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2487 int ihi = 1, ilo = 0;
2488 #else
2489 int ihi = 0, ilo = 1;
2490 #endif
2491 PyObject *errorHandler = NULL;
2492 PyObject *exc = NULL;
2494 /* Note: size will always be longer than the resulting Unicode
2495 character count */
2496 unicode = _PyUnicode_New(size);
2497 if (!unicode)
2498 return NULL;
2499 if (size == 0)
2500 return (PyObject *)unicode;
2502 /* Unpack UTF-16 encoded data */
2503 p = unicode->str;
2504 q = (unsigned char *)s;
2505 e = q + size;
2507 if (byteorder)
2508 bo = *byteorder;
2510 /* Check for BOM marks (U+FEFF) in the input and adjust current
2511 byte order setting accordingly. In native mode, the leading BOM
2512 mark is skipped, in all other modes, it is copied to the output
2513 stream as-is (giving a ZWNBSP character). */
2514 if (bo == 0) {
2515 if (size >= 2) {
2516 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2517 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2518 if (bom == 0xFEFF) {
2519 q += 2;
2520 bo = -1;
2522 else if (bom == 0xFFFE) {
2523 q += 2;
2524 bo = 1;
2526 #else
2527 if (bom == 0xFEFF) {
2528 q += 2;
2529 bo = 1;
2531 else if (bom == 0xFFFE) {
2532 q += 2;
2533 bo = -1;
2535 #endif
2539 if (bo == -1) {
2540 /* force LE */
2541 ihi = 1;
2542 ilo = 0;
2544 else if (bo == 1) {
2545 /* force BE */
2546 ihi = 0;
2547 ilo = 1;
2550 while (q < e) {
2551 Py_UNICODE ch;
2552 /* remaining bytes at the end? (size should be even) */
2553 if (e-q<2) {
2554 if (consumed)
2555 break;
2556 errmsg = "truncated data";
2557 startinpos = ((const char *)q)-starts;
2558 endinpos = ((const char *)e)-starts;
2559 goto utf16Error;
2560 /* The remaining input chars are ignored if the callback
2561 chooses to skip the input */
2563 ch = (q[ihi] << 8) | q[ilo];
2565 q += 2;
2567 if (ch < 0xD800 || ch > 0xDFFF) {
2568 *p++ = ch;
2569 continue;
2572 /* UTF-16 code pair: */
2573 if (q >= e) {
2574 errmsg = "unexpected end of data";
2575 startinpos = (((const char *)q)-2)-starts;
2576 endinpos = ((const char *)e)-starts;
2577 goto utf16Error;
2579 if (0xD800 <= ch && ch <= 0xDBFF) {
2580 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2581 q += 2;
2582 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2583 #ifndef Py_UNICODE_WIDE
2584 *p++ = ch;
2585 *p++ = ch2;
2586 #else
2587 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2588 #endif
2589 continue;
2591 else {
2592 errmsg = "illegal UTF-16 surrogate";
2593 startinpos = (((const char *)q)-4)-starts;
2594 endinpos = startinpos+2;
2595 goto utf16Error;
2599 errmsg = "illegal encoding";
2600 startinpos = (((const char *)q)-2)-starts;
2601 endinpos = startinpos+2;
2602 /* Fall through to report the error */
2604 utf16Error:
2605 outpos = p-PyUnicode_AS_UNICODE(unicode);
2606 if (unicode_decode_call_errorhandler(
2607 errors, &errorHandler,
2608 "utf16", errmsg,
2609 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2610 &unicode, &outpos, &p))
2611 goto onError;
2614 if (byteorder)
2615 *byteorder = bo;
2617 if (consumed)
2618 *consumed = (const char *)q-starts;
2620 /* Adjust length */
2621 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2622 goto onError;
2624 Py_XDECREF(errorHandler);
2625 Py_XDECREF(exc);
2626 return (PyObject *)unicode;
2628 onError:
2629 Py_DECREF(unicode);
2630 Py_XDECREF(errorHandler);
2631 Py_XDECREF(exc);
2632 return NULL;
2635 PyObject *
2636 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2637 Py_ssize_t size,
2638 const char *errors,
2639 int byteorder)
2641 PyObject *v;
2642 unsigned char *p;
2643 Py_ssize_t nsize, bytesize;
2644 #ifdef Py_UNICODE_WIDE
2645 Py_ssize_t i, pairs;
2646 #else
2647 const int pairs = 0;
2648 #endif
2649 /* Offsets from p for storing byte pairs in the right order. */
2650 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2651 int ihi = 1, ilo = 0;
2652 #else
2653 int ihi = 0, ilo = 1;
2654 #endif
2656 #define STORECHAR(CH) \
2657 do { \
2658 p[ihi] = ((CH) >> 8) & 0xff; \
2659 p[ilo] = (CH) & 0xff; \
2660 p += 2; \
2661 } while(0)
2663 #ifdef Py_UNICODE_WIDE
2664 for (i = pairs = 0; i < size; i++)
2665 if (s[i] >= 0x10000)
2666 pairs++;
2667 #endif
2668 /* 2 * (size + pairs + (byteorder == 0)) */
2669 if (size > PY_SSIZE_T_MAX ||
2670 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2671 return PyErr_NoMemory();
2672 nsize = size + pairs + (byteorder == 0);
2673 bytesize = nsize * 2;
2674 if (bytesize / 2 != nsize)
2675 return PyErr_NoMemory();
2676 v = PyString_FromStringAndSize(NULL, bytesize);
2677 if (v == NULL)
2678 return NULL;
2680 p = (unsigned char *)PyString_AS_STRING(v);
2681 if (byteorder == 0)
2682 STORECHAR(0xFEFF);
2683 if (size == 0)
2684 return v;
2686 if (byteorder == -1) {
2687 /* force LE */
2688 ihi = 1;
2689 ilo = 0;
2691 else if (byteorder == 1) {
2692 /* force BE */
2693 ihi = 0;
2694 ilo = 1;
2697 while (size-- > 0) {
2698 Py_UNICODE ch = *s++;
2699 Py_UNICODE ch2 = 0;
2700 #ifdef Py_UNICODE_WIDE
2701 if (ch >= 0x10000) {
2702 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2703 ch = 0xD800 | ((ch-0x10000) >> 10);
2705 #endif
2706 STORECHAR(ch);
2707 if (ch2)
2708 STORECHAR(ch2);
2710 return v;
2711 #undef STORECHAR
2714 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2716 if (!PyUnicode_Check(unicode)) {
2717 PyErr_BadArgument();
2718 return NULL;
2720 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2721 PyUnicode_GET_SIZE(unicode),
2722 NULL,
2726 /* --- Unicode Escape Codec ----------------------------------------------- */
2728 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2730 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2731 Py_ssize_t size,
2732 const char *errors)
2734 const char *starts = s;
2735 Py_ssize_t startinpos;
2736 Py_ssize_t endinpos;
2737 Py_ssize_t outpos;
2738 int i;
2739 PyUnicodeObject *v;
2740 Py_UNICODE *p;
2741 const char *end;
2742 char* message;
2743 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2744 PyObject *errorHandler = NULL;
2745 PyObject *exc = NULL;
2747 /* Escaped strings will always be longer than the resulting
2748 Unicode string, so we start with size here and then reduce the
2749 length after conversion to the true value.
2750 (but if the error callback returns a long replacement string
2751 we'll have to allocate more space) */
2752 v = _PyUnicode_New(size);
2753 if (v == NULL)
2754 goto onError;
2755 if (size == 0)
2756 return (PyObject *)v;
2758 p = PyUnicode_AS_UNICODE(v);
2759 end = s + size;
2761 while (s < end) {
2762 unsigned char c;
2763 Py_UNICODE x;
2764 int digits;
2766 /* Non-escape characters are interpreted as Unicode ordinals */
2767 if (*s != '\\') {
2768 *p++ = (unsigned char) *s++;
2769 continue;
2772 startinpos = s-starts;
2773 /* \ - Escapes */
2774 s++;
2775 c = *s++;
2776 if (s > end)
2777 c = '\0'; /* Invalid after \ */
2778 switch (c) {
2780 /* \x escapes */
2781 case '\n': break;
2782 case '\\': *p++ = '\\'; break;
2783 case '\'': *p++ = '\''; break;
2784 case '\"': *p++ = '\"'; break;
2785 case 'b': *p++ = '\b'; break;
2786 case 'f': *p++ = '\014'; break; /* FF */
2787 case 't': *p++ = '\t'; break;
2788 case 'n': *p++ = '\n'; break;
2789 case 'r': *p++ = '\r'; break;
2790 case 'v': *p++ = '\013'; break; /* VT */
2791 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2793 /* \OOO (octal) escapes */
2794 case '0': case '1': case '2': case '3':
2795 case '4': case '5': case '6': case '7':
2796 x = s[-1] - '0';
2797 if (s < end && '0' <= *s && *s <= '7') {
2798 x = (x<<3) + *s++ - '0';
2799 if (s < end && '0' <= *s && *s <= '7')
2800 x = (x<<3) + *s++ - '0';
2802 *p++ = x;
2803 break;
2805 /* hex escapes */
2806 /* \xXX */
2807 case 'x':
2808 digits = 2;
2809 message = "truncated \\xXX escape";
2810 goto hexescape;
2812 /* \uXXXX */
2813 case 'u':
2814 digits = 4;
2815 message = "truncated \\uXXXX escape";
2816 goto hexescape;
2818 /* \UXXXXXXXX */
2819 case 'U':
2820 digits = 8;
2821 message = "truncated \\UXXXXXXXX escape";
2822 hexescape:
2823 chr = 0;
2824 outpos = p-PyUnicode_AS_UNICODE(v);
2825 if (s+digits>end) {
2826 endinpos = size;
2827 if (unicode_decode_call_errorhandler(
2828 errors, &errorHandler,
2829 "unicodeescape", "end of string in escape sequence",
2830 starts, size, &startinpos, &endinpos, &exc, &s,
2831 &v, &outpos, &p))
2832 goto onError;
2833 goto nextByte;
2835 for (i = 0; i < digits; ++i) {
2836 c = (unsigned char) s[i];
2837 if (!isxdigit(c)) {
2838 endinpos = (s+i+1)-starts;
2839 if (unicode_decode_call_errorhandler(
2840 errors, &errorHandler,
2841 "unicodeescape", message,
2842 starts, size, &startinpos, &endinpos, &exc, &s,
2843 &v, &outpos, &p))
2844 goto onError;
2845 goto nextByte;
2847 chr = (chr<<4) & ~0xF;
2848 if (c >= '0' && c <= '9')
2849 chr += c - '0';
2850 else if (c >= 'a' && c <= 'f')
2851 chr += 10 + c - 'a';
2852 else
2853 chr += 10 + c - 'A';
2855 s += i;
2856 if (chr == 0xffffffff && PyErr_Occurred())
2857 /* _decoding_error will have already written into the
2858 target buffer. */
2859 break;
2860 store:
2861 /* when we get here, chr is a 32-bit unicode character */
2862 if (chr <= 0xffff)
2863 /* UCS-2 character */
2864 *p++ = (Py_UNICODE) chr;
2865 else if (chr <= 0x10ffff) {
2866 /* UCS-4 character. Either store directly, or as
2867 surrogate pair. */
2868 #ifdef Py_UNICODE_WIDE
2869 *p++ = chr;
2870 #else
2871 chr -= 0x10000L;
2872 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2873 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2874 #endif
2875 } else {
2876 endinpos = s-starts;
2877 outpos = p-PyUnicode_AS_UNICODE(v);
2878 if (unicode_decode_call_errorhandler(
2879 errors, &errorHandler,
2880 "unicodeescape", "illegal Unicode character",
2881 starts, size, &startinpos, &endinpos, &exc, &s,
2882 &v, &outpos, &p))
2883 goto onError;
2885 break;
2887 /* \N{name} */
2888 case 'N':
2889 message = "malformed \\N character escape";
2890 if (ucnhash_CAPI == NULL) {
2891 /* load the unicode data module */
2892 PyObject *m, *api;
2893 m = PyImport_ImportModuleNoBlock("unicodedata");
2894 if (m == NULL)
2895 goto ucnhashError;
2896 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
2897 Py_DECREF(m);
2898 if (api == NULL)
2899 goto ucnhashError;
2900 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
2901 Py_DECREF(api);
2902 if (ucnhash_CAPI == NULL)
2903 goto ucnhashError;
2905 if (*s == '{') {
2906 const char *start = s+1;
2907 /* look for the closing brace */
2908 while (*s != '}' && s < end)
2909 s++;
2910 if (s > start && s < end && *s == '}') {
2911 /* found a name. look it up in the unicode database */
2912 message = "unknown Unicode character name";
2913 s++;
2914 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2915 goto store;
2918 endinpos = s-starts;
2919 outpos = p-PyUnicode_AS_UNICODE(v);
2920 if (unicode_decode_call_errorhandler(
2921 errors, &errorHandler,
2922 "unicodeescape", message,
2923 starts, size, &startinpos, &endinpos, &exc, &s,
2924 &v, &outpos, &p))
2925 goto onError;
2926 break;
2928 default:
2929 if (s > end) {
2930 message = "\\ at end of string";
2931 s--;
2932 endinpos = s-starts;
2933 outpos = p-PyUnicode_AS_UNICODE(v);
2934 if (unicode_decode_call_errorhandler(
2935 errors, &errorHandler,
2936 "unicodeescape", message,
2937 starts, size, &startinpos, &endinpos, &exc, &s,
2938 &v, &outpos, &p))
2939 goto onError;
2941 else {
2942 *p++ = '\\';
2943 *p++ = (unsigned char)s[-1];
2945 break;
2947 nextByte:
2950 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2951 goto onError;
2952 Py_XDECREF(errorHandler);
2953 Py_XDECREF(exc);
2954 return (PyObject *)v;
2956 ucnhashError:
2957 PyErr_SetString(
2958 PyExc_UnicodeError,
2959 "\\N escapes not supported (can't load unicodedata module)"
2961 Py_XDECREF(v);
2962 Py_XDECREF(errorHandler);
2963 Py_XDECREF(exc);
2964 return NULL;
2966 onError:
2967 Py_XDECREF(v);
2968 Py_XDECREF(errorHandler);
2969 Py_XDECREF(exc);
2970 return NULL;
2973 /* Return a Unicode-Escape string version of the Unicode object.
2975 If quotes is true, the string is enclosed in u"" or u'' quotes as
2976 appropriate.
2980 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2981 Py_ssize_t size,
2982 Py_UNICODE ch)
2984 /* like wcschr, but doesn't stop at NULL characters */
2986 while (size-- > 0) {
2987 if (*s == ch)
2988 return s;
2989 s++;
2992 return NULL;
2995 static
2996 PyObject *unicodeescape_string(const Py_UNICODE *s,
2997 Py_ssize_t size,
2998 int quotes)
3000 PyObject *repr;
3001 char *p;
3003 static const char *hexdigit = "0123456789abcdef";
3004 #ifdef Py_UNICODE_WIDE
3005 const Py_ssize_t expandsize = 10;
3006 #else
3007 const Py_ssize_t expandsize = 6;
3008 #endif
3010 /* XXX(nnorwitz): rather than over-allocating, it would be
3011 better to choose a different scheme. Perhaps scan the
3012 first N-chars of the string and allocate based on that size.
3014 /* Initial allocation is based on the longest-possible unichr
3015 escape.
3017 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3018 unichr, so in this case it's the longest unichr escape. In
3019 narrow (UTF-16) builds this is five chars per source unichr
3020 since there are two unichrs in the surrogate pair, so in narrow
3021 (UTF-16) builds it's not the longest unichr escape.
3023 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3024 so in the narrow (UTF-16) build case it's the longest unichr
3025 escape.
3028 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3029 return PyErr_NoMemory();
3031 repr = PyString_FromStringAndSize(NULL,
3033 + expandsize*size
3034 + 1);
3035 if (repr == NULL)
3036 return NULL;
3038 p = PyString_AS_STRING(repr);
3040 if (quotes) {
3041 *p++ = 'u';
3042 *p++ = (findchar(s, size, '\'') &&
3043 !findchar(s, size, '"')) ? '"' : '\'';
3045 while (size-- > 0) {
3046 Py_UNICODE ch = *s++;
3048 /* Escape quotes and backslashes */
3049 if ((quotes &&
3050 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
3051 *p++ = '\\';
3052 *p++ = (char) ch;
3053 continue;
3056 #ifdef Py_UNICODE_WIDE
3057 /* Map 21-bit characters to '\U00xxxxxx' */
3058 else if (ch >= 0x10000) {
3059 *p++ = '\\';
3060 *p++ = 'U';
3061 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3062 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3063 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3064 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3065 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3066 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3067 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
3068 *p++ = hexdigit[ch & 0x0000000F];
3069 continue;
3071 #else
3072 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3073 else if (ch >= 0xD800 && ch < 0xDC00) {
3074 Py_UNICODE ch2;
3075 Py_UCS4 ucs;
3077 ch2 = *s++;
3078 size--;
3079 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3080 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3081 *p++ = '\\';
3082 *p++ = 'U';
3083 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3084 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3085 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3086 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3087 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3088 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3089 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3090 *p++ = hexdigit[ucs & 0x0000000F];
3091 continue;
3093 /* Fall through: isolated surrogates are copied as-is */
3094 s--;
3095 size++;
3097 #endif
3099 /* Map 16-bit characters to '\uxxxx' */
3100 if (ch >= 256) {
3101 *p++ = '\\';
3102 *p++ = 'u';
3103 *p++ = hexdigit[(ch >> 12) & 0x000F];
3104 *p++ = hexdigit[(ch >> 8) & 0x000F];
3105 *p++ = hexdigit[(ch >> 4) & 0x000F];
3106 *p++ = hexdigit[ch & 0x000F];
3109 /* Map special whitespace to '\t', \n', '\r' */
3110 else if (ch == '\t') {
3111 *p++ = '\\';
3112 *p++ = 't';
3114 else if (ch == '\n') {
3115 *p++ = '\\';
3116 *p++ = 'n';
3118 else if (ch == '\r') {
3119 *p++ = '\\';
3120 *p++ = 'r';
3123 /* Map non-printable US ASCII to '\xhh' */
3124 else if (ch < ' ' || ch >= 0x7F) {
3125 *p++ = '\\';
3126 *p++ = 'x';
3127 *p++ = hexdigit[(ch >> 4) & 0x000F];
3128 *p++ = hexdigit[ch & 0x000F];
3131 /* Copy everything else as-is */
3132 else
3133 *p++ = (char) ch;
3135 if (quotes)
3136 *p++ = PyString_AS_STRING(repr)[1];
3138 *p = '\0';
3139 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
3140 return repr;
3143 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3144 Py_ssize_t size)
3146 return unicodeescape_string(s, size, 0);
3149 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3151 if (!PyUnicode_Check(unicode)) {
3152 PyErr_BadArgument();
3153 return NULL;
3155 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3156 PyUnicode_GET_SIZE(unicode));
3159 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3161 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3162 Py_ssize_t size,
3163 const char *errors)
3165 const char *starts = s;
3166 Py_ssize_t startinpos;
3167 Py_ssize_t endinpos;
3168 Py_ssize_t outpos;
3169 PyUnicodeObject *v;
3170 Py_UNICODE *p;
3171 const char *end;
3172 const char *bs;
3173 PyObject *errorHandler = NULL;
3174 PyObject *exc = NULL;
3176 /* Escaped strings will always be longer than the resulting
3177 Unicode string, so we start with size here and then reduce the
3178 length after conversion to the true value. (But decoding error
3179 handler might have to resize the string) */
3180 v = _PyUnicode_New(size);
3181 if (v == NULL)
3182 goto onError;
3183 if (size == 0)
3184 return (PyObject *)v;
3185 p = PyUnicode_AS_UNICODE(v);
3186 end = s + size;
3187 while (s < end) {
3188 unsigned char c;
3189 Py_UCS4 x;
3190 int i;
3191 int count;
3193 /* Non-escape characters are interpreted as Unicode ordinals */
3194 if (*s != '\\') {
3195 *p++ = (unsigned char)*s++;
3196 continue;
3198 startinpos = s-starts;
3200 /* \u-escapes are only interpreted iff the number of leading
3201 backslashes if odd */
3202 bs = s;
3203 for (;s < end;) {
3204 if (*s != '\\')
3205 break;
3206 *p++ = (unsigned char)*s++;
3208 if (((s - bs) & 1) == 0 ||
3209 s >= end ||
3210 (*s != 'u' && *s != 'U')) {
3211 continue;
3213 p--;
3214 count = *s=='u' ? 4 : 8;
3215 s++;
3217 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3218 outpos = p-PyUnicode_AS_UNICODE(v);
3219 for (x = 0, i = 0; i < count; ++i, ++s) {
3220 c = (unsigned char)*s;
3221 if (!isxdigit(c)) {
3222 endinpos = s-starts;
3223 if (unicode_decode_call_errorhandler(
3224 errors, &errorHandler,
3225 "rawunicodeescape", "truncated \\uXXXX",
3226 starts, size, &startinpos, &endinpos, &exc, &s,
3227 &v, &outpos, &p))
3228 goto onError;
3229 goto nextByte;
3231 x = (x<<4) & ~0xF;
3232 if (c >= '0' && c <= '9')
3233 x += c - '0';
3234 else if (c >= 'a' && c <= 'f')
3235 x += 10 + c - 'a';
3236 else
3237 x += 10 + c - 'A';
3239 if (x <= 0xffff)
3240 /* UCS-2 character */
3241 *p++ = (Py_UNICODE) x;
3242 else if (x <= 0x10ffff) {
3243 /* UCS-4 character. Either store directly, or as
3244 surrogate pair. */
3245 #ifdef Py_UNICODE_WIDE
3246 *p++ = (Py_UNICODE) x;
3247 #else
3248 x -= 0x10000L;
3249 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3250 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3251 #endif
3252 } else {
3253 endinpos = s-starts;
3254 outpos = p-PyUnicode_AS_UNICODE(v);
3255 if (unicode_decode_call_errorhandler(
3256 errors, &errorHandler,
3257 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3258 starts, size, &startinpos, &endinpos, &exc, &s,
3259 &v, &outpos, &p))
3260 goto onError;
3262 nextByte:
3265 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3266 goto onError;
3267 Py_XDECREF(errorHandler);
3268 Py_XDECREF(exc);
3269 return (PyObject *)v;
3271 onError:
3272 Py_XDECREF(v);
3273 Py_XDECREF(errorHandler);
3274 Py_XDECREF(exc);
3275 return NULL;
3278 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3279 Py_ssize_t size)
3281 PyObject *repr;
3282 char *p;
3283 char *q;
3285 static const char *hexdigit = "0123456789abcdef";
3286 #ifdef Py_UNICODE_WIDE
3287 const Py_ssize_t expandsize = 10;
3288 #else
3289 const Py_ssize_t expandsize = 6;
3290 #endif
3292 if (size > PY_SSIZE_T_MAX / expandsize)
3293 return PyErr_NoMemory();
3295 repr = PyString_FromStringAndSize(NULL, expandsize * size);
3296 if (repr == NULL)
3297 return NULL;
3298 if (size == 0)
3299 return repr;
3301 p = q = PyString_AS_STRING(repr);
3302 while (size-- > 0) {
3303 Py_UNICODE ch = *s++;
3304 #ifdef Py_UNICODE_WIDE
3305 /* Map 32-bit characters to '\Uxxxxxxxx' */
3306 if (ch >= 0x10000) {
3307 *p++ = '\\';
3308 *p++ = 'U';
3309 *p++ = hexdigit[(ch >> 28) & 0xf];
3310 *p++ = hexdigit[(ch >> 24) & 0xf];
3311 *p++ = hexdigit[(ch >> 20) & 0xf];
3312 *p++ = hexdigit[(ch >> 16) & 0xf];
3313 *p++ = hexdigit[(ch >> 12) & 0xf];
3314 *p++ = hexdigit[(ch >> 8) & 0xf];
3315 *p++ = hexdigit[(ch >> 4) & 0xf];
3316 *p++ = hexdigit[ch & 15];
3318 else
3319 #else
3320 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3321 if (ch >= 0xD800 && ch < 0xDC00) {
3322 Py_UNICODE ch2;
3323 Py_UCS4 ucs;
3325 ch2 = *s++;
3326 size--;
3327 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3328 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3329 *p++ = '\\';
3330 *p++ = 'U';
3331 *p++ = hexdigit[(ucs >> 28) & 0xf];
3332 *p++ = hexdigit[(ucs >> 24) & 0xf];
3333 *p++ = hexdigit[(ucs >> 20) & 0xf];
3334 *p++ = hexdigit[(ucs >> 16) & 0xf];
3335 *p++ = hexdigit[(ucs >> 12) & 0xf];
3336 *p++ = hexdigit[(ucs >> 8) & 0xf];
3337 *p++ = hexdigit[(ucs >> 4) & 0xf];
3338 *p++ = hexdigit[ucs & 0xf];
3339 continue;
3341 /* Fall through: isolated surrogates are copied as-is */
3342 s--;
3343 size++;
3345 #endif
3346 /* Map 16-bit characters to '\uxxxx' */
3347 if (ch >= 256) {
3348 *p++ = '\\';
3349 *p++ = 'u';
3350 *p++ = hexdigit[(ch >> 12) & 0xf];
3351 *p++ = hexdigit[(ch >> 8) & 0xf];
3352 *p++ = hexdigit[(ch >> 4) & 0xf];
3353 *p++ = hexdigit[ch & 15];
3355 /* Copy everything else as-is */
3356 else
3357 *p++ = (char) ch;
3359 *p = '\0';
3360 _PyString_Resize(&repr, p - q);
3361 return repr;
3364 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3366 if (!PyUnicode_Check(unicode)) {
3367 PyErr_BadArgument();
3368 return NULL;
3370 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3371 PyUnicode_GET_SIZE(unicode));
3374 /* --- Unicode Internal Codec ------------------------------------------- */
3376 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3377 Py_ssize_t size,
3378 const char *errors)
3380 const char *starts = s;
3381 Py_ssize_t startinpos;
3382 Py_ssize_t endinpos;
3383 Py_ssize_t outpos;
3384 PyUnicodeObject *v;
3385 Py_UNICODE *p;
3386 const char *end;
3387 const char *reason;
3388 PyObject *errorHandler = NULL;
3389 PyObject *exc = NULL;
3391 #ifdef Py_UNICODE_WIDE
3392 Py_UNICODE unimax = PyUnicode_GetMax();
3393 #endif
3395 /* XXX overflow detection missing */
3396 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3397 if (v == NULL)
3398 goto onError;
3399 if (PyUnicode_GetSize((PyObject *)v) == 0)
3400 return (PyObject *)v;
3401 p = PyUnicode_AS_UNICODE(v);
3402 end = s + size;
3404 while (s < end) {
3405 memcpy(p, s, sizeof(Py_UNICODE));
3406 /* We have to sanity check the raw data, otherwise doom looms for
3407 some malformed UCS-4 data. */
3408 if (
3409 #ifdef Py_UNICODE_WIDE
3410 *p > unimax || *p < 0 ||
3411 #endif
3412 end-s < Py_UNICODE_SIZE
3415 startinpos = s - starts;
3416 if (end-s < Py_UNICODE_SIZE) {
3417 endinpos = end-starts;
3418 reason = "truncated input";
3420 else {
3421 endinpos = s - starts + Py_UNICODE_SIZE;
3422 reason = "illegal code point (> 0x10FFFF)";
3424 outpos = p - PyUnicode_AS_UNICODE(v);
3425 if (unicode_decode_call_errorhandler(
3426 errors, &errorHandler,
3427 "unicode_internal", reason,
3428 starts, size, &startinpos, &endinpos, &exc, &s,
3429 &v, &outpos, &p)) {
3430 goto onError;
3433 else {
3434 p++;
3435 s += Py_UNICODE_SIZE;
3439 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3440 goto onError;
3441 Py_XDECREF(errorHandler);
3442 Py_XDECREF(exc);
3443 return (PyObject *)v;
3445 onError:
3446 Py_XDECREF(v);
3447 Py_XDECREF(errorHandler);
3448 Py_XDECREF(exc);
3449 return NULL;
3452 /* --- Latin-1 Codec ------------------------------------------------------ */
3454 PyObject *PyUnicode_DecodeLatin1(const char *s,
3455 Py_ssize_t size,
3456 const char *errors)
3458 PyUnicodeObject *v;
3459 Py_UNICODE *p;
3461 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3462 if (size == 1) {
3463 Py_UNICODE r = *(unsigned char*)s;
3464 return PyUnicode_FromUnicode(&r, 1);
3467 v = _PyUnicode_New(size);
3468 if (v == NULL)
3469 goto onError;
3470 if (size == 0)
3471 return (PyObject *)v;
3472 p = PyUnicode_AS_UNICODE(v);
3473 while (size-- > 0)
3474 *p++ = (unsigned char)*s++;
3475 return (PyObject *)v;
3477 onError:
3478 Py_XDECREF(v);
3479 return NULL;
3482 /* create or adjust a UnicodeEncodeError */
3483 static void make_encode_exception(PyObject **exceptionObject,
3484 const char *encoding,
3485 const Py_UNICODE *unicode, Py_ssize_t size,
3486 Py_ssize_t startpos, Py_ssize_t endpos,
3487 const char *reason)
3489 if (*exceptionObject == NULL) {
3490 *exceptionObject = PyUnicodeEncodeError_Create(
3491 encoding, unicode, size, startpos, endpos, reason);
3493 else {
3494 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3495 goto onError;
3496 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3497 goto onError;
3498 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3499 goto onError;
3500 return;
3501 onError:
3502 Py_DECREF(*exceptionObject);
3503 *exceptionObject = NULL;
3507 /* raises a UnicodeEncodeError */
3508 static void raise_encode_exception(PyObject **exceptionObject,
3509 const char *encoding,
3510 const Py_UNICODE *unicode, Py_ssize_t size,
3511 Py_ssize_t startpos, Py_ssize_t endpos,
3512 const char *reason)
3514 make_encode_exception(exceptionObject,
3515 encoding, unicode, size, startpos, endpos, reason);
3516 if (*exceptionObject != NULL)
3517 PyCodec_StrictErrors(*exceptionObject);
3520 /* error handling callback helper:
3521 build arguments, call the callback and check the arguments,
3522 put the result into newpos and return the replacement string, which
3523 has to be freed by the caller */
3524 static PyObject *unicode_encode_call_errorhandler(const char *errors,
3525 PyObject **errorHandler,
3526 const char *encoding, const char *reason,
3527 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3528 Py_ssize_t startpos, Py_ssize_t endpos,
3529 Py_ssize_t *newpos)
3531 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3533 PyObject *restuple;
3534 PyObject *resunicode;
3536 if (*errorHandler == NULL) {
3537 *errorHandler = PyCodec_LookupError(errors);
3538 if (*errorHandler == NULL)
3539 return NULL;
3542 make_encode_exception(exceptionObject,
3543 encoding, unicode, size, startpos, endpos, reason);
3544 if (*exceptionObject == NULL)
3545 return NULL;
3547 restuple = PyObject_CallFunctionObjArgs(
3548 *errorHandler, *exceptionObject, NULL);
3549 if (restuple == NULL)
3550 return NULL;
3551 if (!PyTuple_Check(restuple)) {
3552 PyErr_SetString(PyExc_TypeError, &argparse[4]);
3553 Py_DECREF(restuple);
3554 return NULL;
3556 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3557 &resunicode, newpos)) {
3558 Py_DECREF(restuple);
3559 return NULL;
3561 if (*newpos<0)
3562 *newpos = size+*newpos;
3563 if (*newpos<0 || *newpos>size) {
3564 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3565 Py_DECREF(restuple);
3566 return NULL;
3568 Py_INCREF(resunicode);
3569 Py_DECREF(restuple);
3570 return resunicode;
3573 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3574 Py_ssize_t size,
3575 const char *errors,
3576 int limit)
3578 /* output object */
3579 PyObject *res;
3580 /* pointers to the beginning and end+1 of input */
3581 const Py_UNICODE *startp = p;
3582 const Py_UNICODE *endp = p + size;
3583 /* pointer to the beginning of the unencodable characters */
3584 /* const Py_UNICODE *badp = NULL; */
3585 /* pointer into the output */
3586 char *str;
3587 /* current output position */
3588 Py_ssize_t respos = 0;
3589 Py_ssize_t ressize;
3590 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3591 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3592 PyObject *errorHandler = NULL;
3593 PyObject *exc = NULL;
3594 /* the following variable is used for caching string comparisons
3595 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3596 int known_errorHandler = -1;
3598 /* allocate enough for a simple encoding without
3599 replacements, if we need more, we'll resize */
3600 res = PyString_FromStringAndSize(NULL, size);
3601 if (res == NULL)
3602 goto onError;
3603 if (size == 0)
3604 return res;
3605 str = PyString_AS_STRING(res);
3606 ressize = size;
3608 while (p<endp) {
3609 Py_UNICODE c = *p;
3611 /* can we encode this? */
3612 if (c<limit) {
3613 /* no overflow check, because we know that the space is enough */
3614 *str++ = (char)c;
3615 ++p;
3617 else {
3618 Py_ssize_t unicodepos = p-startp;
3619 Py_ssize_t requiredsize;
3620 PyObject *repunicode;
3621 Py_ssize_t repsize;
3622 Py_ssize_t newpos;
3623 Py_ssize_t respos;
3624 Py_UNICODE *uni2;
3625 /* startpos for collecting unencodable chars */
3626 const Py_UNICODE *collstart = p;
3627 const Py_UNICODE *collend = p;
3628 /* find all unecodable characters */
3629 while ((collend < endp) && ((*collend)>=limit))
3630 ++collend;
3631 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3632 if (known_errorHandler==-1) {
3633 if ((errors==NULL) || (!strcmp(errors, "strict")))
3634 known_errorHandler = 1;
3635 else if (!strcmp(errors, "replace"))
3636 known_errorHandler = 2;
3637 else if (!strcmp(errors, "ignore"))
3638 known_errorHandler = 3;
3639 else if (!strcmp(errors, "xmlcharrefreplace"))
3640 known_errorHandler = 4;
3641 else
3642 known_errorHandler = 0;
3644 switch (known_errorHandler) {
3645 case 1: /* strict */
3646 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3647 goto onError;
3648 case 2: /* replace */
3649 while (collstart++<collend)
3650 *str++ = '?'; /* fall through */
3651 case 3: /* ignore */
3652 p = collend;
3653 break;
3654 case 4: /* xmlcharrefreplace */
3655 respos = str-PyString_AS_STRING(res);
3656 /* determine replacement size (temporarily (mis)uses p) */
3657 for (p = collstart, repsize = 0; p < collend; ++p) {
3658 if (*p<10)
3659 repsize += 2+1+1;
3660 else if (*p<100)
3661 repsize += 2+2+1;
3662 else if (*p<1000)
3663 repsize += 2+3+1;
3664 else if (*p<10000)
3665 repsize += 2+4+1;
3666 #ifndef Py_UNICODE_WIDE
3667 else
3668 repsize += 2+5+1;
3669 #else
3670 else if (*p<100000)
3671 repsize += 2+5+1;
3672 else if (*p<1000000)
3673 repsize += 2+6+1;
3674 else
3675 repsize += 2+7+1;
3676 #endif
3678 requiredsize = respos+repsize+(endp-collend);
3679 if (requiredsize > ressize) {
3680 if (requiredsize<2*ressize)
3681 requiredsize = 2*ressize;
3682 if (_PyString_Resize(&res, requiredsize))
3683 goto onError;
3684 str = PyString_AS_STRING(res) + respos;
3685 ressize = requiredsize;
3687 /* generate replacement (temporarily (mis)uses p) */
3688 for (p = collstart; p < collend; ++p) {
3689 str += sprintf(str, "&#%d;", (int)*p);
3691 p = collend;
3692 break;
3693 default:
3694 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3695 encoding, reason, startp, size, &exc,
3696 collstart-startp, collend-startp, &newpos);
3697 if (repunicode == NULL)
3698 goto onError;
3699 /* need more space? (at least enough for what we have+the
3700 replacement+the rest of the string, so we won't have to
3701 check space for encodable characters) */
3702 respos = str-PyString_AS_STRING(res);
3703 repsize = PyUnicode_GET_SIZE(repunicode);
3704 requiredsize = respos+repsize+(endp-collend);
3705 if (requiredsize > ressize) {
3706 if (requiredsize<2*ressize)
3707 requiredsize = 2*ressize;
3708 if (_PyString_Resize(&res, requiredsize)) {
3709 Py_DECREF(repunicode);
3710 goto onError;
3712 str = PyString_AS_STRING(res) + respos;
3713 ressize = requiredsize;
3715 /* check if there is anything unencodable in the replacement
3716 and copy it to the output */
3717 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3718 c = *uni2;
3719 if (c >= limit) {
3720 raise_encode_exception(&exc, encoding, startp, size,
3721 unicodepos, unicodepos+1, reason);
3722 Py_DECREF(repunicode);
3723 goto onError;
3725 *str = (char)c;
3727 p = startp + newpos;
3728 Py_DECREF(repunicode);
3732 /* Resize if we allocated to much */
3733 respos = str-PyString_AS_STRING(res);
3734 if (respos<ressize)
3735 /* If this falls res will be NULL */
3736 _PyString_Resize(&res, respos);
3737 Py_XDECREF(errorHandler);
3738 Py_XDECREF(exc);
3739 return res;
3741 onError:
3742 Py_XDECREF(res);
3743 Py_XDECREF(errorHandler);
3744 Py_XDECREF(exc);
3745 return NULL;
3748 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3749 Py_ssize_t size,
3750 const char *errors)
3752 return unicode_encode_ucs1(p, size, errors, 256);
3755 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3757 if (!PyUnicode_Check(unicode)) {
3758 PyErr_BadArgument();
3759 return NULL;
3761 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3762 PyUnicode_GET_SIZE(unicode),
3763 NULL);
3766 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3768 PyObject *PyUnicode_DecodeASCII(const char *s,
3769 Py_ssize_t size,
3770 const char *errors)
3772 const char *starts = s;
3773 PyUnicodeObject *v;
3774 Py_UNICODE *p;
3775 Py_ssize_t startinpos;
3776 Py_ssize_t endinpos;
3777 Py_ssize_t outpos;
3778 const char *e;
3779 PyObject *errorHandler = NULL;
3780 PyObject *exc = NULL;
3782 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3783 if (size == 1 && *(unsigned char*)s < 128) {
3784 Py_UNICODE r = *(unsigned char*)s;
3785 return PyUnicode_FromUnicode(&r, 1);
3788 v = _PyUnicode_New(size);
3789 if (v == NULL)
3790 goto onError;
3791 if (size == 0)
3792 return (PyObject *)v;
3793 p = PyUnicode_AS_UNICODE(v);
3794 e = s + size;
3795 while (s < e) {
3796 register unsigned char c = (unsigned char)*s;
3797 if (c < 128) {
3798 *p++ = c;
3799 ++s;
3801 else {
3802 startinpos = s-starts;
3803 endinpos = startinpos + 1;
3804 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3805 if (unicode_decode_call_errorhandler(
3806 errors, &errorHandler,
3807 "ascii", "ordinal not in range(128)",
3808 starts, size, &startinpos, &endinpos, &exc, &s,
3809 &v, &outpos, &p))
3810 goto onError;
3813 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
3814 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3815 goto onError;
3816 Py_XDECREF(errorHandler);
3817 Py_XDECREF(exc);
3818 return (PyObject *)v;
3820 onError:
3821 Py_XDECREF(v);
3822 Py_XDECREF(errorHandler);
3823 Py_XDECREF(exc);
3824 return NULL;
3827 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3828 Py_ssize_t size,
3829 const char *errors)
3831 return unicode_encode_ucs1(p, size, errors, 128);
3834 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3836 if (!PyUnicode_Check(unicode)) {
3837 PyErr_BadArgument();
3838 return NULL;
3840 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3841 PyUnicode_GET_SIZE(unicode),
3842 NULL);
3845 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3847 /* --- MBCS codecs for Windows -------------------------------------------- */
3849 #if SIZEOF_INT < SIZEOF_SIZE_T
3850 #define NEED_RETRY
3851 #endif
3853 /* XXX This code is limited to "true" double-byte encodings, as
3854 a) it assumes an incomplete character consists of a single byte, and
3855 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3856 encodings, see IsDBCSLeadByteEx documentation. */
3858 static int is_dbcs_lead_byte(const char *s, int offset)
3860 const char *curr = s + offset;
3862 if (IsDBCSLeadByte(*curr)) {
3863 const char *prev = CharPrev(s, curr);
3864 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3866 return 0;
3870 * Decode MBCS string into unicode object. If 'final' is set, converts
3871 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3873 static int decode_mbcs(PyUnicodeObject **v,
3874 const char *s, /* MBCS string */
3875 int size, /* sizeof MBCS string */
3876 int final)
3878 Py_UNICODE *p;
3879 Py_ssize_t n = 0;
3880 int usize = 0;
3882 assert(size >= 0);
3884 /* Skip trailing lead-byte unless 'final' is set */
3885 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3886 --size;
3888 /* First get the size of the result */
3889 if (size > 0) {
3890 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3891 if (usize == 0) {
3892 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3893 return -1;
3897 if (*v == NULL) {
3898 /* Create unicode object */
3899 *v = _PyUnicode_New(usize);
3900 if (*v == NULL)
3901 return -1;
3903 else {
3904 /* Extend unicode object */
3905 n = PyUnicode_GET_SIZE(*v);
3906 if (_PyUnicode_Resize(v, n + usize) < 0)
3907 return -1;
3910 /* Do the conversion */
3911 if (size > 0) {
3912 p = PyUnicode_AS_UNICODE(*v) + n;
3913 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3914 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3915 return -1;
3919 return size;
3922 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3923 Py_ssize_t size,
3924 const char *errors,
3925 Py_ssize_t *consumed)
3927 PyUnicodeObject *v = NULL;
3928 int done;
3930 if (consumed)
3931 *consumed = 0;
3933 #ifdef NEED_RETRY
3934 retry:
3935 if (size > INT_MAX)
3936 done = decode_mbcs(&v, s, INT_MAX, 0);
3937 else
3938 #endif
3939 done = decode_mbcs(&v, s, (int)size, !consumed);
3941 if (done < 0) {
3942 Py_XDECREF(v);
3943 return NULL;
3946 if (consumed)
3947 *consumed += done;
3949 #ifdef NEED_RETRY
3950 if (size > INT_MAX) {
3951 s += done;
3952 size -= done;
3953 goto retry;
3955 #endif
3957 return (PyObject *)v;
3960 PyObject *PyUnicode_DecodeMBCS(const char *s,
3961 Py_ssize_t size,
3962 const char *errors)
3964 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3968 * Convert unicode into string object (MBCS).
3969 * Returns 0 if succeed, -1 otherwise.
3971 static int encode_mbcs(PyObject **repr,
3972 const Py_UNICODE *p, /* unicode */
3973 int size) /* size of unicode */
3975 int mbcssize = 0;
3976 Py_ssize_t n = 0;
3978 assert(size >= 0);
3980 /* First get the size of the result */
3981 if (size > 0) {
3982 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3983 if (mbcssize == 0) {
3984 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3985 return -1;
3989 if (*repr == NULL) {
3990 /* Create string object */
3991 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3992 if (*repr == NULL)
3993 return -1;
3995 else {
3996 /* Extend string object */
3997 n = PyString_Size(*repr);
3998 if (_PyString_Resize(repr, n + mbcssize) < 0)
3999 return -1;
4002 /* Do the conversion */
4003 if (size > 0) {
4004 char *s = PyString_AS_STRING(*repr) + n;
4005 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4006 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4007 return -1;
4011 return 0;
4014 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
4015 Py_ssize_t size,
4016 const char *errors)
4018 PyObject *repr = NULL;
4019 int ret;
4021 #ifdef NEED_RETRY
4022 retry:
4023 if (size > INT_MAX)
4024 ret = encode_mbcs(&repr, p, INT_MAX);
4025 else
4026 #endif
4027 ret = encode_mbcs(&repr, p, (int)size);
4029 if (ret < 0) {
4030 Py_XDECREF(repr);
4031 return NULL;
4034 #ifdef NEED_RETRY
4035 if (size > INT_MAX) {
4036 p += INT_MAX;
4037 size -= INT_MAX;
4038 goto retry;
4040 #endif
4042 return repr;
4045 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4047 if (!PyUnicode_Check(unicode)) {
4048 PyErr_BadArgument();
4049 return NULL;
4051 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4052 PyUnicode_GET_SIZE(unicode),
4053 NULL);
4056 #undef NEED_RETRY
4058 #endif /* MS_WINDOWS */
4060 /* --- Character Mapping Codec -------------------------------------------- */
4062 PyObject *PyUnicode_DecodeCharmap(const char *s,
4063 Py_ssize_t size,
4064 PyObject *mapping,
4065 const char *errors)
4067 const char *starts = s;
4068 Py_ssize_t startinpos;
4069 Py_ssize_t endinpos;
4070 Py_ssize_t outpos;
4071 const char *e;
4072 PyUnicodeObject *v;
4073 Py_UNICODE *p;
4074 Py_ssize_t extrachars = 0;
4075 PyObject *errorHandler = NULL;
4076 PyObject *exc = NULL;
4077 Py_UNICODE *mapstring = NULL;
4078 Py_ssize_t maplen = 0;
4080 /* Default to Latin-1 */
4081 if (mapping == NULL)
4082 return PyUnicode_DecodeLatin1(s, size, errors);
4084 v = _PyUnicode_New(size);
4085 if (v == NULL)
4086 goto onError;
4087 if (size == 0)
4088 return (PyObject *)v;
4089 p = PyUnicode_AS_UNICODE(v);
4090 e = s + size;
4091 if (PyUnicode_CheckExact(mapping)) {
4092 mapstring = PyUnicode_AS_UNICODE(mapping);
4093 maplen = PyUnicode_GET_SIZE(mapping);
4094 while (s < e) {
4095 unsigned char ch = *s;
4096 Py_UNICODE x = 0xfffe; /* illegal value */
4098 if (ch < maplen)
4099 x = mapstring[ch];
4101 if (x == 0xfffe) {
4102 /* undefined mapping */
4103 outpos = p-PyUnicode_AS_UNICODE(v);
4104 startinpos = s-starts;
4105 endinpos = startinpos+1;
4106 if (unicode_decode_call_errorhandler(
4107 errors, &errorHandler,
4108 "charmap", "character maps to <undefined>",
4109 starts, size, &startinpos, &endinpos, &exc, &s,
4110 &v, &outpos, &p)) {
4111 goto onError;
4113 continue;
4115 *p++ = x;
4116 ++s;
4119 else {
4120 while (s < e) {
4121 unsigned char ch = *s;
4122 PyObject *w, *x;
4124 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4125 w = PyInt_FromLong((long)ch);
4126 if (w == NULL)
4127 goto onError;
4128 x = PyObject_GetItem(mapping, w);
4129 Py_DECREF(w);
4130 if (x == NULL) {
4131 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4132 /* No mapping found means: mapping is undefined. */
4133 PyErr_Clear();
4134 x = Py_None;
4135 Py_INCREF(x);
4136 } else
4137 goto onError;
4140 /* Apply mapping */
4141 if (PyInt_Check(x)) {
4142 long value = PyInt_AS_LONG(x);
4143 if (value < 0 || value > 65535) {
4144 PyErr_SetString(PyExc_TypeError,
4145 "character mapping must be in range(65536)");
4146 Py_DECREF(x);
4147 goto onError;
4149 *p++ = (Py_UNICODE)value;
4151 else if (x == Py_None) {
4152 /* undefined mapping */
4153 outpos = p-PyUnicode_AS_UNICODE(v);
4154 startinpos = s-starts;
4155 endinpos = startinpos+1;
4156 if (unicode_decode_call_errorhandler(
4157 errors, &errorHandler,
4158 "charmap", "character maps to <undefined>",
4159 starts, size, &startinpos, &endinpos, &exc, &s,
4160 &v, &outpos, &p)) {
4161 Py_DECREF(x);
4162 goto onError;
4164 Py_DECREF(x);
4165 continue;
4167 else if (PyUnicode_Check(x)) {
4168 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4170 if (targetsize == 1)
4171 /* 1-1 mapping */
4172 *p++ = *PyUnicode_AS_UNICODE(x);
4174 else if (targetsize > 1) {
4175 /* 1-n mapping */
4176 if (targetsize > extrachars) {
4177 /* resize first */
4178 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4179 Py_ssize_t needed = (targetsize - extrachars) + \
4180 (targetsize << 2);
4181 extrachars += needed;
4182 /* XXX overflow detection missing */
4183 if (_PyUnicode_Resize(&v,
4184 PyUnicode_GET_SIZE(v) + needed) < 0) {
4185 Py_DECREF(x);
4186 goto onError;
4188 p = PyUnicode_AS_UNICODE(v) + oldpos;
4190 Py_UNICODE_COPY(p,
4191 PyUnicode_AS_UNICODE(x),
4192 targetsize);
4193 p += targetsize;
4194 extrachars -= targetsize;
4196 /* 1-0 mapping: skip the character */
4198 else {
4199 /* wrong return value */
4200 PyErr_SetString(PyExc_TypeError,
4201 "character mapping must return integer, None or unicode");
4202 Py_DECREF(x);
4203 goto onError;
4205 Py_DECREF(x);
4206 ++s;
4209 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4210 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4211 goto onError;
4212 Py_XDECREF(errorHandler);
4213 Py_XDECREF(exc);
4214 return (PyObject *)v;
4216 onError:
4217 Py_XDECREF(errorHandler);
4218 Py_XDECREF(exc);
4219 Py_XDECREF(v);
4220 return NULL;
4223 /* Charmap encoding: the lookup table */
4225 struct encoding_map{
4226 PyObject_HEAD
4227 unsigned char level1[32];
4228 int count2, count3;
4229 unsigned char level23[1];
4232 static PyObject*
4233 encoding_map_size(PyObject *obj, PyObject* args)
4235 struct encoding_map *map = (struct encoding_map*)obj;
4236 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4237 128*map->count3);
4240 static PyMethodDef encoding_map_methods[] = {
4241 {"size", encoding_map_size, METH_NOARGS,
4242 PyDoc_STR("Return the size (in bytes) of this object") },
4243 { 0 }
4246 static void
4247 encoding_map_dealloc(PyObject* o)
4249 PyObject_FREE(o);
4252 static PyTypeObject EncodingMapType = {
4253 PyVarObject_HEAD_INIT(NULL, 0)
4254 "EncodingMap", /*tp_name*/
4255 sizeof(struct encoding_map), /*tp_basicsize*/
4256 0, /*tp_itemsize*/
4257 /* methods */
4258 encoding_map_dealloc, /*tp_dealloc*/
4259 0, /*tp_print*/
4260 0, /*tp_getattr*/
4261 0, /*tp_setattr*/
4262 0, /*tp_compare*/
4263 0, /*tp_repr*/
4264 0, /*tp_as_number*/
4265 0, /*tp_as_sequence*/
4266 0, /*tp_as_mapping*/
4267 0, /*tp_hash*/
4268 0, /*tp_call*/
4269 0, /*tp_str*/
4270 0, /*tp_getattro*/
4271 0, /*tp_setattro*/
4272 0, /*tp_as_buffer*/
4273 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4274 0, /*tp_doc*/
4275 0, /*tp_traverse*/
4276 0, /*tp_clear*/
4277 0, /*tp_richcompare*/
4278 0, /*tp_weaklistoffset*/
4279 0, /*tp_iter*/
4280 0, /*tp_iternext*/
4281 encoding_map_methods, /*tp_methods*/
4282 0, /*tp_members*/
4283 0, /*tp_getset*/
4284 0, /*tp_base*/
4285 0, /*tp_dict*/
4286 0, /*tp_descr_get*/
4287 0, /*tp_descr_set*/
4288 0, /*tp_dictoffset*/
4289 0, /*tp_init*/
4290 0, /*tp_alloc*/
4291 0, /*tp_new*/
4292 0, /*tp_free*/
4293 0, /*tp_is_gc*/
4296 PyObject*
4297 PyUnicode_BuildEncodingMap(PyObject* string)
4299 Py_UNICODE *decode;
4300 PyObject *result;
4301 struct encoding_map *mresult;
4302 int i;
4303 int need_dict = 0;
4304 unsigned char level1[32];
4305 unsigned char level2[512];
4306 unsigned char *mlevel1, *mlevel2, *mlevel3;
4307 int count2 = 0, count3 = 0;
4309 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4310 PyErr_BadArgument();
4311 return NULL;
4313 decode = PyUnicode_AS_UNICODE(string);
4314 memset(level1, 0xFF, sizeof level1);
4315 memset(level2, 0xFF, sizeof level2);
4317 /* If there isn't a one-to-one mapping of NULL to \0,
4318 or if there are non-BMP characters, we need to use
4319 a mapping dictionary. */
4320 if (decode[0] != 0)
4321 need_dict = 1;
4322 for (i = 1; i < 256; i++) {
4323 int l1, l2;
4324 if (decode[i] == 0
4325 #ifdef Py_UNICODE_WIDE
4326 || decode[i] > 0xFFFF
4327 #endif
4329 need_dict = 1;
4330 break;
4332 if (decode[i] == 0xFFFE)
4333 /* unmapped character */
4334 continue;
4335 l1 = decode[i] >> 11;
4336 l2 = decode[i] >> 7;
4337 if (level1[l1] == 0xFF)
4338 level1[l1] = count2++;
4339 if (level2[l2] == 0xFF)
4340 level2[l2] = count3++;
4343 if (count2 >= 0xFF || count3 >= 0xFF)
4344 need_dict = 1;
4346 if (need_dict) {
4347 PyObject *result = PyDict_New();
4348 PyObject *key, *value;
4349 if (!result)
4350 return NULL;
4351 for (i = 0; i < 256; i++) {
4352 key = value = NULL;
4353 key = PyInt_FromLong(decode[i]);
4354 value = PyInt_FromLong(i);
4355 if (!key || !value)
4356 goto failed1;
4357 if (PyDict_SetItem(result, key, value) == -1)
4358 goto failed1;
4359 Py_DECREF(key);
4360 Py_DECREF(value);
4362 return result;
4363 failed1:
4364 Py_XDECREF(key);
4365 Py_XDECREF(value);
4366 Py_DECREF(result);
4367 return NULL;
4370 /* Create a three-level trie */
4371 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4372 16*count2 + 128*count3 - 1);
4373 if (!result)
4374 return PyErr_NoMemory();
4375 PyObject_Init(result, &EncodingMapType);
4376 mresult = (struct encoding_map*)result;
4377 mresult->count2 = count2;
4378 mresult->count3 = count3;
4379 mlevel1 = mresult->level1;
4380 mlevel2 = mresult->level23;
4381 mlevel3 = mresult->level23 + 16*count2;
4382 memcpy(mlevel1, level1, 32);
4383 memset(mlevel2, 0xFF, 16*count2);
4384 memset(mlevel3, 0, 128*count3);
4385 count3 = 0;
4386 for (i = 1; i < 256; i++) {
4387 int o1, o2, o3, i2, i3;
4388 if (decode[i] == 0xFFFE)
4389 /* unmapped character */
4390 continue;
4391 o1 = decode[i]>>11;
4392 o2 = (decode[i]>>7) & 0xF;
4393 i2 = 16*mlevel1[o1] + o2;
4394 if (mlevel2[i2] == 0xFF)
4395 mlevel2[i2] = count3++;
4396 o3 = decode[i] & 0x7F;
4397 i3 = 128*mlevel2[i2] + o3;
4398 mlevel3[i3] = i;
4400 return result;
4403 static int
4404 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4406 struct encoding_map *map = (struct encoding_map*)mapping;
4407 int l1 = c>>11;
4408 int l2 = (c>>7) & 0xF;
4409 int l3 = c & 0x7F;
4410 int i;
4412 #ifdef Py_UNICODE_WIDE
4413 if (c > 0xFFFF) {
4414 return -1;
4416 #endif
4417 if (c == 0)
4418 return 0;
4419 /* level 1*/
4420 i = map->level1[l1];
4421 if (i == 0xFF) {
4422 return -1;
4424 /* level 2*/
4425 i = map->level23[16*i+l2];
4426 if (i == 0xFF) {
4427 return -1;
4429 /* level 3 */
4430 i = map->level23[16*map->count2 + 128*i + l3];
4431 if (i == 0) {
4432 return -1;
4434 return i;
4437 /* Lookup the character ch in the mapping. If the character
4438 can't be found, Py_None is returned (or NULL, if another
4439 error occurred). */
4440 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4442 PyObject *w = PyInt_FromLong((long)c);
4443 PyObject *x;
4445 if (w == NULL)
4446 return NULL;
4447 x = PyObject_GetItem(mapping, w);
4448 Py_DECREF(w);
4449 if (x == NULL) {
4450 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4451 /* No mapping found means: mapping is undefined. */
4452 PyErr_Clear();
4453 x = Py_None;
4454 Py_INCREF(x);
4455 return x;
4456 } else
4457 return NULL;
4459 else if (x == Py_None)
4460 return x;
4461 else if (PyInt_Check(x)) {
4462 long value = PyInt_AS_LONG(x);
4463 if (value < 0 || value > 255) {
4464 PyErr_SetString(PyExc_TypeError,
4465 "character mapping must be in range(256)");
4466 Py_DECREF(x);
4467 return NULL;
4469 return x;
4471 else if (PyString_Check(x))
4472 return x;
4473 else {
4474 /* wrong return value */
4475 PyErr_SetString(PyExc_TypeError,
4476 "character mapping must return integer, None or str");
4477 Py_DECREF(x);
4478 return NULL;
4482 static int
4483 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4485 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4486 /* exponentially overallocate to minimize reallocations */
4487 if (requiredsize < 2*outsize)
4488 requiredsize = 2*outsize;
4489 if (_PyString_Resize(outobj, requiredsize)) {
4490 return 0;
4492 return 1;
4495 typedef enum charmapencode_result {
4496 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4497 }charmapencode_result;
4498 /* lookup the character, put the result in the output string and adjust
4499 various state variables. Reallocate the output string if not enough
4500 space is available. Return a new reference to the object that
4501 was put in the output buffer, or Py_None, if the mapping was undefined
4502 (in which case no character was written) or NULL, if a
4503 reallocation error occurred. The caller must decref the result */
4504 static
4505 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4506 PyObject **outobj, Py_ssize_t *outpos)
4508 PyObject *rep;
4509 char *outstart;
4510 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4512 if (Py_TYPE(mapping) == &EncodingMapType) {
4513 int res = encoding_map_lookup(c, mapping);
4514 Py_ssize_t requiredsize = *outpos+1;
4515 if (res == -1)
4516 return enc_FAILED;
4517 if (outsize<requiredsize)
4518 if (!charmapencode_resize(outobj, outpos, requiredsize))
4519 return enc_EXCEPTION;
4520 outstart = PyString_AS_STRING(*outobj);
4521 outstart[(*outpos)++] = (char)res;
4522 return enc_SUCCESS;
4525 rep = charmapencode_lookup(c, mapping);
4526 if (rep==NULL)
4527 return enc_EXCEPTION;
4528 else if (rep==Py_None) {
4529 Py_DECREF(rep);
4530 return enc_FAILED;
4531 } else {
4532 if (PyInt_Check(rep)) {
4533 Py_ssize_t requiredsize = *outpos+1;
4534 if (outsize<requiredsize)
4535 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4536 Py_DECREF(rep);
4537 return enc_EXCEPTION;
4539 outstart = PyString_AS_STRING(*outobj);
4540 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4542 else {
4543 const char *repchars = PyString_AS_STRING(rep);
4544 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4545 Py_ssize_t requiredsize = *outpos+repsize;
4546 if (outsize<requiredsize)
4547 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4548 Py_DECREF(rep);
4549 return enc_EXCEPTION;
4551 outstart = PyString_AS_STRING(*outobj);
4552 memcpy(outstart + *outpos, repchars, repsize);
4553 *outpos += repsize;
4556 Py_DECREF(rep);
4557 return enc_SUCCESS;
4560 /* handle an error in PyUnicode_EncodeCharmap
4561 Return 0 on success, -1 on error */
4562 static
4563 int charmap_encoding_error(
4564 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4565 PyObject **exceptionObject,
4566 int *known_errorHandler, PyObject **errorHandler, const char *errors,
4567 PyObject **res, Py_ssize_t *respos)
4569 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4570 Py_ssize_t repsize;
4571 Py_ssize_t newpos;
4572 Py_UNICODE *uni2;
4573 /* startpos for collecting unencodable chars */
4574 Py_ssize_t collstartpos = *inpos;
4575 Py_ssize_t collendpos = *inpos+1;
4576 Py_ssize_t collpos;
4577 char *encoding = "charmap";
4578 char *reason = "character maps to <undefined>";
4579 charmapencode_result x;
4581 /* find all unencodable characters */
4582 while (collendpos < size) {
4583 PyObject *rep;
4584 if (Py_TYPE(mapping) == &EncodingMapType) {
4585 int res = encoding_map_lookup(p[collendpos], mapping);
4586 if (res != -1)
4587 break;
4588 ++collendpos;
4589 continue;
4592 rep = charmapencode_lookup(p[collendpos], mapping);
4593 if (rep==NULL)
4594 return -1;
4595 else if (rep!=Py_None) {
4596 Py_DECREF(rep);
4597 break;
4599 Py_DECREF(rep);
4600 ++collendpos;
4602 /* cache callback name lookup
4603 * (if not done yet, i.e. it's the first error) */
4604 if (*known_errorHandler==-1) {
4605 if ((errors==NULL) || (!strcmp(errors, "strict")))
4606 *known_errorHandler = 1;
4607 else if (!strcmp(errors, "replace"))
4608 *known_errorHandler = 2;
4609 else if (!strcmp(errors, "ignore"))
4610 *known_errorHandler = 3;
4611 else if (!strcmp(errors, "xmlcharrefreplace"))
4612 *known_errorHandler = 4;
4613 else
4614 *known_errorHandler = 0;
4616 switch (*known_errorHandler) {
4617 case 1: /* strict */
4618 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4619 return -1;
4620 case 2: /* replace */
4621 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4622 x = charmapencode_output('?', mapping, res, respos);
4623 if (x==enc_EXCEPTION) {
4624 return -1;
4626 else if (x==enc_FAILED) {
4627 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4628 return -1;
4631 /* fall through */
4632 case 3: /* ignore */
4633 *inpos = collendpos;
4634 break;
4635 case 4: /* xmlcharrefreplace */
4636 /* generate replacement (temporarily (mis)uses p) */
4637 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4638 char buffer[2+29+1+1];
4639 char *cp;
4640 sprintf(buffer, "&#%d;", (int)p[collpos]);
4641 for (cp = buffer; *cp; ++cp) {
4642 x = charmapencode_output(*cp, mapping, res, respos);
4643 if (x==enc_EXCEPTION)
4644 return -1;
4645 else if (x==enc_FAILED) {
4646 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4647 return -1;
4651 *inpos = collendpos;
4652 break;
4653 default:
4654 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4655 encoding, reason, p, size, exceptionObject,
4656 collstartpos, collendpos, &newpos);
4657 if (repunicode == NULL)
4658 return -1;
4659 /* generate replacement */
4660 repsize = PyUnicode_GET_SIZE(repunicode);
4661 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4662 x = charmapencode_output(*uni2, mapping, res, respos);
4663 if (x==enc_EXCEPTION) {
4664 return -1;
4666 else if (x==enc_FAILED) {
4667 Py_DECREF(repunicode);
4668 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4669 return -1;
4672 *inpos = newpos;
4673 Py_DECREF(repunicode);
4675 return 0;
4678 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4679 Py_ssize_t size,
4680 PyObject *mapping,
4681 const char *errors)
4683 /* output object */
4684 PyObject *res = NULL;
4685 /* current input position */
4686 Py_ssize_t inpos = 0;
4687 /* current output position */
4688 Py_ssize_t respos = 0;
4689 PyObject *errorHandler = NULL;
4690 PyObject *exc = NULL;
4691 /* the following variable is used for caching string comparisons
4692 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4693 * 3=ignore, 4=xmlcharrefreplace */
4694 int known_errorHandler = -1;
4696 /* Default to Latin-1 */
4697 if (mapping == NULL)
4698 return PyUnicode_EncodeLatin1(p, size, errors);
4700 /* allocate enough for a simple encoding without
4701 replacements, if we need more, we'll resize */
4702 res = PyString_FromStringAndSize(NULL, size);
4703 if (res == NULL)
4704 goto onError;
4705 if (size == 0)
4706 return res;
4708 while (inpos<size) {
4709 /* try to encode it */
4710 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4711 if (x==enc_EXCEPTION) /* error */
4712 goto onError;
4713 if (x==enc_FAILED) { /* unencodable character */
4714 if (charmap_encoding_error(p, size, &inpos, mapping,
4715 &exc,
4716 &known_errorHandler, &errorHandler, errors,
4717 &res, &respos)) {
4718 goto onError;
4721 else
4722 /* done with this character => adjust input position */
4723 ++inpos;
4726 /* Resize if we allocated to much */
4727 if (respos<PyString_GET_SIZE(res)) {
4728 if (_PyString_Resize(&res, respos))
4729 goto onError;
4731 Py_XDECREF(exc);
4732 Py_XDECREF(errorHandler);
4733 return res;
4735 onError:
4736 Py_XDECREF(res);
4737 Py_XDECREF(exc);
4738 Py_XDECREF(errorHandler);
4739 return NULL;
4742 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4743 PyObject *mapping)
4745 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4746 PyErr_BadArgument();
4747 return NULL;
4749 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4750 PyUnicode_GET_SIZE(unicode),
4751 mapping,
4752 NULL);
4755 /* create or adjust a UnicodeTranslateError */
4756 static void make_translate_exception(PyObject **exceptionObject,
4757 const Py_UNICODE *unicode, Py_ssize_t size,
4758 Py_ssize_t startpos, Py_ssize_t endpos,
4759 const char *reason)
4761 if (*exceptionObject == NULL) {
4762 *exceptionObject = PyUnicodeTranslateError_Create(
4763 unicode, size, startpos, endpos, reason);
4765 else {
4766 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4767 goto onError;
4768 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4769 goto onError;
4770 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4771 goto onError;
4772 return;
4773 onError:
4774 Py_DECREF(*exceptionObject);
4775 *exceptionObject = NULL;
4779 /* raises a UnicodeTranslateError */
4780 static void raise_translate_exception(PyObject **exceptionObject,
4781 const Py_UNICODE *unicode, Py_ssize_t size,
4782 Py_ssize_t startpos, Py_ssize_t endpos,
4783 const char *reason)
4785 make_translate_exception(exceptionObject,
4786 unicode, size, startpos, endpos, reason);
4787 if (*exceptionObject != NULL)
4788 PyCodec_StrictErrors(*exceptionObject);
4791 /* error handling callback helper:
4792 build arguments, call the callback and check the arguments,
4793 put the result into newpos and return the replacement string, which
4794 has to be freed by the caller */
4795 static PyObject *unicode_translate_call_errorhandler(const char *errors,
4796 PyObject **errorHandler,
4797 const char *reason,
4798 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4799 Py_ssize_t startpos, Py_ssize_t endpos,
4800 Py_ssize_t *newpos)
4802 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4804 Py_ssize_t i_newpos;
4805 PyObject *restuple;
4806 PyObject *resunicode;
4808 if (*errorHandler == NULL) {
4809 *errorHandler = PyCodec_LookupError(errors);
4810 if (*errorHandler == NULL)
4811 return NULL;
4814 make_translate_exception(exceptionObject,
4815 unicode, size, startpos, endpos, reason);
4816 if (*exceptionObject == NULL)
4817 return NULL;
4819 restuple = PyObject_CallFunctionObjArgs(
4820 *errorHandler, *exceptionObject, NULL);
4821 if (restuple == NULL)
4822 return NULL;
4823 if (!PyTuple_Check(restuple)) {
4824 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4825 Py_DECREF(restuple);
4826 return NULL;
4828 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4829 &resunicode, &i_newpos)) {
4830 Py_DECREF(restuple);
4831 return NULL;
4833 if (i_newpos<0)
4834 *newpos = size+i_newpos;
4835 else
4836 *newpos = i_newpos;
4837 if (*newpos<0 || *newpos>size) {
4838 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4839 Py_DECREF(restuple);
4840 return NULL;
4842 Py_INCREF(resunicode);
4843 Py_DECREF(restuple);
4844 return resunicode;
4847 /* Lookup the character ch in the mapping and put the result in result,
4848 which must be decrefed by the caller.
4849 Return 0 on success, -1 on error */
4850 static
4851 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4853 PyObject *w = PyInt_FromLong((long)c);
4854 PyObject *x;
4856 if (w == NULL)
4857 return -1;
4858 x = PyObject_GetItem(mapping, w);
4859 Py_DECREF(w);
4860 if (x == NULL) {
4861 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4862 /* No mapping found means: use 1:1 mapping. */
4863 PyErr_Clear();
4864 *result = NULL;
4865 return 0;
4866 } else
4867 return -1;
4869 else if (x == Py_None) {
4870 *result = x;
4871 return 0;
4873 else if (PyInt_Check(x)) {
4874 long value = PyInt_AS_LONG(x);
4875 long max = PyUnicode_GetMax();
4876 if (value < 0 || value > max) {
4877 PyErr_Format(PyExc_TypeError,
4878 "character mapping must be in range(0x%lx)", max+1);
4879 Py_DECREF(x);
4880 return -1;
4882 *result = x;
4883 return 0;
4885 else if (PyUnicode_Check(x)) {
4886 *result = x;
4887 return 0;
4889 else {
4890 /* wrong return value */
4891 PyErr_SetString(PyExc_TypeError,
4892 "character mapping must return integer, None or unicode");
4893 Py_DECREF(x);
4894 return -1;
4897 /* ensure that *outobj is at least requiredsize characters long,
4898 if not reallocate and adjust various state variables.
4899 Return 0 on success, -1 on error */
4900 static
4901 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4902 Py_ssize_t requiredsize)
4904 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4905 if (requiredsize > oldsize) {
4906 /* remember old output position */
4907 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4908 /* exponentially overallocate to minimize reallocations */
4909 if (requiredsize < 2 * oldsize)
4910 requiredsize = 2 * oldsize;
4911 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4912 return -1;
4913 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4915 return 0;
4917 /* lookup the character, put the result in the output string and adjust
4918 various state variables. Return a new reference to the object that
4919 was put in the output buffer in *result, or Py_None, if the mapping was
4920 undefined (in which case no character was written).
4921 The called must decref result.
4922 Return 0 on success, -1 on error. */
4923 static
4924 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
4925 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4926 PyObject **res)
4928 if (charmaptranslate_lookup(*curinp, mapping, res))
4929 return -1;
4930 if (*res==NULL) {
4931 /* not found => default to 1:1 mapping */
4932 *(*outp)++ = *curinp;
4934 else if (*res==Py_None)
4936 else if (PyInt_Check(*res)) {
4937 /* no overflow check, because we know that the space is enough */
4938 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4940 else if (PyUnicode_Check(*res)) {
4941 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4942 if (repsize==1) {
4943 /* no overflow check, because we know that the space is enough */
4944 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4946 else if (repsize!=0) {
4947 /* more than one character */
4948 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4949 (insize - (curinp-startinp)) +
4950 repsize - 1;
4951 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4952 return -1;
4953 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4954 *outp += repsize;
4957 else
4958 return -1;
4959 return 0;
4962 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
4963 Py_ssize_t size,
4964 PyObject *mapping,
4965 const char *errors)
4967 /* output object */
4968 PyObject *res = NULL;
4969 /* pointers to the beginning and end+1 of input */
4970 const Py_UNICODE *startp = p;
4971 const Py_UNICODE *endp = p + size;
4972 /* pointer into the output */
4973 Py_UNICODE *str;
4974 /* current output position */
4975 Py_ssize_t respos = 0;
4976 char *reason = "character maps to <undefined>";
4977 PyObject *errorHandler = NULL;
4978 PyObject *exc = NULL;
4979 /* the following variable is used for caching string comparisons
4980 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4981 * 3=ignore, 4=xmlcharrefreplace */
4982 int known_errorHandler = -1;
4984 if (mapping == NULL) {
4985 PyErr_BadArgument();
4986 return NULL;
4989 /* allocate enough for a simple 1:1 translation without
4990 replacements, if we need more, we'll resize */
4991 res = PyUnicode_FromUnicode(NULL, size);
4992 if (res == NULL)
4993 goto onError;
4994 if (size == 0)
4995 return res;
4996 str = PyUnicode_AS_UNICODE(res);
4998 while (p<endp) {
4999 /* try to encode it */
5000 PyObject *x = NULL;
5001 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5002 Py_XDECREF(x);
5003 goto onError;
5005 Py_XDECREF(x);
5006 if (x!=Py_None) /* it worked => adjust input pointer */
5007 ++p;
5008 else { /* untranslatable character */
5009 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5010 Py_ssize_t repsize;
5011 Py_ssize_t newpos;
5012 Py_UNICODE *uni2;
5013 /* startpos for collecting untranslatable chars */
5014 const Py_UNICODE *collstart = p;
5015 const Py_UNICODE *collend = p+1;
5016 const Py_UNICODE *coll;
5018 /* find all untranslatable characters */
5019 while (collend < endp) {
5020 if (charmaptranslate_lookup(*collend, mapping, &x))
5021 goto onError;
5022 Py_XDECREF(x);
5023 if (x!=Py_None)
5024 break;
5025 ++collend;
5027 /* cache callback name lookup
5028 * (if not done yet, i.e. it's the first error) */
5029 if (known_errorHandler==-1) {
5030 if ((errors==NULL) || (!strcmp(errors, "strict")))
5031 known_errorHandler = 1;
5032 else if (!strcmp(errors, "replace"))
5033 known_errorHandler = 2;
5034 else if (!strcmp(errors, "ignore"))
5035 known_errorHandler = 3;
5036 else if (!strcmp(errors, "xmlcharrefreplace"))
5037 known_errorHandler = 4;
5038 else
5039 known_errorHandler = 0;
5041 switch (known_errorHandler) {
5042 case 1: /* strict */
5043 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5044 goto onError;
5045 case 2: /* replace */
5046 /* No need to check for space, this is a 1:1 replacement */
5047 for (coll = collstart; coll<collend; ++coll)
5048 *str++ = '?';
5049 /* fall through */
5050 case 3: /* ignore */
5051 p = collend;
5052 break;
5053 case 4: /* xmlcharrefreplace */
5054 /* generate replacement (temporarily (mis)uses p) */
5055 for (p = collstart; p < collend; ++p) {
5056 char buffer[2+29+1+1];
5057 char *cp;
5058 sprintf(buffer, "&#%d;", (int)*p);
5059 if (charmaptranslate_makespace(&res, &str,
5060 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5061 goto onError;
5062 for (cp = buffer; *cp; ++cp)
5063 *str++ = *cp;
5065 p = collend;
5066 break;
5067 default:
5068 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5069 reason, startp, size, &exc,
5070 collstart-startp, collend-startp, &newpos);
5071 if (repunicode == NULL)
5072 goto onError;
5073 /* generate replacement */
5074 repsize = PyUnicode_GET_SIZE(repunicode);
5075 if (charmaptranslate_makespace(&res, &str,
5076 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5077 Py_DECREF(repunicode);
5078 goto onError;
5080 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5081 *str++ = *uni2;
5082 p = startp + newpos;
5083 Py_DECREF(repunicode);
5087 /* Resize if we allocated to much */
5088 respos = str-PyUnicode_AS_UNICODE(res);
5089 if (respos<PyUnicode_GET_SIZE(res)) {
5090 if (PyUnicode_Resize(&res, respos) < 0)
5091 goto onError;
5093 Py_XDECREF(exc);
5094 Py_XDECREF(errorHandler);
5095 return res;
5097 onError:
5098 Py_XDECREF(res);
5099 Py_XDECREF(exc);
5100 Py_XDECREF(errorHandler);
5101 return NULL;
5104 PyObject *PyUnicode_Translate(PyObject *str,
5105 PyObject *mapping,
5106 const char *errors)
5108 PyObject *result;
5110 str = PyUnicode_FromObject(str);
5111 if (str == NULL)
5112 goto onError;
5113 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5114 PyUnicode_GET_SIZE(str),
5115 mapping,
5116 errors);
5117 Py_DECREF(str);
5118 return result;
5120 onError:
5121 Py_XDECREF(str);
5122 return NULL;
5125 /* --- Decimal Encoder ---------------------------------------------------- */
5127 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5128 Py_ssize_t length,
5129 char *output,
5130 const char *errors)
5132 Py_UNICODE *p, *end;
5133 PyObject *errorHandler = NULL;
5134 PyObject *exc = NULL;
5135 const char *encoding = "decimal";
5136 const char *reason = "invalid decimal Unicode string";
5137 /* the following variable is used for caching string comparisons
5138 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5139 int known_errorHandler = -1;
5141 if (output == NULL) {
5142 PyErr_BadArgument();
5143 return -1;
5146 p = s;
5147 end = s + length;
5148 while (p < end) {
5149 register Py_UNICODE ch = *p;
5150 int decimal;
5151 PyObject *repunicode;
5152 Py_ssize_t repsize;
5153 Py_ssize_t newpos;
5154 Py_UNICODE *uni2;
5155 Py_UNICODE *collstart;
5156 Py_UNICODE *collend;
5158 if (Py_UNICODE_ISSPACE(ch)) {
5159 *output++ = ' ';
5160 ++p;
5161 continue;
5163 decimal = Py_UNICODE_TODECIMAL(ch);
5164 if (decimal >= 0) {
5165 *output++ = '0' + decimal;
5166 ++p;
5167 continue;
5169 if (0 < ch && ch < 256) {
5170 *output++ = (char)ch;
5171 ++p;
5172 continue;
5174 /* All other characters are considered unencodable */
5175 collstart = p;
5176 collend = p+1;
5177 while (collend < end) {
5178 if ((0 < *collend && *collend < 256) ||
5179 !Py_UNICODE_ISSPACE(*collend) ||
5180 Py_UNICODE_TODECIMAL(*collend))
5181 break;
5183 /* cache callback name lookup
5184 * (if not done yet, i.e. it's the first error) */
5185 if (known_errorHandler==-1) {
5186 if ((errors==NULL) || (!strcmp(errors, "strict")))
5187 known_errorHandler = 1;
5188 else if (!strcmp(errors, "replace"))
5189 known_errorHandler = 2;
5190 else if (!strcmp(errors, "ignore"))
5191 known_errorHandler = 3;
5192 else if (!strcmp(errors, "xmlcharrefreplace"))
5193 known_errorHandler = 4;
5194 else
5195 known_errorHandler = 0;
5197 switch (known_errorHandler) {
5198 case 1: /* strict */
5199 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5200 goto onError;
5201 case 2: /* replace */
5202 for (p = collstart; p < collend; ++p)
5203 *output++ = '?';
5204 /* fall through */
5205 case 3: /* ignore */
5206 p = collend;
5207 break;
5208 case 4: /* xmlcharrefreplace */
5209 /* generate replacement (temporarily (mis)uses p) */
5210 for (p = collstart; p < collend; ++p)
5211 output += sprintf(output, "&#%d;", (int)*p);
5212 p = collend;
5213 break;
5214 default:
5215 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5216 encoding, reason, s, length, &exc,
5217 collstart-s, collend-s, &newpos);
5218 if (repunicode == NULL)
5219 goto onError;
5220 /* generate replacement */
5221 repsize = PyUnicode_GET_SIZE(repunicode);
5222 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5223 Py_UNICODE ch = *uni2;
5224 if (Py_UNICODE_ISSPACE(ch))
5225 *output++ = ' ';
5226 else {
5227 decimal = Py_UNICODE_TODECIMAL(ch);
5228 if (decimal >= 0)
5229 *output++ = '0' + decimal;
5230 else if (0 < ch && ch < 256)
5231 *output++ = (char)ch;
5232 else {
5233 Py_DECREF(repunicode);
5234 raise_encode_exception(&exc, encoding,
5235 s, length, collstart-s, collend-s, reason);
5236 goto onError;
5240 p = s + newpos;
5241 Py_DECREF(repunicode);
5244 /* 0-terminate the output string */
5245 *output++ = '\0';
5246 Py_XDECREF(exc);
5247 Py_XDECREF(errorHandler);
5248 return 0;
5250 onError:
5251 Py_XDECREF(exc);
5252 Py_XDECREF(errorHandler);
5253 return -1;
5256 /* --- Helpers ------------------------------------------------------------ */
5258 #include "stringlib/unicodedefs.h"
5259 #include "stringlib/fastsearch.h"
5261 #include "stringlib/count.h"
5262 #include "stringlib/find.h"
5263 #include "stringlib/partition.h"
5264 #include "stringlib/split.h"
5266 /* helper macro to fixup start/end slice values */
5267 #define ADJUST_INDICES(start, end, len) \
5268 if (end > len) \
5269 end = len; \
5270 else if (end < 0) { \
5271 end += len; \
5272 if (end < 0) \
5273 end = 0; \
5275 if (start < 0) { \
5276 start += len; \
5277 if (start < 0) \
5278 start = 0; \
5281 Py_ssize_t PyUnicode_Count(PyObject *str,
5282 PyObject *substr,
5283 Py_ssize_t start,
5284 Py_ssize_t end)
5286 Py_ssize_t result;
5287 PyUnicodeObject* str_obj;
5288 PyUnicodeObject* sub_obj;
5290 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5291 if (!str_obj)
5292 return -1;
5293 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5294 if (!sub_obj) {
5295 Py_DECREF(str_obj);
5296 return -1;
5299 ADJUST_INDICES(start, end, str_obj->length);
5300 result = stringlib_count(
5301 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5302 PY_SSIZE_T_MAX
5305 Py_DECREF(sub_obj);
5306 Py_DECREF(str_obj);
5308 return result;
5311 Py_ssize_t PyUnicode_Find(PyObject *str,
5312 PyObject *sub,
5313 Py_ssize_t start,
5314 Py_ssize_t end,
5315 int direction)
5317 Py_ssize_t result;
5319 str = PyUnicode_FromObject(str);
5320 if (!str)
5321 return -2;
5322 sub = PyUnicode_FromObject(sub);
5323 if (!sub) {
5324 Py_DECREF(str);
5325 return -2;
5328 if (direction > 0)
5329 result = stringlib_find_slice(
5330 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5331 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5332 start, end
5334 else
5335 result = stringlib_rfind_slice(
5336 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5337 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5338 start, end
5341 Py_DECREF(str);
5342 Py_DECREF(sub);
5344 return result;
5347 static
5348 int tailmatch(PyUnicodeObject *self,
5349 PyUnicodeObject *substring,
5350 Py_ssize_t start,
5351 Py_ssize_t end,
5352 int direction)
5354 if (substring->length == 0)
5355 return 1;
5357 ADJUST_INDICES(start, end, self->length);
5358 end -= substring->length;
5359 if (end < start)
5360 return 0;
5362 if (direction > 0) {
5363 if (Py_UNICODE_MATCH(self, end, substring))
5364 return 1;
5365 } else {
5366 if (Py_UNICODE_MATCH(self, start, substring))
5367 return 1;
5370 return 0;
5373 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5374 PyObject *substr,
5375 Py_ssize_t start,
5376 Py_ssize_t end,
5377 int direction)
5379 Py_ssize_t result;
5381 str = PyUnicode_FromObject(str);
5382 if (str == NULL)
5383 return -1;
5384 substr = PyUnicode_FromObject(substr);
5385 if (substr == NULL) {
5386 Py_DECREF(str);
5387 return -1;
5390 result = tailmatch((PyUnicodeObject *)str,
5391 (PyUnicodeObject *)substr,
5392 start, end, direction);
5393 Py_DECREF(str);
5394 Py_DECREF(substr);
5395 return result;
5398 /* Apply fixfct filter to the Unicode object self and return a
5399 reference to the modified object */
5401 static
5402 PyObject *fixup(PyUnicodeObject *self,
5403 int (*fixfct)(PyUnicodeObject *s))
5406 PyUnicodeObject *u;
5408 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5409 if (u == NULL)
5410 return NULL;
5412 Py_UNICODE_COPY(u->str, self->str, self->length);
5414 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5415 /* fixfct should return TRUE if it modified the buffer. If
5416 FALSE, return a reference to the original buffer instead
5417 (to save space, not time) */
5418 Py_INCREF(self);
5419 Py_DECREF(u);
5420 return (PyObject*) self;
5422 return (PyObject*) u;
5425 static
5426 int fixupper(PyUnicodeObject *self)
5428 Py_ssize_t len = self->length;
5429 Py_UNICODE *s = self->str;
5430 int status = 0;
5432 while (len-- > 0) {
5433 register Py_UNICODE ch;
5435 ch = Py_UNICODE_TOUPPER(*s);
5436 if (ch != *s) {
5437 status = 1;
5438 *s = ch;
5440 s++;
5443 return status;
5446 static
5447 int fixlower(PyUnicodeObject *self)
5449 Py_ssize_t len = self->length;
5450 Py_UNICODE *s = self->str;
5451 int status = 0;
5453 while (len-- > 0) {
5454 register Py_UNICODE ch;
5456 ch = Py_UNICODE_TOLOWER(*s);
5457 if (ch != *s) {
5458 status = 1;
5459 *s = ch;
5461 s++;
5464 return status;
5467 static
5468 int fixswapcase(PyUnicodeObject *self)
5470 Py_ssize_t len = self->length;
5471 Py_UNICODE *s = self->str;
5472 int status = 0;
5474 while (len-- > 0) {
5475 if (Py_UNICODE_ISUPPER(*s)) {
5476 *s = Py_UNICODE_TOLOWER(*s);
5477 status = 1;
5478 } else if (Py_UNICODE_ISLOWER(*s)) {
5479 *s = Py_UNICODE_TOUPPER(*s);
5480 status = 1;
5482 s++;
5485 return status;
5488 static
5489 int fixcapitalize(PyUnicodeObject *self)
5491 Py_ssize_t len = self->length;
5492 Py_UNICODE *s = self->str;
5493 int status = 0;
5495 if (len == 0)
5496 return 0;
5497 if (Py_UNICODE_ISLOWER(*s)) {
5498 *s = Py_UNICODE_TOUPPER(*s);
5499 status = 1;
5501 s++;
5502 while (--len > 0) {
5503 if (Py_UNICODE_ISUPPER(*s)) {
5504 *s = Py_UNICODE_TOLOWER(*s);
5505 status = 1;
5507 s++;
5509 return status;
5512 static
5513 int fixtitle(PyUnicodeObject *self)
5515 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5516 register Py_UNICODE *e;
5517 int previous_is_cased;
5519 /* Shortcut for single character strings */
5520 if (PyUnicode_GET_SIZE(self) == 1) {
5521 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5522 if (*p != ch) {
5523 *p = ch;
5524 return 1;
5526 else
5527 return 0;
5530 e = p + PyUnicode_GET_SIZE(self);
5531 previous_is_cased = 0;
5532 for (; p < e; p++) {
5533 register const Py_UNICODE ch = *p;
5535 if (previous_is_cased)
5536 *p = Py_UNICODE_TOLOWER(ch);
5537 else
5538 *p = Py_UNICODE_TOTITLE(ch);
5540 if (Py_UNICODE_ISLOWER(ch) ||
5541 Py_UNICODE_ISUPPER(ch) ||
5542 Py_UNICODE_ISTITLE(ch))
5543 previous_is_cased = 1;
5544 else
5545 previous_is_cased = 0;
5547 return 1;
5550 PyObject *
5551 PyUnicode_Join(PyObject *separator, PyObject *seq)
5553 PyObject *internal_separator = NULL;
5554 const Py_UNICODE blank = ' ';
5555 const Py_UNICODE *sep = &blank;
5556 Py_ssize_t seplen = 1;
5557 PyUnicodeObject *res = NULL; /* the result */
5558 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5559 Py_ssize_t res_used; /* # used bytes */
5560 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5561 PyObject *fseq; /* PySequence_Fast(seq) */
5562 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
5563 PyObject *item;
5564 Py_ssize_t i;
5566 fseq = PySequence_Fast(seq, "");
5567 if (fseq == NULL) {
5568 return NULL;
5571 /* Grrrr. A codec may be invoked to convert str objects to
5572 * Unicode, and so it's possible to call back into Python code
5573 * during PyUnicode_FromObject(), and so it's possible for a sick
5574 * codec to change the size of fseq (if seq is a list). Therefore
5575 * we have to keep refetching the size -- can't assume seqlen
5576 * is invariant.
5578 seqlen = PySequence_Fast_GET_SIZE(fseq);
5579 /* If empty sequence, return u"". */
5580 if (seqlen == 0) {
5581 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5582 goto Done;
5584 /* If singleton sequence with an exact Unicode, return that. */
5585 if (seqlen == 1) {
5586 item = PySequence_Fast_GET_ITEM(fseq, 0);
5587 if (PyUnicode_CheckExact(item)) {
5588 Py_INCREF(item);
5589 res = (PyUnicodeObject *)item;
5590 goto Done;
5594 /* At least two items to join, or one that isn't exact Unicode. */
5595 if (seqlen > 1) {
5596 /* Set up sep and seplen -- they're needed. */
5597 if (separator == NULL) {
5598 sep = &blank;
5599 seplen = 1;
5601 else {
5602 internal_separator = PyUnicode_FromObject(separator);
5603 if (internal_separator == NULL)
5604 goto onError;
5605 sep = PyUnicode_AS_UNICODE(internal_separator);
5606 seplen = PyUnicode_GET_SIZE(internal_separator);
5607 /* In case PyUnicode_FromObject() mutated seq. */
5608 seqlen = PySequence_Fast_GET_SIZE(fseq);
5612 /* Get space. */
5613 res = _PyUnicode_New(res_alloc);
5614 if (res == NULL)
5615 goto onError;
5616 res_p = PyUnicode_AS_UNICODE(res);
5617 res_used = 0;
5619 for (i = 0; i < seqlen; ++i) {
5620 Py_ssize_t itemlen;
5621 Py_ssize_t new_res_used;
5623 item = PySequence_Fast_GET_ITEM(fseq, i);
5624 /* Convert item to Unicode. */
5625 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5626 PyErr_Format(PyExc_TypeError,
5627 "sequence item %zd: expected string or Unicode,"
5628 " %.80s found",
5629 i, Py_TYPE(item)->tp_name);
5630 goto onError;
5632 item = PyUnicode_FromObject(item);
5633 if (item == NULL)
5634 goto onError;
5635 /* We own a reference to item from here on. */
5637 /* In case PyUnicode_FromObject() mutated seq. */
5638 seqlen = PySequence_Fast_GET_SIZE(fseq);
5640 /* Make sure we have enough space for the separator and the item. */
5641 itemlen = PyUnicode_GET_SIZE(item);
5642 new_res_used = res_used + itemlen;
5643 if (new_res_used < 0)
5644 goto Overflow;
5645 if (i < seqlen - 1) {
5646 new_res_used += seplen;
5647 if (new_res_used < 0)
5648 goto Overflow;
5650 if (new_res_used > res_alloc) {
5651 /* double allocated size until it's big enough */
5652 do {
5653 res_alloc += res_alloc;
5654 if (res_alloc <= 0)
5655 goto Overflow;
5656 } while (new_res_used > res_alloc);
5657 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5658 Py_DECREF(item);
5659 goto onError;
5661 res_p = PyUnicode_AS_UNICODE(res) + res_used;
5664 /* Copy item, and maybe the separator. */
5665 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5666 res_p += itemlen;
5667 if (i < seqlen - 1) {
5668 Py_UNICODE_COPY(res_p, sep, seplen);
5669 res_p += seplen;
5671 Py_DECREF(item);
5672 res_used = new_res_used;
5675 /* Shrink res to match the used area; this probably can't fail,
5676 * but it's cheap to check.
5678 if (_PyUnicode_Resize(&res, res_used) < 0)
5679 goto onError;
5681 Done:
5682 Py_XDECREF(internal_separator);
5683 Py_DECREF(fseq);
5684 return (PyObject *)res;
5686 Overflow:
5687 PyErr_SetString(PyExc_OverflowError,
5688 "join() result is too long for a Python string");
5689 Py_DECREF(item);
5690 /* fall through */
5692 onError:
5693 Py_XDECREF(internal_separator);
5694 Py_DECREF(fseq);
5695 Py_XDECREF(res);
5696 return NULL;
5699 static
5700 PyUnicodeObject *pad(PyUnicodeObject *self,
5701 Py_ssize_t left,
5702 Py_ssize_t right,
5703 Py_UNICODE fill)
5705 PyUnicodeObject *u;
5707 if (left < 0)
5708 left = 0;
5709 if (right < 0)
5710 right = 0;
5712 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5713 Py_INCREF(self);
5714 return self;
5717 if (left > PY_SSIZE_T_MAX - self->length ||
5718 right > PY_SSIZE_T_MAX - (left + self->length)) {
5719 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5720 return NULL;
5722 u = _PyUnicode_New(left + self->length + right);
5723 if (u) {
5724 if (left)
5725 Py_UNICODE_FILL(u->str, fill, left);
5726 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5727 if (right)
5728 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5731 return u;
5734 PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
5736 PyObject *list;
5738 string = PyUnicode_FromObject(string);
5739 if (string == NULL)
5740 return NULL;
5742 list = stringlib_splitlines(
5743 (PyObject*) string, PyUnicode_AS_UNICODE(string),
5744 PyUnicode_GET_SIZE(string), keepends);
5746 Py_DECREF(string);
5747 return list;
5750 static
5751 PyObject *split(PyUnicodeObject *self,
5752 PyUnicodeObject *substring,
5753 Py_ssize_t maxcount)
5755 if (maxcount < 0)
5756 maxcount = PY_SSIZE_T_MAX;
5758 if (substring == NULL)
5759 return stringlib_split_whitespace(
5760 (PyObject*) self, self->str, self->length, maxcount
5763 return stringlib_split(
5764 (PyObject*) self, self->str, self->length,
5765 substring->str, substring->length,
5766 maxcount
5770 static
5771 PyObject *rsplit(PyUnicodeObject *self,
5772 PyUnicodeObject *substring,
5773 Py_ssize_t maxcount)
5775 if (maxcount < 0)
5776 maxcount = PY_SSIZE_T_MAX;
5778 if (substring == NULL)
5779 return stringlib_rsplit_whitespace(
5780 (PyObject*) self, self->str, self->length, maxcount
5783 return stringlib_rsplit(
5784 (PyObject*) self, self->str, self->length,
5785 substring->str, substring->length,
5786 maxcount
5790 static
5791 PyObject *replace(PyUnicodeObject *self,
5792 PyUnicodeObject *str1,
5793 PyUnicodeObject *str2,
5794 Py_ssize_t maxcount)
5796 PyUnicodeObject *u;
5798 if (maxcount < 0)
5799 maxcount = PY_SSIZE_T_MAX;
5800 else if (maxcount == 0 || self->length == 0)
5801 goto nothing;
5803 if (str1->length == str2->length) {
5804 Py_ssize_t i;
5805 /* same length */
5806 if (str1->length == 0)
5807 goto nothing;
5808 if (str1->length == 1) {
5809 /* replace characters */
5810 Py_UNICODE u1, u2;
5811 if (!findchar(self->str, self->length, str1->str[0]))
5812 goto nothing;
5813 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5814 if (!u)
5815 return NULL;
5816 Py_UNICODE_COPY(u->str, self->str, self->length);
5817 u1 = str1->str[0];
5818 u2 = str2->str[0];
5819 for (i = 0; i < u->length; i++)
5820 if (u->str[i] == u1) {
5821 if (--maxcount < 0)
5822 break;
5823 u->str[i] = u2;
5825 } else {
5826 i = stringlib_find(
5827 self->str, self->length, str1->str, str1->length, 0
5829 if (i < 0)
5830 goto nothing;
5831 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5832 if (!u)
5833 return NULL;
5834 Py_UNICODE_COPY(u->str, self->str, self->length);
5836 /* change everything in-place, starting with this one */
5837 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5838 i += str1->length;
5840 while ( --maxcount > 0) {
5841 i = stringlib_find(self->str+i, self->length-i,
5842 str1->str, str1->length,
5844 if (i == -1)
5845 break;
5846 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5847 i += str1->length;
5850 } else {
5852 Py_ssize_t n, i, j, e;
5853 Py_ssize_t product, new_size, delta;
5854 Py_UNICODE *p;
5856 /* replace strings */
5857 n = stringlib_count(self->str, self->length, str1->str, str1->length,
5858 maxcount);
5859 if (n == 0)
5860 goto nothing;
5861 /* new_size = self->length + n * (str2->length - str1->length)); */
5862 delta = (str2->length - str1->length);
5863 if (delta == 0) {
5864 new_size = self->length;
5865 } else {
5866 product = n * (str2->length - str1->length);
5867 if ((product / (str2->length - str1->length)) != n) {
5868 PyErr_SetString(PyExc_OverflowError,
5869 "replace string is too long");
5870 return NULL;
5872 new_size = self->length + product;
5873 if (new_size < 0) {
5874 PyErr_SetString(PyExc_OverflowError,
5875 "replace string is too long");
5876 return NULL;
5879 u = _PyUnicode_New(new_size);
5880 if (!u)
5881 return NULL;
5882 i = 0;
5883 p = u->str;
5884 e = self->length - str1->length;
5885 if (str1->length > 0) {
5886 while (n-- > 0) {
5887 /* look for next match */
5888 j = stringlib_find(self->str+i, self->length-i,
5889 str1->str, str1->length,
5891 if (j == -1)
5892 break;
5893 else if (j > i) {
5894 /* copy unchanged part [i:j] */
5895 Py_UNICODE_COPY(p, self->str+i, j-i);
5896 p += j - i;
5898 /* copy substitution string */
5899 if (str2->length > 0) {
5900 Py_UNICODE_COPY(p, str2->str, str2->length);
5901 p += str2->length;
5903 i = j + str1->length;
5905 if (i < self->length)
5906 /* copy tail [i:] */
5907 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5908 } else {
5909 /* interleave */
5910 while (n > 0) {
5911 Py_UNICODE_COPY(p, str2->str, str2->length);
5912 p += str2->length;
5913 if (--n <= 0)
5914 break;
5915 *p++ = self->str[i++];
5917 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5920 return (PyObject *) u;
5922 nothing:
5923 /* nothing to replace; return original string (when possible) */
5924 if (PyUnicode_CheckExact(self)) {
5925 Py_INCREF(self);
5926 return (PyObject *) self;
5928 return PyUnicode_FromUnicode(self->str, self->length);
5931 /* --- Unicode Object Methods --------------------------------------------- */
5933 PyDoc_STRVAR(title__doc__,
5934 "S.title() -> unicode\n\
5936 Return a titlecased version of S, i.e. words start with title case\n\
5937 characters, all remaining cased characters have lower case.");
5939 static PyObject*
5940 unicode_title(PyUnicodeObject *self)
5942 return fixup(self, fixtitle);
5945 PyDoc_STRVAR(capitalize__doc__,
5946 "S.capitalize() -> unicode\n\
5948 Return a capitalized version of S, i.e. make the first character\n\
5949 have upper case.");
5951 static PyObject*
5952 unicode_capitalize(PyUnicodeObject *self)
5954 return fixup(self, fixcapitalize);
5957 #if 0
5958 PyDoc_STRVAR(capwords__doc__,
5959 "S.capwords() -> unicode\n\
5961 Apply .capitalize() to all words in S and return the result with\n\
5962 normalized whitespace (all whitespace strings are replaced by ' ').");
5964 static PyObject*
5965 unicode_capwords(PyUnicodeObject *self)
5967 PyObject *list;
5968 PyObject *item;
5969 Py_ssize_t i;
5971 /* Split into words */
5972 list = split(self, NULL, -1);
5973 if (!list)
5974 return NULL;
5976 /* Capitalize each word */
5977 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5978 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5979 fixcapitalize);
5980 if (item == NULL)
5981 goto onError;
5982 Py_DECREF(PyList_GET_ITEM(list, i));
5983 PyList_SET_ITEM(list, i, item);
5986 /* Join the words to form a new string */
5987 item = PyUnicode_Join(NULL, list);
5989 onError:
5990 Py_DECREF(list);
5991 return (PyObject *)item;
5993 #endif
5995 /* Argument converter. Coerces to a single unicode character */
5997 static int
5998 convert_uc(PyObject *obj, void *addr)
6000 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6001 PyObject *uniobj;
6002 Py_UNICODE *unistr;
6004 uniobj = PyUnicode_FromObject(obj);
6005 if (uniobj == NULL) {
6006 PyErr_SetString(PyExc_TypeError,
6007 "The fill character cannot be converted to Unicode");
6008 return 0;
6010 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6011 PyErr_SetString(PyExc_TypeError,
6012 "The fill character must be exactly one character long");
6013 Py_DECREF(uniobj);
6014 return 0;
6016 unistr = PyUnicode_AS_UNICODE(uniobj);
6017 *fillcharloc = unistr[0];
6018 Py_DECREF(uniobj);
6019 return 1;
6022 PyDoc_STRVAR(center__doc__,
6023 "S.center(width[, fillchar]) -> unicode\n\
6025 Return S centered in a Unicode string of length width. Padding is\n\
6026 done using the specified fill character (default is a space)");
6028 static PyObject *
6029 unicode_center(PyUnicodeObject *self, PyObject *args)
6031 Py_ssize_t marg, left;
6032 Py_ssize_t width;
6033 Py_UNICODE fillchar = ' ';
6035 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6036 return NULL;
6038 if (self->length >= width && PyUnicode_CheckExact(self)) {
6039 Py_INCREF(self);
6040 return (PyObject*) self;
6043 marg = width - self->length;
6044 left = marg / 2 + (marg & width & 1);
6046 return (PyObject*) pad(self, left, marg - left, fillchar);
6049 #if 0
6051 /* This code should go into some future Unicode collation support
6052 module. The basic comparison should compare ordinals on a naive
6053 basis (this is what Java does and thus Jython too). */
6055 /* speedy UTF-16 code point order comparison */
6056 /* gleaned from: */
6057 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6059 static short utf16Fixup[32] =
6061 0, 0, 0, 0, 0, 0, 0, 0,
6062 0, 0, 0, 0, 0, 0, 0, 0,
6063 0, 0, 0, 0, 0, 0, 0, 0,
6064 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6067 static int
6068 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6070 Py_ssize_t len1, len2;
6072 Py_UNICODE *s1 = str1->str;
6073 Py_UNICODE *s2 = str2->str;
6075 len1 = str1->length;
6076 len2 = str2->length;
6078 while (len1 > 0 && len2 > 0) {
6079 Py_UNICODE c1, c2;
6081 c1 = *s1++;
6082 c2 = *s2++;
6084 if (c1 > (1<<11) * 26)
6085 c1 += utf16Fixup[c1>>11];
6086 if (c2 > (1<<11) * 26)
6087 c2 += utf16Fixup[c2>>11];
6088 /* now c1 and c2 are in UTF-32-compatible order */
6090 if (c1 != c2)
6091 return (c1 < c2) ? -1 : 1;
6093 len1--; len2--;
6096 return (len1 < len2) ? -1 : (len1 != len2);
6099 #else
6101 static int
6102 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6104 register Py_ssize_t len1, len2;
6106 Py_UNICODE *s1 = str1->str;
6107 Py_UNICODE *s2 = str2->str;
6109 len1 = str1->length;
6110 len2 = str2->length;
6112 while (len1 > 0 && len2 > 0) {
6113 Py_UNICODE c1, c2;
6115 c1 = *s1++;
6116 c2 = *s2++;
6118 if (c1 != c2)
6119 return (c1 < c2) ? -1 : 1;
6121 len1--; len2--;
6124 return (len1 < len2) ? -1 : (len1 != len2);
6127 #endif
6129 int PyUnicode_Compare(PyObject *left,
6130 PyObject *right)
6132 PyUnicodeObject *u = NULL, *v = NULL;
6133 int result;
6135 /* Coerce the two arguments */
6136 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6137 if (u == NULL)
6138 goto onError;
6139 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6140 if (v == NULL)
6141 goto onError;
6143 /* Shortcut for empty or interned objects */
6144 if (v == u) {
6145 Py_DECREF(u);
6146 Py_DECREF(v);
6147 return 0;
6150 result = unicode_compare(u, v);
6152 Py_DECREF(u);
6153 Py_DECREF(v);
6154 return result;
6156 onError:
6157 Py_XDECREF(u);
6158 Py_XDECREF(v);
6159 return -1;
6162 PyObject *PyUnicode_RichCompare(PyObject *left,
6163 PyObject *right,
6164 int op)
6166 int result;
6168 result = PyUnicode_Compare(left, right);
6169 if (result == -1 && PyErr_Occurred())
6170 goto onError;
6172 /* Convert the return value to a Boolean */
6173 switch (op) {
6174 case Py_EQ:
6175 result = (result == 0);
6176 break;
6177 case Py_NE:
6178 result = (result != 0);
6179 break;
6180 case Py_LE:
6181 result = (result <= 0);
6182 break;
6183 case Py_GE:
6184 result = (result >= 0);
6185 break;
6186 case Py_LT:
6187 result = (result == -1);
6188 break;
6189 case Py_GT:
6190 result = (result == 1);
6191 break;
6193 return PyBool_FromLong(result);
6195 onError:
6197 /* Standard case
6199 Type errors mean that PyUnicode_FromObject() could not convert
6200 one of the arguments (usually the right hand side) to Unicode,
6201 ie. we can't handle the comparison request. However, it is
6202 possible that the other object knows a comparison method, which
6203 is why we return Py_NotImplemented to give the other object a
6204 chance.
6207 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6208 PyErr_Clear();
6209 Py_INCREF(Py_NotImplemented);
6210 return Py_NotImplemented;
6212 if (op != Py_EQ && op != Py_NE)
6213 return NULL;
6215 /* Equality comparison.
6217 This is a special case: we silence any PyExc_UnicodeDecodeError
6218 and instead turn it into a PyErr_UnicodeWarning.
6221 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6222 return NULL;
6223 PyErr_Clear();
6224 if (PyErr_Warn(PyExc_UnicodeWarning,
6225 (op == Py_EQ) ?
6226 "Unicode equal comparison "
6227 "failed to convert both arguments to Unicode - "
6228 "interpreting them as being unequal" :
6229 "Unicode unequal comparison "
6230 "failed to convert both arguments to Unicode - "
6231 "interpreting them as being unequal"
6232 ) < 0)
6233 return NULL;
6234 result = (op == Py_NE);
6235 return PyBool_FromLong(result);
6238 int PyUnicode_Contains(PyObject *container,
6239 PyObject *element)
6241 PyObject *str, *sub;
6242 int result;
6244 /* Coerce the two arguments */
6245 sub = PyUnicode_FromObject(element);
6246 if (!sub) {
6247 return -1;
6250 str = PyUnicode_FromObject(container);
6251 if (!str) {
6252 Py_DECREF(sub);
6253 return -1;
6256 result = stringlib_contains_obj(str, sub);
6258 Py_DECREF(str);
6259 Py_DECREF(sub);
6261 return result;
6264 /* Concat to string or Unicode object giving a new Unicode object. */
6266 PyObject *PyUnicode_Concat(PyObject *left,
6267 PyObject *right)
6269 PyUnicodeObject *u = NULL, *v = NULL, *w;
6271 /* Coerce the two arguments */
6272 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6273 if (u == NULL)
6274 goto onError;
6275 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6276 if (v == NULL)
6277 goto onError;
6279 /* Shortcuts */
6280 if (v == unicode_empty) {
6281 Py_DECREF(v);
6282 return (PyObject *)u;
6284 if (u == unicode_empty) {
6285 Py_DECREF(u);
6286 return (PyObject *)v;
6289 /* Concat the two Unicode strings */
6290 w = _PyUnicode_New(u->length + v->length);
6291 if (w == NULL)
6292 goto onError;
6293 Py_UNICODE_COPY(w->str, u->str, u->length);
6294 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6296 Py_DECREF(u);
6297 Py_DECREF(v);
6298 return (PyObject *)w;
6300 onError:
6301 Py_XDECREF(u);
6302 Py_XDECREF(v);
6303 return NULL;
6306 PyDoc_STRVAR(count__doc__,
6307 "S.count(sub[, start[, end]]) -> int\n\
6309 Return the number of non-overlapping occurrences of substring sub in\n\
6310 Unicode string S[start:end]. Optional arguments start and end are\n\
6311 interpreted as in slice notation.");
6313 static PyObject *
6314 unicode_count(PyUnicodeObject *self, PyObject *args)
6316 PyUnicodeObject *substring;
6317 Py_ssize_t start = 0;
6318 Py_ssize_t end = PY_SSIZE_T_MAX;
6319 PyObject *result;
6321 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6322 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6323 return NULL;
6325 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6326 (PyObject *)substring);
6327 if (substring == NULL)
6328 return NULL;
6330 ADJUST_INDICES(start, end, self->length);
6331 result = PyInt_FromSsize_t(
6332 stringlib_count(self->str + start, end - start,
6333 substring->str, substring->length,
6334 PY_SSIZE_T_MAX)
6337 Py_DECREF(substring);
6339 return result;
6342 PyDoc_STRVAR(encode__doc__,
6343 "S.encode([encoding[,errors]]) -> string or unicode\n\
6345 Encodes S using the codec registered for encoding. encoding defaults\n\
6346 to the default encoding. errors may be given to set a different error\n\
6347 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6348 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6349 'xmlcharrefreplace' as well as any other name registered with\n\
6350 codecs.register_error that can handle UnicodeEncodeErrors.");
6352 static PyObject *
6353 unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6355 static char *kwlist[] = {"encoding", "errors", 0};
6356 char *encoding = NULL;
6357 char *errors = NULL;
6358 PyObject *v;
6360 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6361 kwlist, &encoding, &errors))
6362 return NULL;
6363 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6364 if (v == NULL)
6365 goto onError;
6366 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6367 PyErr_Format(PyExc_TypeError,
6368 "encoder did not return a string/unicode object "
6369 "(type=%.400s)",
6370 Py_TYPE(v)->tp_name);
6371 Py_DECREF(v);
6372 return NULL;
6374 return v;
6376 onError:
6377 return NULL;
6380 PyDoc_STRVAR(decode__doc__,
6381 "S.decode([encoding[,errors]]) -> string or unicode\n\
6383 Decodes S using the codec registered for encoding. encoding defaults\n\
6384 to the default encoding. errors may be given to set a different error\n\
6385 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6386 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6387 as well as any other name registerd with codecs.register_error that is\n\
6388 able to handle UnicodeDecodeErrors.");
6390 static PyObject *
6391 unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6393 static char *kwlist[] = {"encoding", "errors", 0};
6394 char *encoding = NULL;
6395 char *errors = NULL;
6396 PyObject *v;
6398 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6399 kwlist, &encoding, &errors))
6400 return NULL;
6401 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
6402 if (v == NULL)
6403 goto onError;
6404 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6405 PyErr_Format(PyExc_TypeError,
6406 "decoder did not return a string/unicode object "
6407 "(type=%.400s)",
6408 Py_TYPE(v)->tp_name);
6409 Py_DECREF(v);
6410 return NULL;
6412 return v;
6414 onError:
6415 return NULL;
6418 PyDoc_STRVAR(expandtabs__doc__,
6419 "S.expandtabs([tabsize]) -> unicode\n\
6421 Return a copy of S where all tab characters are expanded using spaces.\n\
6422 If tabsize is not given, a tab size of 8 characters is assumed.");
6424 static PyObject*
6425 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6427 Py_UNICODE *e;
6428 Py_UNICODE *p;
6429 Py_UNICODE *q;
6430 Py_UNICODE *qe;
6431 Py_ssize_t i, j, incr;
6432 PyUnicodeObject *u;
6433 int tabsize = 8;
6435 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6436 return NULL;
6438 /* First pass: determine size of output string */
6439 i = 0; /* chars up to and including most recent \n or \r */
6440 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6441 e = self->str + self->length; /* end of input */
6442 for (p = self->str; p < e; p++)
6443 if (*p == '\t') {
6444 if (tabsize > 0) {
6445 incr = tabsize - (j % tabsize); /* cannot overflow */
6446 if (j > PY_SSIZE_T_MAX - incr)
6447 goto overflow1;
6448 j += incr;
6451 else {
6452 if (j > PY_SSIZE_T_MAX - 1)
6453 goto overflow1;
6454 j++;
6455 if (*p == '\n' || *p == '\r') {
6456 if (i > PY_SSIZE_T_MAX - j)
6457 goto overflow1;
6458 i += j;
6459 j = 0;
6463 if (i > PY_SSIZE_T_MAX - j)
6464 goto overflow1;
6466 /* Second pass: create output string and fill it */
6467 u = _PyUnicode_New(i + j);
6468 if (!u)
6469 return NULL;
6471 j = 0; /* same as in first pass */
6472 q = u->str; /* next output char */
6473 qe = u->str + u->length; /* end of output */
6475 for (p = self->str; p < e; p++)
6476 if (*p == '\t') {
6477 if (tabsize > 0) {
6478 i = tabsize - (j % tabsize);
6479 j += i;
6480 while (i--) {
6481 if (q >= qe)
6482 goto overflow2;
6483 *q++ = ' ';
6487 else {
6488 if (q >= qe)
6489 goto overflow2;
6490 *q++ = *p;
6491 j++;
6492 if (*p == '\n' || *p == '\r')
6493 j = 0;
6496 return (PyObject*) u;
6498 overflow2:
6499 Py_DECREF(u);
6500 overflow1:
6501 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6502 return NULL;
6505 PyDoc_STRVAR(find__doc__,
6506 "S.find(sub [,start [,end]]) -> int\n\
6508 Return the lowest index in S where substring sub is found,\n\
6509 such that sub is contained within s[start:end]. Optional\n\
6510 arguments start and end are interpreted as in slice notation.\n\
6512 Return -1 on failure.");
6514 static PyObject *
6515 unicode_find(PyUnicodeObject *self, PyObject *args)
6517 PyObject *substring;
6518 Py_ssize_t start;
6519 Py_ssize_t end;
6520 Py_ssize_t result;
6522 if (!_ParseTupleFinds(args, &substring, &start, &end))
6523 return NULL;
6525 result = stringlib_find_slice(
6526 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6527 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6528 start, end
6531 Py_DECREF(substring);
6533 return PyInt_FromSsize_t(result);
6536 static PyObject *
6537 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6539 if (index < 0 || index >= self->length) {
6540 PyErr_SetString(PyExc_IndexError, "string index out of range");
6541 return NULL;
6544 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6547 static long
6548 unicode_hash(PyUnicodeObject *self)
6550 /* Since Unicode objects compare equal to their ASCII string
6551 counterparts, they should use the individual character values
6552 as basis for their hash value. This is needed to assure that
6553 strings and Unicode objects behave in the same way as
6554 dictionary keys. */
6556 register Py_ssize_t len;
6557 register Py_UNICODE *p;
6558 register long x;
6560 if (self->hash != -1)
6561 return self->hash;
6562 len = PyUnicode_GET_SIZE(self);
6563 p = PyUnicode_AS_UNICODE(self);
6564 x = *p << 7;
6565 while (--len >= 0)
6566 x = (1000003*x) ^ *p++;
6567 x ^= PyUnicode_GET_SIZE(self);
6568 if (x == -1)
6569 x = -2;
6570 self->hash = x;
6571 return x;
6574 PyDoc_STRVAR(index__doc__,
6575 "S.index(sub [,start [,end]]) -> int\n\
6577 Like S.find() but raise ValueError when the substring is not found.");
6579 static PyObject *
6580 unicode_index(PyUnicodeObject *self, PyObject *args)
6582 Py_ssize_t result;
6583 PyObject *substring;
6584 Py_ssize_t start;
6585 Py_ssize_t end;
6587 if (!_ParseTupleFinds(args, &substring, &start, &end))
6588 return NULL;
6590 result = stringlib_find_slice(
6591 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6592 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6593 start, end
6596 Py_DECREF(substring);
6598 if (result < 0) {
6599 PyErr_SetString(PyExc_ValueError, "substring not found");
6600 return NULL;
6603 return PyInt_FromSsize_t(result);
6606 PyDoc_STRVAR(islower__doc__,
6607 "S.islower() -> bool\n\
6609 Return True if all cased characters in S are lowercase and there is\n\
6610 at least one cased character in S, False otherwise.");
6612 static PyObject*
6613 unicode_islower(PyUnicodeObject *self)
6615 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6616 register const Py_UNICODE *e;
6617 int cased;
6619 /* Shortcut for single character strings */
6620 if (PyUnicode_GET_SIZE(self) == 1)
6621 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6623 /* Special case for empty strings */
6624 if (PyUnicode_GET_SIZE(self) == 0)
6625 return PyBool_FromLong(0);
6627 e = p + PyUnicode_GET_SIZE(self);
6628 cased = 0;
6629 for (; p < e; p++) {
6630 register const Py_UNICODE ch = *p;
6632 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6633 return PyBool_FromLong(0);
6634 else if (!cased && Py_UNICODE_ISLOWER(ch))
6635 cased = 1;
6637 return PyBool_FromLong(cased);
6640 PyDoc_STRVAR(isupper__doc__,
6641 "S.isupper() -> bool\n\
6643 Return True if all cased characters in S are uppercase and there is\n\
6644 at least one cased character in S, False otherwise.");
6646 static PyObject*
6647 unicode_isupper(PyUnicodeObject *self)
6649 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6650 register const Py_UNICODE *e;
6651 int cased;
6653 /* Shortcut for single character strings */
6654 if (PyUnicode_GET_SIZE(self) == 1)
6655 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6657 /* Special case for empty strings */
6658 if (PyUnicode_GET_SIZE(self) == 0)
6659 return PyBool_FromLong(0);
6661 e = p + PyUnicode_GET_SIZE(self);
6662 cased = 0;
6663 for (; p < e; p++) {
6664 register const Py_UNICODE ch = *p;
6666 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6667 return PyBool_FromLong(0);
6668 else if (!cased && Py_UNICODE_ISUPPER(ch))
6669 cased = 1;
6671 return PyBool_FromLong(cased);
6674 PyDoc_STRVAR(istitle__doc__,
6675 "S.istitle() -> bool\n\
6677 Return True if S is a titlecased string and there is at least one\n\
6678 character in S, i.e. upper- and titlecase characters may only\n\
6679 follow uncased characters and lowercase characters only cased ones.\n\
6680 Return False otherwise.");
6682 static PyObject*
6683 unicode_istitle(PyUnicodeObject *self)
6685 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6686 register const Py_UNICODE *e;
6687 int cased, previous_is_cased;
6689 /* Shortcut for single character strings */
6690 if (PyUnicode_GET_SIZE(self) == 1)
6691 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6692 (Py_UNICODE_ISUPPER(*p) != 0));
6694 /* Special case for empty strings */
6695 if (PyUnicode_GET_SIZE(self) == 0)
6696 return PyBool_FromLong(0);
6698 e = p + PyUnicode_GET_SIZE(self);
6699 cased = 0;
6700 previous_is_cased = 0;
6701 for (; p < e; p++) {
6702 register const Py_UNICODE ch = *p;
6704 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6705 if (previous_is_cased)
6706 return PyBool_FromLong(0);
6707 previous_is_cased = 1;
6708 cased = 1;
6710 else if (Py_UNICODE_ISLOWER(ch)) {
6711 if (!previous_is_cased)
6712 return PyBool_FromLong(0);
6713 previous_is_cased = 1;
6714 cased = 1;
6716 else
6717 previous_is_cased = 0;
6719 return PyBool_FromLong(cased);
6722 PyDoc_STRVAR(isspace__doc__,
6723 "S.isspace() -> bool\n\
6725 Return True if all characters in S are whitespace\n\
6726 and there is at least one character in S, False otherwise.");
6728 static PyObject*
6729 unicode_isspace(PyUnicodeObject *self)
6731 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6732 register const Py_UNICODE *e;
6734 /* Shortcut for single character strings */
6735 if (PyUnicode_GET_SIZE(self) == 1 &&
6736 Py_UNICODE_ISSPACE(*p))
6737 return PyBool_FromLong(1);
6739 /* Special case for empty strings */
6740 if (PyUnicode_GET_SIZE(self) == 0)
6741 return PyBool_FromLong(0);
6743 e = p + PyUnicode_GET_SIZE(self);
6744 for (; p < e; p++) {
6745 if (!Py_UNICODE_ISSPACE(*p))
6746 return PyBool_FromLong(0);
6748 return PyBool_FromLong(1);
6751 PyDoc_STRVAR(isalpha__doc__,
6752 "S.isalpha() -> bool\n\
6754 Return True if all characters in S are alphabetic\n\
6755 and there is at least one character in S, False otherwise.");
6757 static PyObject*
6758 unicode_isalpha(PyUnicodeObject *self)
6760 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6761 register const Py_UNICODE *e;
6763 /* Shortcut for single character strings */
6764 if (PyUnicode_GET_SIZE(self) == 1 &&
6765 Py_UNICODE_ISALPHA(*p))
6766 return PyBool_FromLong(1);
6768 /* Special case for empty strings */
6769 if (PyUnicode_GET_SIZE(self) == 0)
6770 return PyBool_FromLong(0);
6772 e = p + PyUnicode_GET_SIZE(self);
6773 for (; p < e; p++) {
6774 if (!Py_UNICODE_ISALPHA(*p))
6775 return PyBool_FromLong(0);
6777 return PyBool_FromLong(1);
6780 PyDoc_STRVAR(isalnum__doc__,
6781 "S.isalnum() -> bool\n\
6783 Return True if all characters in S are alphanumeric\n\
6784 and there is at least one character in S, False otherwise.");
6786 static PyObject*
6787 unicode_isalnum(PyUnicodeObject *self)
6789 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6790 register const Py_UNICODE *e;
6792 /* Shortcut for single character strings */
6793 if (PyUnicode_GET_SIZE(self) == 1 &&
6794 Py_UNICODE_ISALNUM(*p))
6795 return PyBool_FromLong(1);
6797 /* Special case for empty strings */
6798 if (PyUnicode_GET_SIZE(self) == 0)
6799 return PyBool_FromLong(0);
6801 e = p + PyUnicode_GET_SIZE(self);
6802 for (; p < e; p++) {
6803 if (!Py_UNICODE_ISALNUM(*p))
6804 return PyBool_FromLong(0);
6806 return PyBool_FromLong(1);
6809 PyDoc_STRVAR(isdecimal__doc__,
6810 "S.isdecimal() -> bool\n\
6812 Return True if there are only decimal characters in S,\n\
6813 False otherwise.");
6815 static PyObject*
6816 unicode_isdecimal(PyUnicodeObject *self)
6818 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6819 register const Py_UNICODE *e;
6821 /* Shortcut for single character strings */
6822 if (PyUnicode_GET_SIZE(self) == 1 &&
6823 Py_UNICODE_ISDECIMAL(*p))
6824 return PyBool_FromLong(1);
6826 /* Special case for empty strings */
6827 if (PyUnicode_GET_SIZE(self) == 0)
6828 return PyBool_FromLong(0);
6830 e = p + PyUnicode_GET_SIZE(self);
6831 for (; p < e; p++) {
6832 if (!Py_UNICODE_ISDECIMAL(*p))
6833 return PyBool_FromLong(0);
6835 return PyBool_FromLong(1);
6838 PyDoc_STRVAR(isdigit__doc__,
6839 "S.isdigit() -> bool\n\
6841 Return True if all characters in S are digits\n\
6842 and there is at least one character in S, False otherwise.");
6844 static PyObject*
6845 unicode_isdigit(PyUnicodeObject *self)
6847 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6848 register const Py_UNICODE *e;
6850 /* Shortcut for single character strings */
6851 if (PyUnicode_GET_SIZE(self) == 1 &&
6852 Py_UNICODE_ISDIGIT(*p))
6853 return PyBool_FromLong(1);
6855 /* Special case for empty strings */
6856 if (PyUnicode_GET_SIZE(self) == 0)
6857 return PyBool_FromLong(0);
6859 e = p + PyUnicode_GET_SIZE(self);
6860 for (; p < e; p++) {
6861 if (!Py_UNICODE_ISDIGIT(*p))
6862 return PyBool_FromLong(0);
6864 return PyBool_FromLong(1);
6867 PyDoc_STRVAR(isnumeric__doc__,
6868 "S.isnumeric() -> bool\n\
6870 Return True if there are only numeric characters in S,\n\
6871 False otherwise.");
6873 static PyObject*
6874 unicode_isnumeric(PyUnicodeObject *self)
6876 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6877 register const Py_UNICODE *e;
6879 /* Shortcut for single character strings */
6880 if (PyUnicode_GET_SIZE(self) == 1 &&
6881 Py_UNICODE_ISNUMERIC(*p))
6882 return PyBool_FromLong(1);
6884 /* Special case for empty strings */
6885 if (PyUnicode_GET_SIZE(self) == 0)
6886 return PyBool_FromLong(0);
6888 e = p + PyUnicode_GET_SIZE(self);
6889 for (; p < e; p++) {
6890 if (!Py_UNICODE_ISNUMERIC(*p))
6891 return PyBool_FromLong(0);
6893 return PyBool_FromLong(1);
6896 PyDoc_STRVAR(join__doc__,
6897 "S.join(iterable) -> unicode\n\
6899 Return a string which is the concatenation of the strings in the\n\
6900 iterable. The separator between elements is S.");
6902 static PyObject*
6903 unicode_join(PyObject *self, PyObject *data)
6905 return PyUnicode_Join(self, data);
6908 static Py_ssize_t
6909 unicode_length(PyUnicodeObject *self)
6911 return self->length;
6914 PyDoc_STRVAR(ljust__doc__,
6915 "S.ljust(width[, fillchar]) -> int\n\
6917 Return S left-justified in a Unicode string of length width. Padding is\n\
6918 done using the specified fill character (default is a space).");
6920 static PyObject *
6921 unicode_ljust(PyUnicodeObject *self, PyObject *args)
6923 Py_ssize_t width;
6924 Py_UNICODE fillchar = ' ';
6926 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
6927 return NULL;
6929 if (self->length >= width && PyUnicode_CheckExact(self)) {
6930 Py_INCREF(self);
6931 return (PyObject*) self;
6934 return (PyObject*) pad(self, 0, width - self->length, fillchar);
6937 PyDoc_STRVAR(lower__doc__,
6938 "S.lower() -> unicode\n\
6940 Return a copy of the string S converted to lowercase.");
6942 static PyObject*
6943 unicode_lower(PyUnicodeObject *self)
6945 return fixup(self, fixlower);
6948 #define LEFTSTRIP 0
6949 #define RIGHTSTRIP 1
6950 #define BOTHSTRIP 2
6952 /* Arrays indexed by above */
6953 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6955 #define STRIPNAME(i) (stripformat[i]+3)
6957 /* externally visible for str.strip(unicode) */
6958 PyObject *
6959 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6961 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6962 Py_ssize_t len = PyUnicode_GET_SIZE(self);
6963 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6964 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6965 Py_ssize_t i, j;
6967 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6969 i = 0;
6970 if (striptype != RIGHTSTRIP) {
6971 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6972 i++;
6976 j = len;
6977 if (striptype != LEFTSTRIP) {
6978 do {
6979 j--;
6980 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6981 j++;
6984 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6985 Py_INCREF(self);
6986 return (PyObject*)self;
6988 else
6989 return PyUnicode_FromUnicode(s+i, j-i);
6993 static PyObject *
6994 do_strip(PyUnicodeObject *self, int striptype)
6996 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6997 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
6999 i = 0;
7000 if (striptype != RIGHTSTRIP) {
7001 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7002 i++;
7006 j = len;
7007 if (striptype != LEFTSTRIP) {
7008 do {
7009 j--;
7010 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7011 j++;
7014 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7015 Py_INCREF(self);
7016 return (PyObject*)self;
7018 else
7019 return PyUnicode_FromUnicode(s+i, j-i);
7023 static PyObject *
7024 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7026 PyObject *sep = NULL;
7028 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7029 return NULL;
7031 if (sep != NULL && sep != Py_None) {
7032 if (PyUnicode_Check(sep))
7033 return _PyUnicode_XStrip(self, striptype, sep);
7034 else if (PyString_Check(sep)) {
7035 PyObject *res;
7036 sep = PyUnicode_FromObject(sep);
7037 if (sep==NULL)
7038 return NULL;
7039 res = _PyUnicode_XStrip(self, striptype, sep);
7040 Py_DECREF(sep);
7041 return res;
7043 else {
7044 PyErr_Format(PyExc_TypeError,
7045 "%s arg must be None, unicode or str",
7046 STRIPNAME(striptype));
7047 return NULL;
7051 return do_strip(self, striptype);
7055 PyDoc_STRVAR(strip__doc__,
7056 "S.strip([chars]) -> unicode\n\
7058 Return a copy of the string S with leading and trailing\n\
7059 whitespace removed.\n\
7060 If chars is given and not None, remove characters in chars instead.\n\
7061 If chars is a str, it will be converted to unicode before stripping");
7063 static PyObject *
7064 unicode_strip(PyUnicodeObject *self, PyObject *args)
7066 if (PyTuple_GET_SIZE(args) == 0)
7067 return do_strip(self, BOTHSTRIP); /* Common case */
7068 else
7069 return do_argstrip(self, BOTHSTRIP, args);
7073 PyDoc_STRVAR(lstrip__doc__,
7074 "S.lstrip([chars]) -> unicode\n\
7076 Return a copy of the string S with leading whitespace removed.\n\
7077 If chars is given and not None, remove characters in chars instead.\n\
7078 If chars is a str, it will be converted to unicode before stripping");
7080 static PyObject *
7081 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7083 if (PyTuple_GET_SIZE(args) == 0)
7084 return do_strip(self, LEFTSTRIP); /* Common case */
7085 else
7086 return do_argstrip(self, LEFTSTRIP, args);
7090 PyDoc_STRVAR(rstrip__doc__,
7091 "S.rstrip([chars]) -> unicode\n\
7093 Return a copy of the string S with trailing whitespace removed.\n\
7094 If chars is given and not None, remove characters in chars instead.\n\
7095 If chars is a str, it will be converted to unicode before stripping");
7097 static PyObject *
7098 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7100 if (PyTuple_GET_SIZE(args) == 0)
7101 return do_strip(self, RIGHTSTRIP); /* Common case */
7102 else
7103 return do_argstrip(self, RIGHTSTRIP, args);
7107 static PyObject*
7108 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7110 PyUnicodeObject *u;
7111 Py_UNICODE *p;
7112 Py_ssize_t nchars;
7113 size_t nbytes;
7115 if (len < 0)
7116 len = 0;
7118 if (len == 1 && PyUnicode_CheckExact(str)) {
7119 /* no repeat, return original string */
7120 Py_INCREF(str);
7121 return (PyObject*) str;
7124 /* ensure # of chars needed doesn't overflow int and # of bytes
7125 * needed doesn't overflow size_t
7127 nchars = len * str->length;
7128 if (len && nchars / len != str->length) {
7129 PyErr_SetString(PyExc_OverflowError,
7130 "repeated string is too long");
7131 return NULL;
7133 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7134 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7135 PyErr_SetString(PyExc_OverflowError,
7136 "repeated string is too long");
7137 return NULL;
7139 u = _PyUnicode_New(nchars);
7140 if (!u)
7141 return NULL;
7143 p = u->str;
7145 if (str->length == 1 && len > 0) {
7146 Py_UNICODE_FILL(p, str->str[0], len);
7147 } else {
7148 Py_ssize_t done = 0; /* number of characters copied this far */
7149 if (done < nchars) {
7150 Py_UNICODE_COPY(p, str->str, str->length);
7151 done = str->length;
7153 while (done < nchars) {
7154 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7155 Py_UNICODE_COPY(p+done, p, n);
7156 done += n;
7160 return (PyObject*) u;
7163 PyObject *PyUnicode_Replace(PyObject *obj,
7164 PyObject *subobj,
7165 PyObject *replobj,
7166 Py_ssize_t maxcount)
7168 PyObject *self;
7169 PyObject *str1;
7170 PyObject *str2;
7171 PyObject *result;
7173 self = PyUnicode_FromObject(obj);
7174 if (self == NULL)
7175 return NULL;
7176 str1 = PyUnicode_FromObject(subobj);
7177 if (str1 == NULL) {
7178 Py_DECREF(self);
7179 return NULL;
7181 str2 = PyUnicode_FromObject(replobj);
7182 if (str2 == NULL) {
7183 Py_DECREF(self);
7184 Py_DECREF(str1);
7185 return NULL;
7187 result = replace((PyUnicodeObject *)self,
7188 (PyUnicodeObject *)str1,
7189 (PyUnicodeObject *)str2,
7190 maxcount);
7191 Py_DECREF(self);
7192 Py_DECREF(str1);
7193 Py_DECREF(str2);
7194 return result;
7197 PyDoc_STRVAR(replace__doc__,
7198 "S.replace (old, new[, count]) -> unicode\n\
7200 Return a copy of S with all occurrences of substring\n\
7201 old replaced by new. If the optional argument count is\n\
7202 given, only the first count occurrences are replaced.");
7204 static PyObject*
7205 unicode_replace(PyUnicodeObject *self, PyObject *args)
7207 PyUnicodeObject *str1;
7208 PyUnicodeObject *str2;
7209 Py_ssize_t maxcount = -1;
7210 PyObject *result;
7212 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7213 return NULL;
7214 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7215 if (str1 == NULL)
7216 return NULL;
7217 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7218 if (str2 == NULL) {
7219 Py_DECREF(str1);
7220 return NULL;
7223 result = replace(self, str1, str2, maxcount);
7225 Py_DECREF(str1);
7226 Py_DECREF(str2);
7227 return result;
7230 static
7231 PyObject *unicode_repr(PyObject *unicode)
7233 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7234 PyUnicode_GET_SIZE(unicode),
7238 PyDoc_STRVAR(rfind__doc__,
7239 "S.rfind(sub [,start [,end]]) -> int\n\
7241 Return the highest index in S where substring sub is found,\n\
7242 such that sub is contained within s[start:end]. Optional\n\
7243 arguments start and end are interpreted as in slice notation.\n\
7245 Return -1 on failure.");
7247 static PyObject *
7248 unicode_rfind(PyUnicodeObject *self, PyObject *args)
7250 PyObject *substring;
7251 Py_ssize_t start;
7252 Py_ssize_t end;
7253 Py_ssize_t result;
7255 if (!_ParseTupleFinds(args, &substring, &start, &end))
7256 return NULL;
7258 result = stringlib_rfind_slice(
7259 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7260 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7261 start, end
7264 Py_DECREF(substring);
7266 return PyInt_FromSsize_t(result);
7269 PyDoc_STRVAR(rindex__doc__,
7270 "S.rindex(sub [,start [,end]]) -> int\n\
7272 Like S.rfind() but raise ValueError when the substring is not found.");
7274 static PyObject *
7275 unicode_rindex(PyUnicodeObject *self, PyObject *args)
7277 PyObject *substring;
7278 Py_ssize_t start;
7279 Py_ssize_t end;
7280 Py_ssize_t result;
7282 if (!_ParseTupleFinds(args, &substring, &start, &end))
7283 return NULL;
7285 result = stringlib_rfind_slice(
7286 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7287 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7288 start, end
7291 Py_DECREF(substring);
7293 if (result < 0) {
7294 PyErr_SetString(PyExc_ValueError, "substring not found");
7295 return NULL;
7297 return PyInt_FromSsize_t(result);
7300 PyDoc_STRVAR(rjust__doc__,
7301 "S.rjust(width[, fillchar]) -> unicode\n\
7303 Return S right-justified in a Unicode string of length width. Padding is\n\
7304 done using the specified fill character (default is a space).");
7306 static PyObject *
7307 unicode_rjust(PyUnicodeObject *self, PyObject *args)
7309 Py_ssize_t width;
7310 Py_UNICODE fillchar = ' ';
7312 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7313 return NULL;
7315 if (self->length >= width && PyUnicode_CheckExact(self)) {
7316 Py_INCREF(self);
7317 return (PyObject*) self;
7320 return (PyObject*) pad(self, width - self->length, 0, fillchar);
7323 static PyObject*
7324 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
7326 /* standard clamping */
7327 if (start < 0)
7328 start = 0;
7329 if (end < 0)
7330 end = 0;
7331 if (end > self->length)
7332 end = self->length;
7333 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
7334 /* full slice, return original string */
7335 Py_INCREF(self);
7336 return (PyObject*) self;
7338 if (start > end)
7339 start = end;
7340 /* copy slice */
7341 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7342 end - start);
7345 PyObject *PyUnicode_Split(PyObject *s,
7346 PyObject *sep,
7347 Py_ssize_t maxsplit)
7349 PyObject *result;
7351 s = PyUnicode_FromObject(s);
7352 if (s == NULL)
7353 return NULL;
7354 if (sep != NULL) {
7355 sep = PyUnicode_FromObject(sep);
7356 if (sep == NULL) {
7357 Py_DECREF(s);
7358 return NULL;
7362 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7364 Py_DECREF(s);
7365 Py_XDECREF(sep);
7366 return result;
7369 PyDoc_STRVAR(split__doc__,
7370 "S.split([sep [,maxsplit]]) -> list of strings\n\
7372 Return a list of the words in S, using sep as the\n\
7373 delimiter string. If maxsplit is given, at most maxsplit\n\
7374 splits are done. If sep is not specified or is None, any\n\
7375 whitespace string is a separator and empty strings are\n\
7376 removed from the result.");
7378 static PyObject*
7379 unicode_split(PyUnicodeObject *self, PyObject *args)
7381 PyObject *substring = Py_None;
7382 Py_ssize_t maxcount = -1;
7384 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7385 return NULL;
7387 if (substring == Py_None)
7388 return split(self, NULL, maxcount);
7389 else if (PyUnicode_Check(substring))
7390 return split(self, (PyUnicodeObject *)substring, maxcount);
7391 else
7392 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7395 PyObject *
7396 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7398 PyObject* str_obj;
7399 PyObject* sep_obj;
7400 PyObject* out;
7402 str_obj = PyUnicode_FromObject(str_in);
7403 if (!str_obj)
7404 return NULL;
7405 sep_obj = PyUnicode_FromObject(sep_in);
7406 if (!sep_obj) {
7407 Py_DECREF(str_obj);
7408 return NULL;
7411 out = stringlib_partition(
7412 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7413 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7416 Py_DECREF(sep_obj);
7417 Py_DECREF(str_obj);
7419 return out;
7423 PyObject *
7424 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7426 PyObject* str_obj;
7427 PyObject* sep_obj;
7428 PyObject* out;
7430 str_obj = PyUnicode_FromObject(str_in);
7431 if (!str_obj)
7432 return NULL;
7433 sep_obj = PyUnicode_FromObject(sep_in);
7434 if (!sep_obj) {
7435 Py_DECREF(str_obj);
7436 return NULL;
7439 out = stringlib_rpartition(
7440 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7441 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7444 Py_DECREF(sep_obj);
7445 Py_DECREF(str_obj);
7447 return out;
7450 PyDoc_STRVAR(partition__doc__,
7451 "S.partition(sep) -> (head, sep, tail)\n\
7453 Search for the separator sep in S, and return the part before it,\n\
7454 the separator itself, and the part after it. If the separator is not\n\
7455 found, return S and two empty strings.");
7457 static PyObject*
7458 unicode_partition(PyUnicodeObject *self, PyObject *separator)
7460 return PyUnicode_Partition((PyObject *)self, separator);
7463 PyDoc_STRVAR(rpartition__doc__,
7464 "S.rpartition(sep) -> (tail, sep, head)\n\
7466 Search for the separator sep in S, starting at the end of S, and return\n\
7467 the part before it, the separator itself, and the part after it. If the\n\
7468 separator is not found, return two empty strings and S.");
7470 static PyObject*
7471 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7473 return PyUnicode_RPartition((PyObject *)self, separator);
7476 PyObject *PyUnicode_RSplit(PyObject *s,
7477 PyObject *sep,
7478 Py_ssize_t maxsplit)
7480 PyObject *result;
7482 s = PyUnicode_FromObject(s);
7483 if (s == NULL)
7484 return NULL;
7485 if (sep != NULL) {
7486 sep = PyUnicode_FromObject(sep);
7487 if (sep == NULL) {
7488 Py_DECREF(s);
7489 return NULL;
7493 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7495 Py_DECREF(s);
7496 Py_XDECREF(sep);
7497 return result;
7500 PyDoc_STRVAR(rsplit__doc__,
7501 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7503 Return a list of the words in S, using sep as the\n\
7504 delimiter string, starting at the end of the string and\n\
7505 working to the front. If maxsplit is given, at most maxsplit\n\
7506 splits are done. If sep is not specified, any whitespace string\n\
7507 is a separator.");
7509 static PyObject*
7510 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7512 PyObject *substring = Py_None;
7513 Py_ssize_t maxcount = -1;
7515 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7516 return NULL;
7518 if (substring == Py_None)
7519 return rsplit(self, NULL, maxcount);
7520 else if (PyUnicode_Check(substring))
7521 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7522 else
7523 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7526 PyDoc_STRVAR(splitlines__doc__,
7527 "S.splitlines([keepends]) -> list of strings\n\
7529 Return a list of the lines in S, breaking at line boundaries.\n\
7530 Line breaks are not included in the resulting list unless keepends\n\
7531 is given and true.");
7533 static PyObject*
7534 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7536 int keepends = 0;
7538 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7539 return NULL;
7541 return PyUnicode_Splitlines((PyObject *)self, keepends);
7544 static
7545 PyObject *unicode_str(PyUnicodeObject *self)
7547 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
7550 PyDoc_STRVAR(swapcase__doc__,
7551 "S.swapcase() -> unicode\n\
7553 Return a copy of S with uppercase characters converted to lowercase\n\
7554 and vice versa.");
7556 static PyObject*
7557 unicode_swapcase(PyUnicodeObject *self)
7559 return fixup(self, fixswapcase);
7562 PyDoc_STRVAR(translate__doc__,
7563 "S.translate(table) -> unicode\n\
7565 Return a copy of the string S, where all characters have been mapped\n\
7566 through the given translation table, which must be a mapping of\n\
7567 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7568 Unmapped characters are left untouched. Characters mapped to None\n\
7569 are deleted.");
7571 static PyObject*
7572 unicode_translate(PyUnicodeObject *self, PyObject *table)
7574 return PyUnicode_TranslateCharmap(self->str,
7575 self->length,
7576 table,
7577 "ignore");
7580 PyDoc_STRVAR(upper__doc__,
7581 "S.upper() -> unicode\n\
7583 Return a copy of S converted to uppercase.");
7585 static PyObject*
7586 unicode_upper(PyUnicodeObject *self)
7588 return fixup(self, fixupper);
7591 PyDoc_STRVAR(zfill__doc__,
7592 "S.zfill(width) -> unicode\n\
7594 Pad a numeric string S with zeros on the left, to fill a field\n\
7595 of the specified width. The string S is never truncated.");
7597 static PyObject *
7598 unicode_zfill(PyUnicodeObject *self, PyObject *args)
7600 Py_ssize_t fill;
7601 PyUnicodeObject *u;
7603 Py_ssize_t width;
7604 if (!PyArg_ParseTuple(args, "n:zfill", &width))
7605 return NULL;
7607 if (self->length >= width) {
7608 if (PyUnicode_CheckExact(self)) {
7609 Py_INCREF(self);
7610 return (PyObject*) self;
7612 else
7613 return PyUnicode_FromUnicode(
7614 PyUnicode_AS_UNICODE(self),
7615 PyUnicode_GET_SIZE(self)
7619 fill = width - self->length;
7621 u = pad(self, fill, 0, '0');
7623 if (u == NULL)
7624 return NULL;
7626 if (u->str[fill] == '+' || u->str[fill] == '-') {
7627 /* move sign to beginning of string */
7628 u->str[0] = u->str[fill];
7629 u->str[fill] = '0';
7632 return (PyObject*) u;
7635 #if 0
7636 static PyObject*
7637 free_listsize(PyUnicodeObject *self)
7639 return PyInt_FromLong(numfree);
7641 #endif
7643 PyDoc_STRVAR(startswith__doc__,
7644 "S.startswith(prefix[, start[, end]]) -> bool\n\
7646 Return True if S starts with the specified prefix, False otherwise.\n\
7647 With optional start, test S beginning at that position.\n\
7648 With optional end, stop comparing S at that position.\n\
7649 prefix can also be a tuple of strings to try.");
7651 static PyObject *
7652 unicode_startswith(PyUnicodeObject *self,
7653 PyObject *args)
7655 PyObject *subobj;
7656 PyUnicodeObject *substring;
7657 Py_ssize_t start = 0;
7658 Py_ssize_t end = PY_SSIZE_T_MAX;
7659 int result;
7661 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
7662 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7663 return NULL;
7664 if (PyTuple_Check(subobj)) {
7665 Py_ssize_t i;
7666 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7667 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7668 PyTuple_GET_ITEM(subobj, i));
7669 if (substring == NULL)
7670 return NULL;
7671 result = tailmatch(self, substring, start, end, -1);
7672 Py_DECREF(substring);
7673 if (result) {
7674 Py_RETURN_TRUE;
7677 /* nothing matched */
7678 Py_RETURN_FALSE;
7680 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7681 if (substring == NULL)
7682 return NULL;
7683 result = tailmatch(self, substring, start, end, -1);
7684 Py_DECREF(substring);
7685 return PyBool_FromLong(result);
7689 PyDoc_STRVAR(endswith__doc__,
7690 "S.endswith(suffix[, start[, end]]) -> bool\n\
7692 Return True if S ends with the specified suffix, False otherwise.\n\
7693 With optional start, test S beginning at that position.\n\
7694 With optional end, stop comparing S at that position.\n\
7695 suffix can also be a tuple of strings to try.");
7697 static PyObject *
7698 unicode_endswith(PyUnicodeObject *self,
7699 PyObject *args)
7701 PyObject *subobj;
7702 PyUnicodeObject *substring;
7703 Py_ssize_t start = 0;
7704 Py_ssize_t end = PY_SSIZE_T_MAX;
7705 int result;
7707 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7708 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7709 return NULL;
7710 if (PyTuple_Check(subobj)) {
7711 Py_ssize_t i;
7712 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7713 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7714 PyTuple_GET_ITEM(subobj, i));
7715 if (substring == NULL)
7716 return NULL;
7717 result = tailmatch(self, substring, start, end, +1);
7718 Py_DECREF(substring);
7719 if (result) {
7720 Py_RETURN_TRUE;
7723 Py_RETURN_FALSE;
7725 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7726 if (substring == NULL)
7727 return NULL;
7729 result = tailmatch(self, substring, start, end, +1);
7730 Py_DECREF(substring);
7731 return PyBool_FromLong(result);
7735 /* Implements do_string_format, which is unicode because of stringlib */
7736 #include "stringlib/string_format.h"
7738 PyDoc_STRVAR(format__doc__,
7739 "S.format(*args, **kwargs) -> unicode\n\
7743 static PyObject *
7744 unicode__format__(PyObject *self, PyObject *args)
7746 PyObject *format_spec;
7747 PyObject *result = NULL;
7748 PyObject *tmp = NULL;
7750 /* If 2.x, convert format_spec to the same type as value */
7751 /* This is to allow things like u''.format('') */
7752 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7753 goto done;
7754 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7755 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
7756 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
7757 goto done;
7759 tmp = PyObject_Unicode(format_spec);
7760 if (tmp == NULL)
7761 goto done;
7762 format_spec = tmp;
7764 result = _PyUnicode_FormatAdvanced(self,
7765 PyUnicode_AS_UNICODE(format_spec),
7766 PyUnicode_GET_SIZE(format_spec));
7767 done:
7768 Py_XDECREF(tmp);
7769 return result;
7772 PyDoc_STRVAR(p_format__doc__,
7773 "S.__format__(format_spec) -> unicode\n\
7777 static PyObject *
7778 unicode__sizeof__(PyUnicodeObject *v)
7780 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7781 sizeof(Py_UNICODE) * (v->length + 1));
7784 PyDoc_STRVAR(sizeof__doc__,
7785 "S.__sizeof__() -> size of S in memory, in bytes\n\
7789 static PyObject *
7790 unicode_getnewargs(PyUnicodeObject *v)
7792 return Py_BuildValue("(u#)", v->str, v->length);
7796 static PyMethodDef unicode_methods[] = {
7798 /* Order is according to common usage: often used methods should
7799 appear first, since lookup is done sequentially. */
7801 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
7802 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7803 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
7804 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
7805 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7806 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7807 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7808 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7809 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7810 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7811 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
7812 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
7813 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7814 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7815 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
7816 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
7817 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
7818 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7819 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7820 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7821 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
7822 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
7823 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
7824 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
7825 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
7826 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7827 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7828 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7829 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7830 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7831 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7832 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7833 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7834 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7835 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7836 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7837 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7838 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7839 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
7840 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
7841 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7842 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7843 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7844 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
7845 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
7846 #if 0
7847 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
7848 #endif
7850 #if 0
7851 /* This one is just used for debugging the implementation. */
7852 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
7853 #endif
7855 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
7856 {NULL, NULL}
7859 static PyObject *
7860 unicode_mod(PyObject *v, PyObject *w)
7862 if (!PyUnicode_Check(v)) {
7863 Py_INCREF(Py_NotImplemented);
7864 return Py_NotImplemented;
7866 return PyUnicode_Format(v, w);
7869 static PyNumberMethods unicode_as_number = {
7870 0, /*nb_add*/
7871 0, /*nb_subtract*/
7872 0, /*nb_multiply*/
7873 0, /*nb_divide*/
7874 unicode_mod, /*nb_remainder*/
7877 static PySequenceMethods unicode_as_sequence = {
7878 (lenfunc) unicode_length, /* sq_length */
7879 PyUnicode_Concat, /* sq_concat */
7880 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7881 (ssizeargfunc) unicode_getitem, /* sq_item */
7882 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7883 0, /* sq_ass_item */
7884 0, /* sq_ass_slice */
7885 PyUnicode_Contains, /* sq_contains */
7888 static PyObject*
7889 unicode_subscript(PyUnicodeObject* self, PyObject* item)
7891 if (PyIndex_Check(item)) {
7892 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
7893 if (i == -1 && PyErr_Occurred())
7894 return NULL;
7895 if (i < 0)
7896 i += PyUnicode_GET_SIZE(self);
7897 return unicode_getitem(self, i);
7898 } else if (PySlice_Check(item)) {
7899 Py_ssize_t start, stop, step, slicelength, cur, i;
7900 Py_UNICODE* source_buf;
7901 Py_UNICODE* result_buf;
7902 PyObject* result;
7904 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
7905 &start, &stop, &step, &slicelength) < 0) {
7906 return NULL;
7909 if (slicelength <= 0) {
7910 return PyUnicode_FromUnicode(NULL, 0);
7911 } else if (start == 0 && step == 1 && slicelength == self->length &&
7912 PyUnicode_CheckExact(self)) {
7913 Py_INCREF(self);
7914 return (PyObject *)self;
7915 } else if (step == 1) {
7916 return PyUnicode_FromUnicode(self->str + start, slicelength);
7917 } else {
7918 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
7919 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7920 sizeof(Py_UNICODE));
7922 if (result_buf == NULL)
7923 return PyErr_NoMemory();
7925 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7926 result_buf[i] = source_buf[cur];
7929 result = PyUnicode_FromUnicode(result_buf, slicelength);
7930 PyObject_FREE(result_buf);
7931 return result;
7933 } else {
7934 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7935 return NULL;
7939 static PyMappingMethods unicode_as_mapping = {
7940 (lenfunc)unicode_length, /* mp_length */
7941 (binaryfunc)unicode_subscript, /* mp_subscript */
7942 (objobjargproc)0, /* mp_ass_subscript */
7945 static Py_ssize_t
7946 unicode_buffer_getreadbuf(PyUnicodeObject *self,
7947 Py_ssize_t index,
7948 const void **ptr)
7950 if (index != 0) {
7951 PyErr_SetString(PyExc_SystemError,
7952 "accessing non-existent unicode segment");
7953 return -1;
7955 *ptr = (void *) self->str;
7956 return PyUnicode_GET_DATA_SIZE(self);
7959 static Py_ssize_t
7960 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
7961 const void **ptr)
7963 PyErr_SetString(PyExc_TypeError,
7964 "cannot use unicode as modifiable buffer");
7965 return -1;
7968 static int
7969 unicode_buffer_getsegcount(PyUnicodeObject *self,
7970 Py_ssize_t *lenp)
7972 if (lenp)
7973 *lenp = PyUnicode_GET_DATA_SIZE(self);
7974 return 1;
7977 static Py_ssize_t
7978 unicode_buffer_getcharbuf(PyUnicodeObject *self,
7979 Py_ssize_t index,
7980 const void **ptr)
7982 PyObject *str;
7984 if (index != 0) {
7985 PyErr_SetString(PyExc_SystemError,
7986 "accessing non-existent unicode segment");
7987 return -1;
7989 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
7990 if (str == NULL)
7991 return -1;
7992 *ptr = (void *) PyString_AS_STRING(str);
7993 return PyString_GET_SIZE(str);
7996 /* Helpers for PyUnicode_Format() */
7998 static PyObject *
7999 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8001 Py_ssize_t argidx = *p_argidx;
8002 if (argidx < arglen) {
8003 (*p_argidx)++;
8004 if (arglen < 0)
8005 return args;
8006 else
8007 return PyTuple_GetItem(args, argidx);
8009 PyErr_SetString(PyExc_TypeError,
8010 "not enough arguments for format string");
8011 return NULL;
8014 #define F_LJUST (1<<0)
8015 #define F_SIGN (1<<1)
8016 #define F_BLANK (1<<2)
8017 #define F_ALT (1<<3)
8018 #define F_ZERO (1<<4)
8020 static Py_ssize_t
8021 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8023 register Py_ssize_t i;
8024 Py_ssize_t len = strlen(charbuffer);
8025 for (i = len - 1; i >= 0; i--)
8026 buffer[i] = (Py_UNICODE) charbuffer[i];
8028 return len;
8031 static int
8032 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8034 Py_ssize_t result;
8036 PyOS_snprintf((char *)buffer, len, format, x);
8037 result = strtounicode(buffer, (char *)buffer);
8038 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8041 /* XXX To save some code duplication, formatfloat/long/int could have been
8042 shared with stringobject.c, converting from 8-bit to Unicode after the
8043 formatting is done. */
8045 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
8047 static PyObject *
8048 formatfloat(PyObject *v, int flags, int prec, int type)
8050 char *p;
8051 PyObject *result;
8052 double x;
8054 x = PyFloat_AsDouble(v);
8055 if (x == -1.0 && PyErr_Occurred())
8056 return NULL;
8058 if (prec < 0)
8059 prec = 6;
8061 p = PyOS_double_to_string(x, type, prec,
8062 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8063 if (p == NULL)
8064 return NULL;
8065 result = PyUnicode_FromStringAndSize(p, strlen(p));
8066 PyMem_Free(p);
8067 return result;
8070 static PyObject*
8071 formatlong(PyObject *val, int flags, int prec, int type)
8073 char *buf;
8074 int i, len;
8075 PyObject *str; /* temporary string object. */
8076 PyUnicodeObject *result;
8078 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8079 if (!str)
8080 return NULL;
8081 result = _PyUnicode_New(len);
8082 if (!result) {
8083 Py_DECREF(str);
8084 return NULL;
8086 for (i = 0; i < len; i++)
8087 result->str[i] = buf[i];
8088 result->str[len] = 0;
8089 Py_DECREF(str);
8090 return (PyObject*)result;
8093 static int
8094 formatint(Py_UNICODE *buf,
8095 size_t buflen,
8096 int flags,
8097 int prec,
8098 int type,
8099 PyObject *v)
8101 /* fmt = '%#.' + `prec` + 'l' + `type`
8102 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8103 * + 1 + 1
8104 * = 24
8106 char fmt[64]; /* plenty big enough! */
8107 char *sign;
8108 long x;
8110 x = PyInt_AsLong(v);
8111 if (x == -1 && PyErr_Occurred())
8112 return -1;
8113 if (x < 0 && type == 'u') {
8114 type = 'd';
8116 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8117 sign = "-";
8118 else
8119 sign = "";
8120 if (prec < 0)
8121 prec = 1;
8123 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8124 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8126 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8127 PyErr_SetString(PyExc_OverflowError,
8128 "formatted integer is too long (precision too large?)");
8129 return -1;
8132 if ((flags & F_ALT) &&
8133 (type == 'x' || type == 'X')) {
8134 /* When converting under %#x or %#X, there are a number
8135 * of issues that cause pain:
8136 * - when 0 is being converted, the C standard leaves off
8137 * the '0x' or '0X', which is inconsistent with other
8138 * %#x/%#X conversions and inconsistent with Python's
8139 * hex() function
8140 * - there are platforms that violate the standard and
8141 * convert 0 with the '0x' or '0X'
8142 * (Metrowerks, Compaq Tru64)
8143 * - there are platforms that give '0x' when converting
8144 * under %#X, but convert 0 in accordance with the
8145 * standard (OS/2 EMX)
8147 * We can achieve the desired consistency by inserting our
8148 * own '0x' or '0X' prefix, and substituting %x/%X in place
8149 * of %#x/%#X.
8151 * Note that this is the same approach as used in
8152 * formatint() in stringobject.c
8154 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8155 sign, type, prec, type);
8157 else {
8158 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8159 sign, (flags&F_ALT) ? "#" : "",
8160 prec, type);
8162 if (sign[0])
8163 return longtounicode(buf, buflen, fmt, -x);
8164 else
8165 return longtounicode(buf, buflen, fmt, x);
8168 static int
8169 formatchar(Py_UNICODE *buf,
8170 size_t buflen,
8171 PyObject *v)
8173 /* presume that the buffer is at least 2 characters long */
8174 if (PyUnicode_Check(v)) {
8175 if (PyUnicode_GET_SIZE(v) != 1)
8176 goto onError;
8177 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8180 else if (PyString_Check(v)) {
8181 if (PyString_GET_SIZE(v) != 1)
8182 goto onError;
8183 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8186 else {
8187 /* Integer input truncated to a character */
8188 long x;
8189 x = PyInt_AsLong(v);
8190 if (x == -1 && PyErr_Occurred())
8191 goto onError;
8192 #ifdef Py_UNICODE_WIDE
8193 if (x < 0 || x > 0x10ffff) {
8194 PyErr_SetString(PyExc_OverflowError,
8195 "%c arg not in range(0x110000) "
8196 "(wide Python build)");
8197 return -1;
8199 #else
8200 if (x < 0 || x > 0xffff) {
8201 PyErr_SetString(PyExc_OverflowError,
8202 "%c arg not in range(0x10000) "
8203 "(narrow Python build)");
8204 return -1;
8206 #endif
8207 buf[0] = (Py_UNICODE) x;
8209 buf[1] = '\0';
8210 return 1;
8212 onError:
8213 PyErr_SetString(PyExc_TypeError,
8214 "%c requires int or char");
8215 return -1;
8218 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8220 FORMATBUFLEN is the length of the buffer in which the ints &
8221 chars are formatted. XXX This is a magic number. Each formatting
8222 routine does bounds checking to ensure no overflow, but a better
8223 solution may be to malloc a buffer of appropriate size for each
8224 format. For now, the current solution is sufficient.
8226 #define FORMATBUFLEN (size_t)120
8228 PyObject *PyUnicode_Format(PyObject *format,
8229 PyObject *args)
8231 Py_UNICODE *fmt, *res;
8232 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8233 int args_owned = 0;
8234 PyUnicodeObject *result = NULL;
8235 PyObject *dict = NULL;
8236 PyObject *uformat;
8238 if (format == NULL || args == NULL) {
8239 PyErr_BadInternalCall();
8240 return NULL;
8242 uformat = PyUnicode_FromObject(format);
8243 if (uformat == NULL)
8244 return NULL;
8245 fmt = PyUnicode_AS_UNICODE(uformat);
8246 fmtcnt = PyUnicode_GET_SIZE(uformat);
8248 reslen = rescnt = fmtcnt + 100;
8249 result = _PyUnicode_New(reslen);
8250 if (result == NULL)
8251 goto onError;
8252 res = PyUnicode_AS_UNICODE(result);
8254 if (PyTuple_Check(args)) {
8255 arglen = PyTuple_Size(args);
8256 argidx = 0;
8258 else {
8259 arglen = -1;
8260 argidx = -2;
8262 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
8263 !PyObject_TypeCheck(args, &PyBaseString_Type))
8264 dict = args;
8266 while (--fmtcnt >= 0) {
8267 if (*fmt != '%') {
8268 if (--rescnt < 0) {
8269 rescnt = fmtcnt + 100;
8270 reslen += rescnt;
8271 if (_PyUnicode_Resize(&result, reslen) < 0)
8272 goto onError;
8273 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8274 --rescnt;
8276 *res++ = *fmt++;
8278 else {
8279 /* Got a format specifier */
8280 int flags = 0;
8281 Py_ssize_t width = -1;
8282 int prec = -1;
8283 Py_UNICODE c = '\0';
8284 Py_UNICODE fill;
8285 int isnumok;
8286 PyObject *v = NULL;
8287 PyObject *temp = NULL;
8288 Py_UNICODE *pbuf;
8289 Py_UNICODE sign;
8290 Py_ssize_t len;
8291 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
8293 fmt++;
8294 if (*fmt == '(') {
8295 Py_UNICODE *keystart;
8296 Py_ssize_t keylen;
8297 PyObject *key;
8298 int pcount = 1;
8300 if (dict == NULL) {
8301 PyErr_SetString(PyExc_TypeError,
8302 "format requires a mapping");
8303 goto onError;
8305 ++fmt;
8306 --fmtcnt;
8307 keystart = fmt;
8308 /* Skip over balanced parentheses */
8309 while (pcount > 0 && --fmtcnt >= 0) {
8310 if (*fmt == ')')
8311 --pcount;
8312 else if (*fmt == '(')
8313 ++pcount;
8314 fmt++;
8316 keylen = fmt - keystart - 1;
8317 if (fmtcnt < 0 || pcount > 0) {
8318 PyErr_SetString(PyExc_ValueError,
8319 "incomplete format key");
8320 goto onError;
8322 #if 0
8323 /* keys are converted to strings using UTF-8 and
8324 then looked up since Python uses strings to hold
8325 variables names etc. in its namespaces and we
8326 wouldn't want to break common idioms. */
8327 key = PyUnicode_EncodeUTF8(keystart,
8328 keylen,
8329 NULL);
8330 #else
8331 key = PyUnicode_FromUnicode(keystart, keylen);
8332 #endif
8333 if (key == NULL)
8334 goto onError;
8335 if (args_owned) {
8336 Py_DECREF(args);
8337 args_owned = 0;
8339 args = PyObject_GetItem(dict, key);
8340 Py_DECREF(key);
8341 if (args == NULL) {
8342 goto onError;
8344 args_owned = 1;
8345 arglen = -1;
8346 argidx = -2;
8348 while (--fmtcnt >= 0) {
8349 switch (c = *fmt++) {
8350 case '-': flags |= F_LJUST; continue;
8351 case '+': flags |= F_SIGN; continue;
8352 case ' ': flags |= F_BLANK; continue;
8353 case '#': flags |= F_ALT; continue;
8354 case '0': flags |= F_ZERO; continue;
8356 break;
8358 if (c == '*') {
8359 v = getnextarg(args, arglen, &argidx);
8360 if (v == NULL)
8361 goto onError;
8362 if (!PyInt_Check(v)) {
8363 PyErr_SetString(PyExc_TypeError,
8364 "* wants int");
8365 goto onError;
8367 width = PyInt_AsLong(v);
8368 if (width < 0) {
8369 flags |= F_LJUST;
8370 width = -width;
8372 if (--fmtcnt >= 0)
8373 c = *fmt++;
8375 else if (c >= '0' && c <= '9') {
8376 width = c - '0';
8377 while (--fmtcnt >= 0) {
8378 c = *fmt++;
8379 if (c < '0' || c > '9')
8380 break;
8381 if ((width*10) / 10 != width) {
8382 PyErr_SetString(PyExc_ValueError,
8383 "width too big");
8384 goto onError;
8386 width = width*10 + (c - '0');
8389 if (c == '.') {
8390 prec = 0;
8391 if (--fmtcnt >= 0)
8392 c = *fmt++;
8393 if (c == '*') {
8394 v = getnextarg(args, arglen, &argidx);
8395 if (v == NULL)
8396 goto onError;
8397 if (!PyInt_Check(v)) {
8398 PyErr_SetString(PyExc_TypeError,
8399 "* wants int");
8400 goto onError;
8402 prec = PyInt_AsLong(v);
8403 if (prec < 0)
8404 prec = 0;
8405 if (--fmtcnt >= 0)
8406 c = *fmt++;
8408 else if (c >= '0' && c <= '9') {
8409 prec = c - '0';
8410 while (--fmtcnt >= 0) {
8411 c = Py_CHARMASK(*fmt++);
8412 if (c < '0' || c > '9')
8413 break;
8414 if ((prec*10) / 10 != prec) {
8415 PyErr_SetString(PyExc_ValueError,
8416 "prec too big");
8417 goto onError;
8419 prec = prec*10 + (c - '0');
8422 } /* prec */
8423 if (fmtcnt >= 0) {
8424 if (c == 'h' || c == 'l' || c == 'L') {
8425 if (--fmtcnt >= 0)
8426 c = *fmt++;
8429 if (fmtcnt < 0) {
8430 PyErr_SetString(PyExc_ValueError,
8431 "incomplete format");
8432 goto onError;
8434 if (c != '%') {
8435 v = getnextarg(args, arglen, &argidx);
8436 if (v == NULL)
8437 goto onError;
8439 sign = 0;
8440 fill = ' ';
8441 switch (c) {
8443 case '%':
8444 pbuf = formatbuf;
8445 /* presume that buffer length is at least 1 */
8446 pbuf[0] = '%';
8447 len = 1;
8448 break;
8450 case 's':
8451 case 'r':
8452 if (PyUnicode_Check(v) && c == 's') {
8453 temp = v;
8454 Py_INCREF(temp);
8456 else {
8457 PyObject *unicode;
8458 if (c == 's')
8459 temp = PyObject_Unicode(v);
8460 else
8461 temp = PyObject_Repr(v);
8462 if (temp == NULL)
8463 goto onError;
8464 if (PyUnicode_Check(temp))
8465 /* nothing to do */;
8466 else if (PyString_Check(temp)) {
8467 /* convert to string to Unicode */
8468 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8469 PyString_GET_SIZE(temp),
8470 NULL,
8471 "strict");
8472 Py_DECREF(temp);
8473 temp = unicode;
8474 if (temp == NULL)
8475 goto onError;
8477 else {
8478 Py_DECREF(temp);
8479 PyErr_SetString(PyExc_TypeError,
8480 "%s argument has non-string str()");
8481 goto onError;
8484 pbuf = PyUnicode_AS_UNICODE(temp);
8485 len = PyUnicode_GET_SIZE(temp);
8486 if (prec >= 0 && len > prec)
8487 len = prec;
8488 break;
8490 case 'i':
8491 case 'd':
8492 case 'u':
8493 case 'o':
8494 case 'x':
8495 case 'X':
8496 if (c == 'i')
8497 c = 'd';
8498 isnumok = 0;
8499 if (PyNumber_Check(v)) {
8500 PyObject *iobj=NULL;
8502 if (PyInt_Check(v) || (PyLong_Check(v))) {
8503 iobj = v;
8504 Py_INCREF(iobj);
8506 else {
8507 iobj = PyNumber_Int(v);
8508 if (iobj==NULL) iobj = PyNumber_Long(v);
8510 if (iobj!=NULL) {
8511 if (PyInt_Check(iobj)) {
8512 isnumok = 1;
8513 pbuf = formatbuf;
8514 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8515 flags, prec, c, iobj);
8516 Py_DECREF(iobj);
8517 if (len < 0)
8518 goto onError;
8519 sign = 1;
8521 else if (PyLong_Check(iobj)) {
8522 isnumok = 1;
8523 temp = formatlong(iobj, flags, prec, c);
8524 Py_DECREF(iobj);
8525 if (!temp)
8526 goto onError;
8527 pbuf = PyUnicode_AS_UNICODE(temp);
8528 len = PyUnicode_GET_SIZE(temp);
8529 sign = 1;
8531 else {
8532 Py_DECREF(iobj);
8536 if (!isnumok) {
8537 PyErr_Format(PyExc_TypeError,
8538 "%%%c format: a number is required, "
8539 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8540 goto onError;
8542 if (flags & F_ZERO)
8543 fill = '0';
8544 break;
8546 case 'e':
8547 case 'E':
8548 case 'f':
8549 case 'F':
8550 case 'g':
8551 case 'G':
8552 temp = formatfloat(v, flags, prec, c);
8553 if (temp == NULL)
8554 goto onError;
8555 pbuf = PyUnicode_AS_UNICODE(temp);
8556 len = PyUnicode_GET_SIZE(temp);
8557 sign = 1;
8558 if (flags & F_ZERO)
8559 fill = '0';
8560 break;
8562 case 'c':
8563 pbuf = formatbuf;
8564 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8565 if (len < 0)
8566 goto onError;
8567 break;
8569 default:
8570 PyErr_Format(PyExc_ValueError,
8571 "unsupported format character '%c' (0x%x) "
8572 "at index %zd",
8573 (31<=c && c<=126) ? (char)c : '?',
8574 (int)c,
8575 (Py_ssize_t)(fmt - 1 -
8576 PyUnicode_AS_UNICODE(uformat)));
8577 goto onError;
8579 if (sign) {
8580 if (*pbuf == '-' || *pbuf == '+') {
8581 sign = *pbuf++;
8582 len--;
8584 else if (flags & F_SIGN)
8585 sign = '+';
8586 else if (flags & F_BLANK)
8587 sign = ' ';
8588 else
8589 sign = 0;
8591 if (width < len)
8592 width = len;
8593 if (rescnt - (sign != 0) < width) {
8594 reslen -= rescnt;
8595 rescnt = width + fmtcnt + 100;
8596 reslen += rescnt;
8597 if (reslen < 0) {
8598 Py_XDECREF(temp);
8599 PyErr_NoMemory();
8600 goto onError;
8602 if (_PyUnicode_Resize(&result, reslen) < 0) {
8603 Py_XDECREF(temp);
8604 goto onError;
8606 res = PyUnicode_AS_UNICODE(result)
8607 + reslen - rescnt;
8609 if (sign) {
8610 if (fill != ' ')
8611 *res++ = sign;
8612 rescnt--;
8613 if (width > len)
8614 width--;
8616 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8617 assert(pbuf[0] == '0');
8618 assert(pbuf[1] == c);
8619 if (fill != ' ') {
8620 *res++ = *pbuf++;
8621 *res++ = *pbuf++;
8623 rescnt -= 2;
8624 width -= 2;
8625 if (width < 0)
8626 width = 0;
8627 len -= 2;
8629 if (width > len && !(flags & F_LJUST)) {
8630 do {
8631 --rescnt;
8632 *res++ = fill;
8633 } while (--width > len);
8635 if (fill == ' ') {
8636 if (sign)
8637 *res++ = sign;
8638 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8639 assert(pbuf[0] == '0');
8640 assert(pbuf[1] == c);
8641 *res++ = *pbuf++;
8642 *res++ = *pbuf++;
8645 Py_UNICODE_COPY(res, pbuf, len);
8646 res += len;
8647 rescnt -= len;
8648 while (--width >= len) {
8649 --rescnt;
8650 *res++ = ' ';
8652 if (dict && (argidx < arglen) && c != '%') {
8653 PyErr_SetString(PyExc_TypeError,
8654 "not all arguments converted during string formatting");
8655 Py_XDECREF(temp);
8656 goto onError;
8658 Py_XDECREF(temp);
8659 } /* '%' */
8660 } /* until end */
8661 if (argidx < arglen && !dict) {
8662 PyErr_SetString(PyExc_TypeError,
8663 "not all arguments converted during string formatting");
8664 goto onError;
8667 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8668 goto onError;
8669 if (args_owned) {
8670 Py_DECREF(args);
8672 Py_DECREF(uformat);
8673 return (PyObject *)result;
8675 onError:
8676 Py_XDECREF(result);
8677 Py_DECREF(uformat);
8678 if (args_owned) {
8679 Py_DECREF(args);
8681 return NULL;
8684 static PyBufferProcs unicode_as_buffer = {
8685 (readbufferproc) unicode_buffer_getreadbuf,
8686 (writebufferproc) unicode_buffer_getwritebuf,
8687 (segcountproc) unicode_buffer_getsegcount,
8688 (charbufferproc) unicode_buffer_getcharbuf,
8691 static PyObject *
8692 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8694 static PyObject *
8695 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8697 PyObject *x = NULL;
8698 static char *kwlist[] = {"string", "encoding", "errors", 0};
8699 char *encoding = NULL;
8700 char *errors = NULL;
8702 if (type != &PyUnicode_Type)
8703 return unicode_subtype_new(type, args, kwds);
8704 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8705 kwlist, &x, &encoding, &errors))
8706 return NULL;
8707 if (x == NULL)
8708 return (PyObject *)_PyUnicode_New(0);
8709 if (encoding == NULL && errors == NULL)
8710 return PyObject_Unicode(x);
8711 else
8712 return PyUnicode_FromEncodedObject(x, encoding, errors);
8715 static PyObject *
8716 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8718 PyUnicodeObject *tmp, *pnew;
8719 Py_ssize_t n;
8721 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8722 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8723 if (tmp == NULL)
8724 return NULL;
8725 assert(PyUnicode_Check(tmp));
8726 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8727 if (pnew == NULL) {
8728 Py_DECREF(tmp);
8729 return NULL;
8731 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8732 if (pnew->str == NULL) {
8733 _Py_ForgetReference((PyObject *)pnew);
8734 PyObject_Del(pnew);
8735 Py_DECREF(tmp);
8736 return PyErr_NoMemory();
8738 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8739 pnew->length = n;
8740 pnew->hash = tmp->hash;
8741 Py_DECREF(tmp);
8742 return (PyObject *)pnew;
8745 PyDoc_STRVAR(unicode_doc,
8746 "unicode(string [, encoding[, errors]]) -> object\n\
8748 Create a new Unicode object from the given encoded string.\n\
8749 encoding defaults to the current default string encoding.\n\
8750 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8752 PyTypeObject PyUnicode_Type = {
8753 PyVarObject_HEAD_INIT(&PyType_Type, 0)
8754 "unicode", /* tp_name */
8755 sizeof(PyUnicodeObject), /* tp_size */
8756 0, /* tp_itemsize */
8757 /* Slots */
8758 (destructor)unicode_dealloc, /* tp_dealloc */
8759 0, /* tp_print */
8760 0, /* tp_getattr */
8761 0, /* tp_setattr */
8762 0, /* tp_compare */
8763 unicode_repr, /* tp_repr */
8764 &unicode_as_number, /* tp_as_number */
8765 &unicode_as_sequence, /* tp_as_sequence */
8766 &unicode_as_mapping, /* tp_as_mapping */
8767 (hashfunc) unicode_hash, /* tp_hash*/
8768 0, /* tp_call*/
8769 (reprfunc) unicode_str, /* tp_str */
8770 PyObject_GenericGetAttr, /* tp_getattro */
8771 0, /* tp_setattro */
8772 &unicode_as_buffer, /* tp_as_buffer */
8773 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
8774 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
8775 unicode_doc, /* tp_doc */
8776 0, /* tp_traverse */
8777 0, /* tp_clear */
8778 PyUnicode_RichCompare, /* tp_richcompare */
8779 0, /* tp_weaklistoffset */
8780 0, /* tp_iter */
8781 0, /* tp_iternext */
8782 unicode_methods, /* tp_methods */
8783 0, /* tp_members */
8784 0, /* tp_getset */
8785 &PyBaseString_Type, /* tp_base */
8786 0, /* tp_dict */
8787 0, /* tp_descr_get */
8788 0, /* tp_descr_set */
8789 0, /* tp_dictoffset */
8790 0, /* tp_init */
8791 0, /* tp_alloc */
8792 unicode_new, /* tp_new */
8793 PyObject_Del, /* tp_free */
8796 /* Initialize the Unicode implementation */
8798 void _PyUnicode_Init(void)
8800 int i;
8802 /* XXX - move this array to unicodectype.c ? */
8803 Py_UNICODE linebreak[] = {
8804 0x000A, /* LINE FEED */
8805 0x000D, /* CARRIAGE RETURN */
8806 0x001C, /* FILE SEPARATOR */
8807 0x001D, /* GROUP SEPARATOR */
8808 0x001E, /* RECORD SEPARATOR */
8809 0x0085, /* NEXT LINE */
8810 0x2028, /* LINE SEPARATOR */
8811 0x2029, /* PARAGRAPH SEPARATOR */
8814 /* Init the implementation */
8815 free_list = NULL;
8816 numfree = 0;
8817 unicode_empty = _PyUnicode_New(0);
8818 if (!unicode_empty)
8819 return;
8821 strcpy(unicode_default_encoding, "ascii");
8822 for (i = 0; i < 256; i++)
8823 unicode_latin1[i] = NULL;
8824 if (PyType_Ready(&PyUnicode_Type) < 0)
8825 Py_FatalError("Can't initialize 'unicode'");
8827 /* initialize the linebreak bloom filter */
8828 bloom_linebreak = make_bloom_mask(
8829 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8832 PyType_Ready(&EncodingMapType);
8835 /* Finalize the Unicode implementation */
8838 PyUnicode_ClearFreeList(void)
8840 int freelist_size = numfree;
8841 PyUnicodeObject *u;
8843 for (u = free_list; u != NULL;) {
8844 PyUnicodeObject *v = u;
8845 u = *(PyUnicodeObject **)u;
8846 if (v->str)
8847 PyObject_DEL(v->str);
8848 Py_XDECREF(v->defenc);
8849 PyObject_Del(v);
8850 numfree--;
8852 free_list = NULL;
8853 assert(numfree == 0);
8854 return freelist_size;
8857 void
8858 _PyUnicode_Fini(void)
8860 int i;
8862 Py_XDECREF(unicode_empty);
8863 unicode_empty = NULL;
8865 for (i = 0; i < 256; i++) {
8866 if (unicode_latin1[i]) {
8867 Py_DECREF(unicode_latin1[i]);
8868 unicode_latin1[i] = NULL;
8871 (void)PyUnicode_ClearFreeList();
8874 #ifdef __cplusplus
8876 #endif