move sections
[python/dscho.git] / Objects / unicodeobject.c
blob8b3b861bee612ed0381562881ce54a5152ddafdf
1 /*
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
10 Copyright (c) Corporation for National Research Initiatives.
12 --------------------------------------------------------------------
13 The original string type implementation is:
15 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
18 By obtaining, using, and/or copying this software and/or its
19 associated documentation, you agree that you have read, understood,
20 and will comply with the following terms and conditions:
22 Permission to use, copy, modify, and distribute this software and its
23 associated documentation for any purpose and without fee is hereby
24 granted, provided that the above copyright notice appears in all
25 copies, and that both that copyright notice and this permission notice
26 appear in supporting documentation, and that the name of Secret Labs
27 AB or the author not be used in advertising or publicity pertaining to
28 distribution of the software without specific, written prior
29 permission.
31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 --------------------------------------------------------------------
42 #define PY_SSIZE_T_CLEAN
43 #include "Python.h"
45 #include "unicodeobject.h"
46 #include "ucnhash.h"
48 #ifdef MS_WINDOWS
49 #include <windows.h>
50 #endif
52 /* Limit for the Unicode object free list */
54 #define PyUnicode_MAXFREELIST 1024
56 /* Limit for the Unicode object free list stay alive optimization.
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
60 limit. This reduces malloc() overhead for small Unicode objects.
62 At worst this will result in PyUnicode_MAXFREELIST *
63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64 malloc()-overhead) bytes of unused garbage.
66 Setting the limit to 0 effectively turns the feature off.
68 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
73 #define KEEPALIVE_SIZE_LIMIT 9
75 /* Endianness switches; defaults to little endian */
77 #ifdef WORDS_BIGENDIAN
78 # define BYTEORDER_IS_BIG_ENDIAN
79 #else
80 # define BYTEORDER_IS_LITTLE_ENDIAN
81 #endif
83 /* --- Globals ------------------------------------------------------------
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
91 #ifdef __cplusplus
92 extern "C" {
93 #endif
95 /* Free list for Unicode objects */
96 static PyUnicodeObject *free_list;
97 static int numfree;
99 /* The empty Unicode object is shared to improve performance. */
100 static PyUnicodeObject *unicode_empty;
102 /* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104 static PyUnicodeObject *unicode_latin1[256];
106 /* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
109 Always use the PyUnicode_SetDefaultEncoding() and
110 PyUnicode_GetDefaultEncoding() APIs to access this global.
113 static char unicode_default_encoding[100];
115 /* Fast detection of the most frequent whitespace characters */
116 const unsigned char _Py_ascii_whitespace[] = {
117 0, 0, 0, 0, 0, 0, 0, 0,
118 /* case 0x0009: * CHARACTER TABULATION */
119 /* case 0x000A: * LINE FEED */
120 /* case 0x000B: * LINE TABULATION */
121 /* case 0x000C: * FORM FEED */
122 /* case 0x000D: * CARRIAGE RETURN */
123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
125 /* case 0x001C: * FILE SEPARATOR */
126 /* case 0x001D: * GROUP SEPARATOR */
127 /* case 0x001E: * RECORD SEPARATOR */
128 /* case 0x001F: * UNIT SEPARATOR */
129 0, 0, 0, 0, 1, 1, 1, 1,
130 /* case 0x0020: * SPACE */
131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
146 /* Same for linebreaks */
147 static unsigned char ascii_linebreak[] = {
148 0, 0, 0, 0, 0, 0, 0, 0,
149 /* 0x000A, * LINE FEED */
150 /* 0x000B, * LINE TABULATION */
151 /* 0x000C, * FORM FEED */
152 /* 0x000D, * CARRIAGE RETURN */
153 0, 0, 1, 1, 1, 1, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 /* 0x001C, * FILE SEPARATOR */
156 /* 0x001D, * GROUP SEPARATOR */
157 /* 0x001E, * RECORD SEPARATOR */
158 0, 0, 0, 0, 1, 1, 1, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0
175 Py_UNICODE
176 PyUnicode_GetMax(void)
178 #ifdef Py_UNICODE_WIDE
179 return 0x10FFFF;
180 #else
181 /* This is actually an illegal character, so it should
182 not be passed to unichr. */
183 return 0xFFFF;
184 #endif
187 /* --- Bloom Filters ----------------------------------------------------- */
189 /* stuff to implement simple "bloom filters" for Unicode characters.
190 to keep things simple, we use a single bitmask, using the least 5
191 bits from each unicode characters as the bit index. */
193 /* the linebreak mask is set up by Unicode_Init below */
195 #if LONG_BIT >= 128
196 #define BLOOM_WIDTH 128
197 #elif LONG_BIT >= 64
198 #define BLOOM_WIDTH 64
199 #elif LONG_BIT >= 32
200 #define BLOOM_WIDTH 32
201 #else
202 #error "LONG_BIT is smaller than 32"
203 #endif
205 #define BLOOM_MASK unsigned long
207 static BLOOM_MASK bloom_linebreak;
209 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
210 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
212 #define BLOOM_LINEBREAK(ch) \
213 ((ch) < 128U ? ascii_linebreak[(ch)] : \
214 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
216 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
218 /* calculate simple bloom-style bitmask for a given unicode string */
220 BLOOM_MASK mask;
221 Py_ssize_t i;
223 mask = 0;
224 for (i = 0; i < len; i++)
225 BLOOM_ADD(mask, ptr[i]);
227 return mask;
230 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
232 Py_ssize_t i;
234 for (i = 0; i < setlen; i++)
235 if (set[i] == chr)
236 return 1;
238 return 0;
241 #define BLOOM_MEMBER(mask, chr, set, setlen) \
242 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
244 /* --- Unicode Object ----------------------------------------------------- */
246 static
247 int unicode_resize(register PyUnicodeObject *unicode,
248 Py_ssize_t length)
250 void *oldstr;
252 /* Shortcut if there's nothing much to do. */
253 if (unicode->length == length)
254 goto reset;
256 /* Resizing shared object (unicode_empty or single character
257 objects) in-place is not allowed. Use PyUnicode_Resize()
258 instead ! */
260 if (unicode == unicode_empty ||
261 (unicode->length == 1 &&
262 unicode->str[0] < 256U &&
263 unicode_latin1[unicode->str[0]] == unicode)) {
264 PyErr_SetString(PyExc_SystemError,
265 "can't resize shared unicode objects");
266 return -1;
269 /* We allocate one more byte to make sure the string is Ux0000 terminated.
270 The overallocation is also used by fastsearch, which assumes that it's
271 safe to look at str[length] (without making any assumptions about what
272 it contains). */
274 oldstr = unicode->str;
275 unicode->str = PyObject_REALLOC(unicode->str,
276 sizeof(Py_UNICODE) * (length + 1));
277 if (!unicode->str) {
278 unicode->str = (Py_UNICODE *)oldstr;
279 PyErr_NoMemory();
280 return -1;
282 unicode->str[length] = 0;
283 unicode->length = length;
285 reset:
286 /* Reset the object caches */
287 if (unicode->defenc) {
288 Py_DECREF(unicode->defenc);
289 unicode->defenc = NULL;
291 unicode->hash = -1;
293 return 0;
296 /* We allocate one more byte to make sure the string is
297 Ux0000 terminated -- XXX is this needed ?
299 XXX This allocator could further be enhanced by assuring that the
300 free list never reduces its size below 1.
304 static
305 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
307 register PyUnicodeObject *unicode;
309 /* Optimization for empty strings */
310 if (length == 0 && unicode_empty != NULL) {
311 Py_INCREF(unicode_empty);
312 return unicode_empty;
315 /* Ensure we won't overflow the size. */
316 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
317 return (PyUnicodeObject *)PyErr_NoMemory();
320 /* Unicode freelist & memory allocation */
321 if (free_list) {
322 unicode = free_list;
323 free_list = *(PyUnicodeObject **)unicode;
324 numfree--;
325 if (unicode->str) {
326 /* Keep-Alive optimization: we only upsize the buffer,
327 never downsize it. */
328 if ((unicode->length < length) &&
329 unicode_resize(unicode, length) < 0) {
330 PyObject_DEL(unicode->str);
331 unicode->str = NULL;
334 else {
335 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
336 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
338 PyObject_INIT(unicode, &PyUnicode_Type);
340 else {
341 size_t new_size;
342 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
343 if (unicode == NULL)
344 return NULL;
345 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
346 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
349 if (!unicode->str) {
350 PyErr_NoMemory();
351 goto onError;
353 /* Initialize the first element to guard against cases where
354 * the caller fails before initializing str -- unicode_resize()
355 * reads str[0], and the Keep-Alive optimization can keep memory
356 * allocated for str alive across a call to unicode_dealloc(unicode).
357 * We don't want unicode_resize to read uninitialized memory in
358 * that case.
360 unicode->str[0] = 0;
361 unicode->str[length] = 0;
362 unicode->length = length;
363 unicode->hash = -1;
364 unicode->defenc = NULL;
365 return unicode;
367 onError:
368 /* XXX UNREF/NEWREF interface should be more symmetrical */
369 _Py_DEC_REFTOTAL;
370 _Py_ForgetReference((PyObject *)unicode);
371 PyObject_Del(unicode);
372 return NULL;
375 static
376 void unicode_dealloc(register PyUnicodeObject *unicode)
378 if (PyUnicode_CheckExact(unicode) &&
379 numfree < PyUnicode_MAXFREELIST) {
380 /* Keep-Alive optimization */
381 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
382 PyObject_DEL(unicode->str);
383 unicode->str = NULL;
384 unicode->length = 0;
386 if (unicode->defenc) {
387 Py_DECREF(unicode->defenc);
388 unicode->defenc = NULL;
390 /* Add to free list */
391 *(PyUnicodeObject **)unicode = free_list;
392 free_list = unicode;
393 numfree++;
395 else {
396 PyObject_DEL(unicode->str);
397 Py_XDECREF(unicode->defenc);
398 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
402 static
403 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
405 register PyUnicodeObject *v;
407 /* Argument checks */
408 if (unicode == NULL) {
409 PyErr_BadInternalCall();
410 return -1;
412 v = *unicode;
413 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
414 PyErr_BadInternalCall();
415 return -1;
418 /* Resizing unicode_empty and single character objects is not
419 possible since these are being shared. We simply return a fresh
420 copy with the same Unicode content. */
421 if (v->length != length &&
422 (v == unicode_empty || v->length == 1)) {
423 PyUnicodeObject *w = _PyUnicode_New(length);
424 if (w == NULL)
425 return -1;
426 Py_UNICODE_COPY(w->str, v->str,
427 length < v->length ? length : v->length);
428 Py_DECREF(*unicode);
429 *unicode = w;
430 return 0;
433 /* Note that we don't have to modify *unicode for unshared Unicode
434 objects, since we can modify them in-place. */
435 return unicode_resize(v, length);
438 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
440 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
443 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
444 Py_ssize_t size)
446 PyUnicodeObject *unicode;
448 /* If the Unicode data is known at construction time, we can apply
449 some optimizations which share commonly used objects. */
450 if (u != NULL) {
452 /* Optimization for empty strings */
453 if (size == 0 && unicode_empty != NULL) {
454 Py_INCREF(unicode_empty);
455 return (PyObject *)unicode_empty;
458 /* Single character Unicode objects in the Latin-1 range are
459 shared when using this constructor */
460 if (size == 1 && *u < 256) {
461 unicode = unicode_latin1[*u];
462 if (!unicode) {
463 unicode = _PyUnicode_New(1);
464 if (!unicode)
465 return NULL;
466 unicode->str[0] = *u;
467 unicode_latin1[*u] = unicode;
469 Py_INCREF(unicode);
470 return (PyObject *)unicode;
474 unicode = _PyUnicode_New(size);
475 if (!unicode)
476 return NULL;
478 /* Copy the Unicode data into the new object */
479 if (u != NULL)
480 Py_UNICODE_COPY(unicode->str, u, size);
482 return (PyObject *)unicode;
485 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
487 PyUnicodeObject *unicode;
489 if (size < 0) {
490 PyErr_SetString(PyExc_SystemError,
491 "Negative size passed to PyUnicode_FromStringAndSize");
492 return NULL;
495 /* If the Unicode data is known at construction time, we can apply
496 some optimizations which share commonly used objects.
497 Also, this means the input must be UTF-8, so fall back to the
498 UTF-8 decoder at the end. */
499 if (u != NULL) {
501 /* Optimization for empty strings */
502 if (size == 0 && unicode_empty != NULL) {
503 Py_INCREF(unicode_empty);
504 return (PyObject *)unicode_empty;
507 /* Single characters are shared when using this constructor.
508 Restrict to ASCII, since the input must be UTF-8. */
509 if (size == 1 && Py_CHARMASK(*u) < 128) {
510 unicode = unicode_latin1[Py_CHARMASK(*u)];
511 if (!unicode) {
512 unicode = _PyUnicode_New(1);
513 if (!unicode)
514 return NULL;
515 unicode->str[0] = Py_CHARMASK(*u);
516 unicode_latin1[Py_CHARMASK(*u)] = unicode;
518 Py_INCREF(unicode);
519 return (PyObject *)unicode;
522 return PyUnicode_DecodeUTF8(u, size, NULL);
525 unicode = _PyUnicode_New(size);
526 if (!unicode)
527 return NULL;
529 return (PyObject *)unicode;
532 PyObject *PyUnicode_FromString(const char *u)
534 size_t size = strlen(u);
535 if (size > PY_SSIZE_T_MAX) {
536 PyErr_SetString(PyExc_OverflowError, "input too long");
537 return NULL;
540 return PyUnicode_FromStringAndSize(u, size);
543 #ifdef HAVE_WCHAR_H
545 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
546 # define CONVERT_WCHAR_TO_SURROGATES
547 #endif
549 #ifdef CONVERT_WCHAR_TO_SURROGATES
551 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
552 to convert from UTF32 to UTF16. */
554 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
555 Py_ssize_t size)
557 PyUnicodeObject *unicode;
558 register Py_ssize_t i;
559 Py_ssize_t alloc;
560 const wchar_t *orig_w;
562 if (w == NULL) {
563 PyErr_BadInternalCall();
564 return NULL;
567 alloc = size;
568 orig_w = w;
569 for (i = size; i > 0; i--) {
570 if (*w > 0xFFFF)
571 alloc++;
572 w++;
574 w = orig_w;
575 unicode = _PyUnicode_New(alloc);
576 if (!unicode)
577 return NULL;
579 /* Copy the wchar_t data into the new object */
581 register Py_UNICODE *u;
582 u = PyUnicode_AS_UNICODE(unicode);
583 for (i = size; i > 0; i--) {
584 if (*w > 0xFFFF) {
585 wchar_t ordinal = *w++;
586 ordinal -= 0x10000;
587 *u++ = 0xD800 | (ordinal >> 10);
588 *u++ = 0xDC00 | (ordinal & 0x3FF);
590 else
591 *u++ = *w++;
594 return (PyObject *)unicode;
597 #else
599 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
600 Py_ssize_t size)
602 PyUnicodeObject *unicode;
604 if (w == NULL) {
605 PyErr_BadInternalCall();
606 return NULL;
609 unicode = _PyUnicode_New(size);
610 if (!unicode)
611 return NULL;
613 /* Copy the wchar_t data into the new object */
614 #ifdef HAVE_USABLE_WCHAR_T
615 memcpy(unicode->str, w, size * sizeof(wchar_t));
616 #else
618 register Py_UNICODE *u;
619 register Py_ssize_t i;
620 u = PyUnicode_AS_UNICODE(unicode);
621 for (i = size; i > 0; i--)
622 *u++ = *w++;
624 #endif
626 return (PyObject *)unicode;
629 #endif /* CONVERT_WCHAR_TO_SURROGATES */
631 #undef CONVERT_WCHAR_TO_SURROGATES
633 static void
634 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
636 *fmt++ = '%';
637 if (width) {
638 if (zeropad)
639 *fmt++ = '0';
640 fmt += sprintf(fmt, "%d", width);
642 if (precision)
643 fmt += sprintf(fmt, ".%d", precision);
644 if (longflag)
645 *fmt++ = 'l';
646 else if (size_tflag) {
647 char *f = PY_FORMAT_SIZE_T;
648 while (*f)
649 *fmt++ = *f++;
651 *fmt++ = c;
652 *fmt = '\0';
655 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
657 PyObject *
658 PyUnicode_FromFormatV(const char *format, va_list vargs)
660 va_list count;
661 Py_ssize_t callcount = 0;
662 PyObject **callresults = NULL;
663 PyObject **callresult = NULL;
664 Py_ssize_t n = 0;
665 int width = 0;
666 int precision = 0;
667 int zeropad;
668 const char* f;
669 Py_UNICODE *s;
670 PyObject *string;
671 /* used by sprintf */
672 char buffer[21];
673 /* use abuffer instead of buffer, if we need more space
674 * (which can happen if there's a format specifier with width). */
675 char *abuffer = NULL;
676 char *realbuffer;
677 Py_ssize_t abuffersize = 0;
678 char fmt[60]; /* should be enough for %0width.precisionld */
679 const char *copy;
681 #ifdef VA_LIST_IS_ARRAY
682 Py_MEMCPY(count, vargs, sizeof(va_list));
683 #else
684 #ifdef __va_copy
685 __va_copy(count, vargs);
686 #else
687 count = vargs;
688 #endif
689 #endif
690 /* step 1: count the number of %S/%R/%s format specifications
691 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
692 * objects once during step 3 and put the result in an array) */
693 for (f = format; *f; f++) {
694 if (*f == '%') {
695 if (*(f+1)=='%')
696 continue;
697 if (*(f+1)=='S' || *(f+1)=='R')
698 ++callcount;
699 while (isdigit((unsigned)*f))
700 width = (width*10) + *f++ - '0';
701 while (*++f && *f != '%' && !isalpha((unsigned)*f))
703 if (*f == 's')
704 ++callcount;
707 /* step 2: allocate memory for the results of
708 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
709 if (callcount) {
710 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
711 if (!callresults) {
712 PyErr_NoMemory();
713 return NULL;
715 callresult = callresults;
717 /* step 3: figure out how large a buffer we need */
718 for (f = format; *f; f++) {
719 if (*f == '%') {
720 const char* p = f;
721 width = 0;
722 while (isdigit((unsigned)*f))
723 width = (width*10) + *f++ - '0';
724 while (*++f && *f != '%' && !isalpha((unsigned)*f))
727 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
728 * they don't affect the amount of space we reserve.
730 if ((*f == 'l' || *f == 'z') &&
731 (f[1] == 'd' || f[1] == 'u'))
732 ++f;
734 switch (*f) {
735 case 'c':
736 (void)va_arg(count, int);
737 /* fall through... */
738 case '%':
739 n++;
740 break;
741 case 'd': case 'u': case 'i': case 'x':
742 (void) va_arg(count, int);
743 /* 20 bytes is enough to hold a 64-bit
744 integer. Decimal takes the most space.
745 This isn't enough for octal.
746 If a width is specified we need more
747 (which we allocate later). */
748 if (width < 20)
749 width = 20;
750 n += width;
751 if (abuffersize < width)
752 abuffersize = width;
753 break;
754 case 's':
756 /* UTF-8 */
757 const char *s = va_arg(count, const char*);
758 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
759 if (!str)
760 goto fail;
761 n += PyUnicode_GET_SIZE(str);
762 /* Remember the str and switch to the next slot */
763 *callresult++ = str;
764 break;
766 case 'U':
768 PyObject *obj = va_arg(count, PyObject *);
769 assert(obj && PyUnicode_Check(obj));
770 n += PyUnicode_GET_SIZE(obj);
771 break;
773 case 'V':
775 PyObject *obj = va_arg(count, PyObject *);
776 const char *str = va_arg(count, const char *);
777 assert(obj || str);
778 assert(!obj || PyUnicode_Check(obj));
779 if (obj)
780 n += PyUnicode_GET_SIZE(obj);
781 else
782 n += strlen(str);
783 break;
785 case 'S':
787 PyObject *obj = va_arg(count, PyObject *);
788 PyObject *str;
789 assert(obj);
790 str = PyObject_Str(obj);
791 if (!str)
792 goto fail;
793 n += PyUnicode_GET_SIZE(str);
794 /* Remember the str and switch to the next slot */
795 *callresult++ = str;
796 break;
798 case 'R':
800 PyObject *obj = va_arg(count, PyObject *);
801 PyObject *repr;
802 assert(obj);
803 repr = PyObject_Repr(obj);
804 if (!repr)
805 goto fail;
806 n += PyUnicode_GET_SIZE(repr);
807 /* Remember the repr and switch to the next slot */
808 *callresult++ = repr;
809 break;
811 case 'p':
812 (void) va_arg(count, int);
813 /* maximum 64-bit pointer representation:
814 * 0xffffffffffffffff
815 * so 19 characters is enough.
816 * XXX I count 18 -- what's the extra for?
818 n += 19;
819 break;
820 default:
821 /* if we stumble upon an unknown
822 formatting code, copy the rest of
823 the format string to the output
824 string. (we cannot just skip the
825 code, since there's no way to know
826 what's in the argument list) */
827 n += strlen(p);
828 goto expand;
830 } else
831 n++;
833 expand:
834 if (abuffersize > 20) {
835 abuffer = PyObject_Malloc(abuffersize);
836 if (!abuffer) {
837 PyErr_NoMemory();
838 goto fail;
840 realbuffer = abuffer;
842 else
843 realbuffer = buffer;
844 /* step 4: fill the buffer */
845 /* Since we've analyzed how much space we need for the worst case,
846 we don't have to resize the string.
847 There can be no errors beyond this point. */
848 string = PyUnicode_FromUnicode(NULL, n);
849 if (!string)
850 goto fail;
852 s = PyUnicode_AS_UNICODE(string);
853 callresult = callresults;
855 for (f = format; *f; f++) {
856 if (*f == '%') {
857 const char* p = f++;
858 int longflag = 0;
859 int size_tflag = 0;
860 zeropad = (*f == '0');
861 /* parse the width.precision part */
862 width = 0;
863 while (isdigit((unsigned)*f))
864 width = (width*10) + *f++ - '0';
865 precision = 0;
866 if (*f == '.') {
867 f++;
868 while (isdigit((unsigned)*f))
869 precision = (precision*10) + *f++ - '0';
871 /* handle the long flag, but only for %ld and %lu.
872 others can be added when necessary. */
873 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
874 longflag = 1;
875 ++f;
877 /* handle the size_t flag. */
878 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
879 size_tflag = 1;
880 ++f;
883 switch (*f) {
884 case 'c':
885 *s++ = va_arg(vargs, int);
886 break;
887 case 'd':
888 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
889 if (longflag)
890 sprintf(realbuffer, fmt, va_arg(vargs, long));
891 else if (size_tflag)
892 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
893 else
894 sprintf(realbuffer, fmt, va_arg(vargs, int));
895 appendstring(realbuffer);
896 break;
897 case 'u':
898 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
899 if (longflag)
900 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
901 else if (size_tflag)
902 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
903 else
904 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
905 appendstring(realbuffer);
906 break;
907 case 'i':
908 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
909 sprintf(realbuffer, fmt, va_arg(vargs, int));
910 appendstring(realbuffer);
911 break;
912 case 'x':
913 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
914 sprintf(realbuffer, fmt, va_arg(vargs, int));
915 appendstring(realbuffer);
916 break;
917 case 's':
919 /* unused, since we already have the result */
920 (void) va_arg(vargs, char *);
921 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
922 PyUnicode_GET_SIZE(*callresult));
923 s += PyUnicode_GET_SIZE(*callresult);
924 /* We're done with the unicode()/repr() => forget it */
925 Py_DECREF(*callresult);
926 /* switch to next unicode()/repr() result */
927 ++callresult;
928 break;
930 case 'U':
932 PyObject *obj = va_arg(vargs, PyObject *);
933 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
934 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
935 s += size;
936 break;
938 case 'V':
940 PyObject *obj = va_arg(vargs, PyObject *);
941 const char *str = va_arg(vargs, const char *);
942 if (obj) {
943 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
944 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
945 s += size;
946 } else {
947 appendstring(str);
949 break;
951 case 'S':
952 case 'R':
954 Py_UNICODE *ucopy;
955 Py_ssize_t usize;
956 Py_ssize_t upos;
957 /* unused, since we already have the result */
958 (void) va_arg(vargs, PyObject *);
959 ucopy = PyUnicode_AS_UNICODE(*callresult);
960 usize = PyUnicode_GET_SIZE(*callresult);
961 for (upos = 0; upos<usize;)
962 *s++ = ucopy[upos++];
963 /* We're done with the unicode()/repr() => forget it */
964 Py_DECREF(*callresult);
965 /* switch to next unicode()/repr() result */
966 ++callresult;
967 break;
969 case 'p':
970 sprintf(buffer, "%p", va_arg(vargs, void*));
971 /* %p is ill-defined: ensure leading 0x. */
972 if (buffer[1] == 'X')
973 buffer[1] = 'x';
974 else if (buffer[1] != 'x') {
975 memmove(buffer+2, buffer, strlen(buffer)+1);
976 buffer[0] = '0';
977 buffer[1] = 'x';
979 appendstring(buffer);
980 break;
981 case '%':
982 *s++ = '%';
983 break;
984 default:
985 appendstring(p);
986 goto end;
988 } else
989 *s++ = *f;
992 end:
993 if (callresults)
994 PyObject_Free(callresults);
995 if (abuffer)
996 PyObject_Free(abuffer);
997 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
998 return string;
999 fail:
1000 if (callresults) {
1001 PyObject **callresult2 = callresults;
1002 while (callresult2 < callresult) {
1003 Py_DECREF(*callresult2);
1004 ++callresult2;
1006 PyObject_Free(callresults);
1008 if (abuffer)
1009 PyObject_Free(abuffer);
1010 return NULL;
1013 #undef appendstring
1015 PyObject *
1016 PyUnicode_FromFormat(const char *format, ...)
1018 PyObject* ret;
1019 va_list vargs;
1021 #ifdef HAVE_STDARG_PROTOTYPES
1022 va_start(vargs, format);
1023 #else
1024 va_start(vargs);
1025 #endif
1026 ret = PyUnicode_FromFormatV(format, vargs);
1027 va_end(vargs);
1028 return ret;
1031 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1032 wchar_t *w,
1033 Py_ssize_t size)
1035 if (unicode == NULL) {
1036 PyErr_BadInternalCall();
1037 return -1;
1040 /* If possible, try to copy the 0-termination as well */
1041 if (size > PyUnicode_GET_SIZE(unicode))
1042 size = PyUnicode_GET_SIZE(unicode) + 1;
1044 #ifdef HAVE_USABLE_WCHAR_T
1045 memcpy(w, unicode->str, size * sizeof(wchar_t));
1046 #else
1048 register Py_UNICODE *u;
1049 register Py_ssize_t i;
1050 u = PyUnicode_AS_UNICODE(unicode);
1051 for (i = size; i > 0; i--)
1052 *w++ = *u++;
1054 #endif
1056 if (size > PyUnicode_GET_SIZE(unicode))
1057 return PyUnicode_GET_SIZE(unicode);
1058 else
1059 return size;
1062 #endif
1064 PyObject *PyUnicode_FromOrdinal(int ordinal)
1066 Py_UNICODE s[1];
1068 #ifdef Py_UNICODE_WIDE
1069 if (ordinal < 0 || ordinal > 0x10ffff) {
1070 PyErr_SetString(PyExc_ValueError,
1071 "unichr() arg not in range(0x110000) "
1072 "(wide Python build)");
1073 return NULL;
1075 #else
1076 if (ordinal < 0 || ordinal > 0xffff) {
1077 PyErr_SetString(PyExc_ValueError,
1078 "unichr() arg not in range(0x10000) "
1079 "(narrow Python build)");
1080 return NULL;
1082 #endif
1084 s[0] = (Py_UNICODE)ordinal;
1085 return PyUnicode_FromUnicode(s, 1);
1088 PyObject *PyUnicode_FromObject(register PyObject *obj)
1090 /* XXX Perhaps we should make this API an alias of
1091 PyObject_Unicode() instead ?! */
1092 if (PyUnicode_CheckExact(obj)) {
1093 Py_INCREF(obj);
1094 return obj;
1096 if (PyUnicode_Check(obj)) {
1097 /* For a Unicode subtype that's not a Unicode object,
1098 return a true Unicode object with the same data. */
1099 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1100 PyUnicode_GET_SIZE(obj));
1102 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1105 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1106 const char *encoding,
1107 const char *errors)
1109 const char *s = NULL;
1110 Py_ssize_t len;
1111 PyObject *v;
1113 if (obj == NULL) {
1114 PyErr_BadInternalCall();
1115 return NULL;
1118 #if 0
1119 /* For b/w compatibility we also accept Unicode objects provided
1120 that no encodings is given and then redirect to
1121 PyObject_Unicode() which then applies the additional logic for
1122 Unicode subclasses.
1124 NOTE: This API should really only be used for object which
1125 represent *encoded* Unicode !
1128 if (PyUnicode_Check(obj)) {
1129 if (encoding) {
1130 PyErr_SetString(PyExc_TypeError,
1131 "decoding Unicode is not supported");
1132 return NULL;
1134 return PyObject_Unicode(obj);
1136 #else
1137 if (PyUnicode_Check(obj)) {
1138 PyErr_SetString(PyExc_TypeError,
1139 "decoding Unicode is not supported");
1140 return NULL;
1142 #endif
1144 /* Coerce object */
1145 if (PyString_Check(obj)) {
1146 s = PyString_AS_STRING(obj);
1147 len = PyString_GET_SIZE(obj);
1149 else if (PyByteArray_Check(obj)) {
1150 /* Python 2.x specific */
1151 PyErr_Format(PyExc_TypeError,
1152 "decoding bytearray is not supported");
1153 return NULL;
1155 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1156 /* Overwrite the error message with something more useful in
1157 case of a TypeError. */
1158 if (PyErr_ExceptionMatches(PyExc_TypeError))
1159 PyErr_Format(PyExc_TypeError,
1160 "coercing to Unicode: need string or buffer, "
1161 "%.80s found",
1162 Py_TYPE(obj)->tp_name);
1163 goto onError;
1166 /* Convert to Unicode */
1167 if (len == 0) {
1168 Py_INCREF(unicode_empty);
1169 v = (PyObject *)unicode_empty;
1171 else
1172 v = PyUnicode_Decode(s, len, encoding, errors);
1174 return v;
1176 onError:
1177 return NULL;
1180 PyObject *PyUnicode_Decode(const char *s,
1181 Py_ssize_t size,
1182 const char *encoding,
1183 const char *errors)
1185 PyObject *buffer = NULL, *unicode;
1187 if (encoding == NULL)
1188 encoding = PyUnicode_GetDefaultEncoding();
1190 /* Shortcuts for common default encodings */
1191 if (strcmp(encoding, "utf-8") == 0)
1192 return PyUnicode_DecodeUTF8(s, size, errors);
1193 else if (strcmp(encoding, "latin-1") == 0)
1194 return PyUnicode_DecodeLatin1(s, size, errors);
1195 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1196 else if (strcmp(encoding, "mbcs") == 0)
1197 return PyUnicode_DecodeMBCS(s, size, errors);
1198 #endif
1199 else if (strcmp(encoding, "ascii") == 0)
1200 return PyUnicode_DecodeASCII(s, size, errors);
1202 /* Decode via the codec registry */
1203 buffer = PyBuffer_FromMemory((void *)s, size);
1204 if (buffer == NULL)
1205 goto onError;
1206 unicode = PyCodec_Decode(buffer, encoding, errors);
1207 if (unicode == NULL)
1208 goto onError;
1209 if (!PyUnicode_Check(unicode)) {
1210 PyErr_Format(PyExc_TypeError,
1211 "decoder did not return an unicode object (type=%.400s)",
1212 Py_TYPE(unicode)->tp_name);
1213 Py_DECREF(unicode);
1214 goto onError;
1216 Py_DECREF(buffer);
1217 return unicode;
1219 onError:
1220 Py_XDECREF(buffer);
1221 return NULL;
1224 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1225 const char *encoding,
1226 const char *errors)
1228 PyObject *v;
1230 if (!PyUnicode_Check(unicode)) {
1231 PyErr_BadArgument();
1232 goto onError;
1235 if (encoding == NULL)
1236 encoding = PyUnicode_GetDefaultEncoding();
1238 /* Decode via the codec registry */
1239 v = PyCodec_Decode(unicode, encoding, errors);
1240 if (v == NULL)
1241 goto onError;
1242 return v;
1244 onError:
1245 return NULL;
1248 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1249 Py_ssize_t size,
1250 const char *encoding,
1251 const char *errors)
1253 PyObject *v, *unicode;
1255 unicode = PyUnicode_FromUnicode(s, size);
1256 if (unicode == NULL)
1257 return NULL;
1258 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1259 Py_DECREF(unicode);
1260 return v;
1263 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1264 const char *encoding,
1265 const char *errors)
1267 PyObject *v;
1269 if (!PyUnicode_Check(unicode)) {
1270 PyErr_BadArgument();
1271 goto onError;
1274 if (encoding == NULL)
1275 encoding = PyUnicode_GetDefaultEncoding();
1277 /* Encode via the codec registry */
1278 v = PyCodec_Encode(unicode, encoding, errors);
1279 if (v == NULL)
1280 goto onError;
1281 return v;
1283 onError:
1284 return NULL;
1287 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1288 const char *encoding,
1289 const char *errors)
1291 PyObject *v;
1293 if (!PyUnicode_Check(unicode)) {
1294 PyErr_BadArgument();
1295 goto onError;
1298 if (encoding == NULL)
1299 encoding = PyUnicode_GetDefaultEncoding();
1301 /* Shortcuts for common default encodings */
1302 if (errors == NULL) {
1303 if (strcmp(encoding, "utf-8") == 0)
1304 return PyUnicode_AsUTF8String(unicode);
1305 else if (strcmp(encoding, "latin-1") == 0)
1306 return PyUnicode_AsLatin1String(unicode);
1307 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1308 else if (strcmp(encoding, "mbcs") == 0)
1309 return PyUnicode_AsMBCSString(unicode);
1310 #endif
1311 else if (strcmp(encoding, "ascii") == 0)
1312 return PyUnicode_AsASCIIString(unicode);
1315 /* Encode via the codec registry */
1316 v = PyCodec_Encode(unicode, encoding, errors);
1317 if (v == NULL)
1318 goto onError;
1319 if (!PyString_Check(v)) {
1320 PyErr_Format(PyExc_TypeError,
1321 "encoder did not return a string object (type=%.400s)",
1322 Py_TYPE(v)->tp_name);
1323 Py_DECREF(v);
1324 goto onError;
1326 return v;
1328 onError:
1329 return NULL;
1332 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1333 const char *errors)
1335 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1337 if (v)
1338 return v;
1339 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1340 if (v && errors == NULL)
1341 ((PyUnicodeObject *)unicode)->defenc = v;
1342 return v;
1345 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1347 if (!PyUnicode_Check(unicode)) {
1348 PyErr_BadArgument();
1349 goto onError;
1351 return PyUnicode_AS_UNICODE(unicode);
1353 onError:
1354 return NULL;
1357 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1359 if (!PyUnicode_Check(unicode)) {
1360 PyErr_BadArgument();
1361 goto onError;
1363 return PyUnicode_GET_SIZE(unicode);
1365 onError:
1366 return -1;
1369 const char *PyUnicode_GetDefaultEncoding(void)
1371 return unicode_default_encoding;
1374 int PyUnicode_SetDefaultEncoding(const char *encoding)
1376 PyObject *v;
1378 /* Make sure the encoding is valid. As side effect, this also
1379 loads the encoding into the codec registry cache. */
1380 v = _PyCodec_Lookup(encoding);
1381 if (v == NULL)
1382 goto onError;
1383 Py_DECREF(v);
1384 strncpy(unicode_default_encoding,
1385 encoding,
1386 sizeof(unicode_default_encoding));
1387 return 0;
1389 onError:
1390 return -1;
1393 /* error handling callback helper:
1394 build arguments, call the callback and check the arguments,
1395 if no exception occurred, copy the replacement to the output
1396 and adjust various state variables.
1397 return 0 on success, -1 on error
1400 static
1401 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1402 const char *encoding, const char *reason,
1403 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1404 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1405 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1407 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1409 PyObject *restuple = NULL;
1410 PyObject *repunicode = NULL;
1411 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1412 Py_ssize_t requiredsize;
1413 Py_ssize_t newpos;
1414 Py_UNICODE *repptr;
1415 Py_ssize_t repsize;
1416 int res = -1;
1418 if (*errorHandler == NULL) {
1419 *errorHandler = PyCodec_LookupError(errors);
1420 if (*errorHandler == NULL)
1421 goto onError;
1424 if (*exceptionObject == NULL) {
1425 *exceptionObject = PyUnicodeDecodeError_Create(
1426 encoding, input, insize, *startinpos, *endinpos, reason);
1427 if (*exceptionObject == NULL)
1428 goto onError;
1430 else {
1431 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1432 goto onError;
1433 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1434 goto onError;
1435 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1436 goto onError;
1439 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1440 if (restuple == NULL)
1441 goto onError;
1442 if (!PyTuple_Check(restuple)) {
1443 PyErr_SetString(PyExc_TypeError, &argparse[4]);
1444 goto onError;
1446 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1447 goto onError;
1448 if (newpos<0)
1449 newpos = insize+newpos;
1450 if (newpos<0 || newpos>insize) {
1451 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1452 goto onError;
1455 /* need more space? (at least enough for what we
1456 have+the replacement+the rest of the string (starting
1457 at the new input position), so we won't have to check space
1458 when there are no errors in the rest of the string) */
1459 repptr = PyUnicode_AS_UNICODE(repunicode);
1460 repsize = PyUnicode_GET_SIZE(repunicode);
1461 requiredsize = *outpos + repsize + insize-newpos;
1462 if (requiredsize > outsize) {
1463 if (requiredsize<2*outsize)
1464 requiredsize = 2*outsize;
1465 if (_PyUnicode_Resize(output, requiredsize) < 0)
1466 goto onError;
1467 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1469 *endinpos = newpos;
1470 *inptr = input + newpos;
1471 Py_UNICODE_COPY(*outptr, repptr, repsize);
1472 *outptr += repsize;
1473 *outpos += repsize;
1474 /* we made it! */
1475 res = 0;
1477 onError:
1478 Py_XDECREF(restuple);
1479 return res;
1482 /* --- UTF-7 Codec -------------------------------------------------------- */
1484 /* See RFC2152 for details. We encode conservatively and decode liberally. */
1486 /* Three simple macros defining base-64. */
1488 /* Is c a base-64 character? */
1490 #define IS_BASE64(c) \
1491 (isalnum(c) || (c) == '+' || (c) == '/')
1493 /* given that c is a base-64 character, what is its base-64 value? */
1495 #define FROM_BASE64(c) \
1496 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1497 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1498 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1499 (c) == '+' ? 62 : 63)
1501 /* What is the base-64 character of the bottom 6 bits of n? */
1503 #define TO_BASE64(n) \
1504 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1506 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1507 * decoded as itself. We are permissive on decoding; the only ASCII
1508 * byte not decoding to itself is the + which begins a base64
1509 * string. */
1511 #define DECODE_DIRECT(c) \
1512 ((c) <= 127 && (c) != '+')
1514 /* The UTF-7 encoder treats ASCII characters differently according to
1515 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1516 * the above). See RFC2152. This array identifies these different
1517 * sets:
1518 * 0 : "Set D"
1519 * alphanumeric and '(),-./:?
1520 * 1 : "Set O"
1521 * !"#$%&*;<=>@[]^_`{|}
1522 * 2 : "whitespace"
1523 * ht nl cr sp
1524 * 3 : special (must be base64 encoded)
1525 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1528 static
1529 char utf7_category[128] = {
1530 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1531 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1532 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1533 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1534 /* sp ! " # $ % & ' ( ) * + , - . / */
1535 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1536 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1537 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1538 /* @ A B C D E F G H I J K L M N O */
1539 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1540 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
1541 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1542 /* ` a b c d e f g h i j k l m n o */
1543 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1544 /* p q r s t u v w x y z { | } ~ del */
1545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
1548 /* ENCODE_DIRECT: this character should be encoded as itself. The
1549 * answer depends on whether we are encoding set O as itself, and also
1550 * on whether we are encoding whitespace as itself. RFC2152 makes it
1551 * clear that the answers to these questions vary between
1552 * applications, so this code needs to be flexible. */
1554 #define ENCODE_DIRECT(c, directO, directWS) \
1555 ((c) < 128 && (c) > 0 && \
1556 ((utf7_category[(c)] == 0) || \
1557 (directWS && (utf7_category[(c)] == 2)) || \
1558 (directO && (utf7_category[(c)] == 1))))
1560 PyObject *PyUnicode_DecodeUTF7(const char *s,
1561 Py_ssize_t size,
1562 const char *errors)
1564 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1567 /* The decoder. The only state we preserve is our read position,
1568 * i.e. how many characters we have consumed. So if we end in the
1569 * middle of a shift sequence we have to back off the read position
1570 * and the output to the beginning of the sequence, otherwise we lose
1571 * all the shift state (seen bits, number of bits seen, high
1572 * surrogate). */
1574 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1575 Py_ssize_t size,
1576 const char *errors,
1577 Py_ssize_t *consumed)
1579 const char *starts = s;
1580 Py_ssize_t startinpos;
1581 Py_ssize_t endinpos;
1582 Py_ssize_t outpos;
1583 const char *e;
1584 PyUnicodeObject *unicode;
1585 Py_UNICODE *p;
1586 const char *errmsg = "";
1587 int inShift = 0;
1588 Py_UNICODE *shiftOutStart;
1589 unsigned int base64bits = 0;
1590 unsigned long base64buffer = 0;
1591 Py_UNICODE surrogate = 0;
1592 PyObject *errorHandler = NULL;
1593 PyObject *exc = NULL;
1595 unicode = _PyUnicode_New(size);
1596 if (!unicode)
1597 return NULL;
1598 if (size == 0) {
1599 if (consumed)
1600 *consumed = 0;
1601 return (PyObject *)unicode;
1604 p = unicode->str;
1605 shiftOutStart = p;
1606 e = s + size;
1608 while (s < e) {
1609 Py_UNICODE ch = (unsigned char) *s;
1611 if (inShift) { /* in a base-64 section */
1612 if (IS_BASE64(ch)) { /* consume a base-64 character */
1613 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1614 base64bits += 6;
1615 s++;
1616 if (base64bits >= 16) {
1617 /* we have enough bits for a UTF-16 value */
1618 Py_UNICODE outCh = (Py_UNICODE)
1619 (base64buffer >> (base64bits-16));
1620 base64bits -= 16;
1621 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1622 if (surrogate) {
1623 /* expecting a second surrogate */
1624 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1625 #ifdef Py_UNICODE_WIDE
1626 *p++ = (((surrogate & 0x3FF)<<10)
1627 | (outCh & 0x3FF)) + 0x10000;
1628 #else
1629 *p++ = surrogate;
1630 *p++ = outCh;
1631 #endif
1632 surrogate = 0;
1634 else {
1635 surrogate = 0;
1636 errmsg = "second surrogate missing";
1637 goto utf7Error;
1640 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1641 /* first surrogate */
1642 surrogate = outCh;
1644 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1645 errmsg = "unexpected second surrogate";
1646 goto utf7Error;
1648 else {
1649 *p++ = outCh;
1653 else { /* now leaving a base-64 section */
1654 inShift = 0;
1655 s++;
1656 if (surrogate) {
1657 errmsg = "second surrogate missing at end of shift sequence";
1658 goto utf7Error;
1660 if (base64bits > 0) { /* left-over bits */
1661 if (base64bits >= 6) {
1662 /* We've seen at least one base-64 character */
1663 errmsg = "partial character in shift sequence";
1664 goto utf7Error;
1666 else {
1667 /* Some bits remain; they should be zero */
1668 if (base64buffer != 0) {
1669 errmsg = "non-zero padding bits in shift sequence";
1670 goto utf7Error;
1674 if (ch != '-') {
1675 /* '-' is absorbed; other terminating
1676 characters are preserved */
1677 *p++ = ch;
1681 else if ( ch == '+' ) {
1682 startinpos = s-starts;
1683 s++; /* consume '+' */
1684 if (s < e && *s == '-') { /* '+-' encodes '+' */
1685 s++;
1686 *p++ = '+';
1688 else { /* begin base64-encoded section */
1689 inShift = 1;
1690 shiftOutStart = p;
1691 base64bits = 0;
1694 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
1695 *p++ = ch;
1696 s++;
1698 else {
1699 startinpos = s-starts;
1700 s++;
1701 errmsg = "unexpected special character";
1702 goto utf7Error;
1704 continue;
1705 utf7Error:
1706 outpos = p-PyUnicode_AS_UNICODE(unicode);
1707 endinpos = s-starts;
1708 if (unicode_decode_call_errorhandler(
1709 errors, &errorHandler,
1710 "utf7", errmsg,
1711 starts, size, &startinpos, &endinpos, &exc, &s,
1712 &unicode, &outpos, &p))
1713 goto onError;
1716 /* end of string */
1718 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1719 /* if we're in an inconsistent state, that's an error */
1720 if (surrogate ||
1721 (base64bits >= 6) ||
1722 (base64bits > 0 && base64buffer != 0)) {
1723 outpos = p-PyUnicode_AS_UNICODE(unicode);
1724 endinpos = size;
1725 if (unicode_decode_call_errorhandler(
1726 errors, &errorHandler,
1727 "utf7", "unterminated shift sequence",
1728 starts, size, &startinpos, &endinpos, &exc, &s,
1729 &unicode, &outpos, &p))
1730 goto onError;
1734 /* return state */
1735 if (consumed) {
1736 if (inShift) {
1737 p = shiftOutStart; /* back off output */
1738 *consumed = startinpos;
1740 else {
1741 *consumed = s-starts;
1745 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1746 goto onError;
1748 Py_XDECREF(errorHandler);
1749 Py_XDECREF(exc);
1750 return (PyObject *)unicode;
1752 onError:
1753 Py_XDECREF(errorHandler);
1754 Py_XDECREF(exc);
1755 Py_DECREF(unicode);
1756 return NULL;
1760 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1761 Py_ssize_t size,
1762 int base64SetO,
1763 int base64WhiteSpace,
1764 const char *errors)
1766 PyObject *v;
1767 /* It might be possible to tighten this worst case */
1768 Py_ssize_t allocated = 8 * size;
1769 int inShift = 0;
1770 Py_ssize_t i = 0;
1771 unsigned int base64bits = 0;
1772 unsigned long base64buffer = 0;
1773 char * out;
1774 char * start;
1776 if (allocated / 8 != size)
1777 return PyErr_NoMemory();
1779 if (size == 0)
1780 return PyString_FromStringAndSize(NULL, 0);
1782 v = PyString_FromStringAndSize(NULL, allocated);
1783 if (v == NULL)
1784 return NULL;
1786 start = out = PyString_AS_STRING(v);
1787 for (;i < size; ++i) {
1788 Py_UNICODE ch = s[i];
1790 if (inShift) {
1791 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1792 /* shifting out */
1793 if (base64bits) { /* output remaining bits */
1794 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1795 base64buffer = 0;
1796 base64bits = 0;
1798 inShift = 0;
1799 /* Characters not in the BASE64 set implicitly unshift the sequence
1800 so no '-' is required, except if the character is itself a '-' */
1801 if (IS_BASE64(ch) || ch == '-') {
1802 *out++ = '-';
1804 *out++ = (char) ch;
1806 else {
1807 goto encode_char;
1810 else { /* not in a shift sequence */
1811 if (ch == '+') {
1812 *out++ = '+';
1813 *out++ = '-';
1815 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1816 *out++ = (char) ch;
1818 else {
1819 *out++ = '+';
1820 inShift = 1;
1821 goto encode_char;
1824 continue;
1825 encode_char:
1826 #ifdef Py_UNICODE_WIDE
1827 if (ch >= 0x10000) {
1828 /* code first surrogate */
1829 base64bits += 16;
1830 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1831 while (base64bits >= 6) {
1832 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1833 base64bits -= 6;
1835 /* prepare second surrogate */
1836 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1838 #endif
1839 base64bits += 16;
1840 base64buffer = (base64buffer << 16) | ch;
1841 while (base64bits >= 6) {
1842 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1843 base64bits -= 6;
1846 if (base64bits)
1847 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1848 if (inShift)
1849 *out++ = '-';
1851 if (_PyString_Resize(&v, out - start))
1852 return NULL;
1853 return v;
1856 #undef IS_BASE64
1857 #undef FROM_BASE64
1858 #undef TO_BASE64
1859 #undef DECODE_DIRECT
1860 #undef ENCODE_DIRECT
1862 /* --- UTF-8 Codec -------------------------------------------------------- */
1864 static
1865 char utf8_code_length[256] = {
1866 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
1867 illegal prefix. See RFC 3629 for details */
1868 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1869 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1870 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1871 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1872 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1873 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1874 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1875 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1876 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
1877 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1878 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1879 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1880 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1881 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1882 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1883 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
1886 PyObject *PyUnicode_DecodeUTF8(const char *s,
1887 Py_ssize_t size,
1888 const char *errors)
1890 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1893 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1894 Py_ssize_t size,
1895 const char *errors,
1896 Py_ssize_t *consumed)
1898 const char *starts = s;
1899 int n;
1900 int k;
1901 Py_ssize_t startinpos;
1902 Py_ssize_t endinpos;
1903 Py_ssize_t outpos;
1904 const char *e;
1905 PyUnicodeObject *unicode;
1906 Py_UNICODE *p;
1907 const char *errmsg = "";
1908 PyObject *errorHandler = NULL;
1909 PyObject *exc = NULL;
1911 /* Note: size will always be longer than the resulting Unicode
1912 character count */
1913 unicode = _PyUnicode_New(size);
1914 if (!unicode)
1915 return NULL;
1916 if (size == 0) {
1917 if (consumed)
1918 *consumed = 0;
1919 return (PyObject *)unicode;
1922 /* Unpack UTF-8 encoded data */
1923 p = unicode->str;
1924 e = s + size;
1926 while (s < e) {
1927 Py_UCS4 ch = (unsigned char)*s;
1929 if (ch < 0x80) {
1930 *p++ = (Py_UNICODE)ch;
1931 s++;
1932 continue;
1935 n = utf8_code_length[ch];
1937 if (s + n > e) {
1938 if (consumed)
1939 break;
1940 else {
1941 errmsg = "unexpected end of data";
1942 startinpos = s-starts;
1943 endinpos = startinpos+1;
1944 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
1945 endinpos++;
1946 goto utf8Error;
1950 switch (n) {
1952 case 0:
1953 errmsg = "invalid start byte";
1954 startinpos = s-starts;
1955 endinpos = startinpos+1;
1956 goto utf8Error;
1958 case 1:
1959 errmsg = "internal error";
1960 startinpos = s-starts;
1961 endinpos = startinpos+1;
1962 goto utf8Error;
1964 case 2:
1965 if ((s[1] & 0xc0) != 0x80) {
1966 errmsg = "invalid continuation byte";
1967 startinpos = s-starts;
1968 endinpos = startinpos + 1;
1969 goto utf8Error;
1971 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1972 assert ((ch > 0x007F) && (ch <= 0x07FF));
1973 *p++ = (Py_UNICODE)ch;
1974 break;
1976 case 3:
1977 /* XXX: surrogates shouldn't be valid UTF-8!
1978 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
1979 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
1980 Uncomment the 2 lines below to make them invalid,
1981 codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
1982 if ((s[1] & 0xc0) != 0x80 ||
1983 (s[2] & 0xc0) != 0x80 ||
1984 ((unsigned char)s[0] == 0xE0 &&
1985 (unsigned char)s[1] < 0xA0)/* ||
1986 ((unsigned char)s[0] == 0xED &&
1987 (unsigned char)s[1] > 0x9F)*/) {
1988 errmsg = "invalid continuation byte";
1989 startinpos = s-starts;
1990 endinpos = startinpos + 1;
1992 /* if s[1] first two bits are 1 and 0, then the invalid
1993 continuation byte is s[2], so increment endinpos by 1,
1994 if not, s[1] is invalid and endinpos doesn't need to
1995 be incremented. */
1996 if ((s[1] & 0xC0) == 0x80)
1997 endinpos++;
1998 goto utf8Error;
2000 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2001 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2002 *p++ = (Py_UNICODE)ch;
2003 break;
2005 case 4:
2006 if ((s[1] & 0xc0) != 0x80 ||
2007 (s[2] & 0xc0) != 0x80 ||
2008 (s[3] & 0xc0) != 0x80 ||
2009 ((unsigned char)s[0] == 0xF0 &&
2010 (unsigned char)s[1] < 0x90) ||
2011 ((unsigned char)s[0] == 0xF4 &&
2012 (unsigned char)s[1] > 0x8F)) {
2013 errmsg = "invalid continuation byte";
2014 startinpos = s-starts;
2015 endinpos = startinpos + 1;
2016 if ((s[1] & 0xC0) == 0x80) {
2017 endinpos++;
2018 if ((s[2] & 0xC0) == 0x80)
2019 endinpos++;
2021 goto utf8Error;
2023 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2024 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2025 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2027 #ifdef Py_UNICODE_WIDE
2028 *p++ = (Py_UNICODE)ch;
2029 #else
2030 /* compute and append the two surrogates: */
2032 /* translate from 10000..10FFFF to 0..FFFF */
2033 ch -= 0x10000;
2035 /* high surrogate = top 10 bits added to D800 */
2036 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2038 /* low surrogate = bottom 10 bits added to DC00 */
2039 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2040 #endif
2041 break;
2043 s += n;
2044 continue;
2046 utf8Error:
2047 outpos = p-PyUnicode_AS_UNICODE(unicode);
2048 if (unicode_decode_call_errorhandler(
2049 errors, &errorHandler,
2050 "utf8", errmsg,
2051 starts, size, &startinpos, &endinpos, &exc, &s,
2052 &unicode, &outpos, &p))
2053 goto onError;
2055 if (consumed)
2056 *consumed = s-starts;
2058 /* Adjust length */
2059 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2060 goto onError;
2062 Py_XDECREF(errorHandler);
2063 Py_XDECREF(exc);
2064 return (PyObject *)unicode;
2066 onError:
2067 Py_XDECREF(errorHandler);
2068 Py_XDECREF(exc);
2069 Py_DECREF(unicode);
2070 return NULL;
2073 /* Allocation strategy: if the string is short, convert into a stack buffer
2074 and allocate exactly as much space needed at the end. Else allocate the
2075 maximum possible needed (4 result bytes per Unicode character), and return
2076 the excess memory at the end.
2078 PyObject *
2079 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2080 Py_ssize_t size,
2081 const char *errors)
2083 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
2085 Py_ssize_t i; /* index into s of next input byte */
2086 PyObject *v; /* result string object */
2087 char *p; /* next free byte in output buffer */
2088 Py_ssize_t nallocated; /* number of result bytes allocated */
2089 Py_ssize_t nneeded; /* number of result bytes needed */
2090 char stackbuf[MAX_SHORT_UNICHARS * 4];
2092 assert(s != NULL);
2093 assert(size >= 0);
2095 if (size <= MAX_SHORT_UNICHARS) {
2096 /* Write into the stack buffer; nallocated can't overflow.
2097 * At the end, we'll allocate exactly as much heap space as it
2098 * turns out we need.
2100 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2101 v = NULL; /* will allocate after we're done */
2102 p = stackbuf;
2104 else {
2105 /* Overallocate on the heap, and give the excess back at the end. */
2106 nallocated = size * 4;
2107 if (nallocated / 4 != size) /* overflow! */
2108 return PyErr_NoMemory();
2109 v = PyString_FromStringAndSize(NULL, nallocated);
2110 if (v == NULL)
2111 return NULL;
2112 p = PyString_AS_STRING(v);
2115 for (i = 0; i < size;) {
2116 Py_UCS4 ch = s[i++];
2118 if (ch < 0x80)
2119 /* Encode ASCII */
2120 *p++ = (char) ch;
2122 else if (ch < 0x0800) {
2123 /* Encode Latin-1 */
2124 *p++ = (char)(0xc0 | (ch >> 6));
2125 *p++ = (char)(0x80 | (ch & 0x3f));
2127 else {
2128 /* Encode UCS2 Unicode ordinals */
2129 if (ch < 0x10000) {
2130 /* Special case: check for high surrogate */
2131 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2132 Py_UCS4 ch2 = s[i];
2133 /* Check for low surrogate and combine the two to
2134 form a UCS4 value */
2135 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2136 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2137 i++;
2138 goto encodeUCS4;
2140 /* Fall through: handles isolated high surrogates */
2142 *p++ = (char)(0xe0 | (ch >> 12));
2143 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2144 *p++ = (char)(0x80 | (ch & 0x3f));
2145 continue;
2147 encodeUCS4:
2148 /* Encode UCS4 Unicode ordinals */
2149 *p++ = (char)(0xf0 | (ch >> 18));
2150 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2151 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2152 *p++ = (char)(0x80 | (ch & 0x3f));
2156 if (v == NULL) {
2157 /* This was stack allocated. */
2158 nneeded = p - stackbuf;
2159 assert(nneeded <= nallocated);
2160 v = PyString_FromStringAndSize(stackbuf, nneeded);
2162 else {
2163 /* Cut back to size actually needed. */
2164 nneeded = p - PyString_AS_STRING(v);
2165 assert(nneeded <= nallocated);
2166 if (_PyString_Resize(&v, nneeded))
2167 return NULL;
2169 return v;
2171 #undef MAX_SHORT_UNICHARS
2174 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2176 if (!PyUnicode_Check(unicode)) {
2177 PyErr_BadArgument();
2178 return NULL;
2180 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2181 PyUnicode_GET_SIZE(unicode),
2182 NULL);
2185 /* --- UTF-32 Codec ------------------------------------------------------- */
2187 PyObject *
2188 PyUnicode_DecodeUTF32(const char *s,
2189 Py_ssize_t size,
2190 const char *errors,
2191 int *byteorder)
2193 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2196 PyObject *
2197 PyUnicode_DecodeUTF32Stateful(const char *s,
2198 Py_ssize_t size,
2199 const char *errors,
2200 int *byteorder,
2201 Py_ssize_t *consumed)
2203 const char *starts = s;
2204 Py_ssize_t startinpos;
2205 Py_ssize_t endinpos;
2206 Py_ssize_t outpos;
2207 PyUnicodeObject *unicode;
2208 Py_UNICODE *p;
2209 #ifndef Py_UNICODE_WIDE
2210 int pairs = 0;
2211 const unsigned char *qq;
2212 #else
2213 const int pairs = 0;
2214 #endif
2215 const unsigned char *q, *e;
2216 int bo = 0; /* assume native ordering by default */
2217 const char *errmsg = "";
2218 /* Offsets from q for retrieving bytes in the right order. */
2219 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2220 int iorder[] = {0, 1, 2, 3};
2221 #else
2222 int iorder[] = {3, 2, 1, 0};
2223 #endif
2224 PyObject *errorHandler = NULL;
2225 PyObject *exc = NULL;
2227 q = (unsigned char *)s;
2228 e = q + size;
2230 if (byteorder)
2231 bo = *byteorder;
2233 /* Check for BOM marks (U+FEFF) in the input and adjust current
2234 byte order setting accordingly. In native mode, the leading BOM
2235 mark is skipped, in all other modes, it is copied to the output
2236 stream as-is (giving a ZWNBSP character). */
2237 if (bo == 0) {
2238 if (size >= 4) {
2239 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2240 (q[iorder[1]] << 8) | q[iorder[0]];
2241 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2242 if (bom == 0x0000FEFF) {
2243 q += 4;
2244 bo = -1;
2246 else if (bom == 0xFFFE0000) {
2247 q += 4;
2248 bo = 1;
2250 #else
2251 if (bom == 0x0000FEFF) {
2252 q += 4;
2253 bo = 1;
2255 else if (bom == 0xFFFE0000) {
2256 q += 4;
2257 bo = -1;
2259 #endif
2263 if (bo == -1) {
2264 /* force LE */
2265 iorder[0] = 0;
2266 iorder[1] = 1;
2267 iorder[2] = 2;
2268 iorder[3] = 3;
2270 else if (bo == 1) {
2271 /* force BE */
2272 iorder[0] = 3;
2273 iorder[1] = 2;
2274 iorder[2] = 1;
2275 iorder[3] = 0;
2278 /* On narrow builds we split characters outside the BMP into two
2279 codepoints => count how much extra space we need. */
2280 #ifndef Py_UNICODE_WIDE
2281 for (qq = q; qq < e; qq += 4)
2282 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2283 pairs++;
2284 #endif
2286 /* This might be one to much, because of a BOM */
2287 unicode = _PyUnicode_New((size+3)/4+pairs);
2288 if (!unicode)
2289 return NULL;
2290 if (size == 0)
2291 return (PyObject *)unicode;
2293 /* Unpack UTF-32 encoded data */
2294 p = unicode->str;
2296 while (q < e) {
2297 Py_UCS4 ch;
2298 /* remaining bytes at the end? (size should be divisible by 4) */
2299 if (e-q<4) {
2300 if (consumed)
2301 break;
2302 errmsg = "truncated data";
2303 startinpos = ((const char *)q)-starts;
2304 endinpos = ((const char *)e)-starts;
2305 goto utf32Error;
2306 /* The remaining input chars are ignored if the callback
2307 chooses to skip the input */
2309 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2310 (q[iorder[1]] << 8) | q[iorder[0]];
2312 if (ch >= 0x110000)
2314 errmsg = "codepoint not in range(0x110000)";
2315 startinpos = ((const char *)q)-starts;
2316 endinpos = startinpos+4;
2317 goto utf32Error;
2319 #ifndef Py_UNICODE_WIDE
2320 if (ch >= 0x10000)
2322 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2323 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2325 else
2326 #endif
2327 *p++ = ch;
2328 q += 4;
2329 continue;
2330 utf32Error:
2331 outpos = p-PyUnicode_AS_UNICODE(unicode);
2332 if (unicode_decode_call_errorhandler(
2333 errors, &errorHandler,
2334 "utf32", errmsg,
2335 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2336 &unicode, &outpos, &p))
2337 goto onError;
2340 if (byteorder)
2341 *byteorder = bo;
2343 if (consumed)
2344 *consumed = (const char *)q-starts;
2346 /* Adjust length */
2347 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2348 goto onError;
2350 Py_XDECREF(errorHandler);
2351 Py_XDECREF(exc);
2352 return (PyObject *)unicode;
2354 onError:
2355 Py_DECREF(unicode);
2356 Py_XDECREF(errorHandler);
2357 Py_XDECREF(exc);
2358 return NULL;
2361 PyObject *
2362 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2363 Py_ssize_t size,
2364 const char *errors,
2365 int byteorder)
2367 PyObject *v;
2368 unsigned char *p;
2369 Py_ssize_t nsize, bytesize;
2370 #ifndef Py_UNICODE_WIDE
2371 Py_ssize_t i, pairs;
2372 #else
2373 const int pairs = 0;
2374 #endif
2375 /* Offsets from p for storing byte pairs in the right order. */
2376 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2377 int iorder[] = {0, 1, 2, 3};
2378 #else
2379 int iorder[] = {3, 2, 1, 0};
2380 #endif
2382 #define STORECHAR(CH) \
2383 do { \
2384 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2385 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2386 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2387 p[iorder[0]] = (CH) & 0xff; \
2388 p += 4; \
2389 } while(0)
2391 /* In narrow builds we can output surrogate pairs as one codepoint,
2392 so we need less space. */
2393 #ifndef Py_UNICODE_WIDE
2394 for (i = pairs = 0; i < size-1; i++)
2395 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2396 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2397 pairs++;
2398 #endif
2399 nsize = (size - pairs + (byteorder == 0));
2400 bytesize = nsize * 4;
2401 if (bytesize / 4 != nsize)
2402 return PyErr_NoMemory();
2403 v = PyString_FromStringAndSize(NULL, bytesize);
2404 if (v == NULL)
2405 return NULL;
2407 p = (unsigned char *)PyString_AS_STRING(v);
2408 if (byteorder == 0)
2409 STORECHAR(0xFEFF);
2410 if (size == 0)
2411 return v;
2413 if (byteorder == -1) {
2414 /* force LE */
2415 iorder[0] = 0;
2416 iorder[1] = 1;
2417 iorder[2] = 2;
2418 iorder[3] = 3;
2420 else if (byteorder == 1) {
2421 /* force BE */
2422 iorder[0] = 3;
2423 iorder[1] = 2;
2424 iorder[2] = 1;
2425 iorder[3] = 0;
2428 while (size-- > 0) {
2429 Py_UCS4 ch = *s++;
2430 #ifndef Py_UNICODE_WIDE
2431 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2432 Py_UCS4 ch2 = *s;
2433 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2434 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2435 s++;
2436 size--;
2439 #endif
2440 STORECHAR(ch);
2442 return v;
2443 #undef STORECHAR
2446 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2448 if (!PyUnicode_Check(unicode)) {
2449 PyErr_BadArgument();
2450 return NULL;
2452 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2453 PyUnicode_GET_SIZE(unicode),
2454 NULL,
2458 /* --- UTF-16 Codec ------------------------------------------------------- */
2460 PyObject *
2461 PyUnicode_DecodeUTF16(const char *s,
2462 Py_ssize_t size,
2463 const char *errors,
2464 int *byteorder)
2466 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2469 PyObject *
2470 PyUnicode_DecodeUTF16Stateful(const char *s,
2471 Py_ssize_t size,
2472 const char *errors,
2473 int *byteorder,
2474 Py_ssize_t *consumed)
2476 const char *starts = s;
2477 Py_ssize_t startinpos;
2478 Py_ssize_t endinpos;
2479 Py_ssize_t outpos;
2480 PyUnicodeObject *unicode;
2481 Py_UNICODE *p;
2482 const unsigned char *q, *e;
2483 int bo = 0; /* assume native ordering by default */
2484 const char *errmsg = "";
2485 /* Offsets from q for retrieving byte pairs in the right order. */
2486 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2487 int ihi = 1, ilo = 0;
2488 #else
2489 int ihi = 0, ilo = 1;
2490 #endif
2491 PyObject *errorHandler = NULL;
2492 PyObject *exc = NULL;
2494 /* Note: size will always be longer than the resulting Unicode
2495 character count */
2496 unicode = _PyUnicode_New(size);
2497 if (!unicode)
2498 return NULL;
2499 if (size == 0)
2500 return (PyObject *)unicode;
2502 /* Unpack UTF-16 encoded data */
2503 p = unicode->str;
2504 q = (unsigned char *)s;
2505 e = q + size;
2507 if (byteorder)
2508 bo = *byteorder;
2510 /* Check for BOM marks (U+FEFF) in the input and adjust current
2511 byte order setting accordingly. In native mode, the leading BOM
2512 mark is skipped, in all other modes, it is copied to the output
2513 stream as-is (giving a ZWNBSP character). */
2514 if (bo == 0) {
2515 if (size >= 2) {
2516 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2517 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2518 if (bom == 0xFEFF) {
2519 q += 2;
2520 bo = -1;
2522 else if (bom == 0xFFFE) {
2523 q += 2;
2524 bo = 1;
2526 #else
2527 if (bom == 0xFEFF) {
2528 q += 2;
2529 bo = 1;
2531 else if (bom == 0xFFFE) {
2532 q += 2;
2533 bo = -1;
2535 #endif
2539 if (bo == -1) {
2540 /* force LE */
2541 ihi = 1;
2542 ilo = 0;
2544 else if (bo == 1) {
2545 /* force BE */
2546 ihi = 0;
2547 ilo = 1;
2550 while (q < e) {
2551 Py_UNICODE ch;
2552 /* remaining bytes at the end? (size should be even) */
2553 if (e-q<2) {
2554 if (consumed)
2555 break;
2556 errmsg = "truncated data";
2557 startinpos = ((const char *)q)-starts;
2558 endinpos = ((const char *)e)-starts;
2559 goto utf16Error;
2560 /* The remaining input chars are ignored if the callback
2561 chooses to skip the input */
2563 ch = (q[ihi] << 8) | q[ilo];
2565 q += 2;
2567 if (ch < 0xD800 || ch > 0xDFFF) {
2568 *p++ = ch;
2569 continue;
2572 /* UTF-16 code pair: */
2573 if (q >= e) {
2574 errmsg = "unexpected end of data";
2575 startinpos = (((const char *)q)-2)-starts;
2576 endinpos = ((const char *)e)-starts;
2577 goto utf16Error;
2579 if (0xD800 <= ch && ch <= 0xDBFF) {
2580 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2581 q += 2;
2582 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2583 #ifndef Py_UNICODE_WIDE
2584 *p++ = ch;
2585 *p++ = ch2;
2586 #else
2587 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2588 #endif
2589 continue;
2591 else {
2592 errmsg = "illegal UTF-16 surrogate";
2593 startinpos = (((const char *)q)-4)-starts;
2594 endinpos = startinpos+2;
2595 goto utf16Error;
2599 errmsg = "illegal encoding";
2600 startinpos = (((const char *)q)-2)-starts;
2601 endinpos = startinpos+2;
2602 /* Fall through to report the error */
2604 utf16Error:
2605 outpos = p-PyUnicode_AS_UNICODE(unicode);
2606 if (unicode_decode_call_errorhandler(
2607 errors, &errorHandler,
2608 "utf16", errmsg,
2609 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2610 &unicode, &outpos, &p))
2611 goto onError;
2614 if (byteorder)
2615 *byteorder = bo;
2617 if (consumed)
2618 *consumed = (const char *)q-starts;
2620 /* Adjust length */
2621 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2622 goto onError;
2624 Py_XDECREF(errorHandler);
2625 Py_XDECREF(exc);
2626 return (PyObject *)unicode;
2628 onError:
2629 Py_DECREF(unicode);
2630 Py_XDECREF(errorHandler);
2631 Py_XDECREF(exc);
2632 return NULL;
2635 PyObject *
2636 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2637 Py_ssize_t size,
2638 const char *errors,
2639 int byteorder)
2641 PyObject *v;
2642 unsigned char *p;
2643 Py_ssize_t nsize, bytesize;
2644 #ifdef Py_UNICODE_WIDE
2645 Py_ssize_t i, pairs;
2646 #else
2647 const int pairs = 0;
2648 #endif
2649 /* Offsets from p for storing byte pairs in the right order. */
2650 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2651 int ihi = 1, ilo = 0;
2652 #else
2653 int ihi = 0, ilo = 1;
2654 #endif
2656 #define STORECHAR(CH) \
2657 do { \
2658 p[ihi] = ((CH) >> 8) & 0xff; \
2659 p[ilo] = (CH) & 0xff; \
2660 p += 2; \
2661 } while(0)
2663 #ifdef Py_UNICODE_WIDE
2664 for (i = pairs = 0; i < size; i++)
2665 if (s[i] >= 0x10000)
2666 pairs++;
2667 #endif
2668 /* 2 * (size + pairs + (byteorder == 0)) */
2669 if (size > PY_SSIZE_T_MAX ||
2670 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2671 return PyErr_NoMemory();
2672 nsize = size + pairs + (byteorder == 0);
2673 bytesize = nsize * 2;
2674 if (bytesize / 2 != nsize)
2675 return PyErr_NoMemory();
2676 v = PyString_FromStringAndSize(NULL, bytesize);
2677 if (v == NULL)
2678 return NULL;
2680 p = (unsigned char *)PyString_AS_STRING(v);
2681 if (byteorder == 0)
2682 STORECHAR(0xFEFF);
2683 if (size == 0)
2684 return v;
2686 if (byteorder == -1) {
2687 /* force LE */
2688 ihi = 1;
2689 ilo = 0;
2691 else if (byteorder == 1) {
2692 /* force BE */
2693 ihi = 0;
2694 ilo = 1;
2697 while (size-- > 0) {
2698 Py_UNICODE ch = *s++;
2699 Py_UNICODE ch2 = 0;
2700 #ifdef Py_UNICODE_WIDE
2701 if (ch >= 0x10000) {
2702 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2703 ch = 0xD800 | ((ch-0x10000) >> 10);
2705 #endif
2706 STORECHAR(ch);
2707 if (ch2)
2708 STORECHAR(ch2);
2710 return v;
2711 #undef STORECHAR
2714 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2716 if (!PyUnicode_Check(unicode)) {
2717 PyErr_BadArgument();
2718 return NULL;
2720 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2721 PyUnicode_GET_SIZE(unicode),
2722 NULL,
2726 /* --- Unicode Escape Codec ----------------------------------------------- */
2728 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2730 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2731 Py_ssize_t size,
2732 const char *errors)
2734 const char *starts = s;
2735 Py_ssize_t startinpos;
2736 Py_ssize_t endinpos;
2737 Py_ssize_t outpos;
2738 int i;
2739 PyUnicodeObject *v;
2740 Py_UNICODE *p;
2741 const char *end;
2742 char* message;
2743 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2744 PyObject *errorHandler = NULL;
2745 PyObject *exc = NULL;
2747 /* Escaped strings will always be longer than the resulting
2748 Unicode string, so we start with size here and then reduce the
2749 length after conversion to the true value.
2750 (but if the error callback returns a long replacement string
2751 we'll have to allocate more space) */
2752 v = _PyUnicode_New(size);
2753 if (v == NULL)
2754 goto onError;
2755 if (size == 0)
2756 return (PyObject *)v;
2758 p = PyUnicode_AS_UNICODE(v);
2759 end = s + size;
2761 while (s < end) {
2762 unsigned char c;
2763 Py_UNICODE x;
2764 int digits;
2766 /* Non-escape characters are interpreted as Unicode ordinals */
2767 if (*s != '\\') {
2768 *p++ = (unsigned char) *s++;
2769 continue;
2772 startinpos = s-starts;
2773 /* \ - Escapes */
2774 s++;
2775 c = *s++;
2776 if (s > end)
2777 c = '\0'; /* Invalid after \ */
2778 switch (c) {
2780 /* \x escapes */
2781 case '\n': break;
2782 case '\\': *p++ = '\\'; break;
2783 case '\'': *p++ = '\''; break;
2784 case '\"': *p++ = '\"'; break;
2785 case 'b': *p++ = '\b'; break;
2786 case 'f': *p++ = '\014'; break; /* FF */
2787 case 't': *p++ = '\t'; break;
2788 case 'n': *p++ = '\n'; break;
2789 case 'r': *p++ = '\r'; break;
2790 case 'v': *p++ = '\013'; break; /* VT */
2791 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2793 /* \OOO (octal) escapes */
2794 case '0': case '1': case '2': case '3':
2795 case '4': case '5': case '6': case '7':
2796 x = s[-1] - '0';
2797 if (s < end && '0' <= *s && *s <= '7') {
2798 x = (x<<3) + *s++ - '0';
2799 if (s < end && '0' <= *s && *s <= '7')
2800 x = (x<<3) + *s++ - '0';
2802 *p++ = x;
2803 break;
2805 /* hex escapes */
2806 /* \xXX */
2807 case 'x':
2808 digits = 2;
2809 message = "truncated \\xXX escape";
2810 goto hexescape;
2812 /* \uXXXX */
2813 case 'u':
2814 digits = 4;
2815 message = "truncated \\uXXXX escape";
2816 goto hexescape;
2818 /* \UXXXXXXXX */
2819 case 'U':
2820 digits = 8;
2821 message = "truncated \\UXXXXXXXX escape";
2822 hexescape:
2823 chr = 0;
2824 outpos = p-PyUnicode_AS_UNICODE(v);
2825 if (s+digits>end) {
2826 endinpos = size;
2827 if (unicode_decode_call_errorhandler(
2828 errors, &errorHandler,
2829 "unicodeescape", "end of string in escape sequence",
2830 starts, size, &startinpos, &endinpos, &exc, &s,
2831 &v, &outpos, &p))
2832 goto onError;
2833 goto nextByte;
2835 for (i = 0; i < digits; ++i) {
2836 c = (unsigned char) s[i];
2837 if (!isxdigit(c)) {
2838 endinpos = (s+i+1)-starts;
2839 if (unicode_decode_call_errorhandler(
2840 errors, &errorHandler,
2841 "unicodeescape", message,
2842 starts, size, &startinpos, &endinpos, &exc, &s,
2843 &v, &outpos, &p))
2844 goto onError;
2845 goto nextByte;
2847 chr = (chr<<4) & ~0xF;
2848 if (c >= '0' && c <= '9')
2849 chr += c - '0';
2850 else if (c >= 'a' && c <= 'f')
2851 chr += 10 + c - 'a';
2852 else
2853 chr += 10 + c - 'A';
2855 s += i;
2856 if (chr == 0xffffffff && PyErr_Occurred())
2857 /* _decoding_error will have already written into the
2858 target buffer. */
2859 break;
2860 store:
2861 /* when we get here, chr is a 32-bit unicode character */
2862 if (chr <= 0xffff)
2863 /* UCS-2 character */
2864 *p++ = (Py_UNICODE) chr;
2865 else if (chr <= 0x10ffff) {
2866 /* UCS-4 character. Either store directly, or as
2867 surrogate pair. */
2868 #ifdef Py_UNICODE_WIDE
2869 *p++ = chr;
2870 #else
2871 chr -= 0x10000L;
2872 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2873 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2874 #endif
2875 } else {
2876 endinpos = s-starts;
2877 outpos = p-PyUnicode_AS_UNICODE(v);
2878 if (unicode_decode_call_errorhandler(
2879 errors, &errorHandler,
2880 "unicodeescape", "illegal Unicode character",
2881 starts, size, &startinpos, &endinpos, &exc, &s,
2882 &v, &outpos, &p))
2883 goto onError;
2885 break;
2887 /* \N{name} */
2888 case 'N':
2889 message = "malformed \\N character escape";
2890 if (ucnhash_CAPI == NULL) {
2891 /* load the unicode data module */
2892 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
2893 if (ucnhash_CAPI == NULL)
2894 goto ucnhashError;
2896 if (*s == '{') {
2897 const char *start = s+1;
2898 /* look for the closing brace */
2899 while (*s != '}' && s < end)
2900 s++;
2901 if (s > start && s < end && *s == '}') {
2902 /* found a name. look it up in the unicode database */
2903 message = "unknown Unicode character name";
2904 s++;
2905 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2906 goto store;
2909 endinpos = s-starts;
2910 outpos = p-PyUnicode_AS_UNICODE(v);
2911 if (unicode_decode_call_errorhandler(
2912 errors, &errorHandler,
2913 "unicodeescape", message,
2914 starts, size, &startinpos, &endinpos, &exc, &s,
2915 &v, &outpos, &p))
2916 goto onError;
2917 break;
2919 default:
2920 if (s > end) {
2921 message = "\\ at end of string";
2922 s--;
2923 endinpos = s-starts;
2924 outpos = p-PyUnicode_AS_UNICODE(v);
2925 if (unicode_decode_call_errorhandler(
2926 errors, &errorHandler,
2927 "unicodeescape", message,
2928 starts, size, &startinpos, &endinpos, &exc, &s,
2929 &v, &outpos, &p))
2930 goto onError;
2932 else {
2933 *p++ = '\\';
2934 *p++ = (unsigned char)s[-1];
2936 break;
2938 nextByte:
2941 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2942 goto onError;
2943 Py_XDECREF(errorHandler);
2944 Py_XDECREF(exc);
2945 return (PyObject *)v;
2947 ucnhashError:
2948 PyErr_SetString(
2949 PyExc_UnicodeError,
2950 "\\N escapes not supported (can't load unicodedata module)"
2952 Py_XDECREF(v);
2953 Py_XDECREF(errorHandler);
2954 Py_XDECREF(exc);
2955 return NULL;
2957 onError:
2958 Py_XDECREF(v);
2959 Py_XDECREF(errorHandler);
2960 Py_XDECREF(exc);
2961 return NULL;
2964 /* Return a Unicode-Escape string version of the Unicode object.
2966 If quotes is true, the string is enclosed in u"" or u'' quotes as
2967 appropriate.
2971 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2972 Py_ssize_t size,
2973 Py_UNICODE ch)
2975 /* like wcschr, but doesn't stop at NULL characters */
2977 while (size-- > 0) {
2978 if (*s == ch)
2979 return s;
2980 s++;
2983 return NULL;
2986 static
2987 PyObject *unicodeescape_string(const Py_UNICODE *s,
2988 Py_ssize_t size,
2989 int quotes)
2991 PyObject *repr;
2992 char *p;
2994 static const char *hexdigit = "0123456789abcdef";
2995 #ifdef Py_UNICODE_WIDE
2996 const Py_ssize_t expandsize = 10;
2997 #else
2998 const Py_ssize_t expandsize = 6;
2999 #endif
3001 /* XXX(nnorwitz): rather than over-allocating, it would be
3002 better to choose a different scheme. Perhaps scan the
3003 first N-chars of the string and allocate based on that size.
3005 /* Initial allocation is based on the longest-possible unichr
3006 escape.
3008 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3009 unichr, so in this case it's the longest unichr escape. In
3010 narrow (UTF-16) builds this is five chars per source unichr
3011 since there are two unichrs in the surrogate pair, so in narrow
3012 (UTF-16) builds it's not the longest unichr escape.
3014 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3015 so in the narrow (UTF-16) build case it's the longest unichr
3016 escape.
3019 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3020 return PyErr_NoMemory();
3022 repr = PyString_FromStringAndSize(NULL,
3024 + expandsize*size
3025 + 1);
3026 if (repr == NULL)
3027 return NULL;
3029 p = PyString_AS_STRING(repr);
3031 if (quotes) {
3032 *p++ = 'u';
3033 *p++ = (findchar(s, size, '\'') &&
3034 !findchar(s, size, '"')) ? '"' : '\'';
3036 while (size-- > 0) {
3037 Py_UNICODE ch = *s++;
3039 /* Escape quotes and backslashes */
3040 if ((quotes &&
3041 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
3042 *p++ = '\\';
3043 *p++ = (char) ch;
3044 continue;
3047 #ifdef Py_UNICODE_WIDE
3048 /* Map 21-bit characters to '\U00xxxxxx' */
3049 else if (ch >= 0x10000) {
3050 *p++ = '\\';
3051 *p++ = 'U';
3052 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3053 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3054 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3055 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3056 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3057 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3058 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
3059 *p++ = hexdigit[ch & 0x0000000F];
3060 continue;
3062 #else
3063 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3064 else if (ch >= 0xD800 && ch < 0xDC00) {
3065 Py_UNICODE ch2;
3066 Py_UCS4 ucs;
3068 ch2 = *s++;
3069 size--;
3070 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3071 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3072 *p++ = '\\';
3073 *p++ = 'U';
3074 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3075 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3076 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3077 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3078 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3079 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3080 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3081 *p++ = hexdigit[ucs & 0x0000000F];
3082 continue;
3084 /* Fall through: isolated surrogates are copied as-is */
3085 s--;
3086 size++;
3088 #endif
3090 /* Map 16-bit characters to '\uxxxx' */
3091 if (ch >= 256) {
3092 *p++ = '\\';
3093 *p++ = 'u';
3094 *p++ = hexdigit[(ch >> 12) & 0x000F];
3095 *p++ = hexdigit[(ch >> 8) & 0x000F];
3096 *p++ = hexdigit[(ch >> 4) & 0x000F];
3097 *p++ = hexdigit[ch & 0x000F];
3100 /* Map special whitespace to '\t', \n', '\r' */
3101 else if (ch == '\t') {
3102 *p++ = '\\';
3103 *p++ = 't';
3105 else if (ch == '\n') {
3106 *p++ = '\\';
3107 *p++ = 'n';
3109 else if (ch == '\r') {
3110 *p++ = '\\';
3111 *p++ = 'r';
3114 /* Map non-printable US ASCII to '\xhh' */
3115 else if (ch < ' ' || ch >= 0x7F) {
3116 *p++ = '\\';
3117 *p++ = 'x';
3118 *p++ = hexdigit[(ch >> 4) & 0x000F];
3119 *p++ = hexdigit[ch & 0x000F];
3122 /* Copy everything else as-is */
3123 else
3124 *p++ = (char) ch;
3126 if (quotes)
3127 *p++ = PyString_AS_STRING(repr)[1];
3129 *p = '\0';
3130 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3131 return NULL;
3132 return repr;
3135 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3136 Py_ssize_t size)
3138 return unicodeescape_string(s, size, 0);
3141 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3143 if (!PyUnicode_Check(unicode)) {
3144 PyErr_BadArgument();
3145 return NULL;
3147 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3148 PyUnicode_GET_SIZE(unicode));
3151 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3153 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3154 Py_ssize_t size,
3155 const char *errors)
3157 const char *starts = s;
3158 Py_ssize_t startinpos;
3159 Py_ssize_t endinpos;
3160 Py_ssize_t outpos;
3161 PyUnicodeObject *v;
3162 Py_UNICODE *p;
3163 const char *end;
3164 const char *bs;
3165 PyObject *errorHandler = NULL;
3166 PyObject *exc = NULL;
3168 /* Escaped strings will always be longer than the resulting
3169 Unicode string, so we start with size here and then reduce the
3170 length after conversion to the true value. (But decoding error
3171 handler might have to resize the string) */
3172 v = _PyUnicode_New(size);
3173 if (v == NULL)
3174 goto onError;
3175 if (size == 0)
3176 return (PyObject *)v;
3177 p = PyUnicode_AS_UNICODE(v);
3178 end = s + size;
3179 while (s < end) {
3180 unsigned char c;
3181 Py_UCS4 x;
3182 int i;
3183 int count;
3185 /* Non-escape characters are interpreted as Unicode ordinals */
3186 if (*s != '\\') {
3187 *p++ = (unsigned char)*s++;
3188 continue;
3190 startinpos = s-starts;
3192 /* \u-escapes are only interpreted iff the number of leading
3193 backslashes if odd */
3194 bs = s;
3195 for (;s < end;) {
3196 if (*s != '\\')
3197 break;
3198 *p++ = (unsigned char)*s++;
3200 if (((s - bs) & 1) == 0 ||
3201 s >= end ||
3202 (*s != 'u' && *s != 'U')) {
3203 continue;
3205 p--;
3206 count = *s=='u' ? 4 : 8;
3207 s++;
3209 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3210 outpos = p-PyUnicode_AS_UNICODE(v);
3211 for (x = 0, i = 0; i < count; ++i, ++s) {
3212 c = (unsigned char)*s;
3213 if (!isxdigit(c)) {
3214 endinpos = s-starts;
3215 if (unicode_decode_call_errorhandler(
3216 errors, &errorHandler,
3217 "rawunicodeescape", "truncated \\uXXXX",
3218 starts, size, &startinpos, &endinpos, &exc, &s,
3219 &v, &outpos, &p))
3220 goto onError;
3221 goto nextByte;
3223 x = (x<<4) & ~0xF;
3224 if (c >= '0' && c <= '9')
3225 x += c - '0';
3226 else if (c >= 'a' && c <= 'f')
3227 x += 10 + c - 'a';
3228 else
3229 x += 10 + c - 'A';
3231 if (x <= 0xffff)
3232 /* UCS-2 character */
3233 *p++ = (Py_UNICODE) x;
3234 else if (x <= 0x10ffff) {
3235 /* UCS-4 character. Either store directly, or as
3236 surrogate pair. */
3237 #ifdef Py_UNICODE_WIDE
3238 *p++ = (Py_UNICODE) x;
3239 #else
3240 x -= 0x10000L;
3241 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3242 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3243 #endif
3244 } else {
3245 endinpos = s-starts;
3246 outpos = p-PyUnicode_AS_UNICODE(v);
3247 if (unicode_decode_call_errorhandler(
3248 errors, &errorHandler,
3249 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3250 starts, size, &startinpos, &endinpos, &exc, &s,
3251 &v, &outpos, &p))
3252 goto onError;
3254 nextByte:
3257 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3258 goto onError;
3259 Py_XDECREF(errorHandler);
3260 Py_XDECREF(exc);
3261 return (PyObject *)v;
3263 onError:
3264 Py_XDECREF(v);
3265 Py_XDECREF(errorHandler);
3266 Py_XDECREF(exc);
3267 return NULL;
3270 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3271 Py_ssize_t size)
3273 PyObject *repr;
3274 char *p;
3275 char *q;
3277 static const char *hexdigit = "0123456789abcdef";
3278 #ifdef Py_UNICODE_WIDE
3279 const Py_ssize_t expandsize = 10;
3280 #else
3281 const Py_ssize_t expandsize = 6;
3282 #endif
3284 if (size > PY_SSIZE_T_MAX / expandsize)
3285 return PyErr_NoMemory();
3287 repr = PyString_FromStringAndSize(NULL, expandsize * size);
3288 if (repr == NULL)
3289 return NULL;
3290 if (size == 0)
3291 return repr;
3293 p = q = PyString_AS_STRING(repr);
3294 while (size-- > 0) {
3295 Py_UNICODE ch = *s++;
3296 #ifdef Py_UNICODE_WIDE
3297 /* Map 32-bit characters to '\Uxxxxxxxx' */
3298 if (ch >= 0x10000) {
3299 *p++ = '\\';
3300 *p++ = 'U';
3301 *p++ = hexdigit[(ch >> 28) & 0xf];
3302 *p++ = hexdigit[(ch >> 24) & 0xf];
3303 *p++ = hexdigit[(ch >> 20) & 0xf];
3304 *p++ = hexdigit[(ch >> 16) & 0xf];
3305 *p++ = hexdigit[(ch >> 12) & 0xf];
3306 *p++ = hexdigit[(ch >> 8) & 0xf];
3307 *p++ = hexdigit[(ch >> 4) & 0xf];
3308 *p++ = hexdigit[ch & 15];
3310 else
3311 #else
3312 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3313 if (ch >= 0xD800 && ch < 0xDC00) {
3314 Py_UNICODE ch2;
3315 Py_UCS4 ucs;
3317 ch2 = *s++;
3318 size--;
3319 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3320 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3321 *p++ = '\\';
3322 *p++ = 'U';
3323 *p++ = hexdigit[(ucs >> 28) & 0xf];
3324 *p++ = hexdigit[(ucs >> 24) & 0xf];
3325 *p++ = hexdigit[(ucs >> 20) & 0xf];
3326 *p++ = hexdigit[(ucs >> 16) & 0xf];
3327 *p++ = hexdigit[(ucs >> 12) & 0xf];
3328 *p++ = hexdigit[(ucs >> 8) & 0xf];
3329 *p++ = hexdigit[(ucs >> 4) & 0xf];
3330 *p++ = hexdigit[ucs & 0xf];
3331 continue;
3333 /* Fall through: isolated surrogates are copied as-is */
3334 s--;
3335 size++;
3337 #endif
3338 /* Map 16-bit characters to '\uxxxx' */
3339 if (ch >= 256) {
3340 *p++ = '\\';
3341 *p++ = 'u';
3342 *p++ = hexdigit[(ch >> 12) & 0xf];
3343 *p++ = hexdigit[(ch >> 8) & 0xf];
3344 *p++ = hexdigit[(ch >> 4) & 0xf];
3345 *p++ = hexdigit[ch & 15];
3347 /* Copy everything else as-is */
3348 else
3349 *p++ = (char) ch;
3351 *p = '\0';
3352 if (_PyString_Resize(&repr, p - q))
3353 return NULL;
3354 return repr;
3357 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3359 if (!PyUnicode_Check(unicode)) {
3360 PyErr_BadArgument();
3361 return NULL;
3363 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3364 PyUnicode_GET_SIZE(unicode));
3367 /* --- Unicode Internal Codec ------------------------------------------- */
3369 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3370 Py_ssize_t size,
3371 const char *errors)
3373 const char *starts = s;
3374 Py_ssize_t startinpos;
3375 Py_ssize_t endinpos;
3376 Py_ssize_t outpos;
3377 PyUnicodeObject *v;
3378 Py_UNICODE *p;
3379 const char *end;
3380 const char *reason;
3381 PyObject *errorHandler = NULL;
3382 PyObject *exc = NULL;
3384 #ifdef Py_UNICODE_WIDE
3385 Py_UNICODE unimax = PyUnicode_GetMax();
3386 #endif
3388 /* XXX overflow detection missing */
3389 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3390 if (v == NULL)
3391 goto onError;
3392 if (PyUnicode_GetSize((PyObject *)v) == 0)
3393 return (PyObject *)v;
3394 p = PyUnicode_AS_UNICODE(v);
3395 end = s + size;
3397 while (s < end) {
3398 memcpy(p, s, sizeof(Py_UNICODE));
3399 /* We have to sanity check the raw data, otherwise doom looms for
3400 some malformed UCS-4 data. */
3401 if (
3402 #ifdef Py_UNICODE_WIDE
3403 *p > unimax || *p < 0 ||
3404 #endif
3405 end-s < Py_UNICODE_SIZE
3408 startinpos = s - starts;
3409 if (end-s < Py_UNICODE_SIZE) {
3410 endinpos = end-starts;
3411 reason = "truncated input";
3413 else {
3414 endinpos = s - starts + Py_UNICODE_SIZE;
3415 reason = "illegal code point (> 0x10FFFF)";
3417 outpos = p - PyUnicode_AS_UNICODE(v);
3418 if (unicode_decode_call_errorhandler(
3419 errors, &errorHandler,
3420 "unicode_internal", reason,
3421 starts, size, &startinpos, &endinpos, &exc, &s,
3422 &v, &outpos, &p)) {
3423 goto onError;
3426 else {
3427 p++;
3428 s += Py_UNICODE_SIZE;
3432 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3433 goto onError;
3434 Py_XDECREF(errorHandler);
3435 Py_XDECREF(exc);
3436 return (PyObject *)v;
3438 onError:
3439 Py_XDECREF(v);
3440 Py_XDECREF(errorHandler);
3441 Py_XDECREF(exc);
3442 return NULL;
3445 /* --- Latin-1 Codec ------------------------------------------------------ */
3447 PyObject *PyUnicode_DecodeLatin1(const char *s,
3448 Py_ssize_t size,
3449 const char *errors)
3451 PyUnicodeObject *v;
3452 Py_UNICODE *p;
3454 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3455 if (size == 1) {
3456 Py_UNICODE r = *(unsigned char*)s;
3457 return PyUnicode_FromUnicode(&r, 1);
3460 v = _PyUnicode_New(size);
3461 if (v == NULL)
3462 goto onError;
3463 if (size == 0)
3464 return (PyObject *)v;
3465 p = PyUnicode_AS_UNICODE(v);
3466 while (size-- > 0)
3467 *p++ = (unsigned char)*s++;
3468 return (PyObject *)v;
3470 onError:
3471 Py_XDECREF(v);
3472 return NULL;
3475 /* create or adjust a UnicodeEncodeError */
3476 static void make_encode_exception(PyObject **exceptionObject,
3477 const char *encoding,
3478 const Py_UNICODE *unicode, Py_ssize_t size,
3479 Py_ssize_t startpos, Py_ssize_t endpos,
3480 const char *reason)
3482 if (*exceptionObject == NULL) {
3483 *exceptionObject = PyUnicodeEncodeError_Create(
3484 encoding, unicode, size, startpos, endpos, reason);
3486 else {
3487 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3488 goto onError;
3489 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3490 goto onError;
3491 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3492 goto onError;
3493 return;
3494 onError:
3495 Py_DECREF(*exceptionObject);
3496 *exceptionObject = NULL;
3500 /* raises a UnicodeEncodeError */
3501 static void raise_encode_exception(PyObject **exceptionObject,
3502 const char *encoding,
3503 const Py_UNICODE *unicode, Py_ssize_t size,
3504 Py_ssize_t startpos, Py_ssize_t endpos,
3505 const char *reason)
3507 make_encode_exception(exceptionObject,
3508 encoding, unicode, size, startpos, endpos, reason);
3509 if (*exceptionObject != NULL)
3510 PyCodec_StrictErrors(*exceptionObject);
3513 /* error handling callback helper:
3514 build arguments, call the callback and check the arguments,
3515 put the result into newpos and return the replacement string, which
3516 has to be freed by the caller */
3517 static PyObject *unicode_encode_call_errorhandler(const char *errors,
3518 PyObject **errorHandler,
3519 const char *encoding, const char *reason,
3520 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3521 Py_ssize_t startpos, Py_ssize_t endpos,
3522 Py_ssize_t *newpos)
3524 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3526 PyObject *restuple;
3527 PyObject *resunicode;
3529 if (*errorHandler == NULL) {
3530 *errorHandler = PyCodec_LookupError(errors);
3531 if (*errorHandler == NULL)
3532 return NULL;
3535 make_encode_exception(exceptionObject,
3536 encoding, unicode, size, startpos, endpos, reason);
3537 if (*exceptionObject == NULL)
3538 return NULL;
3540 restuple = PyObject_CallFunctionObjArgs(
3541 *errorHandler, *exceptionObject, NULL);
3542 if (restuple == NULL)
3543 return NULL;
3544 if (!PyTuple_Check(restuple)) {
3545 PyErr_SetString(PyExc_TypeError, &argparse[4]);
3546 Py_DECREF(restuple);
3547 return NULL;
3549 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3550 &resunicode, newpos)) {
3551 Py_DECREF(restuple);
3552 return NULL;
3554 if (*newpos<0)
3555 *newpos = size+*newpos;
3556 if (*newpos<0 || *newpos>size) {
3557 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3558 Py_DECREF(restuple);
3559 return NULL;
3561 Py_INCREF(resunicode);
3562 Py_DECREF(restuple);
3563 return resunicode;
3566 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3567 Py_ssize_t size,
3568 const char *errors,
3569 int limit)
3571 /* output object */
3572 PyObject *res;
3573 /* pointers to the beginning and end+1 of input */
3574 const Py_UNICODE *startp = p;
3575 const Py_UNICODE *endp = p + size;
3576 /* pointer to the beginning of the unencodable characters */
3577 /* const Py_UNICODE *badp = NULL; */
3578 /* pointer into the output */
3579 char *str;
3580 /* current output position */
3581 Py_ssize_t respos = 0;
3582 Py_ssize_t ressize;
3583 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3584 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3585 PyObject *errorHandler = NULL;
3586 PyObject *exc = NULL;
3587 /* the following variable is used for caching string comparisons
3588 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3589 int known_errorHandler = -1;
3591 /* allocate enough for a simple encoding without
3592 replacements, if we need more, we'll resize */
3593 res = PyString_FromStringAndSize(NULL, size);
3594 if (res == NULL)
3595 goto onError;
3596 if (size == 0)
3597 return res;
3598 str = PyString_AS_STRING(res);
3599 ressize = size;
3601 while (p<endp) {
3602 Py_UNICODE c = *p;
3604 /* can we encode this? */
3605 if (c<limit) {
3606 /* no overflow check, because we know that the space is enough */
3607 *str++ = (char)c;
3608 ++p;
3610 else {
3611 Py_ssize_t unicodepos = p-startp;
3612 Py_ssize_t requiredsize;
3613 PyObject *repunicode;
3614 Py_ssize_t repsize;
3615 Py_ssize_t newpos;
3616 Py_ssize_t respos;
3617 Py_UNICODE *uni2;
3618 /* startpos for collecting unencodable chars */
3619 const Py_UNICODE *collstart = p;
3620 const Py_UNICODE *collend = p;
3621 /* find all unecodable characters */
3622 while ((collend < endp) && ((*collend)>=limit))
3623 ++collend;
3624 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3625 if (known_errorHandler==-1) {
3626 if ((errors==NULL) || (!strcmp(errors, "strict")))
3627 known_errorHandler = 1;
3628 else if (!strcmp(errors, "replace"))
3629 known_errorHandler = 2;
3630 else if (!strcmp(errors, "ignore"))
3631 known_errorHandler = 3;
3632 else if (!strcmp(errors, "xmlcharrefreplace"))
3633 known_errorHandler = 4;
3634 else
3635 known_errorHandler = 0;
3637 switch (known_errorHandler) {
3638 case 1: /* strict */
3639 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3640 goto onError;
3641 case 2: /* replace */
3642 while (collstart++<collend)
3643 *str++ = '?'; /* fall through */
3644 case 3: /* ignore */
3645 p = collend;
3646 break;
3647 case 4: /* xmlcharrefreplace */
3648 respos = str-PyString_AS_STRING(res);
3649 /* determine replacement size (temporarily (mis)uses p) */
3650 for (p = collstart, repsize = 0; p < collend; ++p) {
3651 if (*p<10)
3652 repsize += 2+1+1;
3653 else if (*p<100)
3654 repsize += 2+2+1;
3655 else if (*p<1000)
3656 repsize += 2+3+1;
3657 else if (*p<10000)
3658 repsize += 2+4+1;
3659 #ifndef Py_UNICODE_WIDE
3660 else
3661 repsize += 2+5+1;
3662 #else
3663 else if (*p<100000)
3664 repsize += 2+5+1;
3665 else if (*p<1000000)
3666 repsize += 2+6+1;
3667 else
3668 repsize += 2+7+1;
3669 #endif
3671 requiredsize = respos+repsize+(endp-collend);
3672 if (requiredsize > ressize) {
3673 if (requiredsize<2*ressize)
3674 requiredsize = 2*ressize;
3675 if (_PyString_Resize(&res, requiredsize))
3676 goto onError;
3677 str = PyString_AS_STRING(res) + respos;
3678 ressize = requiredsize;
3680 /* generate replacement (temporarily (mis)uses p) */
3681 for (p = collstart; p < collend; ++p) {
3682 str += sprintf(str, "&#%d;", (int)*p);
3684 p = collend;
3685 break;
3686 default:
3687 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3688 encoding, reason, startp, size, &exc,
3689 collstart-startp, collend-startp, &newpos);
3690 if (repunicode == NULL)
3691 goto onError;
3692 /* need more space? (at least enough for what we have+the
3693 replacement+the rest of the string, so we won't have to
3694 check space for encodable characters) */
3695 respos = str-PyString_AS_STRING(res);
3696 repsize = PyUnicode_GET_SIZE(repunicode);
3697 requiredsize = respos+repsize+(endp-collend);
3698 if (requiredsize > ressize) {
3699 if (requiredsize<2*ressize)
3700 requiredsize = 2*ressize;
3701 if (_PyString_Resize(&res, requiredsize)) {
3702 Py_DECREF(repunicode);
3703 goto onError;
3705 str = PyString_AS_STRING(res) + respos;
3706 ressize = requiredsize;
3708 /* check if there is anything unencodable in the replacement
3709 and copy it to the output */
3710 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3711 c = *uni2;
3712 if (c >= limit) {
3713 raise_encode_exception(&exc, encoding, startp, size,
3714 unicodepos, unicodepos+1, reason);
3715 Py_DECREF(repunicode);
3716 goto onError;
3718 *str = (char)c;
3720 p = startp + newpos;
3721 Py_DECREF(repunicode);
3725 /* Resize if we allocated to much */
3726 respos = str-PyString_AS_STRING(res);
3727 if (respos<ressize)
3728 /* If this falls res will be NULL */
3729 _PyString_Resize(&res, respos);
3730 Py_XDECREF(errorHandler);
3731 Py_XDECREF(exc);
3732 return res;
3734 onError:
3735 Py_XDECREF(res);
3736 Py_XDECREF(errorHandler);
3737 Py_XDECREF(exc);
3738 return NULL;
3741 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3742 Py_ssize_t size,
3743 const char *errors)
3745 return unicode_encode_ucs1(p, size, errors, 256);
3748 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3750 if (!PyUnicode_Check(unicode)) {
3751 PyErr_BadArgument();
3752 return NULL;
3754 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3755 PyUnicode_GET_SIZE(unicode),
3756 NULL);
3759 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3761 PyObject *PyUnicode_DecodeASCII(const char *s,
3762 Py_ssize_t size,
3763 const char *errors)
3765 const char *starts = s;
3766 PyUnicodeObject *v;
3767 Py_UNICODE *p;
3768 Py_ssize_t startinpos;
3769 Py_ssize_t endinpos;
3770 Py_ssize_t outpos;
3771 const char *e;
3772 PyObject *errorHandler = NULL;
3773 PyObject *exc = NULL;
3775 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3776 if (size == 1 && *(unsigned char*)s < 128) {
3777 Py_UNICODE r = *(unsigned char*)s;
3778 return PyUnicode_FromUnicode(&r, 1);
3781 v = _PyUnicode_New(size);
3782 if (v == NULL)
3783 goto onError;
3784 if (size == 0)
3785 return (PyObject *)v;
3786 p = PyUnicode_AS_UNICODE(v);
3787 e = s + size;
3788 while (s < e) {
3789 register unsigned char c = (unsigned char)*s;
3790 if (c < 128) {
3791 *p++ = c;
3792 ++s;
3794 else {
3795 startinpos = s-starts;
3796 endinpos = startinpos + 1;
3797 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3798 if (unicode_decode_call_errorhandler(
3799 errors, &errorHandler,
3800 "ascii", "ordinal not in range(128)",
3801 starts, size, &startinpos, &endinpos, &exc, &s,
3802 &v, &outpos, &p))
3803 goto onError;
3806 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
3807 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3808 goto onError;
3809 Py_XDECREF(errorHandler);
3810 Py_XDECREF(exc);
3811 return (PyObject *)v;
3813 onError:
3814 Py_XDECREF(v);
3815 Py_XDECREF(errorHandler);
3816 Py_XDECREF(exc);
3817 return NULL;
3820 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3821 Py_ssize_t size,
3822 const char *errors)
3824 return unicode_encode_ucs1(p, size, errors, 128);
3827 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3829 if (!PyUnicode_Check(unicode)) {
3830 PyErr_BadArgument();
3831 return NULL;
3833 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3834 PyUnicode_GET_SIZE(unicode),
3835 NULL);
3838 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3840 /* --- MBCS codecs for Windows -------------------------------------------- */
3842 #if SIZEOF_INT < SIZEOF_SIZE_T
3843 #define NEED_RETRY
3844 #endif
3846 /* XXX This code is limited to "true" double-byte encodings, as
3847 a) it assumes an incomplete character consists of a single byte, and
3848 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3849 encodings, see IsDBCSLeadByteEx documentation. */
3851 static int is_dbcs_lead_byte(const char *s, int offset)
3853 const char *curr = s + offset;
3855 if (IsDBCSLeadByte(*curr)) {
3856 const char *prev = CharPrev(s, curr);
3857 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3859 return 0;
3863 * Decode MBCS string into unicode object. If 'final' is set, converts
3864 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3866 static int decode_mbcs(PyUnicodeObject **v,
3867 const char *s, /* MBCS string */
3868 int size, /* sizeof MBCS string */
3869 int final)
3871 Py_UNICODE *p;
3872 Py_ssize_t n = 0;
3873 int usize = 0;
3875 assert(size >= 0);
3877 /* Skip trailing lead-byte unless 'final' is set */
3878 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3879 --size;
3881 /* First get the size of the result */
3882 if (size > 0) {
3883 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3884 if (usize == 0) {
3885 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3886 return -1;
3890 if (*v == NULL) {
3891 /* Create unicode object */
3892 *v = _PyUnicode_New(usize);
3893 if (*v == NULL)
3894 return -1;
3896 else {
3897 /* Extend unicode object */
3898 n = PyUnicode_GET_SIZE(*v);
3899 if (_PyUnicode_Resize(v, n + usize) < 0)
3900 return -1;
3903 /* Do the conversion */
3904 if (size > 0) {
3905 p = PyUnicode_AS_UNICODE(*v) + n;
3906 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3907 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3908 return -1;
3912 return size;
3915 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3916 Py_ssize_t size,
3917 const char *errors,
3918 Py_ssize_t *consumed)
3920 PyUnicodeObject *v = NULL;
3921 int done;
3923 if (consumed)
3924 *consumed = 0;
3926 #ifdef NEED_RETRY
3927 retry:
3928 if (size > INT_MAX)
3929 done = decode_mbcs(&v, s, INT_MAX, 0);
3930 else
3931 #endif
3932 done = decode_mbcs(&v, s, (int)size, !consumed);
3934 if (done < 0) {
3935 Py_XDECREF(v);
3936 return NULL;
3939 if (consumed)
3940 *consumed += done;
3942 #ifdef NEED_RETRY
3943 if (size > INT_MAX) {
3944 s += done;
3945 size -= done;
3946 goto retry;
3948 #endif
3950 return (PyObject *)v;
3953 PyObject *PyUnicode_DecodeMBCS(const char *s,
3954 Py_ssize_t size,
3955 const char *errors)
3957 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3961 * Convert unicode into string object (MBCS).
3962 * Returns 0 if succeed, -1 otherwise.
3964 static int encode_mbcs(PyObject **repr,
3965 const Py_UNICODE *p, /* unicode */
3966 int size) /* size of unicode */
3968 int mbcssize = 0;
3969 Py_ssize_t n = 0;
3971 assert(size >= 0);
3973 /* First get the size of the result */
3974 if (size > 0) {
3975 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3976 if (mbcssize == 0) {
3977 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3978 return -1;
3982 if (*repr == NULL) {
3983 /* Create string object */
3984 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3985 if (*repr == NULL)
3986 return -1;
3988 else {
3989 /* Extend string object */
3990 n = PyString_Size(*repr);
3991 if (_PyString_Resize(repr, n + mbcssize) < 0)
3992 return -1;
3995 /* Do the conversion */
3996 if (size > 0) {
3997 char *s = PyString_AS_STRING(*repr) + n;
3998 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3999 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4000 return -1;
4004 return 0;
4007 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
4008 Py_ssize_t size,
4009 const char *errors)
4011 PyObject *repr = NULL;
4012 int ret;
4014 #ifdef NEED_RETRY
4015 retry:
4016 if (size > INT_MAX)
4017 ret = encode_mbcs(&repr, p, INT_MAX);
4018 else
4019 #endif
4020 ret = encode_mbcs(&repr, p, (int)size);
4022 if (ret < 0) {
4023 Py_XDECREF(repr);
4024 return NULL;
4027 #ifdef NEED_RETRY
4028 if (size > INT_MAX) {
4029 p += INT_MAX;
4030 size -= INT_MAX;
4031 goto retry;
4033 #endif
4035 return repr;
4038 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4040 if (!PyUnicode_Check(unicode)) {
4041 PyErr_BadArgument();
4042 return NULL;
4044 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4045 PyUnicode_GET_SIZE(unicode),
4046 NULL);
4049 #undef NEED_RETRY
4051 #endif /* MS_WINDOWS */
4053 /* --- Character Mapping Codec -------------------------------------------- */
4055 PyObject *PyUnicode_DecodeCharmap(const char *s,
4056 Py_ssize_t size,
4057 PyObject *mapping,
4058 const char *errors)
4060 const char *starts = s;
4061 Py_ssize_t startinpos;
4062 Py_ssize_t endinpos;
4063 Py_ssize_t outpos;
4064 const char *e;
4065 PyUnicodeObject *v;
4066 Py_UNICODE *p;
4067 Py_ssize_t extrachars = 0;
4068 PyObject *errorHandler = NULL;
4069 PyObject *exc = NULL;
4070 Py_UNICODE *mapstring = NULL;
4071 Py_ssize_t maplen = 0;
4073 /* Default to Latin-1 */
4074 if (mapping == NULL)
4075 return PyUnicode_DecodeLatin1(s, size, errors);
4077 v = _PyUnicode_New(size);
4078 if (v == NULL)
4079 goto onError;
4080 if (size == 0)
4081 return (PyObject *)v;
4082 p = PyUnicode_AS_UNICODE(v);
4083 e = s + size;
4084 if (PyUnicode_CheckExact(mapping)) {
4085 mapstring = PyUnicode_AS_UNICODE(mapping);
4086 maplen = PyUnicode_GET_SIZE(mapping);
4087 while (s < e) {
4088 unsigned char ch = *s;
4089 Py_UNICODE x = 0xfffe; /* illegal value */
4091 if (ch < maplen)
4092 x = mapstring[ch];
4094 if (x == 0xfffe) {
4095 /* undefined mapping */
4096 outpos = p-PyUnicode_AS_UNICODE(v);
4097 startinpos = s-starts;
4098 endinpos = startinpos+1;
4099 if (unicode_decode_call_errorhandler(
4100 errors, &errorHandler,
4101 "charmap", "character maps to <undefined>",
4102 starts, size, &startinpos, &endinpos, &exc, &s,
4103 &v, &outpos, &p)) {
4104 goto onError;
4106 continue;
4108 *p++ = x;
4109 ++s;
4112 else {
4113 while (s < e) {
4114 unsigned char ch = *s;
4115 PyObject *w, *x;
4117 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4118 w = PyInt_FromLong((long)ch);
4119 if (w == NULL)
4120 goto onError;
4121 x = PyObject_GetItem(mapping, w);
4122 Py_DECREF(w);
4123 if (x == NULL) {
4124 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4125 /* No mapping found means: mapping is undefined. */
4126 PyErr_Clear();
4127 x = Py_None;
4128 Py_INCREF(x);
4129 } else
4130 goto onError;
4133 /* Apply mapping */
4134 if (PyInt_Check(x)) {
4135 long value = PyInt_AS_LONG(x);
4136 if (value < 0 || value > 65535) {
4137 PyErr_SetString(PyExc_TypeError,
4138 "character mapping must be in range(65536)");
4139 Py_DECREF(x);
4140 goto onError;
4142 *p++ = (Py_UNICODE)value;
4144 else if (x == Py_None) {
4145 /* undefined mapping */
4146 outpos = p-PyUnicode_AS_UNICODE(v);
4147 startinpos = s-starts;
4148 endinpos = startinpos+1;
4149 if (unicode_decode_call_errorhandler(
4150 errors, &errorHandler,
4151 "charmap", "character maps to <undefined>",
4152 starts, size, &startinpos, &endinpos, &exc, &s,
4153 &v, &outpos, &p)) {
4154 Py_DECREF(x);
4155 goto onError;
4157 Py_DECREF(x);
4158 continue;
4160 else if (PyUnicode_Check(x)) {
4161 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4163 if (targetsize == 1)
4164 /* 1-1 mapping */
4165 *p++ = *PyUnicode_AS_UNICODE(x);
4167 else if (targetsize > 1) {
4168 /* 1-n mapping */
4169 if (targetsize > extrachars) {
4170 /* resize first */
4171 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4172 Py_ssize_t needed = (targetsize - extrachars) + \
4173 (targetsize << 2);
4174 extrachars += needed;
4175 /* XXX overflow detection missing */
4176 if (_PyUnicode_Resize(&v,
4177 PyUnicode_GET_SIZE(v) + needed) < 0) {
4178 Py_DECREF(x);
4179 goto onError;
4181 p = PyUnicode_AS_UNICODE(v) + oldpos;
4183 Py_UNICODE_COPY(p,
4184 PyUnicode_AS_UNICODE(x),
4185 targetsize);
4186 p += targetsize;
4187 extrachars -= targetsize;
4189 /* 1-0 mapping: skip the character */
4191 else {
4192 /* wrong return value */
4193 PyErr_SetString(PyExc_TypeError,
4194 "character mapping must return integer, None or unicode");
4195 Py_DECREF(x);
4196 goto onError;
4198 Py_DECREF(x);
4199 ++s;
4202 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4203 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4204 goto onError;
4205 Py_XDECREF(errorHandler);
4206 Py_XDECREF(exc);
4207 return (PyObject *)v;
4209 onError:
4210 Py_XDECREF(errorHandler);
4211 Py_XDECREF(exc);
4212 Py_XDECREF(v);
4213 return NULL;
4216 /* Charmap encoding: the lookup table */
4218 struct encoding_map{
4219 PyObject_HEAD
4220 unsigned char level1[32];
4221 int count2, count3;
4222 unsigned char level23[1];
4225 static PyObject*
4226 encoding_map_size(PyObject *obj, PyObject* args)
4228 struct encoding_map *map = (struct encoding_map*)obj;
4229 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4230 128*map->count3);
4233 static PyMethodDef encoding_map_methods[] = {
4234 {"size", encoding_map_size, METH_NOARGS,
4235 PyDoc_STR("Return the size (in bytes) of this object") },
4236 { 0 }
4239 static void
4240 encoding_map_dealloc(PyObject* o)
4242 PyObject_FREE(o);
4245 static PyTypeObject EncodingMapType = {
4246 PyVarObject_HEAD_INIT(NULL, 0)
4247 "EncodingMap", /*tp_name*/
4248 sizeof(struct encoding_map), /*tp_basicsize*/
4249 0, /*tp_itemsize*/
4250 /* methods */
4251 encoding_map_dealloc, /*tp_dealloc*/
4252 0, /*tp_print*/
4253 0, /*tp_getattr*/
4254 0, /*tp_setattr*/
4255 0, /*tp_compare*/
4256 0, /*tp_repr*/
4257 0, /*tp_as_number*/
4258 0, /*tp_as_sequence*/
4259 0, /*tp_as_mapping*/
4260 0, /*tp_hash*/
4261 0, /*tp_call*/
4262 0, /*tp_str*/
4263 0, /*tp_getattro*/
4264 0, /*tp_setattro*/
4265 0, /*tp_as_buffer*/
4266 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4267 0, /*tp_doc*/
4268 0, /*tp_traverse*/
4269 0, /*tp_clear*/
4270 0, /*tp_richcompare*/
4271 0, /*tp_weaklistoffset*/
4272 0, /*tp_iter*/
4273 0, /*tp_iternext*/
4274 encoding_map_methods, /*tp_methods*/
4275 0, /*tp_members*/
4276 0, /*tp_getset*/
4277 0, /*tp_base*/
4278 0, /*tp_dict*/
4279 0, /*tp_descr_get*/
4280 0, /*tp_descr_set*/
4281 0, /*tp_dictoffset*/
4282 0, /*tp_init*/
4283 0, /*tp_alloc*/
4284 0, /*tp_new*/
4285 0, /*tp_free*/
4286 0, /*tp_is_gc*/
4289 PyObject*
4290 PyUnicode_BuildEncodingMap(PyObject* string)
4292 Py_UNICODE *decode;
4293 PyObject *result;
4294 struct encoding_map *mresult;
4295 int i;
4296 int need_dict = 0;
4297 unsigned char level1[32];
4298 unsigned char level2[512];
4299 unsigned char *mlevel1, *mlevel2, *mlevel3;
4300 int count2 = 0, count3 = 0;
4302 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4303 PyErr_BadArgument();
4304 return NULL;
4306 decode = PyUnicode_AS_UNICODE(string);
4307 memset(level1, 0xFF, sizeof level1);
4308 memset(level2, 0xFF, sizeof level2);
4310 /* If there isn't a one-to-one mapping of NULL to \0,
4311 or if there are non-BMP characters, we need to use
4312 a mapping dictionary. */
4313 if (decode[0] != 0)
4314 need_dict = 1;
4315 for (i = 1; i < 256; i++) {
4316 int l1, l2;
4317 if (decode[i] == 0
4318 #ifdef Py_UNICODE_WIDE
4319 || decode[i] > 0xFFFF
4320 #endif
4322 need_dict = 1;
4323 break;
4325 if (decode[i] == 0xFFFE)
4326 /* unmapped character */
4327 continue;
4328 l1 = decode[i] >> 11;
4329 l2 = decode[i] >> 7;
4330 if (level1[l1] == 0xFF)
4331 level1[l1] = count2++;
4332 if (level2[l2] == 0xFF)
4333 level2[l2] = count3++;
4336 if (count2 >= 0xFF || count3 >= 0xFF)
4337 need_dict = 1;
4339 if (need_dict) {
4340 PyObject *result = PyDict_New();
4341 PyObject *key, *value;
4342 if (!result)
4343 return NULL;
4344 for (i = 0; i < 256; i++) {
4345 value = NULL;
4346 key = PyInt_FromLong(decode[i]);
4347 value = PyInt_FromLong(i);
4348 if (!key || !value)
4349 goto failed1;
4350 if (PyDict_SetItem(result, key, value) == -1)
4351 goto failed1;
4352 Py_DECREF(key);
4353 Py_DECREF(value);
4355 return result;
4356 failed1:
4357 Py_XDECREF(key);
4358 Py_XDECREF(value);
4359 Py_DECREF(result);
4360 return NULL;
4363 /* Create a three-level trie */
4364 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4365 16*count2 + 128*count3 - 1);
4366 if (!result)
4367 return PyErr_NoMemory();
4368 PyObject_Init(result, &EncodingMapType);
4369 mresult = (struct encoding_map*)result;
4370 mresult->count2 = count2;
4371 mresult->count3 = count3;
4372 mlevel1 = mresult->level1;
4373 mlevel2 = mresult->level23;
4374 mlevel3 = mresult->level23 + 16*count2;
4375 memcpy(mlevel1, level1, 32);
4376 memset(mlevel2, 0xFF, 16*count2);
4377 memset(mlevel3, 0, 128*count3);
4378 count3 = 0;
4379 for (i = 1; i < 256; i++) {
4380 int o1, o2, o3, i2, i3;
4381 if (decode[i] == 0xFFFE)
4382 /* unmapped character */
4383 continue;
4384 o1 = decode[i]>>11;
4385 o2 = (decode[i]>>7) & 0xF;
4386 i2 = 16*mlevel1[o1] + o2;
4387 if (mlevel2[i2] == 0xFF)
4388 mlevel2[i2] = count3++;
4389 o3 = decode[i] & 0x7F;
4390 i3 = 128*mlevel2[i2] + o3;
4391 mlevel3[i3] = i;
4393 return result;
4396 static int
4397 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4399 struct encoding_map *map = (struct encoding_map*)mapping;
4400 int l1 = c>>11;
4401 int l2 = (c>>7) & 0xF;
4402 int l3 = c & 0x7F;
4403 int i;
4405 #ifdef Py_UNICODE_WIDE
4406 if (c > 0xFFFF) {
4407 return -1;
4409 #endif
4410 if (c == 0)
4411 return 0;
4412 /* level 1*/
4413 i = map->level1[l1];
4414 if (i == 0xFF) {
4415 return -1;
4417 /* level 2*/
4418 i = map->level23[16*i+l2];
4419 if (i == 0xFF) {
4420 return -1;
4422 /* level 3 */
4423 i = map->level23[16*map->count2 + 128*i + l3];
4424 if (i == 0) {
4425 return -1;
4427 return i;
4430 /* Lookup the character ch in the mapping. If the character
4431 can't be found, Py_None is returned (or NULL, if another
4432 error occurred). */
4433 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4435 PyObject *w = PyInt_FromLong((long)c);
4436 PyObject *x;
4438 if (w == NULL)
4439 return NULL;
4440 x = PyObject_GetItem(mapping, w);
4441 Py_DECREF(w);
4442 if (x == NULL) {
4443 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4444 /* No mapping found means: mapping is undefined. */
4445 PyErr_Clear();
4446 x = Py_None;
4447 Py_INCREF(x);
4448 return x;
4449 } else
4450 return NULL;
4452 else if (x == Py_None)
4453 return x;
4454 else if (PyInt_Check(x)) {
4455 long value = PyInt_AS_LONG(x);
4456 if (value < 0 || value > 255) {
4457 PyErr_SetString(PyExc_TypeError,
4458 "character mapping must be in range(256)");
4459 Py_DECREF(x);
4460 return NULL;
4462 return x;
4464 else if (PyString_Check(x))
4465 return x;
4466 else {
4467 /* wrong return value */
4468 PyErr_SetString(PyExc_TypeError,
4469 "character mapping must return integer, None or str");
4470 Py_DECREF(x);
4471 return NULL;
4475 static int
4476 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4478 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4479 /* exponentially overallocate to minimize reallocations */
4480 if (requiredsize < 2*outsize)
4481 requiredsize = 2*outsize;
4482 if (_PyString_Resize(outobj, requiredsize)) {
4483 return 0;
4485 return 1;
4488 typedef enum charmapencode_result {
4489 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4490 }charmapencode_result;
4491 /* lookup the character, put the result in the output string and adjust
4492 various state variables. Reallocate the output string if not enough
4493 space is available. Return a new reference to the object that
4494 was put in the output buffer, or Py_None, if the mapping was undefined
4495 (in which case no character was written) or NULL, if a
4496 reallocation error occurred. The caller must decref the result */
4497 static
4498 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4499 PyObject **outobj, Py_ssize_t *outpos)
4501 PyObject *rep;
4502 char *outstart;
4503 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4505 if (Py_TYPE(mapping) == &EncodingMapType) {
4506 int res = encoding_map_lookup(c, mapping);
4507 Py_ssize_t requiredsize = *outpos+1;
4508 if (res == -1)
4509 return enc_FAILED;
4510 if (outsize<requiredsize)
4511 if (!charmapencode_resize(outobj, outpos, requiredsize))
4512 return enc_EXCEPTION;
4513 outstart = PyString_AS_STRING(*outobj);
4514 outstart[(*outpos)++] = (char)res;
4515 return enc_SUCCESS;
4518 rep = charmapencode_lookup(c, mapping);
4519 if (rep==NULL)
4520 return enc_EXCEPTION;
4521 else if (rep==Py_None) {
4522 Py_DECREF(rep);
4523 return enc_FAILED;
4524 } else {
4525 if (PyInt_Check(rep)) {
4526 Py_ssize_t requiredsize = *outpos+1;
4527 if (outsize<requiredsize)
4528 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4529 Py_DECREF(rep);
4530 return enc_EXCEPTION;
4532 outstart = PyString_AS_STRING(*outobj);
4533 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4535 else {
4536 const char *repchars = PyString_AS_STRING(rep);
4537 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4538 Py_ssize_t requiredsize = *outpos+repsize;
4539 if (outsize<requiredsize)
4540 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4541 Py_DECREF(rep);
4542 return enc_EXCEPTION;
4544 outstart = PyString_AS_STRING(*outobj);
4545 memcpy(outstart + *outpos, repchars, repsize);
4546 *outpos += repsize;
4549 Py_DECREF(rep);
4550 return enc_SUCCESS;
4553 /* handle an error in PyUnicode_EncodeCharmap
4554 Return 0 on success, -1 on error */
4555 static
4556 int charmap_encoding_error(
4557 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4558 PyObject **exceptionObject,
4559 int *known_errorHandler, PyObject **errorHandler, const char *errors,
4560 PyObject **res, Py_ssize_t *respos)
4562 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4563 Py_ssize_t repsize;
4564 Py_ssize_t newpos;
4565 Py_UNICODE *uni2;
4566 /* startpos for collecting unencodable chars */
4567 Py_ssize_t collstartpos = *inpos;
4568 Py_ssize_t collendpos = *inpos+1;
4569 Py_ssize_t collpos;
4570 char *encoding = "charmap";
4571 char *reason = "character maps to <undefined>";
4572 charmapencode_result x;
4574 /* find all unencodable characters */
4575 while (collendpos < size) {
4576 PyObject *rep;
4577 if (Py_TYPE(mapping) == &EncodingMapType) {
4578 int res = encoding_map_lookup(p[collendpos], mapping);
4579 if (res != -1)
4580 break;
4581 ++collendpos;
4582 continue;
4585 rep = charmapencode_lookup(p[collendpos], mapping);
4586 if (rep==NULL)
4587 return -1;
4588 else if (rep!=Py_None) {
4589 Py_DECREF(rep);
4590 break;
4592 Py_DECREF(rep);
4593 ++collendpos;
4595 /* cache callback name lookup
4596 * (if not done yet, i.e. it's the first error) */
4597 if (*known_errorHandler==-1) {
4598 if ((errors==NULL) || (!strcmp(errors, "strict")))
4599 *known_errorHandler = 1;
4600 else if (!strcmp(errors, "replace"))
4601 *known_errorHandler = 2;
4602 else if (!strcmp(errors, "ignore"))
4603 *known_errorHandler = 3;
4604 else if (!strcmp(errors, "xmlcharrefreplace"))
4605 *known_errorHandler = 4;
4606 else
4607 *known_errorHandler = 0;
4609 switch (*known_errorHandler) {
4610 case 1: /* strict */
4611 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4612 return -1;
4613 case 2: /* replace */
4614 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4615 x = charmapencode_output('?', mapping, res, respos);
4616 if (x==enc_EXCEPTION) {
4617 return -1;
4619 else if (x==enc_FAILED) {
4620 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4621 return -1;
4624 /* fall through */
4625 case 3: /* ignore */
4626 *inpos = collendpos;
4627 break;
4628 case 4: /* xmlcharrefreplace */
4629 /* generate replacement (temporarily (mis)uses p) */
4630 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4631 char buffer[2+29+1+1];
4632 char *cp;
4633 sprintf(buffer, "&#%d;", (int)p[collpos]);
4634 for (cp = buffer; *cp; ++cp) {
4635 x = charmapencode_output(*cp, mapping, res, respos);
4636 if (x==enc_EXCEPTION)
4637 return -1;
4638 else if (x==enc_FAILED) {
4639 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4640 return -1;
4644 *inpos = collendpos;
4645 break;
4646 default:
4647 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4648 encoding, reason, p, size, exceptionObject,
4649 collstartpos, collendpos, &newpos);
4650 if (repunicode == NULL)
4651 return -1;
4652 /* generate replacement */
4653 repsize = PyUnicode_GET_SIZE(repunicode);
4654 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4655 x = charmapencode_output(*uni2, mapping, res, respos);
4656 if (x==enc_EXCEPTION) {
4657 return -1;
4659 else if (x==enc_FAILED) {
4660 Py_DECREF(repunicode);
4661 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4662 return -1;
4665 *inpos = newpos;
4666 Py_DECREF(repunicode);
4668 return 0;
4671 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4672 Py_ssize_t size,
4673 PyObject *mapping,
4674 const char *errors)
4676 /* output object */
4677 PyObject *res = NULL;
4678 /* current input position */
4679 Py_ssize_t inpos = 0;
4680 /* current output position */
4681 Py_ssize_t respos = 0;
4682 PyObject *errorHandler = NULL;
4683 PyObject *exc = NULL;
4684 /* the following variable is used for caching string comparisons
4685 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4686 * 3=ignore, 4=xmlcharrefreplace */
4687 int known_errorHandler = -1;
4689 /* Default to Latin-1 */
4690 if (mapping == NULL)
4691 return PyUnicode_EncodeLatin1(p, size, errors);
4693 /* allocate enough for a simple encoding without
4694 replacements, if we need more, we'll resize */
4695 res = PyString_FromStringAndSize(NULL, size);
4696 if (res == NULL)
4697 goto onError;
4698 if (size == 0)
4699 return res;
4701 while (inpos<size) {
4702 /* try to encode it */
4703 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4704 if (x==enc_EXCEPTION) /* error */
4705 goto onError;
4706 if (x==enc_FAILED) { /* unencodable character */
4707 if (charmap_encoding_error(p, size, &inpos, mapping,
4708 &exc,
4709 &known_errorHandler, &errorHandler, errors,
4710 &res, &respos)) {
4711 goto onError;
4714 else
4715 /* done with this character => adjust input position */
4716 ++inpos;
4719 /* Resize if we allocated to much */
4720 if (respos<PyString_GET_SIZE(res)) {
4721 if (_PyString_Resize(&res, respos))
4722 goto onError;
4724 Py_XDECREF(exc);
4725 Py_XDECREF(errorHandler);
4726 return res;
4728 onError:
4729 Py_XDECREF(res);
4730 Py_XDECREF(exc);
4731 Py_XDECREF(errorHandler);
4732 return NULL;
4735 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4736 PyObject *mapping)
4738 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4739 PyErr_BadArgument();
4740 return NULL;
4742 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4743 PyUnicode_GET_SIZE(unicode),
4744 mapping,
4745 NULL);
4748 /* create or adjust a UnicodeTranslateError */
4749 static void make_translate_exception(PyObject **exceptionObject,
4750 const Py_UNICODE *unicode, Py_ssize_t size,
4751 Py_ssize_t startpos, Py_ssize_t endpos,
4752 const char *reason)
4754 if (*exceptionObject == NULL) {
4755 *exceptionObject = PyUnicodeTranslateError_Create(
4756 unicode, size, startpos, endpos, reason);
4758 else {
4759 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4760 goto onError;
4761 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4762 goto onError;
4763 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4764 goto onError;
4765 return;
4766 onError:
4767 Py_DECREF(*exceptionObject);
4768 *exceptionObject = NULL;
4772 /* raises a UnicodeTranslateError */
4773 static void raise_translate_exception(PyObject **exceptionObject,
4774 const Py_UNICODE *unicode, Py_ssize_t size,
4775 Py_ssize_t startpos, Py_ssize_t endpos,
4776 const char *reason)
4778 make_translate_exception(exceptionObject,
4779 unicode, size, startpos, endpos, reason);
4780 if (*exceptionObject != NULL)
4781 PyCodec_StrictErrors(*exceptionObject);
4784 /* error handling callback helper:
4785 build arguments, call the callback and check the arguments,
4786 put the result into newpos and return the replacement string, which
4787 has to be freed by the caller */
4788 static PyObject *unicode_translate_call_errorhandler(const char *errors,
4789 PyObject **errorHandler,
4790 const char *reason,
4791 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4792 Py_ssize_t startpos, Py_ssize_t endpos,
4793 Py_ssize_t *newpos)
4795 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4797 Py_ssize_t i_newpos;
4798 PyObject *restuple;
4799 PyObject *resunicode;
4801 if (*errorHandler == NULL) {
4802 *errorHandler = PyCodec_LookupError(errors);
4803 if (*errorHandler == NULL)
4804 return NULL;
4807 make_translate_exception(exceptionObject,
4808 unicode, size, startpos, endpos, reason);
4809 if (*exceptionObject == NULL)
4810 return NULL;
4812 restuple = PyObject_CallFunctionObjArgs(
4813 *errorHandler, *exceptionObject, NULL);
4814 if (restuple == NULL)
4815 return NULL;
4816 if (!PyTuple_Check(restuple)) {
4817 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4818 Py_DECREF(restuple);
4819 return NULL;
4821 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4822 &resunicode, &i_newpos)) {
4823 Py_DECREF(restuple);
4824 return NULL;
4826 if (i_newpos<0)
4827 *newpos = size+i_newpos;
4828 else
4829 *newpos = i_newpos;
4830 if (*newpos<0 || *newpos>size) {
4831 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4832 Py_DECREF(restuple);
4833 return NULL;
4835 Py_INCREF(resunicode);
4836 Py_DECREF(restuple);
4837 return resunicode;
4840 /* Lookup the character ch in the mapping and put the result in result,
4841 which must be decrefed by the caller.
4842 Return 0 on success, -1 on error */
4843 static
4844 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4846 PyObject *w = PyInt_FromLong((long)c);
4847 PyObject *x;
4849 if (w == NULL)
4850 return -1;
4851 x = PyObject_GetItem(mapping, w);
4852 Py_DECREF(w);
4853 if (x == NULL) {
4854 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4855 /* No mapping found means: use 1:1 mapping. */
4856 PyErr_Clear();
4857 *result = NULL;
4858 return 0;
4859 } else
4860 return -1;
4862 else if (x == Py_None) {
4863 *result = x;
4864 return 0;
4866 else if (PyInt_Check(x)) {
4867 long value = PyInt_AS_LONG(x);
4868 long max = PyUnicode_GetMax();
4869 if (value < 0 || value > max) {
4870 PyErr_Format(PyExc_TypeError,
4871 "character mapping must be in range(0x%lx)", max+1);
4872 Py_DECREF(x);
4873 return -1;
4875 *result = x;
4876 return 0;
4878 else if (PyUnicode_Check(x)) {
4879 *result = x;
4880 return 0;
4882 else {
4883 /* wrong return value */
4884 PyErr_SetString(PyExc_TypeError,
4885 "character mapping must return integer, None or unicode");
4886 Py_DECREF(x);
4887 return -1;
4890 /* ensure that *outobj is at least requiredsize characters long,
4891 if not reallocate and adjust various state variables.
4892 Return 0 on success, -1 on error */
4893 static
4894 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4895 Py_ssize_t requiredsize)
4897 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4898 if (requiredsize > oldsize) {
4899 /* remember old output position */
4900 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4901 /* exponentially overallocate to minimize reallocations */
4902 if (requiredsize < 2 * oldsize)
4903 requiredsize = 2 * oldsize;
4904 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4905 return -1;
4906 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4908 return 0;
4910 /* lookup the character, put the result in the output string and adjust
4911 various state variables. Return a new reference to the object that
4912 was put in the output buffer in *result, or Py_None, if the mapping was
4913 undefined (in which case no character was written).
4914 The called must decref result.
4915 Return 0 on success, -1 on error. */
4916 static
4917 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
4918 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4919 PyObject **res)
4921 if (charmaptranslate_lookup(*curinp, mapping, res))
4922 return -1;
4923 if (*res==NULL) {
4924 /* not found => default to 1:1 mapping */
4925 *(*outp)++ = *curinp;
4927 else if (*res==Py_None)
4929 else if (PyInt_Check(*res)) {
4930 /* no overflow check, because we know that the space is enough */
4931 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4933 else if (PyUnicode_Check(*res)) {
4934 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4935 if (repsize==1) {
4936 /* no overflow check, because we know that the space is enough */
4937 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4939 else if (repsize!=0) {
4940 /* more than one character */
4941 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4942 (insize - (curinp-startinp)) +
4943 repsize - 1;
4944 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4945 return -1;
4946 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4947 *outp += repsize;
4950 else
4951 return -1;
4952 return 0;
4955 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
4956 Py_ssize_t size,
4957 PyObject *mapping,
4958 const char *errors)
4960 /* output object */
4961 PyObject *res = NULL;
4962 /* pointers to the beginning and end+1 of input */
4963 const Py_UNICODE *startp = p;
4964 const Py_UNICODE *endp = p + size;
4965 /* pointer into the output */
4966 Py_UNICODE *str;
4967 /* current output position */
4968 Py_ssize_t respos = 0;
4969 char *reason = "character maps to <undefined>";
4970 PyObject *errorHandler = NULL;
4971 PyObject *exc = NULL;
4972 /* the following variable is used for caching string comparisons
4973 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4974 * 3=ignore, 4=xmlcharrefreplace */
4975 int known_errorHandler = -1;
4977 if (mapping == NULL) {
4978 PyErr_BadArgument();
4979 return NULL;
4982 /* allocate enough for a simple 1:1 translation without
4983 replacements, if we need more, we'll resize */
4984 res = PyUnicode_FromUnicode(NULL, size);
4985 if (res == NULL)
4986 goto onError;
4987 if (size == 0)
4988 return res;
4989 str = PyUnicode_AS_UNICODE(res);
4991 while (p<endp) {
4992 /* try to encode it */
4993 PyObject *x = NULL;
4994 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4995 Py_XDECREF(x);
4996 goto onError;
4998 Py_XDECREF(x);
4999 if (x!=Py_None) /* it worked => adjust input pointer */
5000 ++p;
5001 else { /* untranslatable character */
5002 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5003 Py_ssize_t repsize;
5004 Py_ssize_t newpos;
5005 Py_UNICODE *uni2;
5006 /* startpos for collecting untranslatable chars */
5007 const Py_UNICODE *collstart = p;
5008 const Py_UNICODE *collend = p+1;
5009 const Py_UNICODE *coll;
5011 /* find all untranslatable characters */
5012 while (collend < endp) {
5013 if (charmaptranslate_lookup(*collend, mapping, &x))
5014 goto onError;
5015 Py_XDECREF(x);
5016 if (x!=Py_None)
5017 break;
5018 ++collend;
5020 /* cache callback name lookup
5021 * (if not done yet, i.e. it's the first error) */
5022 if (known_errorHandler==-1) {
5023 if ((errors==NULL) || (!strcmp(errors, "strict")))
5024 known_errorHandler = 1;
5025 else if (!strcmp(errors, "replace"))
5026 known_errorHandler = 2;
5027 else if (!strcmp(errors, "ignore"))
5028 known_errorHandler = 3;
5029 else if (!strcmp(errors, "xmlcharrefreplace"))
5030 known_errorHandler = 4;
5031 else
5032 known_errorHandler = 0;
5034 switch (known_errorHandler) {
5035 case 1: /* strict */
5036 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5037 goto onError;
5038 case 2: /* replace */
5039 /* No need to check for space, this is a 1:1 replacement */
5040 for (coll = collstart; coll<collend; ++coll)
5041 *str++ = '?';
5042 /* fall through */
5043 case 3: /* ignore */
5044 p = collend;
5045 break;
5046 case 4: /* xmlcharrefreplace */
5047 /* generate replacement (temporarily (mis)uses p) */
5048 for (p = collstart; p < collend; ++p) {
5049 char buffer[2+29+1+1];
5050 char *cp;
5051 sprintf(buffer, "&#%d;", (int)*p);
5052 if (charmaptranslate_makespace(&res, &str,
5053 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5054 goto onError;
5055 for (cp = buffer; *cp; ++cp)
5056 *str++ = *cp;
5058 p = collend;
5059 break;
5060 default:
5061 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5062 reason, startp, size, &exc,
5063 collstart-startp, collend-startp, &newpos);
5064 if (repunicode == NULL)
5065 goto onError;
5066 /* generate replacement */
5067 repsize = PyUnicode_GET_SIZE(repunicode);
5068 if (charmaptranslate_makespace(&res, &str,
5069 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5070 Py_DECREF(repunicode);
5071 goto onError;
5073 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5074 *str++ = *uni2;
5075 p = startp + newpos;
5076 Py_DECREF(repunicode);
5080 /* Resize if we allocated to much */
5081 respos = str-PyUnicode_AS_UNICODE(res);
5082 if (respos<PyUnicode_GET_SIZE(res)) {
5083 if (PyUnicode_Resize(&res, respos) < 0)
5084 goto onError;
5086 Py_XDECREF(exc);
5087 Py_XDECREF(errorHandler);
5088 return res;
5090 onError:
5091 Py_XDECREF(res);
5092 Py_XDECREF(exc);
5093 Py_XDECREF(errorHandler);
5094 return NULL;
5097 PyObject *PyUnicode_Translate(PyObject *str,
5098 PyObject *mapping,
5099 const char *errors)
5101 PyObject *result;
5103 str = PyUnicode_FromObject(str);
5104 if (str == NULL)
5105 goto onError;
5106 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5107 PyUnicode_GET_SIZE(str),
5108 mapping,
5109 errors);
5110 Py_DECREF(str);
5111 return result;
5113 onError:
5114 Py_XDECREF(str);
5115 return NULL;
5118 /* --- Decimal Encoder ---------------------------------------------------- */
5120 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5121 Py_ssize_t length,
5122 char *output,
5123 const char *errors)
5125 Py_UNICODE *p, *end;
5126 PyObject *errorHandler = NULL;
5127 PyObject *exc = NULL;
5128 const char *encoding = "decimal";
5129 const char *reason = "invalid decimal Unicode string";
5130 /* the following variable is used for caching string comparisons
5131 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5132 int known_errorHandler = -1;
5134 if (output == NULL) {
5135 PyErr_BadArgument();
5136 return -1;
5139 p = s;
5140 end = s + length;
5141 while (p < end) {
5142 register Py_UNICODE ch = *p;
5143 int decimal;
5144 PyObject *repunicode;
5145 Py_ssize_t repsize;
5146 Py_ssize_t newpos;
5147 Py_UNICODE *uni2;
5148 Py_UNICODE *collstart;
5149 Py_UNICODE *collend;
5151 if (Py_UNICODE_ISSPACE(ch)) {
5152 *output++ = ' ';
5153 ++p;
5154 continue;
5156 decimal = Py_UNICODE_TODECIMAL(ch);
5157 if (decimal >= 0) {
5158 *output++ = '0' + decimal;
5159 ++p;
5160 continue;
5162 if (0 < ch && ch < 256) {
5163 *output++ = (char)ch;
5164 ++p;
5165 continue;
5167 /* All other characters are considered unencodable */
5168 collstart = p;
5169 collend = p+1;
5170 while (collend < end) {
5171 if ((0 < *collend && *collend < 256) ||
5172 !Py_UNICODE_ISSPACE(*collend) ||
5173 Py_UNICODE_TODECIMAL(*collend))
5174 break;
5176 /* cache callback name lookup
5177 * (if not done yet, i.e. it's the first error) */
5178 if (known_errorHandler==-1) {
5179 if ((errors==NULL) || (!strcmp(errors, "strict")))
5180 known_errorHandler = 1;
5181 else if (!strcmp(errors, "replace"))
5182 known_errorHandler = 2;
5183 else if (!strcmp(errors, "ignore"))
5184 known_errorHandler = 3;
5185 else if (!strcmp(errors, "xmlcharrefreplace"))
5186 known_errorHandler = 4;
5187 else
5188 known_errorHandler = 0;
5190 switch (known_errorHandler) {
5191 case 1: /* strict */
5192 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5193 goto onError;
5194 case 2: /* replace */
5195 for (p = collstart; p < collend; ++p)
5196 *output++ = '?';
5197 /* fall through */
5198 case 3: /* ignore */
5199 p = collend;
5200 break;
5201 case 4: /* xmlcharrefreplace */
5202 /* generate replacement (temporarily (mis)uses p) */
5203 for (p = collstart; p < collend; ++p)
5204 output += sprintf(output, "&#%d;", (int)*p);
5205 p = collend;
5206 break;
5207 default:
5208 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5209 encoding, reason, s, length, &exc,
5210 collstart-s, collend-s, &newpos);
5211 if (repunicode == NULL)
5212 goto onError;
5213 /* generate replacement */
5214 repsize = PyUnicode_GET_SIZE(repunicode);
5215 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5216 Py_UNICODE ch = *uni2;
5217 if (Py_UNICODE_ISSPACE(ch))
5218 *output++ = ' ';
5219 else {
5220 decimal = Py_UNICODE_TODECIMAL(ch);
5221 if (decimal >= 0)
5222 *output++ = '0' + decimal;
5223 else if (0 < ch && ch < 256)
5224 *output++ = (char)ch;
5225 else {
5226 Py_DECREF(repunicode);
5227 raise_encode_exception(&exc, encoding,
5228 s, length, collstart-s, collend-s, reason);
5229 goto onError;
5233 p = s + newpos;
5234 Py_DECREF(repunicode);
5237 /* 0-terminate the output string */
5238 *output++ = '\0';
5239 Py_XDECREF(exc);
5240 Py_XDECREF(errorHandler);
5241 return 0;
5243 onError:
5244 Py_XDECREF(exc);
5245 Py_XDECREF(errorHandler);
5246 return -1;
5249 /* --- Helpers ------------------------------------------------------------ */
5251 #include "stringlib/unicodedefs.h"
5252 #include "stringlib/fastsearch.h"
5254 #include "stringlib/count.h"
5255 #include "stringlib/find.h"
5256 #include "stringlib/partition.h"
5257 #include "stringlib/split.h"
5259 /* helper macro to fixup start/end slice values */
5260 #define ADJUST_INDICES(start, end, len) \
5261 if (end > len) \
5262 end = len; \
5263 else if (end < 0) { \
5264 end += len; \
5265 if (end < 0) \
5266 end = 0; \
5268 if (start < 0) { \
5269 start += len; \
5270 if (start < 0) \
5271 start = 0; \
5274 Py_ssize_t PyUnicode_Count(PyObject *str,
5275 PyObject *substr,
5276 Py_ssize_t start,
5277 Py_ssize_t end)
5279 Py_ssize_t result;
5280 PyUnicodeObject* str_obj;
5281 PyUnicodeObject* sub_obj;
5283 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5284 if (!str_obj)
5285 return -1;
5286 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5287 if (!sub_obj) {
5288 Py_DECREF(str_obj);
5289 return -1;
5292 ADJUST_INDICES(start, end, str_obj->length);
5293 result = stringlib_count(
5294 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5295 PY_SSIZE_T_MAX
5298 Py_DECREF(sub_obj);
5299 Py_DECREF(str_obj);
5301 return result;
5304 Py_ssize_t PyUnicode_Find(PyObject *str,
5305 PyObject *sub,
5306 Py_ssize_t start,
5307 Py_ssize_t end,
5308 int direction)
5310 Py_ssize_t result;
5312 str = PyUnicode_FromObject(str);
5313 if (!str)
5314 return -2;
5315 sub = PyUnicode_FromObject(sub);
5316 if (!sub) {
5317 Py_DECREF(str);
5318 return -2;
5321 if (direction > 0)
5322 result = stringlib_find_slice(
5323 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5324 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5325 start, end
5327 else
5328 result = stringlib_rfind_slice(
5329 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5330 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5331 start, end
5334 Py_DECREF(str);
5335 Py_DECREF(sub);
5337 return result;
5340 static
5341 int tailmatch(PyUnicodeObject *self,
5342 PyUnicodeObject *substring,
5343 Py_ssize_t start,
5344 Py_ssize_t end,
5345 int direction)
5347 if (substring->length == 0)
5348 return 1;
5350 ADJUST_INDICES(start, end, self->length);
5351 end -= substring->length;
5352 if (end < start)
5353 return 0;
5355 if (direction > 0) {
5356 if (Py_UNICODE_MATCH(self, end, substring))
5357 return 1;
5358 } else {
5359 if (Py_UNICODE_MATCH(self, start, substring))
5360 return 1;
5363 return 0;
5366 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5367 PyObject *substr,
5368 Py_ssize_t start,
5369 Py_ssize_t end,
5370 int direction)
5372 Py_ssize_t result;
5374 str = PyUnicode_FromObject(str);
5375 if (str == NULL)
5376 return -1;
5377 substr = PyUnicode_FromObject(substr);
5378 if (substr == NULL) {
5379 Py_DECREF(str);
5380 return -1;
5383 result = tailmatch((PyUnicodeObject *)str,
5384 (PyUnicodeObject *)substr,
5385 start, end, direction);
5386 Py_DECREF(str);
5387 Py_DECREF(substr);
5388 return result;
5391 /* Apply fixfct filter to the Unicode object self and return a
5392 reference to the modified object */
5394 static
5395 PyObject *fixup(PyUnicodeObject *self,
5396 int (*fixfct)(PyUnicodeObject *s))
5399 PyUnicodeObject *u;
5401 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5402 if (u == NULL)
5403 return NULL;
5405 Py_UNICODE_COPY(u->str, self->str, self->length);
5407 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5408 /* fixfct should return TRUE if it modified the buffer. If
5409 FALSE, return a reference to the original buffer instead
5410 (to save space, not time) */
5411 Py_INCREF(self);
5412 Py_DECREF(u);
5413 return (PyObject*) self;
5415 return (PyObject*) u;
5418 static
5419 int fixupper(PyUnicodeObject *self)
5421 Py_ssize_t len = self->length;
5422 Py_UNICODE *s = self->str;
5423 int status = 0;
5425 while (len-- > 0) {
5426 register Py_UNICODE ch;
5428 ch = Py_UNICODE_TOUPPER(*s);
5429 if (ch != *s) {
5430 status = 1;
5431 *s = ch;
5433 s++;
5436 return status;
5439 static
5440 int fixlower(PyUnicodeObject *self)
5442 Py_ssize_t len = self->length;
5443 Py_UNICODE *s = self->str;
5444 int status = 0;
5446 while (len-- > 0) {
5447 register Py_UNICODE ch;
5449 ch = Py_UNICODE_TOLOWER(*s);
5450 if (ch != *s) {
5451 status = 1;
5452 *s = ch;
5454 s++;
5457 return status;
5460 static
5461 int fixswapcase(PyUnicodeObject *self)
5463 Py_ssize_t len = self->length;
5464 Py_UNICODE *s = self->str;
5465 int status = 0;
5467 while (len-- > 0) {
5468 if (Py_UNICODE_ISUPPER(*s)) {
5469 *s = Py_UNICODE_TOLOWER(*s);
5470 status = 1;
5471 } else if (Py_UNICODE_ISLOWER(*s)) {
5472 *s = Py_UNICODE_TOUPPER(*s);
5473 status = 1;
5475 s++;
5478 return status;
5481 static
5482 int fixcapitalize(PyUnicodeObject *self)
5484 Py_ssize_t len = self->length;
5485 Py_UNICODE *s = self->str;
5486 int status = 0;
5488 if (len == 0)
5489 return 0;
5490 if (Py_UNICODE_ISLOWER(*s)) {
5491 *s = Py_UNICODE_TOUPPER(*s);
5492 status = 1;
5494 s++;
5495 while (--len > 0) {
5496 if (Py_UNICODE_ISUPPER(*s)) {
5497 *s = Py_UNICODE_TOLOWER(*s);
5498 status = 1;
5500 s++;
5502 return status;
5505 static
5506 int fixtitle(PyUnicodeObject *self)
5508 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5509 register Py_UNICODE *e;
5510 int previous_is_cased;
5512 /* Shortcut for single character strings */
5513 if (PyUnicode_GET_SIZE(self) == 1) {
5514 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5515 if (*p != ch) {
5516 *p = ch;
5517 return 1;
5519 else
5520 return 0;
5523 e = p + PyUnicode_GET_SIZE(self);
5524 previous_is_cased = 0;
5525 for (; p < e; p++) {
5526 register const Py_UNICODE ch = *p;
5528 if (previous_is_cased)
5529 *p = Py_UNICODE_TOLOWER(ch);
5530 else
5531 *p = Py_UNICODE_TOTITLE(ch);
5533 if (Py_UNICODE_ISLOWER(ch) ||
5534 Py_UNICODE_ISUPPER(ch) ||
5535 Py_UNICODE_ISTITLE(ch))
5536 previous_is_cased = 1;
5537 else
5538 previous_is_cased = 0;
5540 return 1;
5543 PyObject *
5544 PyUnicode_Join(PyObject *separator, PyObject *seq)
5546 PyObject *internal_separator = NULL;
5547 const Py_UNICODE blank = ' ';
5548 const Py_UNICODE *sep = &blank;
5549 Py_ssize_t seplen = 1;
5550 PyUnicodeObject *res = NULL; /* the result */
5551 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5552 Py_ssize_t res_used; /* # used bytes */
5553 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5554 PyObject *fseq; /* PySequence_Fast(seq) */
5555 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
5556 PyObject *item;
5557 Py_ssize_t i;
5559 fseq = PySequence_Fast(seq, "");
5560 if (fseq == NULL) {
5561 return NULL;
5564 /* Grrrr. A codec may be invoked to convert str objects to
5565 * Unicode, and so it's possible to call back into Python code
5566 * during PyUnicode_FromObject(), and so it's possible for a sick
5567 * codec to change the size of fseq (if seq is a list). Therefore
5568 * we have to keep refetching the size -- can't assume seqlen
5569 * is invariant.
5571 seqlen = PySequence_Fast_GET_SIZE(fseq);
5572 /* If empty sequence, return u"". */
5573 if (seqlen == 0) {
5574 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5575 goto Done;
5577 /* If singleton sequence with an exact Unicode, return that. */
5578 if (seqlen == 1) {
5579 item = PySequence_Fast_GET_ITEM(fseq, 0);
5580 if (PyUnicode_CheckExact(item)) {
5581 Py_INCREF(item);
5582 res = (PyUnicodeObject *)item;
5583 goto Done;
5587 /* At least two items to join, or one that isn't exact Unicode. */
5588 if (seqlen > 1) {
5589 /* Set up sep and seplen -- they're needed. */
5590 if (separator == NULL) {
5591 sep = &blank;
5592 seplen = 1;
5594 else {
5595 internal_separator = PyUnicode_FromObject(separator);
5596 if (internal_separator == NULL)
5597 goto onError;
5598 sep = PyUnicode_AS_UNICODE(internal_separator);
5599 seplen = PyUnicode_GET_SIZE(internal_separator);
5600 /* In case PyUnicode_FromObject() mutated seq. */
5601 seqlen = PySequence_Fast_GET_SIZE(fseq);
5605 /* Get space. */
5606 res = _PyUnicode_New(res_alloc);
5607 if (res == NULL)
5608 goto onError;
5609 res_p = PyUnicode_AS_UNICODE(res);
5610 res_used = 0;
5612 for (i = 0; i < seqlen; ++i) {
5613 Py_ssize_t itemlen;
5614 Py_ssize_t new_res_used;
5616 item = PySequence_Fast_GET_ITEM(fseq, i);
5617 /* Convert item to Unicode. */
5618 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5619 PyErr_Format(PyExc_TypeError,
5620 "sequence item %zd: expected string or Unicode,"
5621 " %.80s found",
5622 i, Py_TYPE(item)->tp_name);
5623 goto onError;
5625 item = PyUnicode_FromObject(item);
5626 if (item == NULL)
5627 goto onError;
5628 /* We own a reference to item from here on. */
5630 /* In case PyUnicode_FromObject() mutated seq. */
5631 seqlen = PySequence_Fast_GET_SIZE(fseq);
5633 /* Make sure we have enough space for the separator and the item. */
5634 itemlen = PyUnicode_GET_SIZE(item);
5635 new_res_used = res_used + itemlen;
5636 if (new_res_used < 0)
5637 goto Overflow;
5638 if (i < seqlen - 1) {
5639 new_res_used += seplen;
5640 if (new_res_used < 0)
5641 goto Overflow;
5643 if (new_res_used > res_alloc) {
5644 /* double allocated size until it's big enough */
5645 do {
5646 res_alloc += res_alloc;
5647 if (res_alloc <= 0)
5648 goto Overflow;
5649 } while (new_res_used > res_alloc);
5650 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5651 Py_DECREF(item);
5652 goto onError;
5654 res_p = PyUnicode_AS_UNICODE(res) + res_used;
5657 /* Copy item, and maybe the separator. */
5658 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5659 res_p += itemlen;
5660 if (i < seqlen - 1) {
5661 Py_UNICODE_COPY(res_p, sep, seplen);
5662 res_p += seplen;
5664 Py_DECREF(item);
5665 res_used = new_res_used;
5668 /* Shrink res to match the used area; this probably can't fail,
5669 * but it's cheap to check.
5671 if (_PyUnicode_Resize(&res, res_used) < 0)
5672 goto onError;
5674 Done:
5675 Py_XDECREF(internal_separator);
5676 Py_DECREF(fseq);
5677 return (PyObject *)res;
5679 Overflow:
5680 PyErr_SetString(PyExc_OverflowError,
5681 "join() result is too long for a Python string");
5682 Py_DECREF(item);
5683 /* fall through */
5685 onError:
5686 Py_XDECREF(internal_separator);
5687 Py_DECREF(fseq);
5688 Py_XDECREF(res);
5689 return NULL;
5692 static
5693 PyUnicodeObject *pad(PyUnicodeObject *self,
5694 Py_ssize_t left,
5695 Py_ssize_t right,
5696 Py_UNICODE fill)
5698 PyUnicodeObject *u;
5700 if (left < 0)
5701 left = 0;
5702 if (right < 0)
5703 right = 0;
5705 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5706 Py_INCREF(self);
5707 return self;
5710 if (left > PY_SSIZE_T_MAX - self->length ||
5711 right > PY_SSIZE_T_MAX - (left + self->length)) {
5712 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5713 return NULL;
5715 u = _PyUnicode_New(left + self->length + right);
5716 if (u) {
5717 if (left)
5718 Py_UNICODE_FILL(u->str, fill, left);
5719 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5720 if (right)
5721 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5724 return u;
5727 PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
5729 PyObject *list;
5731 string = PyUnicode_FromObject(string);
5732 if (string == NULL)
5733 return NULL;
5735 list = stringlib_splitlines(
5736 (PyObject*) string, PyUnicode_AS_UNICODE(string),
5737 PyUnicode_GET_SIZE(string), keepends);
5739 Py_DECREF(string);
5740 return list;
5743 static
5744 PyObject *split(PyUnicodeObject *self,
5745 PyUnicodeObject *substring,
5746 Py_ssize_t maxcount)
5748 if (maxcount < 0)
5749 maxcount = PY_SSIZE_T_MAX;
5751 if (substring == NULL)
5752 return stringlib_split_whitespace(
5753 (PyObject*) self, self->str, self->length, maxcount
5756 return stringlib_split(
5757 (PyObject*) self, self->str, self->length,
5758 substring->str, substring->length,
5759 maxcount
5763 static
5764 PyObject *rsplit(PyUnicodeObject *self,
5765 PyUnicodeObject *substring,
5766 Py_ssize_t maxcount)
5768 if (maxcount < 0)
5769 maxcount = PY_SSIZE_T_MAX;
5771 if (substring == NULL)
5772 return stringlib_rsplit_whitespace(
5773 (PyObject*) self, self->str, self->length, maxcount
5776 return stringlib_rsplit(
5777 (PyObject*) self, self->str, self->length,
5778 substring->str, substring->length,
5779 maxcount
5783 static
5784 PyObject *replace(PyUnicodeObject *self,
5785 PyUnicodeObject *str1,
5786 PyUnicodeObject *str2,
5787 Py_ssize_t maxcount)
5789 PyUnicodeObject *u;
5791 if (maxcount < 0)
5792 maxcount = PY_SSIZE_T_MAX;
5793 else if (maxcount == 0 || self->length == 0)
5794 goto nothing;
5796 if (str1->length == str2->length) {
5797 Py_ssize_t i;
5798 /* same length */
5799 if (str1->length == 0)
5800 goto nothing;
5801 if (str1->length == 1) {
5802 /* replace characters */
5803 Py_UNICODE u1, u2;
5804 if (!findchar(self->str, self->length, str1->str[0]))
5805 goto nothing;
5806 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5807 if (!u)
5808 return NULL;
5809 Py_UNICODE_COPY(u->str, self->str, self->length);
5810 u1 = str1->str[0];
5811 u2 = str2->str[0];
5812 for (i = 0; i < u->length; i++)
5813 if (u->str[i] == u1) {
5814 if (--maxcount < 0)
5815 break;
5816 u->str[i] = u2;
5818 } else {
5819 i = stringlib_find(
5820 self->str, self->length, str1->str, str1->length, 0
5822 if (i < 0)
5823 goto nothing;
5824 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5825 if (!u)
5826 return NULL;
5827 Py_UNICODE_COPY(u->str, self->str, self->length);
5829 /* change everything in-place, starting with this one */
5830 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5831 i += str1->length;
5833 while ( --maxcount > 0) {
5834 i = stringlib_find(self->str+i, self->length-i,
5835 str1->str, str1->length,
5837 if (i == -1)
5838 break;
5839 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5840 i += str1->length;
5843 } else {
5845 Py_ssize_t n, i, j;
5846 Py_ssize_t product, new_size, delta;
5847 Py_UNICODE *p;
5849 /* replace strings */
5850 n = stringlib_count(self->str, self->length, str1->str, str1->length,
5851 maxcount);
5852 if (n == 0)
5853 goto nothing;
5854 /* new_size = self->length + n * (str2->length - str1->length)); */
5855 delta = (str2->length - str1->length);
5856 if (delta == 0) {
5857 new_size = self->length;
5858 } else {
5859 product = n * (str2->length - str1->length);
5860 if ((product / (str2->length - str1->length)) != n) {
5861 PyErr_SetString(PyExc_OverflowError,
5862 "replace string is too long");
5863 return NULL;
5865 new_size = self->length + product;
5866 if (new_size < 0) {
5867 PyErr_SetString(PyExc_OverflowError,
5868 "replace string is too long");
5869 return NULL;
5872 u = _PyUnicode_New(new_size);
5873 if (!u)
5874 return NULL;
5875 i = 0;
5876 p = u->str;
5877 if (str1->length > 0) {
5878 while (n-- > 0) {
5879 /* look for next match */
5880 j = stringlib_find(self->str+i, self->length-i,
5881 str1->str, str1->length,
5883 if (j == -1)
5884 break;
5885 else if (j > i) {
5886 /* copy unchanged part [i:j] */
5887 Py_UNICODE_COPY(p, self->str+i, j-i);
5888 p += j - i;
5890 /* copy substitution string */
5891 if (str2->length > 0) {
5892 Py_UNICODE_COPY(p, str2->str, str2->length);
5893 p += str2->length;
5895 i = j + str1->length;
5897 if (i < self->length)
5898 /* copy tail [i:] */
5899 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5900 } else {
5901 /* interleave */
5902 while (n > 0) {
5903 Py_UNICODE_COPY(p, str2->str, str2->length);
5904 p += str2->length;
5905 if (--n <= 0)
5906 break;
5907 *p++ = self->str[i++];
5909 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5912 return (PyObject *) u;
5914 nothing:
5915 /* nothing to replace; return original string (when possible) */
5916 if (PyUnicode_CheckExact(self)) {
5917 Py_INCREF(self);
5918 return (PyObject *) self;
5920 return PyUnicode_FromUnicode(self->str, self->length);
5923 /* --- Unicode Object Methods --------------------------------------------- */
5925 PyDoc_STRVAR(title__doc__,
5926 "S.title() -> unicode\n\
5928 Return a titlecased version of S, i.e. words start with title case\n\
5929 characters, all remaining cased characters have lower case.");
5931 static PyObject*
5932 unicode_title(PyUnicodeObject *self)
5934 return fixup(self, fixtitle);
5937 PyDoc_STRVAR(capitalize__doc__,
5938 "S.capitalize() -> unicode\n\
5940 Return a capitalized version of S, i.e. make the first character\n\
5941 have upper case.");
5943 static PyObject*
5944 unicode_capitalize(PyUnicodeObject *self)
5946 return fixup(self, fixcapitalize);
5949 #if 0
5950 PyDoc_STRVAR(capwords__doc__,
5951 "S.capwords() -> unicode\n\
5953 Apply .capitalize() to all words in S and return the result with\n\
5954 normalized whitespace (all whitespace strings are replaced by ' ').");
5956 static PyObject*
5957 unicode_capwords(PyUnicodeObject *self)
5959 PyObject *list;
5960 PyObject *item;
5961 Py_ssize_t i;
5963 /* Split into words */
5964 list = split(self, NULL, -1);
5965 if (!list)
5966 return NULL;
5968 /* Capitalize each word */
5969 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5970 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5971 fixcapitalize);
5972 if (item == NULL)
5973 goto onError;
5974 Py_DECREF(PyList_GET_ITEM(list, i));
5975 PyList_SET_ITEM(list, i, item);
5978 /* Join the words to form a new string */
5979 item = PyUnicode_Join(NULL, list);
5981 onError:
5982 Py_DECREF(list);
5983 return (PyObject *)item;
5985 #endif
5987 /* Argument converter. Coerces to a single unicode character */
5989 static int
5990 convert_uc(PyObject *obj, void *addr)
5992 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5993 PyObject *uniobj;
5994 Py_UNICODE *unistr;
5996 uniobj = PyUnicode_FromObject(obj);
5997 if (uniobj == NULL) {
5998 PyErr_SetString(PyExc_TypeError,
5999 "The fill character cannot be converted to Unicode");
6000 return 0;
6002 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6003 PyErr_SetString(PyExc_TypeError,
6004 "The fill character must be exactly one character long");
6005 Py_DECREF(uniobj);
6006 return 0;
6008 unistr = PyUnicode_AS_UNICODE(uniobj);
6009 *fillcharloc = unistr[0];
6010 Py_DECREF(uniobj);
6011 return 1;
6014 PyDoc_STRVAR(center__doc__,
6015 "S.center(width[, fillchar]) -> unicode\n\
6017 Return S centered in a Unicode string of length width. Padding is\n\
6018 done using the specified fill character (default is a space)");
6020 static PyObject *
6021 unicode_center(PyUnicodeObject *self, PyObject *args)
6023 Py_ssize_t marg, left;
6024 Py_ssize_t width;
6025 Py_UNICODE fillchar = ' ';
6027 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6028 return NULL;
6030 if (self->length >= width && PyUnicode_CheckExact(self)) {
6031 Py_INCREF(self);
6032 return (PyObject*) self;
6035 marg = width - self->length;
6036 left = marg / 2 + (marg & width & 1);
6038 return (PyObject*) pad(self, left, marg - left, fillchar);
6041 #if 0
6043 /* This code should go into some future Unicode collation support
6044 module. The basic comparison should compare ordinals on a naive
6045 basis (this is what Java does and thus Jython too). */
6047 /* speedy UTF-16 code point order comparison */
6048 /* gleaned from: */
6049 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6051 static short utf16Fixup[32] =
6053 0, 0, 0, 0, 0, 0, 0, 0,
6054 0, 0, 0, 0, 0, 0, 0, 0,
6055 0, 0, 0, 0, 0, 0, 0, 0,
6056 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6059 static int
6060 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6062 Py_ssize_t len1, len2;
6064 Py_UNICODE *s1 = str1->str;
6065 Py_UNICODE *s2 = str2->str;
6067 len1 = str1->length;
6068 len2 = str2->length;
6070 while (len1 > 0 && len2 > 0) {
6071 Py_UNICODE c1, c2;
6073 c1 = *s1++;
6074 c2 = *s2++;
6076 if (c1 > (1<<11) * 26)
6077 c1 += utf16Fixup[c1>>11];
6078 if (c2 > (1<<11) * 26)
6079 c2 += utf16Fixup[c2>>11];
6080 /* now c1 and c2 are in UTF-32-compatible order */
6082 if (c1 != c2)
6083 return (c1 < c2) ? -1 : 1;
6085 len1--; len2--;
6088 return (len1 < len2) ? -1 : (len1 != len2);
6091 #else
6093 static int
6094 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6096 register Py_ssize_t len1, len2;
6098 Py_UNICODE *s1 = str1->str;
6099 Py_UNICODE *s2 = str2->str;
6101 len1 = str1->length;
6102 len2 = str2->length;
6104 while (len1 > 0 && len2 > 0) {
6105 Py_UNICODE c1, c2;
6107 c1 = *s1++;
6108 c2 = *s2++;
6110 if (c1 != c2)
6111 return (c1 < c2) ? -1 : 1;
6113 len1--; len2--;
6116 return (len1 < len2) ? -1 : (len1 != len2);
6119 #endif
6121 int PyUnicode_Compare(PyObject *left,
6122 PyObject *right)
6124 PyUnicodeObject *u = NULL, *v = NULL;
6125 int result;
6127 /* Coerce the two arguments */
6128 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6129 if (u == NULL)
6130 goto onError;
6131 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6132 if (v == NULL)
6133 goto onError;
6135 /* Shortcut for empty or interned objects */
6136 if (v == u) {
6137 Py_DECREF(u);
6138 Py_DECREF(v);
6139 return 0;
6142 result = unicode_compare(u, v);
6144 Py_DECREF(u);
6145 Py_DECREF(v);
6146 return result;
6148 onError:
6149 Py_XDECREF(u);
6150 Py_XDECREF(v);
6151 return -1;
6154 PyObject *PyUnicode_RichCompare(PyObject *left,
6155 PyObject *right,
6156 int op)
6158 int result;
6160 result = PyUnicode_Compare(left, right);
6161 if (result == -1 && PyErr_Occurred())
6162 goto onError;
6164 /* Convert the return value to a Boolean */
6165 switch (op) {
6166 case Py_EQ:
6167 result = (result == 0);
6168 break;
6169 case Py_NE:
6170 result = (result != 0);
6171 break;
6172 case Py_LE:
6173 result = (result <= 0);
6174 break;
6175 case Py_GE:
6176 result = (result >= 0);
6177 break;
6178 case Py_LT:
6179 result = (result == -1);
6180 break;
6181 case Py_GT:
6182 result = (result == 1);
6183 break;
6185 return PyBool_FromLong(result);
6187 onError:
6189 /* Standard case
6191 Type errors mean that PyUnicode_FromObject() could not convert
6192 one of the arguments (usually the right hand side) to Unicode,
6193 ie. we can't handle the comparison request. However, it is
6194 possible that the other object knows a comparison method, which
6195 is why we return Py_NotImplemented to give the other object a
6196 chance.
6199 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6200 PyErr_Clear();
6201 Py_INCREF(Py_NotImplemented);
6202 return Py_NotImplemented;
6204 if (op != Py_EQ && op != Py_NE)
6205 return NULL;
6207 /* Equality comparison.
6209 This is a special case: we silence any PyExc_UnicodeDecodeError
6210 and instead turn it into a PyErr_UnicodeWarning.
6213 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6214 return NULL;
6215 PyErr_Clear();
6216 if (PyErr_Warn(PyExc_UnicodeWarning,
6217 (op == Py_EQ) ?
6218 "Unicode equal comparison "
6219 "failed to convert both arguments to Unicode - "
6220 "interpreting them as being unequal" :
6221 "Unicode unequal comparison "
6222 "failed to convert both arguments to Unicode - "
6223 "interpreting them as being unequal"
6224 ) < 0)
6225 return NULL;
6226 result = (op == Py_NE);
6227 return PyBool_FromLong(result);
6230 int PyUnicode_Contains(PyObject *container,
6231 PyObject *element)
6233 PyObject *str, *sub;
6234 int result;
6236 /* Coerce the two arguments */
6237 sub = PyUnicode_FromObject(element);
6238 if (!sub) {
6239 return -1;
6242 str = PyUnicode_FromObject(container);
6243 if (!str) {
6244 Py_DECREF(sub);
6245 return -1;
6248 result = stringlib_contains_obj(str, sub);
6250 Py_DECREF(str);
6251 Py_DECREF(sub);
6253 return result;
6256 /* Concat to string or Unicode object giving a new Unicode object. */
6258 PyObject *PyUnicode_Concat(PyObject *left,
6259 PyObject *right)
6261 PyUnicodeObject *u = NULL, *v = NULL, *w;
6263 /* Coerce the two arguments */
6264 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6265 if (u == NULL)
6266 goto onError;
6267 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6268 if (v == NULL)
6269 goto onError;
6271 /* Shortcuts */
6272 if (v == unicode_empty) {
6273 Py_DECREF(v);
6274 return (PyObject *)u;
6276 if (u == unicode_empty) {
6277 Py_DECREF(u);
6278 return (PyObject *)v;
6281 /* Concat the two Unicode strings */
6282 w = _PyUnicode_New(u->length + v->length);
6283 if (w == NULL)
6284 goto onError;
6285 Py_UNICODE_COPY(w->str, u->str, u->length);
6286 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6288 Py_DECREF(u);
6289 Py_DECREF(v);
6290 return (PyObject *)w;
6292 onError:
6293 Py_XDECREF(u);
6294 Py_XDECREF(v);
6295 return NULL;
6298 PyDoc_STRVAR(count__doc__,
6299 "S.count(sub[, start[, end]]) -> int\n\
6301 Return the number of non-overlapping occurrences of substring sub in\n\
6302 Unicode string S[start:end]. Optional arguments start and end are\n\
6303 interpreted as in slice notation.");
6305 static PyObject *
6306 unicode_count(PyUnicodeObject *self, PyObject *args)
6308 PyUnicodeObject *substring;
6309 Py_ssize_t start = 0;
6310 Py_ssize_t end = PY_SSIZE_T_MAX;
6311 PyObject *result;
6313 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6314 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6315 return NULL;
6317 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6318 (PyObject *)substring);
6319 if (substring == NULL)
6320 return NULL;
6322 ADJUST_INDICES(start, end, self->length);
6323 result = PyInt_FromSsize_t(
6324 stringlib_count(self->str + start, end - start,
6325 substring->str, substring->length,
6326 PY_SSIZE_T_MAX)
6329 Py_DECREF(substring);
6331 return result;
6334 PyDoc_STRVAR(encode__doc__,
6335 "S.encode([encoding[,errors]]) -> string or unicode\n\
6337 Encodes S using the codec registered for encoding. encoding defaults\n\
6338 to the default encoding. errors may be given to set a different error\n\
6339 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6340 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6341 'xmlcharrefreplace' as well as any other name registered with\n\
6342 codecs.register_error that can handle UnicodeEncodeErrors.");
6344 static PyObject *
6345 unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6347 static char *kwlist[] = {"encoding", "errors", 0};
6348 char *encoding = NULL;
6349 char *errors = NULL;
6350 PyObject *v;
6352 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6353 kwlist, &encoding, &errors))
6354 return NULL;
6355 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6356 if (v == NULL)
6357 goto onError;
6358 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6359 PyErr_Format(PyExc_TypeError,
6360 "encoder did not return a string/unicode object "
6361 "(type=%.400s)",
6362 Py_TYPE(v)->tp_name);
6363 Py_DECREF(v);
6364 return NULL;
6366 return v;
6368 onError:
6369 return NULL;
6372 PyDoc_STRVAR(decode__doc__,
6373 "S.decode([encoding[,errors]]) -> string or unicode\n\
6375 Decodes S using the codec registered for encoding. encoding defaults\n\
6376 to the default encoding. errors may be given to set a different error\n\
6377 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6378 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6379 as well as any other name registerd with codecs.register_error that is\n\
6380 able to handle UnicodeDecodeErrors.");
6382 static PyObject *
6383 unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6385 static char *kwlist[] = {"encoding", "errors", 0};
6386 char *encoding = NULL;
6387 char *errors = NULL;
6388 PyObject *v;
6390 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6391 kwlist, &encoding, &errors))
6392 return NULL;
6393 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
6394 if (v == NULL)
6395 goto onError;
6396 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6397 PyErr_Format(PyExc_TypeError,
6398 "decoder did not return a string/unicode object "
6399 "(type=%.400s)",
6400 Py_TYPE(v)->tp_name);
6401 Py_DECREF(v);
6402 return NULL;
6404 return v;
6406 onError:
6407 return NULL;
6410 PyDoc_STRVAR(expandtabs__doc__,
6411 "S.expandtabs([tabsize]) -> unicode\n\
6413 Return a copy of S where all tab characters are expanded using spaces.\n\
6414 If tabsize is not given, a tab size of 8 characters is assumed.");
6416 static PyObject*
6417 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6419 Py_UNICODE *e;
6420 Py_UNICODE *p;
6421 Py_UNICODE *q;
6422 Py_UNICODE *qe;
6423 Py_ssize_t i, j, incr;
6424 PyUnicodeObject *u;
6425 int tabsize = 8;
6427 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6428 return NULL;
6430 /* First pass: determine size of output string */
6431 i = 0; /* chars up to and including most recent \n or \r */
6432 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6433 e = self->str + self->length; /* end of input */
6434 for (p = self->str; p < e; p++)
6435 if (*p == '\t') {
6436 if (tabsize > 0) {
6437 incr = tabsize - (j % tabsize); /* cannot overflow */
6438 if (j > PY_SSIZE_T_MAX - incr)
6439 goto overflow1;
6440 j += incr;
6443 else {
6444 if (j > PY_SSIZE_T_MAX - 1)
6445 goto overflow1;
6446 j++;
6447 if (*p == '\n' || *p == '\r') {
6448 if (i > PY_SSIZE_T_MAX - j)
6449 goto overflow1;
6450 i += j;
6451 j = 0;
6455 if (i > PY_SSIZE_T_MAX - j)
6456 goto overflow1;
6458 /* Second pass: create output string and fill it */
6459 u = _PyUnicode_New(i + j);
6460 if (!u)
6461 return NULL;
6463 j = 0; /* same as in first pass */
6464 q = u->str; /* next output char */
6465 qe = u->str + u->length; /* end of output */
6467 for (p = self->str; p < e; p++)
6468 if (*p == '\t') {
6469 if (tabsize > 0) {
6470 i = tabsize - (j % tabsize);
6471 j += i;
6472 while (i--) {
6473 if (q >= qe)
6474 goto overflow2;
6475 *q++ = ' ';
6479 else {
6480 if (q >= qe)
6481 goto overflow2;
6482 *q++ = *p;
6483 j++;
6484 if (*p == '\n' || *p == '\r')
6485 j = 0;
6488 return (PyObject*) u;
6490 overflow2:
6491 Py_DECREF(u);
6492 overflow1:
6493 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6494 return NULL;
6497 PyDoc_STRVAR(find__doc__,
6498 "S.find(sub [,start [,end]]) -> int\n\
6500 Return the lowest index in S where substring sub is found,\n\
6501 such that sub is contained within s[start:end]. Optional\n\
6502 arguments start and end are interpreted as in slice notation.\n\
6504 Return -1 on failure.");
6506 static PyObject *
6507 unicode_find(PyUnicodeObject *self, PyObject *args)
6509 PyObject *substring;
6510 Py_ssize_t start;
6511 Py_ssize_t end;
6512 Py_ssize_t result;
6514 if (!_ParseTupleFinds(args, &substring, &start, &end))
6515 return NULL;
6517 result = stringlib_find_slice(
6518 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6519 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6520 start, end
6523 Py_DECREF(substring);
6525 return PyInt_FromSsize_t(result);
6528 static PyObject *
6529 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6531 if (index < 0 || index >= self->length) {
6532 PyErr_SetString(PyExc_IndexError, "string index out of range");
6533 return NULL;
6536 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6539 static long
6540 unicode_hash(PyUnicodeObject *self)
6542 /* Since Unicode objects compare equal to their ASCII string
6543 counterparts, they should use the individual character values
6544 as basis for their hash value. This is needed to assure that
6545 strings and Unicode objects behave in the same way as
6546 dictionary keys. */
6548 register Py_ssize_t len;
6549 register Py_UNICODE *p;
6550 register long x;
6552 if (self->hash != -1)
6553 return self->hash;
6554 len = PyUnicode_GET_SIZE(self);
6555 p = PyUnicode_AS_UNICODE(self);
6556 x = *p << 7;
6557 while (--len >= 0)
6558 x = (1000003*x) ^ *p++;
6559 x ^= PyUnicode_GET_SIZE(self);
6560 if (x == -1)
6561 x = -2;
6562 self->hash = x;
6563 return x;
6566 PyDoc_STRVAR(index__doc__,
6567 "S.index(sub [,start [,end]]) -> int\n\
6569 Like S.find() but raise ValueError when the substring is not found.");
6571 static PyObject *
6572 unicode_index(PyUnicodeObject *self, PyObject *args)
6574 Py_ssize_t result;
6575 PyObject *substring;
6576 Py_ssize_t start;
6577 Py_ssize_t end;
6579 if (!_ParseTupleFinds(args, &substring, &start, &end))
6580 return NULL;
6582 result = stringlib_find_slice(
6583 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6584 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6585 start, end
6588 Py_DECREF(substring);
6590 if (result < 0) {
6591 PyErr_SetString(PyExc_ValueError, "substring not found");
6592 return NULL;
6595 return PyInt_FromSsize_t(result);
6598 PyDoc_STRVAR(islower__doc__,
6599 "S.islower() -> bool\n\
6601 Return True if all cased characters in S are lowercase and there is\n\
6602 at least one cased character in S, False otherwise.");
6604 static PyObject*
6605 unicode_islower(PyUnicodeObject *self)
6607 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6608 register const Py_UNICODE *e;
6609 int cased;
6611 /* Shortcut for single character strings */
6612 if (PyUnicode_GET_SIZE(self) == 1)
6613 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6615 /* Special case for empty strings */
6616 if (PyUnicode_GET_SIZE(self) == 0)
6617 return PyBool_FromLong(0);
6619 e = p + PyUnicode_GET_SIZE(self);
6620 cased = 0;
6621 for (; p < e; p++) {
6622 register const Py_UNICODE ch = *p;
6624 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6625 return PyBool_FromLong(0);
6626 else if (!cased && Py_UNICODE_ISLOWER(ch))
6627 cased = 1;
6629 return PyBool_FromLong(cased);
6632 PyDoc_STRVAR(isupper__doc__,
6633 "S.isupper() -> bool\n\
6635 Return True if all cased characters in S are uppercase and there is\n\
6636 at least one cased character in S, False otherwise.");
6638 static PyObject*
6639 unicode_isupper(PyUnicodeObject *self)
6641 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6642 register const Py_UNICODE *e;
6643 int cased;
6645 /* Shortcut for single character strings */
6646 if (PyUnicode_GET_SIZE(self) == 1)
6647 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6649 /* Special case for empty strings */
6650 if (PyUnicode_GET_SIZE(self) == 0)
6651 return PyBool_FromLong(0);
6653 e = p + PyUnicode_GET_SIZE(self);
6654 cased = 0;
6655 for (; p < e; p++) {
6656 register const Py_UNICODE ch = *p;
6658 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6659 return PyBool_FromLong(0);
6660 else if (!cased && Py_UNICODE_ISUPPER(ch))
6661 cased = 1;
6663 return PyBool_FromLong(cased);
6666 PyDoc_STRVAR(istitle__doc__,
6667 "S.istitle() -> bool\n\
6669 Return True if S is a titlecased string and there is at least one\n\
6670 character in S, i.e. upper- and titlecase characters may only\n\
6671 follow uncased characters and lowercase characters only cased ones.\n\
6672 Return False otherwise.");
6674 static PyObject*
6675 unicode_istitle(PyUnicodeObject *self)
6677 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6678 register const Py_UNICODE *e;
6679 int cased, previous_is_cased;
6681 /* Shortcut for single character strings */
6682 if (PyUnicode_GET_SIZE(self) == 1)
6683 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6684 (Py_UNICODE_ISUPPER(*p) != 0));
6686 /* Special case for empty strings */
6687 if (PyUnicode_GET_SIZE(self) == 0)
6688 return PyBool_FromLong(0);
6690 e = p + PyUnicode_GET_SIZE(self);
6691 cased = 0;
6692 previous_is_cased = 0;
6693 for (; p < e; p++) {
6694 register const Py_UNICODE ch = *p;
6696 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6697 if (previous_is_cased)
6698 return PyBool_FromLong(0);
6699 previous_is_cased = 1;
6700 cased = 1;
6702 else if (Py_UNICODE_ISLOWER(ch)) {
6703 if (!previous_is_cased)
6704 return PyBool_FromLong(0);
6705 previous_is_cased = 1;
6706 cased = 1;
6708 else
6709 previous_is_cased = 0;
6711 return PyBool_FromLong(cased);
6714 PyDoc_STRVAR(isspace__doc__,
6715 "S.isspace() -> bool\n\
6717 Return True if all characters in S are whitespace\n\
6718 and there is at least one character in S, False otherwise.");
6720 static PyObject*
6721 unicode_isspace(PyUnicodeObject *self)
6723 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6724 register const Py_UNICODE *e;
6726 /* Shortcut for single character strings */
6727 if (PyUnicode_GET_SIZE(self) == 1 &&
6728 Py_UNICODE_ISSPACE(*p))
6729 return PyBool_FromLong(1);
6731 /* Special case for empty strings */
6732 if (PyUnicode_GET_SIZE(self) == 0)
6733 return PyBool_FromLong(0);
6735 e = p + PyUnicode_GET_SIZE(self);
6736 for (; p < e; p++) {
6737 if (!Py_UNICODE_ISSPACE(*p))
6738 return PyBool_FromLong(0);
6740 return PyBool_FromLong(1);
6743 PyDoc_STRVAR(isalpha__doc__,
6744 "S.isalpha() -> bool\n\
6746 Return True if all characters in S are alphabetic\n\
6747 and there is at least one character in S, False otherwise.");
6749 static PyObject*
6750 unicode_isalpha(PyUnicodeObject *self)
6752 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6753 register const Py_UNICODE *e;
6755 /* Shortcut for single character strings */
6756 if (PyUnicode_GET_SIZE(self) == 1 &&
6757 Py_UNICODE_ISALPHA(*p))
6758 return PyBool_FromLong(1);
6760 /* Special case for empty strings */
6761 if (PyUnicode_GET_SIZE(self) == 0)
6762 return PyBool_FromLong(0);
6764 e = p + PyUnicode_GET_SIZE(self);
6765 for (; p < e; p++) {
6766 if (!Py_UNICODE_ISALPHA(*p))
6767 return PyBool_FromLong(0);
6769 return PyBool_FromLong(1);
6772 PyDoc_STRVAR(isalnum__doc__,
6773 "S.isalnum() -> bool\n\
6775 Return True if all characters in S are alphanumeric\n\
6776 and there is at least one character in S, False otherwise.");
6778 static PyObject*
6779 unicode_isalnum(PyUnicodeObject *self)
6781 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6782 register const Py_UNICODE *e;
6784 /* Shortcut for single character strings */
6785 if (PyUnicode_GET_SIZE(self) == 1 &&
6786 Py_UNICODE_ISALNUM(*p))
6787 return PyBool_FromLong(1);
6789 /* Special case for empty strings */
6790 if (PyUnicode_GET_SIZE(self) == 0)
6791 return PyBool_FromLong(0);
6793 e = p + PyUnicode_GET_SIZE(self);
6794 for (; p < e; p++) {
6795 if (!Py_UNICODE_ISALNUM(*p))
6796 return PyBool_FromLong(0);
6798 return PyBool_FromLong(1);
6801 PyDoc_STRVAR(isdecimal__doc__,
6802 "S.isdecimal() -> bool\n\
6804 Return True if there are only decimal characters in S,\n\
6805 False otherwise.");
6807 static PyObject*
6808 unicode_isdecimal(PyUnicodeObject *self)
6810 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6811 register const Py_UNICODE *e;
6813 /* Shortcut for single character strings */
6814 if (PyUnicode_GET_SIZE(self) == 1 &&
6815 Py_UNICODE_ISDECIMAL(*p))
6816 return PyBool_FromLong(1);
6818 /* Special case for empty strings */
6819 if (PyUnicode_GET_SIZE(self) == 0)
6820 return PyBool_FromLong(0);
6822 e = p + PyUnicode_GET_SIZE(self);
6823 for (; p < e; p++) {
6824 if (!Py_UNICODE_ISDECIMAL(*p))
6825 return PyBool_FromLong(0);
6827 return PyBool_FromLong(1);
6830 PyDoc_STRVAR(isdigit__doc__,
6831 "S.isdigit() -> bool\n\
6833 Return True if all characters in S are digits\n\
6834 and there is at least one character in S, False otherwise.");
6836 static PyObject*
6837 unicode_isdigit(PyUnicodeObject *self)
6839 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6840 register const Py_UNICODE *e;
6842 /* Shortcut for single character strings */
6843 if (PyUnicode_GET_SIZE(self) == 1 &&
6844 Py_UNICODE_ISDIGIT(*p))
6845 return PyBool_FromLong(1);
6847 /* Special case for empty strings */
6848 if (PyUnicode_GET_SIZE(self) == 0)
6849 return PyBool_FromLong(0);
6851 e = p + PyUnicode_GET_SIZE(self);
6852 for (; p < e; p++) {
6853 if (!Py_UNICODE_ISDIGIT(*p))
6854 return PyBool_FromLong(0);
6856 return PyBool_FromLong(1);
6859 PyDoc_STRVAR(isnumeric__doc__,
6860 "S.isnumeric() -> bool\n\
6862 Return True if there are only numeric characters in S,\n\
6863 False otherwise.");
6865 static PyObject*
6866 unicode_isnumeric(PyUnicodeObject *self)
6868 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6869 register const Py_UNICODE *e;
6871 /* Shortcut for single character strings */
6872 if (PyUnicode_GET_SIZE(self) == 1 &&
6873 Py_UNICODE_ISNUMERIC(*p))
6874 return PyBool_FromLong(1);
6876 /* Special case for empty strings */
6877 if (PyUnicode_GET_SIZE(self) == 0)
6878 return PyBool_FromLong(0);
6880 e = p + PyUnicode_GET_SIZE(self);
6881 for (; p < e; p++) {
6882 if (!Py_UNICODE_ISNUMERIC(*p))
6883 return PyBool_FromLong(0);
6885 return PyBool_FromLong(1);
6888 PyDoc_STRVAR(join__doc__,
6889 "S.join(iterable) -> unicode\n\
6891 Return a string which is the concatenation of the strings in the\n\
6892 iterable. The separator between elements is S.");
6894 static PyObject*
6895 unicode_join(PyObject *self, PyObject *data)
6897 return PyUnicode_Join(self, data);
6900 static Py_ssize_t
6901 unicode_length(PyUnicodeObject *self)
6903 return self->length;
6906 PyDoc_STRVAR(ljust__doc__,
6907 "S.ljust(width[, fillchar]) -> int\n\
6909 Return S left-justified in a Unicode string of length width. Padding is\n\
6910 done using the specified fill character (default is a space).");
6912 static PyObject *
6913 unicode_ljust(PyUnicodeObject *self, PyObject *args)
6915 Py_ssize_t width;
6916 Py_UNICODE fillchar = ' ';
6918 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
6919 return NULL;
6921 if (self->length >= width && PyUnicode_CheckExact(self)) {
6922 Py_INCREF(self);
6923 return (PyObject*) self;
6926 return (PyObject*) pad(self, 0, width - self->length, fillchar);
6929 PyDoc_STRVAR(lower__doc__,
6930 "S.lower() -> unicode\n\
6932 Return a copy of the string S converted to lowercase.");
6934 static PyObject*
6935 unicode_lower(PyUnicodeObject *self)
6937 return fixup(self, fixlower);
6940 #define LEFTSTRIP 0
6941 #define RIGHTSTRIP 1
6942 #define BOTHSTRIP 2
6944 /* Arrays indexed by above */
6945 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6947 #define STRIPNAME(i) (stripformat[i]+3)
6949 /* externally visible for str.strip(unicode) */
6950 PyObject *
6951 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6953 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6954 Py_ssize_t len = PyUnicode_GET_SIZE(self);
6955 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6956 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6957 Py_ssize_t i, j;
6959 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6961 i = 0;
6962 if (striptype != RIGHTSTRIP) {
6963 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6964 i++;
6968 j = len;
6969 if (striptype != LEFTSTRIP) {
6970 do {
6971 j--;
6972 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6973 j++;
6976 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6977 Py_INCREF(self);
6978 return (PyObject*)self;
6980 else
6981 return PyUnicode_FromUnicode(s+i, j-i);
6985 static PyObject *
6986 do_strip(PyUnicodeObject *self, int striptype)
6988 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6989 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
6991 i = 0;
6992 if (striptype != RIGHTSTRIP) {
6993 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6994 i++;
6998 j = len;
6999 if (striptype != LEFTSTRIP) {
7000 do {
7001 j--;
7002 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7003 j++;
7006 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7007 Py_INCREF(self);
7008 return (PyObject*)self;
7010 else
7011 return PyUnicode_FromUnicode(s+i, j-i);
7015 static PyObject *
7016 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7018 PyObject *sep = NULL;
7020 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7021 return NULL;
7023 if (sep != NULL && sep != Py_None) {
7024 if (PyUnicode_Check(sep))
7025 return _PyUnicode_XStrip(self, striptype, sep);
7026 else if (PyString_Check(sep)) {
7027 PyObject *res;
7028 sep = PyUnicode_FromObject(sep);
7029 if (sep==NULL)
7030 return NULL;
7031 res = _PyUnicode_XStrip(self, striptype, sep);
7032 Py_DECREF(sep);
7033 return res;
7035 else {
7036 PyErr_Format(PyExc_TypeError,
7037 "%s arg must be None, unicode or str",
7038 STRIPNAME(striptype));
7039 return NULL;
7043 return do_strip(self, striptype);
7047 PyDoc_STRVAR(strip__doc__,
7048 "S.strip([chars]) -> unicode\n\
7050 Return a copy of the string S with leading and trailing\n\
7051 whitespace removed.\n\
7052 If chars is given and not None, remove characters in chars instead.\n\
7053 If chars is a str, it will be converted to unicode before stripping");
7055 static PyObject *
7056 unicode_strip(PyUnicodeObject *self, PyObject *args)
7058 if (PyTuple_GET_SIZE(args) == 0)
7059 return do_strip(self, BOTHSTRIP); /* Common case */
7060 else
7061 return do_argstrip(self, BOTHSTRIP, args);
7065 PyDoc_STRVAR(lstrip__doc__,
7066 "S.lstrip([chars]) -> unicode\n\
7068 Return a copy of the string S with leading whitespace removed.\n\
7069 If chars is given and not None, remove characters in chars instead.\n\
7070 If chars is a str, it will be converted to unicode before stripping");
7072 static PyObject *
7073 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7075 if (PyTuple_GET_SIZE(args) == 0)
7076 return do_strip(self, LEFTSTRIP); /* Common case */
7077 else
7078 return do_argstrip(self, LEFTSTRIP, args);
7082 PyDoc_STRVAR(rstrip__doc__,
7083 "S.rstrip([chars]) -> unicode\n\
7085 Return a copy of the string S with trailing whitespace removed.\n\
7086 If chars is given and not None, remove characters in chars instead.\n\
7087 If chars is a str, it will be converted to unicode before stripping");
7089 static PyObject *
7090 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7092 if (PyTuple_GET_SIZE(args) == 0)
7093 return do_strip(self, RIGHTSTRIP); /* Common case */
7094 else
7095 return do_argstrip(self, RIGHTSTRIP, args);
7099 static PyObject*
7100 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7102 PyUnicodeObject *u;
7103 Py_UNICODE *p;
7104 Py_ssize_t nchars;
7105 size_t nbytes;
7107 if (len < 0)
7108 len = 0;
7110 if (len == 1 && PyUnicode_CheckExact(str)) {
7111 /* no repeat, return original string */
7112 Py_INCREF(str);
7113 return (PyObject*) str;
7116 /* ensure # of chars needed doesn't overflow int and # of bytes
7117 * needed doesn't overflow size_t
7119 nchars = len * str->length;
7120 if (len && nchars / len != str->length) {
7121 PyErr_SetString(PyExc_OverflowError,
7122 "repeated string is too long");
7123 return NULL;
7125 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7126 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7127 PyErr_SetString(PyExc_OverflowError,
7128 "repeated string is too long");
7129 return NULL;
7131 u = _PyUnicode_New(nchars);
7132 if (!u)
7133 return NULL;
7135 p = u->str;
7137 if (str->length == 1 && len > 0) {
7138 Py_UNICODE_FILL(p, str->str[0], len);
7139 } else {
7140 Py_ssize_t done = 0; /* number of characters copied this far */
7141 if (done < nchars) {
7142 Py_UNICODE_COPY(p, str->str, str->length);
7143 done = str->length;
7145 while (done < nchars) {
7146 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7147 Py_UNICODE_COPY(p+done, p, n);
7148 done += n;
7152 return (PyObject*) u;
7155 PyObject *PyUnicode_Replace(PyObject *obj,
7156 PyObject *subobj,
7157 PyObject *replobj,
7158 Py_ssize_t maxcount)
7160 PyObject *self;
7161 PyObject *str1;
7162 PyObject *str2;
7163 PyObject *result;
7165 self = PyUnicode_FromObject(obj);
7166 if (self == NULL)
7167 return NULL;
7168 str1 = PyUnicode_FromObject(subobj);
7169 if (str1 == NULL) {
7170 Py_DECREF(self);
7171 return NULL;
7173 str2 = PyUnicode_FromObject(replobj);
7174 if (str2 == NULL) {
7175 Py_DECREF(self);
7176 Py_DECREF(str1);
7177 return NULL;
7179 result = replace((PyUnicodeObject *)self,
7180 (PyUnicodeObject *)str1,
7181 (PyUnicodeObject *)str2,
7182 maxcount);
7183 Py_DECREF(self);
7184 Py_DECREF(str1);
7185 Py_DECREF(str2);
7186 return result;
7189 PyDoc_STRVAR(replace__doc__,
7190 "S.replace(old, new[, count]) -> unicode\n\
7192 Return a copy of S with all occurrences of substring\n\
7193 old replaced by new. If the optional argument count is\n\
7194 given, only the first count occurrences are replaced.");
7196 static PyObject*
7197 unicode_replace(PyUnicodeObject *self, PyObject *args)
7199 PyUnicodeObject *str1;
7200 PyUnicodeObject *str2;
7201 Py_ssize_t maxcount = -1;
7202 PyObject *result;
7204 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7205 return NULL;
7206 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7207 if (str1 == NULL)
7208 return NULL;
7209 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7210 if (str2 == NULL) {
7211 Py_DECREF(str1);
7212 return NULL;
7215 result = replace(self, str1, str2, maxcount);
7217 Py_DECREF(str1);
7218 Py_DECREF(str2);
7219 return result;
7222 static
7223 PyObject *unicode_repr(PyObject *unicode)
7225 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7226 PyUnicode_GET_SIZE(unicode),
7230 PyDoc_STRVAR(rfind__doc__,
7231 "S.rfind(sub [,start [,end]]) -> int\n\
7233 Return the highest index in S where substring sub is found,\n\
7234 such that sub is contained within s[start:end]. Optional\n\
7235 arguments start and end are interpreted as in slice notation.\n\
7237 Return -1 on failure.");
7239 static PyObject *
7240 unicode_rfind(PyUnicodeObject *self, PyObject *args)
7242 PyObject *substring;
7243 Py_ssize_t start;
7244 Py_ssize_t end;
7245 Py_ssize_t result;
7247 if (!_ParseTupleFinds(args, &substring, &start, &end))
7248 return NULL;
7250 result = stringlib_rfind_slice(
7251 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7252 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7253 start, end
7256 Py_DECREF(substring);
7258 return PyInt_FromSsize_t(result);
7261 PyDoc_STRVAR(rindex__doc__,
7262 "S.rindex(sub [,start [,end]]) -> int\n\
7264 Like S.rfind() but raise ValueError when the substring is not found.");
7266 static PyObject *
7267 unicode_rindex(PyUnicodeObject *self, PyObject *args)
7269 PyObject *substring;
7270 Py_ssize_t start;
7271 Py_ssize_t end;
7272 Py_ssize_t result;
7274 if (!_ParseTupleFinds(args, &substring, &start, &end))
7275 return NULL;
7277 result = stringlib_rfind_slice(
7278 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7279 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7280 start, end
7283 Py_DECREF(substring);
7285 if (result < 0) {
7286 PyErr_SetString(PyExc_ValueError, "substring not found");
7287 return NULL;
7289 return PyInt_FromSsize_t(result);
7292 PyDoc_STRVAR(rjust__doc__,
7293 "S.rjust(width[, fillchar]) -> unicode\n\
7295 Return S right-justified in a Unicode string of length width. Padding is\n\
7296 done using the specified fill character (default is a space).");
7298 static PyObject *
7299 unicode_rjust(PyUnicodeObject *self, PyObject *args)
7301 Py_ssize_t width;
7302 Py_UNICODE fillchar = ' ';
7304 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7305 return NULL;
7307 if (self->length >= width && PyUnicode_CheckExact(self)) {
7308 Py_INCREF(self);
7309 return (PyObject*) self;
7312 return (PyObject*) pad(self, width - self->length, 0, fillchar);
7315 static PyObject*
7316 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
7318 /* standard clamping */
7319 if (start < 0)
7320 start = 0;
7321 if (end < 0)
7322 end = 0;
7323 if (end > self->length)
7324 end = self->length;
7325 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
7326 /* full slice, return original string */
7327 Py_INCREF(self);
7328 return (PyObject*) self;
7330 if (start > end)
7331 start = end;
7332 /* copy slice */
7333 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7334 end - start);
7337 PyObject *PyUnicode_Split(PyObject *s,
7338 PyObject *sep,
7339 Py_ssize_t maxsplit)
7341 PyObject *result;
7343 s = PyUnicode_FromObject(s);
7344 if (s == NULL)
7345 return NULL;
7346 if (sep != NULL) {
7347 sep = PyUnicode_FromObject(sep);
7348 if (sep == NULL) {
7349 Py_DECREF(s);
7350 return NULL;
7354 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7356 Py_DECREF(s);
7357 Py_XDECREF(sep);
7358 return result;
7361 PyDoc_STRVAR(split__doc__,
7362 "S.split([sep [,maxsplit]]) -> list of strings\n\
7364 Return a list of the words in S, using sep as the\n\
7365 delimiter string. If maxsplit is given, at most maxsplit\n\
7366 splits are done. If sep is not specified or is None, any\n\
7367 whitespace string is a separator and empty strings are\n\
7368 removed from the result.");
7370 static PyObject*
7371 unicode_split(PyUnicodeObject *self, PyObject *args)
7373 PyObject *substring = Py_None;
7374 Py_ssize_t maxcount = -1;
7376 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7377 return NULL;
7379 if (substring == Py_None)
7380 return split(self, NULL, maxcount);
7381 else if (PyUnicode_Check(substring))
7382 return split(self, (PyUnicodeObject *)substring, maxcount);
7383 else
7384 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7387 PyObject *
7388 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7390 PyObject* str_obj;
7391 PyObject* sep_obj;
7392 PyObject* out;
7394 str_obj = PyUnicode_FromObject(str_in);
7395 if (!str_obj)
7396 return NULL;
7397 sep_obj = PyUnicode_FromObject(sep_in);
7398 if (!sep_obj) {
7399 Py_DECREF(str_obj);
7400 return NULL;
7403 out = stringlib_partition(
7404 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7405 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7408 Py_DECREF(sep_obj);
7409 Py_DECREF(str_obj);
7411 return out;
7415 PyObject *
7416 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7418 PyObject* str_obj;
7419 PyObject* sep_obj;
7420 PyObject* out;
7422 str_obj = PyUnicode_FromObject(str_in);
7423 if (!str_obj)
7424 return NULL;
7425 sep_obj = PyUnicode_FromObject(sep_in);
7426 if (!sep_obj) {
7427 Py_DECREF(str_obj);
7428 return NULL;
7431 out = stringlib_rpartition(
7432 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7433 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7436 Py_DECREF(sep_obj);
7437 Py_DECREF(str_obj);
7439 return out;
7442 PyDoc_STRVAR(partition__doc__,
7443 "S.partition(sep) -> (head, sep, tail)\n\
7445 Search for the separator sep in S, and return the part before it,\n\
7446 the separator itself, and the part after it. If the separator is not\n\
7447 found, return S and two empty strings.");
7449 static PyObject*
7450 unicode_partition(PyUnicodeObject *self, PyObject *separator)
7452 return PyUnicode_Partition((PyObject *)self, separator);
7455 PyDoc_STRVAR(rpartition__doc__,
7456 "S.rpartition(sep) -> (head, sep, tail)\n\
7458 Search for the separator sep in S, starting at the end of S, and return\n\
7459 the part before it, the separator itself, and the part after it. If the\n\
7460 separator is not found, return two empty strings and S.");
7462 static PyObject*
7463 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7465 return PyUnicode_RPartition((PyObject *)self, separator);
7468 PyObject *PyUnicode_RSplit(PyObject *s,
7469 PyObject *sep,
7470 Py_ssize_t maxsplit)
7472 PyObject *result;
7474 s = PyUnicode_FromObject(s);
7475 if (s == NULL)
7476 return NULL;
7477 if (sep != NULL) {
7478 sep = PyUnicode_FromObject(sep);
7479 if (sep == NULL) {
7480 Py_DECREF(s);
7481 return NULL;
7485 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7487 Py_DECREF(s);
7488 Py_XDECREF(sep);
7489 return result;
7492 PyDoc_STRVAR(rsplit__doc__,
7493 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7495 Return a list of the words in S, using sep as the\n\
7496 delimiter string, starting at the end of the string and\n\
7497 working to the front. If maxsplit is given, at most maxsplit\n\
7498 splits are done. If sep is not specified, any whitespace string\n\
7499 is a separator.");
7501 static PyObject*
7502 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7504 PyObject *substring = Py_None;
7505 Py_ssize_t maxcount = -1;
7507 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7508 return NULL;
7510 if (substring == Py_None)
7511 return rsplit(self, NULL, maxcount);
7512 else if (PyUnicode_Check(substring))
7513 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7514 else
7515 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7518 PyDoc_STRVAR(splitlines__doc__,
7519 "S.splitlines([keepends]) -> list of strings\n\
7521 Return a list of the lines in S, breaking at line boundaries.\n\
7522 Line breaks are not included in the resulting list unless keepends\n\
7523 is given and true.");
7525 static PyObject*
7526 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7528 int keepends = 0;
7530 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7531 return NULL;
7533 return PyUnicode_Splitlines((PyObject *)self, keepends);
7536 static
7537 PyObject *unicode_str(PyUnicodeObject *self)
7539 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
7542 PyDoc_STRVAR(swapcase__doc__,
7543 "S.swapcase() -> unicode\n\
7545 Return a copy of S with uppercase characters converted to lowercase\n\
7546 and vice versa.");
7548 static PyObject*
7549 unicode_swapcase(PyUnicodeObject *self)
7551 return fixup(self, fixswapcase);
7554 PyDoc_STRVAR(translate__doc__,
7555 "S.translate(table) -> unicode\n\
7557 Return a copy of the string S, where all characters have been mapped\n\
7558 through the given translation table, which must be a mapping of\n\
7559 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7560 Unmapped characters are left untouched. Characters mapped to None\n\
7561 are deleted.");
7563 static PyObject*
7564 unicode_translate(PyUnicodeObject *self, PyObject *table)
7566 return PyUnicode_TranslateCharmap(self->str,
7567 self->length,
7568 table,
7569 "ignore");
7572 PyDoc_STRVAR(upper__doc__,
7573 "S.upper() -> unicode\n\
7575 Return a copy of S converted to uppercase.");
7577 static PyObject*
7578 unicode_upper(PyUnicodeObject *self)
7580 return fixup(self, fixupper);
7583 PyDoc_STRVAR(zfill__doc__,
7584 "S.zfill(width) -> unicode\n\
7586 Pad a numeric string S with zeros on the left, to fill a field\n\
7587 of the specified width. The string S is never truncated.");
7589 static PyObject *
7590 unicode_zfill(PyUnicodeObject *self, PyObject *args)
7592 Py_ssize_t fill;
7593 PyUnicodeObject *u;
7595 Py_ssize_t width;
7596 if (!PyArg_ParseTuple(args, "n:zfill", &width))
7597 return NULL;
7599 if (self->length >= width) {
7600 if (PyUnicode_CheckExact(self)) {
7601 Py_INCREF(self);
7602 return (PyObject*) self;
7604 else
7605 return PyUnicode_FromUnicode(
7606 PyUnicode_AS_UNICODE(self),
7607 PyUnicode_GET_SIZE(self)
7611 fill = width - self->length;
7613 u = pad(self, fill, 0, '0');
7615 if (u == NULL)
7616 return NULL;
7618 if (u->str[fill] == '+' || u->str[fill] == '-') {
7619 /* move sign to beginning of string */
7620 u->str[0] = u->str[fill];
7621 u->str[fill] = '0';
7624 return (PyObject*) u;
7627 #if 0
7628 static PyObject*
7629 free_listsize(PyUnicodeObject *self)
7631 return PyInt_FromLong(numfree);
7633 #endif
7635 PyDoc_STRVAR(startswith__doc__,
7636 "S.startswith(prefix[, start[, end]]) -> bool\n\
7638 Return True if S starts with the specified prefix, False otherwise.\n\
7639 With optional start, test S beginning at that position.\n\
7640 With optional end, stop comparing S at that position.\n\
7641 prefix can also be a tuple of strings to try.");
7643 static PyObject *
7644 unicode_startswith(PyUnicodeObject *self,
7645 PyObject *args)
7647 PyObject *subobj;
7648 PyUnicodeObject *substring;
7649 Py_ssize_t start = 0;
7650 Py_ssize_t end = PY_SSIZE_T_MAX;
7651 int result;
7653 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
7654 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7655 return NULL;
7656 if (PyTuple_Check(subobj)) {
7657 Py_ssize_t i;
7658 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7659 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7660 PyTuple_GET_ITEM(subobj, i));
7661 if (substring == NULL)
7662 return NULL;
7663 result = tailmatch(self, substring, start, end, -1);
7664 Py_DECREF(substring);
7665 if (result) {
7666 Py_RETURN_TRUE;
7669 /* nothing matched */
7670 Py_RETURN_FALSE;
7672 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7673 if (substring == NULL)
7674 return NULL;
7675 result = tailmatch(self, substring, start, end, -1);
7676 Py_DECREF(substring);
7677 return PyBool_FromLong(result);
7681 PyDoc_STRVAR(endswith__doc__,
7682 "S.endswith(suffix[, start[, end]]) -> bool\n\
7684 Return True if S ends with the specified suffix, False otherwise.\n\
7685 With optional start, test S beginning at that position.\n\
7686 With optional end, stop comparing S at that position.\n\
7687 suffix can also be a tuple of strings to try.");
7689 static PyObject *
7690 unicode_endswith(PyUnicodeObject *self,
7691 PyObject *args)
7693 PyObject *subobj;
7694 PyUnicodeObject *substring;
7695 Py_ssize_t start = 0;
7696 Py_ssize_t end = PY_SSIZE_T_MAX;
7697 int result;
7699 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7700 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7701 return NULL;
7702 if (PyTuple_Check(subobj)) {
7703 Py_ssize_t i;
7704 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7705 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7706 PyTuple_GET_ITEM(subobj, i));
7707 if (substring == NULL)
7708 return NULL;
7709 result = tailmatch(self, substring, start, end, +1);
7710 Py_DECREF(substring);
7711 if (result) {
7712 Py_RETURN_TRUE;
7715 Py_RETURN_FALSE;
7717 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7718 if (substring == NULL)
7719 return NULL;
7721 result = tailmatch(self, substring, start, end, +1);
7722 Py_DECREF(substring);
7723 return PyBool_FromLong(result);
7727 /* Implements do_string_format, which is unicode because of stringlib */
7728 #include "stringlib/string_format.h"
7730 PyDoc_STRVAR(format__doc__,
7731 "S.format(*args, **kwargs) -> unicode\n\
7735 static PyObject *
7736 unicode__format__(PyObject *self, PyObject *args)
7738 PyObject *format_spec;
7739 PyObject *result = NULL;
7740 PyObject *tmp = NULL;
7742 /* If 2.x, convert format_spec to the same type as value */
7743 /* This is to allow things like u''.format('') */
7744 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7745 goto done;
7746 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7747 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
7748 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
7749 goto done;
7751 tmp = PyObject_Unicode(format_spec);
7752 if (tmp == NULL)
7753 goto done;
7754 format_spec = tmp;
7756 result = _PyUnicode_FormatAdvanced(self,
7757 PyUnicode_AS_UNICODE(format_spec),
7758 PyUnicode_GET_SIZE(format_spec));
7759 done:
7760 Py_XDECREF(tmp);
7761 return result;
7764 PyDoc_STRVAR(p_format__doc__,
7765 "S.__format__(format_spec) -> unicode\n\
7769 static PyObject *
7770 unicode__sizeof__(PyUnicodeObject *v)
7772 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7773 sizeof(Py_UNICODE) * (v->length + 1));
7776 PyDoc_STRVAR(sizeof__doc__,
7777 "S.__sizeof__() -> size of S in memory, in bytes\n\
7781 static PyObject *
7782 unicode_getnewargs(PyUnicodeObject *v)
7784 return Py_BuildValue("(u#)", v->str, v->length);
7788 static PyMethodDef unicode_methods[] = {
7790 /* Order is according to common usage: often used methods should
7791 appear first, since lookup is done sequentially. */
7793 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
7794 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7795 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
7796 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
7797 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7798 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7799 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7800 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7801 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7802 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7803 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
7804 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
7805 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7806 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7807 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
7808 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
7809 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
7810 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7811 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7812 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7813 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
7814 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
7815 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
7816 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
7817 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
7818 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7819 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7820 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7821 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7822 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7823 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7824 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7825 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7826 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7827 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7828 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7829 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7830 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7831 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
7832 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
7833 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7834 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7835 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7836 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
7837 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
7838 #if 0
7839 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
7840 #endif
7842 #if 0
7843 /* This one is just used for debugging the implementation. */
7844 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
7845 #endif
7847 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
7848 {NULL, NULL}
7851 static PyObject *
7852 unicode_mod(PyObject *v, PyObject *w)
7854 if (!PyUnicode_Check(v)) {
7855 Py_INCREF(Py_NotImplemented);
7856 return Py_NotImplemented;
7858 return PyUnicode_Format(v, w);
7861 static PyNumberMethods unicode_as_number = {
7862 0, /*nb_add*/
7863 0, /*nb_subtract*/
7864 0, /*nb_multiply*/
7865 0, /*nb_divide*/
7866 unicode_mod, /*nb_remainder*/
7869 static PySequenceMethods unicode_as_sequence = {
7870 (lenfunc) unicode_length, /* sq_length */
7871 PyUnicode_Concat, /* sq_concat */
7872 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7873 (ssizeargfunc) unicode_getitem, /* sq_item */
7874 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7875 0, /* sq_ass_item */
7876 0, /* sq_ass_slice */
7877 PyUnicode_Contains, /* sq_contains */
7880 static PyObject*
7881 unicode_subscript(PyUnicodeObject* self, PyObject* item)
7883 if (PyIndex_Check(item)) {
7884 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
7885 if (i == -1 && PyErr_Occurred())
7886 return NULL;
7887 if (i < 0)
7888 i += PyUnicode_GET_SIZE(self);
7889 return unicode_getitem(self, i);
7890 } else if (PySlice_Check(item)) {
7891 Py_ssize_t start, stop, step, slicelength, cur, i;
7892 Py_UNICODE* source_buf;
7893 Py_UNICODE* result_buf;
7894 PyObject* result;
7896 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
7897 &start, &stop, &step, &slicelength) < 0) {
7898 return NULL;
7901 if (slicelength <= 0) {
7902 return PyUnicode_FromUnicode(NULL, 0);
7903 } else if (start == 0 && step == 1 && slicelength == self->length &&
7904 PyUnicode_CheckExact(self)) {
7905 Py_INCREF(self);
7906 return (PyObject *)self;
7907 } else if (step == 1) {
7908 return PyUnicode_FromUnicode(self->str + start, slicelength);
7909 } else {
7910 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
7911 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7912 sizeof(Py_UNICODE));
7914 if (result_buf == NULL)
7915 return PyErr_NoMemory();
7917 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7918 result_buf[i] = source_buf[cur];
7921 result = PyUnicode_FromUnicode(result_buf, slicelength);
7922 PyObject_FREE(result_buf);
7923 return result;
7925 } else {
7926 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7927 return NULL;
7931 static PyMappingMethods unicode_as_mapping = {
7932 (lenfunc)unicode_length, /* mp_length */
7933 (binaryfunc)unicode_subscript, /* mp_subscript */
7934 (objobjargproc)0, /* mp_ass_subscript */
7937 static Py_ssize_t
7938 unicode_buffer_getreadbuf(PyUnicodeObject *self,
7939 Py_ssize_t index,
7940 const void **ptr)
7942 if (index != 0) {
7943 PyErr_SetString(PyExc_SystemError,
7944 "accessing non-existent unicode segment");
7945 return -1;
7947 *ptr = (void *) self->str;
7948 return PyUnicode_GET_DATA_SIZE(self);
7951 static Py_ssize_t
7952 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
7953 const void **ptr)
7955 PyErr_SetString(PyExc_TypeError,
7956 "cannot use unicode as modifiable buffer");
7957 return -1;
7960 static int
7961 unicode_buffer_getsegcount(PyUnicodeObject *self,
7962 Py_ssize_t *lenp)
7964 if (lenp)
7965 *lenp = PyUnicode_GET_DATA_SIZE(self);
7966 return 1;
7969 static Py_ssize_t
7970 unicode_buffer_getcharbuf(PyUnicodeObject *self,
7971 Py_ssize_t index,
7972 const void **ptr)
7974 PyObject *str;
7976 if (index != 0) {
7977 PyErr_SetString(PyExc_SystemError,
7978 "accessing non-existent unicode segment");
7979 return -1;
7981 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
7982 if (str == NULL)
7983 return -1;
7984 *ptr = (void *) PyString_AS_STRING(str);
7985 return PyString_GET_SIZE(str);
7988 /* Helpers for PyUnicode_Format() */
7990 static PyObject *
7991 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
7993 Py_ssize_t argidx = *p_argidx;
7994 if (argidx < arglen) {
7995 (*p_argidx)++;
7996 if (arglen < 0)
7997 return args;
7998 else
7999 return PyTuple_GetItem(args, argidx);
8001 PyErr_SetString(PyExc_TypeError,
8002 "not enough arguments for format string");
8003 return NULL;
8006 #define F_LJUST (1<<0)
8007 #define F_SIGN (1<<1)
8008 #define F_BLANK (1<<2)
8009 #define F_ALT (1<<3)
8010 #define F_ZERO (1<<4)
8012 static Py_ssize_t
8013 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8015 register Py_ssize_t i;
8016 Py_ssize_t len = strlen(charbuffer);
8017 for (i = len - 1; i >= 0; i--)
8018 buffer[i] = (Py_UNICODE) charbuffer[i];
8020 return len;
8023 static int
8024 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8026 Py_ssize_t result;
8028 PyOS_snprintf((char *)buffer, len, format, x);
8029 result = strtounicode(buffer, (char *)buffer);
8030 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8033 /* XXX To save some code duplication, formatfloat/long/int could have been
8034 shared with stringobject.c, converting from 8-bit to Unicode after the
8035 formatting is done. */
8037 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
8039 static PyObject *
8040 formatfloat(PyObject *v, int flags, int prec, int type)
8042 char *p;
8043 PyObject *result;
8044 double x;
8046 x = PyFloat_AsDouble(v);
8047 if (x == -1.0 && PyErr_Occurred())
8048 return NULL;
8050 if (prec < 0)
8051 prec = 6;
8053 p = PyOS_double_to_string(x, type, prec,
8054 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8055 if (p == NULL)
8056 return NULL;
8057 result = PyUnicode_FromStringAndSize(p, strlen(p));
8058 PyMem_Free(p);
8059 return result;
8062 static PyObject*
8063 formatlong(PyObject *val, int flags, int prec, int type)
8065 char *buf;
8066 int i, len;
8067 PyObject *str; /* temporary string object. */
8068 PyUnicodeObject *result;
8070 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8071 if (!str)
8072 return NULL;
8073 result = _PyUnicode_New(len);
8074 if (!result) {
8075 Py_DECREF(str);
8076 return NULL;
8078 for (i = 0; i < len; i++)
8079 result->str[i] = buf[i];
8080 result->str[len] = 0;
8081 Py_DECREF(str);
8082 return (PyObject*)result;
8085 static int
8086 formatint(Py_UNICODE *buf,
8087 size_t buflen,
8088 int flags,
8089 int prec,
8090 int type,
8091 PyObject *v)
8093 /* fmt = '%#.' + `prec` + 'l' + `type`
8094 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8095 * + 1 + 1
8096 * = 24
8098 char fmt[64]; /* plenty big enough! */
8099 char *sign;
8100 long x;
8102 x = PyInt_AsLong(v);
8103 if (x == -1 && PyErr_Occurred())
8104 return -1;
8105 if (x < 0 && type == 'u') {
8106 type = 'd';
8108 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8109 sign = "-";
8110 else
8111 sign = "";
8112 if (prec < 0)
8113 prec = 1;
8115 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8116 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8118 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8119 PyErr_SetString(PyExc_OverflowError,
8120 "formatted integer is too long (precision too large?)");
8121 return -1;
8124 if ((flags & F_ALT) &&
8125 (type == 'x' || type == 'X')) {
8126 /* When converting under %#x or %#X, there are a number
8127 * of issues that cause pain:
8128 * - when 0 is being converted, the C standard leaves off
8129 * the '0x' or '0X', which is inconsistent with other
8130 * %#x/%#X conversions and inconsistent with Python's
8131 * hex() function
8132 * - there are platforms that violate the standard and
8133 * convert 0 with the '0x' or '0X'
8134 * (Metrowerks, Compaq Tru64)
8135 * - there are platforms that give '0x' when converting
8136 * under %#X, but convert 0 in accordance with the
8137 * standard (OS/2 EMX)
8139 * We can achieve the desired consistency by inserting our
8140 * own '0x' or '0X' prefix, and substituting %x/%X in place
8141 * of %#x/%#X.
8143 * Note that this is the same approach as used in
8144 * formatint() in stringobject.c
8146 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8147 sign, type, prec, type);
8149 else {
8150 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8151 sign, (flags&F_ALT) ? "#" : "",
8152 prec, type);
8154 if (sign[0])
8155 return longtounicode(buf, buflen, fmt, -x);
8156 else
8157 return longtounicode(buf, buflen, fmt, x);
8160 static int
8161 formatchar(Py_UNICODE *buf,
8162 size_t buflen,
8163 PyObject *v)
8165 PyObject *unistr;
8166 char *str;
8167 /* presume that the buffer is at least 2 characters long */
8168 if (PyUnicode_Check(v)) {
8169 if (PyUnicode_GET_SIZE(v) != 1)
8170 goto onError;
8171 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8174 else if (PyString_Check(v)) {
8175 if (PyString_GET_SIZE(v) != 1)
8176 goto onError;
8177 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8178 with a UnicodeDecodeError if 'char' is not decodable with the
8179 default encoding (usually ASCII, but it might be something else) */
8180 str = PyString_AS_STRING(v);
8181 if ((unsigned char)str[0] > 0x7F) {
8182 /* the char is not ASCII; try to decode the string using the
8183 default encoding and return -1 to let the UnicodeDecodeError
8184 be raised if the string can't be decoded */
8185 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8186 if (unistr == NULL)
8187 return -1;
8188 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8189 Py_DECREF(unistr);
8191 else
8192 buf[0] = (Py_UNICODE)str[0];
8195 else {
8196 /* Integer input truncated to a character */
8197 long x;
8198 x = PyInt_AsLong(v);
8199 if (x == -1 && PyErr_Occurred())
8200 goto onError;
8201 #ifdef Py_UNICODE_WIDE
8202 if (x < 0 || x > 0x10ffff) {
8203 PyErr_SetString(PyExc_OverflowError,
8204 "%c arg not in range(0x110000) "
8205 "(wide Python build)");
8206 return -1;
8208 #else
8209 if (x < 0 || x > 0xffff) {
8210 PyErr_SetString(PyExc_OverflowError,
8211 "%c arg not in range(0x10000) "
8212 "(narrow Python build)");
8213 return -1;
8215 #endif
8216 buf[0] = (Py_UNICODE) x;
8218 buf[1] = '\0';
8219 return 1;
8221 onError:
8222 PyErr_SetString(PyExc_TypeError,
8223 "%c requires int or char");
8224 return -1;
8227 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8229 FORMATBUFLEN is the length of the buffer in which the ints &
8230 chars are formatted. XXX This is a magic number. Each formatting
8231 routine does bounds checking to ensure no overflow, but a better
8232 solution may be to malloc a buffer of appropriate size for each
8233 format. For now, the current solution is sufficient.
8235 #define FORMATBUFLEN (size_t)120
8237 PyObject *PyUnicode_Format(PyObject *format,
8238 PyObject *args)
8240 Py_UNICODE *fmt, *res;
8241 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8242 int args_owned = 0;
8243 PyUnicodeObject *result = NULL;
8244 PyObject *dict = NULL;
8245 PyObject *uformat;
8247 if (format == NULL || args == NULL) {
8248 PyErr_BadInternalCall();
8249 return NULL;
8251 uformat = PyUnicode_FromObject(format);
8252 if (uformat == NULL)
8253 return NULL;
8254 fmt = PyUnicode_AS_UNICODE(uformat);
8255 fmtcnt = PyUnicode_GET_SIZE(uformat);
8257 reslen = rescnt = fmtcnt + 100;
8258 result = _PyUnicode_New(reslen);
8259 if (result == NULL)
8260 goto onError;
8261 res = PyUnicode_AS_UNICODE(result);
8263 if (PyTuple_Check(args)) {
8264 arglen = PyTuple_Size(args);
8265 argidx = 0;
8267 else {
8268 arglen = -1;
8269 argidx = -2;
8271 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
8272 !PyObject_TypeCheck(args, &PyBaseString_Type))
8273 dict = args;
8275 while (--fmtcnt >= 0) {
8276 if (*fmt != '%') {
8277 if (--rescnt < 0) {
8278 rescnt = fmtcnt + 100;
8279 reslen += rescnt;
8280 if (_PyUnicode_Resize(&result, reslen) < 0)
8281 goto onError;
8282 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8283 --rescnt;
8285 *res++ = *fmt++;
8287 else {
8288 /* Got a format specifier */
8289 int flags = 0;
8290 Py_ssize_t width = -1;
8291 int prec = -1;
8292 Py_UNICODE c = '\0';
8293 Py_UNICODE fill;
8294 int isnumok;
8295 PyObject *v = NULL;
8296 PyObject *temp = NULL;
8297 Py_UNICODE *pbuf;
8298 Py_UNICODE sign;
8299 Py_ssize_t len;
8300 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
8302 fmt++;
8303 if (*fmt == '(') {
8304 Py_UNICODE *keystart;
8305 Py_ssize_t keylen;
8306 PyObject *key;
8307 int pcount = 1;
8309 if (dict == NULL) {
8310 PyErr_SetString(PyExc_TypeError,
8311 "format requires a mapping");
8312 goto onError;
8314 ++fmt;
8315 --fmtcnt;
8316 keystart = fmt;
8317 /* Skip over balanced parentheses */
8318 while (pcount > 0 && --fmtcnt >= 0) {
8319 if (*fmt == ')')
8320 --pcount;
8321 else if (*fmt == '(')
8322 ++pcount;
8323 fmt++;
8325 keylen = fmt - keystart - 1;
8326 if (fmtcnt < 0 || pcount > 0) {
8327 PyErr_SetString(PyExc_ValueError,
8328 "incomplete format key");
8329 goto onError;
8331 #if 0
8332 /* keys are converted to strings using UTF-8 and
8333 then looked up since Python uses strings to hold
8334 variables names etc. in its namespaces and we
8335 wouldn't want to break common idioms. */
8336 key = PyUnicode_EncodeUTF8(keystart,
8337 keylen,
8338 NULL);
8339 #else
8340 key = PyUnicode_FromUnicode(keystart, keylen);
8341 #endif
8342 if (key == NULL)
8343 goto onError;
8344 if (args_owned) {
8345 Py_DECREF(args);
8346 args_owned = 0;
8348 args = PyObject_GetItem(dict, key);
8349 Py_DECREF(key);
8350 if (args == NULL) {
8351 goto onError;
8353 args_owned = 1;
8354 arglen = -1;
8355 argidx = -2;
8357 while (--fmtcnt >= 0) {
8358 switch (c = *fmt++) {
8359 case '-': flags |= F_LJUST; continue;
8360 case '+': flags |= F_SIGN; continue;
8361 case ' ': flags |= F_BLANK; continue;
8362 case '#': flags |= F_ALT; continue;
8363 case '0': flags |= F_ZERO; continue;
8365 break;
8367 if (c == '*') {
8368 v = getnextarg(args, arglen, &argidx);
8369 if (v == NULL)
8370 goto onError;
8371 if (!PyInt_Check(v)) {
8372 PyErr_SetString(PyExc_TypeError,
8373 "* wants int");
8374 goto onError;
8376 width = PyInt_AsLong(v);
8377 if (width < 0) {
8378 flags |= F_LJUST;
8379 width = -width;
8381 if (--fmtcnt >= 0)
8382 c = *fmt++;
8384 else if (c >= '0' && c <= '9') {
8385 width = c - '0';
8386 while (--fmtcnt >= 0) {
8387 c = *fmt++;
8388 if (c < '0' || c > '9')
8389 break;
8390 if ((width*10) / 10 != width) {
8391 PyErr_SetString(PyExc_ValueError,
8392 "width too big");
8393 goto onError;
8395 width = width*10 + (c - '0');
8398 if (c == '.') {
8399 prec = 0;
8400 if (--fmtcnt >= 0)
8401 c = *fmt++;
8402 if (c == '*') {
8403 v = getnextarg(args, arglen, &argidx);
8404 if (v == NULL)
8405 goto onError;
8406 if (!PyInt_Check(v)) {
8407 PyErr_SetString(PyExc_TypeError,
8408 "* wants int");
8409 goto onError;
8411 prec = PyInt_AsLong(v);
8412 if (prec < 0)
8413 prec = 0;
8414 if (--fmtcnt >= 0)
8415 c = *fmt++;
8417 else if (c >= '0' && c <= '9') {
8418 prec = c - '0';
8419 while (--fmtcnt >= 0) {
8420 c = Py_CHARMASK(*fmt++);
8421 if (c < '0' || c > '9')
8422 break;
8423 if ((prec*10) / 10 != prec) {
8424 PyErr_SetString(PyExc_ValueError,
8425 "prec too big");
8426 goto onError;
8428 prec = prec*10 + (c - '0');
8431 } /* prec */
8432 if (fmtcnt >= 0) {
8433 if (c == 'h' || c == 'l' || c == 'L') {
8434 if (--fmtcnt >= 0)
8435 c = *fmt++;
8438 if (fmtcnt < 0) {
8439 PyErr_SetString(PyExc_ValueError,
8440 "incomplete format");
8441 goto onError;
8443 if (c != '%') {
8444 v = getnextarg(args, arglen, &argidx);
8445 if (v == NULL)
8446 goto onError;
8448 sign = 0;
8449 fill = ' ';
8450 switch (c) {
8452 case '%':
8453 pbuf = formatbuf;
8454 /* presume that buffer length is at least 1 */
8455 pbuf[0] = '%';
8456 len = 1;
8457 break;
8459 case 's':
8460 case 'r':
8461 if (PyUnicode_CheckExact(v) && c == 's') {
8462 temp = v;
8463 Py_INCREF(temp);
8465 else {
8466 PyObject *unicode;
8467 if (c == 's')
8468 temp = PyObject_Unicode(v);
8469 else
8470 temp = PyObject_Repr(v);
8471 if (temp == NULL)
8472 goto onError;
8473 if (PyUnicode_Check(temp))
8474 /* nothing to do */;
8475 else if (PyString_Check(temp)) {
8476 /* convert to string to Unicode */
8477 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8478 PyString_GET_SIZE(temp),
8479 NULL,
8480 "strict");
8481 Py_DECREF(temp);
8482 temp = unicode;
8483 if (temp == NULL)
8484 goto onError;
8486 else {
8487 Py_DECREF(temp);
8488 PyErr_SetString(PyExc_TypeError,
8489 "%s argument has non-string str()");
8490 goto onError;
8493 pbuf = PyUnicode_AS_UNICODE(temp);
8494 len = PyUnicode_GET_SIZE(temp);
8495 if (prec >= 0 && len > prec)
8496 len = prec;
8497 break;
8499 case 'i':
8500 case 'd':
8501 case 'u':
8502 case 'o':
8503 case 'x':
8504 case 'X':
8505 if (c == 'i')
8506 c = 'd';
8507 isnumok = 0;
8508 if (PyNumber_Check(v)) {
8509 PyObject *iobj=NULL;
8511 if (PyInt_Check(v) || (PyLong_Check(v))) {
8512 iobj = v;
8513 Py_INCREF(iobj);
8515 else {
8516 iobj = PyNumber_Int(v);
8517 if (iobj==NULL) iobj = PyNumber_Long(v);
8519 if (iobj!=NULL) {
8520 if (PyInt_Check(iobj)) {
8521 isnumok = 1;
8522 pbuf = formatbuf;
8523 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8524 flags, prec, c, iobj);
8525 Py_DECREF(iobj);
8526 if (len < 0)
8527 goto onError;
8528 sign = 1;
8530 else if (PyLong_Check(iobj)) {
8531 isnumok = 1;
8532 temp = formatlong(iobj, flags, prec, c);
8533 Py_DECREF(iobj);
8534 if (!temp)
8535 goto onError;
8536 pbuf = PyUnicode_AS_UNICODE(temp);
8537 len = PyUnicode_GET_SIZE(temp);
8538 sign = 1;
8540 else {
8541 Py_DECREF(iobj);
8545 if (!isnumok) {
8546 PyErr_Format(PyExc_TypeError,
8547 "%%%c format: a number is required, "
8548 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8549 goto onError;
8551 if (flags & F_ZERO)
8552 fill = '0';
8553 break;
8555 case 'e':
8556 case 'E':
8557 case 'f':
8558 case 'F':
8559 case 'g':
8560 case 'G':
8561 temp = formatfloat(v, flags, prec, c);
8562 if (temp == NULL)
8563 goto onError;
8564 pbuf = PyUnicode_AS_UNICODE(temp);
8565 len = PyUnicode_GET_SIZE(temp);
8566 sign = 1;
8567 if (flags & F_ZERO)
8568 fill = '0';
8569 break;
8571 case 'c':
8572 pbuf = formatbuf;
8573 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8574 if (len < 0)
8575 goto onError;
8576 break;
8578 default:
8579 PyErr_Format(PyExc_ValueError,
8580 "unsupported format character '%c' (0x%x) "
8581 "at index %zd",
8582 (31<=c && c<=126) ? (char)c : '?',
8583 (int)c,
8584 (Py_ssize_t)(fmt - 1 -
8585 PyUnicode_AS_UNICODE(uformat)));
8586 goto onError;
8588 if (sign) {
8589 if (*pbuf == '-' || *pbuf == '+') {
8590 sign = *pbuf++;
8591 len--;
8593 else if (flags & F_SIGN)
8594 sign = '+';
8595 else if (flags & F_BLANK)
8596 sign = ' ';
8597 else
8598 sign = 0;
8600 if (width < len)
8601 width = len;
8602 if (rescnt - (sign != 0) < width) {
8603 reslen -= rescnt;
8604 rescnt = width + fmtcnt + 100;
8605 reslen += rescnt;
8606 if (reslen < 0) {
8607 Py_XDECREF(temp);
8608 PyErr_NoMemory();
8609 goto onError;
8611 if (_PyUnicode_Resize(&result, reslen) < 0) {
8612 Py_XDECREF(temp);
8613 goto onError;
8615 res = PyUnicode_AS_UNICODE(result)
8616 + reslen - rescnt;
8618 if (sign) {
8619 if (fill != ' ')
8620 *res++ = sign;
8621 rescnt--;
8622 if (width > len)
8623 width--;
8625 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8626 assert(pbuf[0] == '0');
8627 assert(pbuf[1] == c);
8628 if (fill != ' ') {
8629 *res++ = *pbuf++;
8630 *res++ = *pbuf++;
8632 rescnt -= 2;
8633 width -= 2;
8634 if (width < 0)
8635 width = 0;
8636 len -= 2;
8638 if (width > len && !(flags & F_LJUST)) {
8639 do {
8640 --rescnt;
8641 *res++ = fill;
8642 } while (--width > len);
8644 if (fill == ' ') {
8645 if (sign)
8646 *res++ = sign;
8647 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8648 assert(pbuf[0] == '0');
8649 assert(pbuf[1] == c);
8650 *res++ = *pbuf++;
8651 *res++ = *pbuf++;
8654 Py_UNICODE_COPY(res, pbuf, len);
8655 res += len;
8656 rescnt -= len;
8657 while (--width >= len) {
8658 --rescnt;
8659 *res++ = ' ';
8661 if (dict && (argidx < arglen) && c != '%') {
8662 PyErr_SetString(PyExc_TypeError,
8663 "not all arguments converted during string formatting");
8664 Py_XDECREF(temp);
8665 goto onError;
8667 Py_XDECREF(temp);
8668 } /* '%' */
8669 } /* until end */
8670 if (argidx < arglen && !dict) {
8671 PyErr_SetString(PyExc_TypeError,
8672 "not all arguments converted during string formatting");
8673 goto onError;
8676 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8677 goto onError;
8678 if (args_owned) {
8679 Py_DECREF(args);
8681 Py_DECREF(uformat);
8682 return (PyObject *)result;
8684 onError:
8685 Py_XDECREF(result);
8686 Py_DECREF(uformat);
8687 if (args_owned) {
8688 Py_DECREF(args);
8690 return NULL;
8693 static PyBufferProcs unicode_as_buffer = {
8694 (readbufferproc) unicode_buffer_getreadbuf,
8695 (writebufferproc) unicode_buffer_getwritebuf,
8696 (segcountproc) unicode_buffer_getsegcount,
8697 (charbufferproc) unicode_buffer_getcharbuf,
8700 static PyObject *
8701 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8703 static PyObject *
8704 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8706 PyObject *x = NULL;
8707 static char *kwlist[] = {"string", "encoding", "errors", 0};
8708 char *encoding = NULL;
8709 char *errors = NULL;
8711 if (type != &PyUnicode_Type)
8712 return unicode_subtype_new(type, args, kwds);
8713 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8714 kwlist, &x, &encoding, &errors))
8715 return NULL;
8716 if (x == NULL)
8717 return (PyObject *)_PyUnicode_New(0);
8718 if (encoding == NULL && errors == NULL)
8719 return PyObject_Unicode(x);
8720 else
8721 return PyUnicode_FromEncodedObject(x, encoding, errors);
8724 static PyObject *
8725 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8727 PyUnicodeObject *tmp, *pnew;
8728 Py_ssize_t n;
8730 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8731 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8732 if (tmp == NULL)
8733 return NULL;
8734 assert(PyUnicode_Check(tmp));
8735 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8736 if (pnew == NULL) {
8737 Py_DECREF(tmp);
8738 return NULL;
8740 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8741 if (pnew->str == NULL) {
8742 _Py_ForgetReference((PyObject *)pnew);
8743 PyObject_Del(pnew);
8744 Py_DECREF(tmp);
8745 return PyErr_NoMemory();
8747 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8748 pnew->length = n;
8749 pnew->hash = tmp->hash;
8750 Py_DECREF(tmp);
8751 return (PyObject *)pnew;
8754 PyDoc_STRVAR(unicode_doc,
8755 "unicode(string [, encoding[, errors]]) -> object\n\
8757 Create a new Unicode object from the given encoded string.\n\
8758 encoding defaults to the current default string encoding.\n\
8759 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8761 PyTypeObject PyUnicode_Type = {
8762 PyVarObject_HEAD_INIT(&PyType_Type, 0)
8763 "unicode", /* tp_name */
8764 sizeof(PyUnicodeObject), /* tp_size */
8765 0, /* tp_itemsize */
8766 /* Slots */
8767 (destructor)unicode_dealloc, /* tp_dealloc */
8768 0, /* tp_print */
8769 0, /* tp_getattr */
8770 0, /* tp_setattr */
8771 0, /* tp_compare */
8772 unicode_repr, /* tp_repr */
8773 &unicode_as_number, /* tp_as_number */
8774 &unicode_as_sequence, /* tp_as_sequence */
8775 &unicode_as_mapping, /* tp_as_mapping */
8776 (hashfunc) unicode_hash, /* tp_hash*/
8777 0, /* tp_call*/
8778 (reprfunc) unicode_str, /* tp_str */
8779 PyObject_GenericGetAttr, /* tp_getattro */
8780 0, /* tp_setattro */
8781 &unicode_as_buffer, /* tp_as_buffer */
8782 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
8783 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
8784 unicode_doc, /* tp_doc */
8785 0, /* tp_traverse */
8786 0, /* tp_clear */
8787 PyUnicode_RichCompare, /* tp_richcompare */
8788 0, /* tp_weaklistoffset */
8789 0, /* tp_iter */
8790 0, /* tp_iternext */
8791 unicode_methods, /* tp_methods */
8792 0, /* tp_members */
8793 0, /* tp_getset */
8794 &PyBaseString_Type, /* tp_base */
8795 0, /* tp_dict */
8796 0, /* tp_descr_get */
8797 0, /* tp_descr_set */
8798 0, /* tp_dictoffset */
8799 0, /* tp_init */
8800 0, /* tp_alloc */
8801 unicode_new, /* tp_new */
8802 PyObject_Del, /* tp_free */
8805 /* Initialize the Unicode implementation */
8807 void _PyUnicode_Init(void)
8809 int i;
8811 /* XXX - move this array to unicodectype.c ? */
8812 Py_UNICODE linebreak[] = {
8813 0x000A, /* LINE FEED */
8814 0x000D, /* CARRIAGE RETURN */
8815 0x001C, /* FILE SEPARATOR */
8816 0x001D, /* GROUP SEPARATOR */
8817 0x001E, /* RECORD SEPARATOR */
8818 0x0085, /* NEXT LINE */
8819 0x2028, /* LINE SEPARATOR */
8820 0x2029, /* PARAGRAPH SEPARATOR */
8823 /* Init the implementation */
8824 free_list = NULL;
8825 numfree = 0;
8826 unicode_empty = _PyUnicode_New(0);
8827 if (!unicode_empty)
8828 return;
8830 strcpy(unicode_default_encoding, "ascii");
8831 for (i = 0; i < 256; i++)
8832 unicode_latin1[i] = NULL;
8833 if (PyType_Ready(&PyUnicode_Type) < 0)
8834 Py_FatalError("Can't initialize 'unicode'");
8836 /* initialize the linebreak bloom filter */
8837 bloom_linebreak = make_bloom_mask(
8838 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8841 PyType_Ready(&EncodingMapType);
8844 /* Finalize the Unicode implementation */
8847 PyUnicode_ClearFreeList(void)
8849 int freelist_size = numfree;
8850 PyUnicodeObject *u;
8852 for (u = free_list; u != NULL;) {
8853 PyUnicodeObject *v = u;
8854 u = *(PyUnicodeObject **)u;
8855 if (v->str)
8856 PyObject_DEL(v->str);
8857 Py_XDECREF(v->defenc);
8858 PyObject_Del(v);
8859 numfree--;
8861 free_list = NULL;
8862 assert(numfree == 0);
8863 return freelist_size;
8866 void
8867 _PyUnicode_Fini(void)
8869 int i;
8871 Py_XDECREF(unicode_empty);
8872 unicode_empty = NULL;
8874 for (i = 0; i < 256; i++) {
8875 if (unicode_latin1[i]) {
8876 Py_DECREF(unicode_latin1[i]);
8877 unicode_latin1[i] = NULL;
8880 (void)PyUnicode_ClearFreeList();
8883 #ifdef __cplusplus
8885 #endif