Silence the DeprecationWarning raised by importing mimetools in BaseHTTPServer.
[python.git] / Objects / unicodeobject.c
blob603c5070a8e384abf2758e78b1d6e2d4347a0d83
1 /*
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
10 Copyright (c) Corporation for National Research Initiatives.
12 --------------------------------------------------------------------
13 The original string type implementation is:
15 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
18 By obtaining, using, and/or copying this software and/or its
19 associated documentation, you agree that you have read, understood,
20 and will comply with the following terms and conditions:
22 Permission to use, copy, modify, and distribute this software and its
23 associated documentation for any purpose and without fee is hereby
24 granted, provided that the above copyright notice appears in all
25 copies, and that both that copyright notice and this permission notice
26 appear in supporting documentation, and that the name of Secret Labs
27 AB or the author not be used in advertising or publicity pertaining to
28 distribution of the software without specific, written prior
29 permission.
31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 --------------------------------------------------------------------
42 #define PY_SSIZE_T_CLEAN
43 #include "Python.h"
45 #include "unicodeobject.h"
46 #include "ucnhash.h"
48 #ifdef MS_WINDOWS
49 #include <windows.h>
50 #endif
52 /* Limit for the Unicode object free list */
54 #define PyUnicode_MAXFREELIST 1024
56 /* Limit for the Unicode object free list stay alive optimization.
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
60 limit. This reduces malloc() overhead for small Unicode objects.
62 At worst this will result in PyUnicode_MAXFREELIST *
63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64 malloc()-overhead) bytes of unused garbage.
66 Setting the limit to 0 effectively turns the feature off.
68 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
73 #define KEEPALIVE_SIZE_LIMIT 9
75 /* Endianness switches; defaults to little endian */
77 #ifdef WORDS_BIGENDIAN
78 # define BYTEORDER_IS_BIG_ENDIAN
79 #else
80 # define BYTEORDER_IS_LITTLE_ENDIAN
81 #endif
83 /* --- Globals ------------------------------------------------------------
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
91 #ifdef __cplusplus
92 extern "C" {
93 #endif
95 /* Free list for Unicode objects */
96 static PyUnicodeObject *free_list;
97 static int numfree;
99 /* The empty Unicode object is shared to improve performance. */
100 static PyUnicodeObject *unicode_empty;
102 /* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104 static PyUnicodeObject *unicode_latin1[256];
106 /* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
109 Always use the PyUnicode_SetDefaultEncoding() and
110 PyUnicode_GetDefaultEncoding() APIs to access this global.
113 static char unicode_default_encoding[100];
115 /* Fast detection of the most frequent whitespace characters */
116 const unsigned char _Py_ascii_whitespace[] = {
117 0, 0, 0, 0, 0, 0, 0, 0,
118 // case 0x0009: /* HORIZONTAL TABULATION */
119 // case 0x000A: /* LINE FEED */
120 // case 0x000B: /* VERTICAL TABULATION */
121 // case 0x000C: /* FORM FEED */
122 // case 0x000D: /* CARRIAGE RETURN */
123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
125 // case 0x001C: /* FILE SEPARATOR */
126 // case 0x001D: /* GROUP SEPARATOR */
127 // case 0x001E: /* RECORD SEPARATOR */
128 // case 0x001F: /* UNIT SEPARATOR */
129 0, 0, 0, 0, 1, 1, 1, 1,
130 // case 0x0020: /* SPACE */
131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
146 /* Same for linebreaks */
147 static unsigned char ascii_linebreak[] = {
148 0, 0, 0, 0, 0, 0, 0, 0,
149 // 0x000A, /* LINE FEED */
150 // 0x000D, /* CARRIAGE RETURN */
151 0, 0, 1, 0, 0, 1, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 // 0x001C, /* FILE SEPARATOR */
154 // 0x001D, /* GROUP SEPARATOR */
155 // 0x001E, /* RECORD SEPARATOR */
156 0, 0, 0, 0, 1, 1, 1, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0
173 Py_UNICODE
174 PyUnicode_GetMax(void)
176 #ifdef Py_UNICODE_WIDE
177 return 0x10FFFF;
178 #else
179 /* This is actually an illegal character, so it should
180 not be passed to unichr. */
181 return 0xFFFF;
182 #endif
185 /* --- Bloom Filters ----------------------------------------------------- */
187 /* stuff to implement simple "bloom filters" for Unicode characters.
188 to keep things simple, we use a single bitmask, using the least 5
189 bits from each unicode characters as the bit index. */
191 /* the linebreak mask is set up by Unicode_Init below */
193 #define BLOOM_MASK unsigned long
195 static BLOOM_MASK bloom_linebreak;
197 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
199 #define BLOOM_LINEBREAK(ch) \
200 ((ch) < 128U ? ascii_linebreak[(ch)] : \
201 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
203 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
205 /* calculate simple bloom-style bitmask for a given unicode string */
207 long mask;
208 Py_ssize_t i;
210 mask = 0;
211 for (i = 0; i < len; i++)
212 mask |= (1 << (ptr[i] & 0x1F));
214 return mask;
217 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
219 Py_ssize_t i;
221 for (i = 0; i < setlen; i++)
222 if (set[i] == chr)
223 return 1;
225 return 0;
228 #define BLOOM_MEMBER(mask, chr, set, setlen)\
229 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
231 /* --- Unicode Object ----------------------------------------------------- */
233 static
234 int unicode_resize(register PyUnicodeObject *unicode,
235 Py_ssize_t length)
237 void *oldstr;
239 /* Shortcut if there's nothing much to do. */
240 if (unicode->length == length)
241 goto reset;
243 /* Resizing shared object (unicode_empty or single character
244 objects) in-place is not allowed. Use PyUnicode_Resize()
245 instead ! */
247 if (unicode == unicode_empty ||
248 (unicode->length == 1 &&
249 unicode->str[0] < 256U &&
250 unicode_latin1[unicode->str[0]] == unicode)) {
251 PyErr_SetString(PyExc_SystemError,
252 "can't resize shared unicode objects");
253 return -1;
256 /* We allocate one more byte to make sure the string is Ux0000 terminated.
257 The overallocation is also used by fastsearch, which assumes that it's
258 safe to look at str[length] (without making any assumptions about what
259 it contains). */
261 oldstr = unicode->str;
262 unicode->str = PyObject_REALLOC(unicode->str,
263 sizeof(Py_UNICODE) * (length + 1));
264 if (!unicode->str) {
265 unicode->str = (Py_UNICODE *)oldstr;
266 PyErr_NoMemory();
267 return -1;
269 unicode->str[length] = 0;
270 unicode->length = length;
272 reset:
273 /* Reset the object caches */
274 if (unicode->defenc) {
275 Py_DECREF(unicode->defenc);
276 unicode->defenc = NULL;
278 unicode->hash = -1;
280 return 0;
283 /* We allocate one more byte to make sure the string is
284 Ux0000 terminated -- XXX is this needed ?
286 XXX This allocator could further be enhanced by assuring that the
287 free list never reduces its size below 1.
291 static
292 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
294 register PyUnicodeObject *unicode;
296 /* Optimization for empty strings */
297 if (length == 0 && unicode_empty != NULL) {
298 Py_INCREF(unicode_empty);
299 return unicode_empty;
302 /* Ensure we won't overflow the size. */
303 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
304 return (PyUnicodeObject *)PyErr_NoMemory();
307 /* Unicode freelist & memory allocation */
308 if (free_list) {
309 unicode = free_list;
310 free_list = *(PyUnicodeObject **)unicode;
311 numfree--;
312 if (unicode->str) {
313 /* Keep-Alive optimization: we only upsize the buffer,
314 never downsize it. */
315 if ((unicode->length < length) &&
316 unicode_resize(unicode, length) < 0) {
317 PyObject_DEL(unicode->str);
318 unicode->str = NULL;
321 else {
322 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
323 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
325 PyObject_INIT(unicode, &PyUnicode_Type);
327 else {
328 size_t new_size;
329 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
330 if (unicode == NULL)
331 return NULL;
332 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
333 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
336 if (!unicode->str) {
337 PyErr_NoMemory();
338 goto onError;
340 /* Initialize the first element to guard against cases where
341 * the caller fails before initializing str -- unicode_resize()
342 * reads str[0], and the Keep-Alive optimization can keep memory
343 * allocated for str alive across a call to unicode_dealloc(unicode).
344 * We don't want unicode_resize to read uninitialized memory in
345 * that case.
347 unicode->str[0] = 0;
348 unicode->str[length] = 0;
349 unicode->length = length;
350 unicode->hash = -1;
351 unicode->defenc = NULL;
352 return unicode;
354 onError:
355 /* XXX UNREF/NEWREF interface should be more symmetrical */
356 _Py_DEC_REFTOTAL;
357 _Py_ForgetReference((PyObject *)unicode);
358 PyObject_Del(unicode);
359 return NULL;
362 static
363 void unicode_dealloc(register PyUnicodeObject *unicode)
365 if (PyUnicode_CheckExact(unicode) &&
366 numfree < PyUnicode_MAXFREELIST) {
367 /* Keep-Alive optimization */
368 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
369 PyObject_DEL(unicode->str);
370 unicode->str = NULL;
371 unicode->length = 0;
373 if (unicode->defenc) {
374 Py_DECREF(unicode->defenc);
375 unicode->defenc = NULL;
377 /* Add to free list */
378 *(PyUnicodeObject **)unicode = free_list;
379 free_list = unicode;
380 numfree++;
382 else {
383 PyObject_DEL(unicode->str);
384 Py_XDECREF(unicode->defenc);
385 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
389 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
391 register PyUnicodeObject *v;
393 /* Argument checks */
394 if (unicode == NULL) {
395 PyErr_BadInternalCall();
396 return -1;
398 v = (PyUnicodeObject *)*unicode;
399 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
400 PyErr_BadInternalCall();
401 return -1;
404 /* Resizing unicode_empty and single character objects is not
405 possible since these are being shared. We simply return a fresh
406 copy with the same Unicode content. */
407 if (v->length != length &&
408 (v == unicode_empty || v->length == 1)) {
409 PyUnicodeObject *w = _PyUnicode_New(length);
410 if (w == NULL)
411 return -1;
412 Py_UNICODE_COPY(w->str, v->str,
413 length < v->length ? length : v->length);
414 Py_DECREF(*unicode);
415 *unicode = (PyObject *)w;
416 return 0;
419 /* Note that we don't have to modify *unicode for unshared Unicode
420 objects, since we can modify them in-place. */
421 return unicode_resize(v, length);
424 /* Internal API for use in unicodeobject.c only ! */
425 #define _PyUnicode_Resize(unicodevar, length) \
426 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
428 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
429 Py_ssize_t size)
431 PyUnicodeObject *unicode;
433 /* If the Unicode data is known at construction time, we can apply
434 some optimizations which share commonly used objects. */
435 if (u != NULL) {
437 /* Optimization for empty strings */
438 if (size == 0 && unicode_empty != NULL) {
439 Py_INCREF(unicode_empty);
440 return (PyObject *)unicode_empty;
443 /* Single character Unicode objects in the Latin-1 range are
444 shared when using this constructor */
445 if (size == 1 && *u < 256) {
446 unicode = unicode_latin1[*u];
447 if (!unicode) {
448 unicode = _PyUnicode_New(1);
449 if (!unicode)
450 return NULL;
451 unicode->str[0] = *u;
452 unicode_latin1[*u] = unicode;
454 Py_INCREF(unicode);
455 return (PyObject *)unicode;
459 unicode = _PyUnicode_New(size);
460 if (!unicode)
461 return NULL;
463 /* Copy the Unicode data into the new object */
464 if (u != NULL)
465 Py_UNICODE_COPY(unicode->str, u, size);
467 return (PyObject *)unicode;
470 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
472 PyUnicodeObject *unicode;
474 if (size < 0) {
475 PyErr_SetString(PyExc_SystemError,
476 "Negative size passed to PyUnicode_FromStringAndSize");
477 return NULL;
480 /* If the Unicode data is known at construction time, we can apply
481 some optimizations which share commonly used objects.
482 Also, this means the input must be UTF-8, so fall back to the
483 UTF-8 decoder at the end. */
484 if (u != NULL) {
486 /* Optimization for empty strings */
487 if (size == 0 && unicode_empty != NULL) {
488 Py_INCREF(unicode_empty);
489 return (PyObject *)unicode_empty;
492 /* Single characters are shared when using this constructor.
493 Restrict to ASCII, since the input must be UTF-8. */
494 if (size == 1 && Py_CHARMASK(*u) < 128) {
495 unicode = unicode_latin1[Py_CHARMASK(*u)];
496 if (!unicode) {
497 unicode = _PyUnicode_New(1);
498 if (!unicode)
499 return NULL;
500 unicode->str[0] = Py_CHARMASK(*u);
501 unicode_latin1[Py_CHARMASK(*u)] = unicode;
503 Py_INCREF(unicode);
504 return (PyObject *)unicode;
507 return PyUnicode_DecodeUTF8(u, size, NULL);
510 unicode = _PyUnicode_New(size);
511 if (!unicode)
512 return NULL;
514 return (PyObject *)unicode;
517 PyObject *PyUnicode_FromString(const char *u)
519 size_t size = strlen(u);
520 if (size > PY_SSIZE_T_MAX) {
521 PyErr_SetString(PyExc_OverflowError, "input too long");
522 return NULL;
525 return PyUnicode_FromStringAndSize(u, size);
528 #ifdef HAVE_WCHAR_H
530 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
531 Py_ssize_t size)
533 PyUnicodeObject *unicode;
535 if (w == NULL) {
536 PyErr_BadInternalCall();
537 return NULL;
540 unicode = _PyUnicode_New(size);
541 if (!unicode)
542 return NULL;
544 /* Copy the wchar_t data into the new object */
545 #ifdef HAVE_USABLE_WCHAR_T
546 memcpy(unicode->str, w, size * sizeof(wchar_t));
547 #else
549 register Py_UNICODE *u;
550 register Py_ssize_t i;
551 u = PyUnicode_AS_UNICODE(unicode);
552 for (i = size; i > 0; i--)
553 *u++ = *w++;
555 #endif
557 return (PyObject *)unicode;
560 static void
561 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
563 *fmt++ = '%';
564 if (width) {
565 if (zeropad)
566 *fmt++ = '0';
567 fmt += sprintf(fmt, "%d", width);
569 if (precision)
570 fmt += sprintf(fmt, ".%d", precision);
571 if (longflag)
572 *fmt++ = 'l';
573 else if (size_tflag) {
574 char *f = PY_FORMAT_SIZE_T;
575 while (*f)
576 *fmt++ = *f++;
578 *fmt++ = c;
579 *fmt = '\0';
582 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
584 PyObject *
585 PyUnicode_FromFormatV(const char *format, va_list vargs)
587 va_list count;
588 Py_ssize_t callcount = 0;
589 PyObject **callresults = NULL;
590 PyObject **callresult = NULL;
591 Py_ssize_t n = 0;
592 int width = 0;
593 int precision = 0;
594 int zeropad;
595 const char* f;
596 Py_UNICODE *s;
597 PyObject *string;
598 /* used by sprintf */
599 char buffer[21];
600 /* use abuffer instead of buffer, if we need more space
601 * (which can happen if there's a format specifier with width). */
602 char *abuffer = NULL;
603 char *realbuffer;
604 Py_ssize_t abuffersize = 0;
605 char fmt[60]; /* should be enough for %0width.precisionld */
606 const char *copy;
608 #ifdef VA_LIST_IS_ARRAY
609 Py_MEMCPY(count, vargs, sizeof(va_list));
610 #else
611 #ifdef __va_copy
612 __va_copy(count, vargs);
613 #else
614 count = vargs;
615 #endif
616 #endif
617 /* step 1: count the number of %S/%R format specifications
618 * (we call PyObject_Str()/PyObject_Repr() for these objects
619 * once during step 3 and put the result in an array) */
620 for (f = format; *f; f++) {
621 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
622 ++callcount;
624 /* step 2: allocate memory for the results of
625 * PyObject_Str()/PyObject_Repr() calls */
626 if (callcount) {
627 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
628 if (!callresults) {
629 PyErr_NoMemory();
630 return NULL;
632 callresult = callresults;
634 /* step 3: figure out how large a buffer we need */
635 for (f = format; *f; f++) {
636 if (*f == '%') {
637 const char* p = f;
638 width = 0;
639 while (isdigit((unsigned)*f))
640 width = (width*10) + *f++ - '0';
641 while (*++f && *f != '%' && !isalpha((unsigned)*f))
644 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
645 * they don't affect the amount of space we reserve.
647 if ((*f == 'l' || *f == 'z') &&
648 (f[1] == 'd' || f[1] == 'u'))
649 ++f;
651 switch (*f) {
652 case 'c':
653 (void)va_arg(count, int);
654 /* fall through... */
655 case '%':
656 n++;
657 break;
658 case 'd': case 'u': case 'i': case 'x':
659 (void) va_arg(count, int);
660 /* 20 bytes is enough to hold a 64-bit
661 integer. Decimal takes the most space.
662 This isn't enough for octal.
663 If a width is specified we need more
664 (which we allocate later). */
665 if (width < 20)
666 width = 20;
667 n += width;
668 if (abuffersize < width)
669 abuffersize = width;
670 break;
671 case 's':
673 /* UTF-8 */
674 unsigned char*s;
675 s = va_arg(count, unsigned char*);
676 while (*s) {
677 if (*s < 128) {
678 n++; s++;
679 } else if (*s < 0xc0) {
680 /* invalid UTF-8 */
681 n++; s++;
682 } else if (*s < 0xc0) {
683 n++;
684 s++; if(!*s)break;
685 s++;
686 } else if (*s < 0xe0) {
687 n++;
688 s++; if(!*s)break;
689 s++; if(!*s)break;
690 s++;
691 } else {
692 #ifdef Py_UNICODE_WIDE
693 n++;
694 #else
695 n+=2;
696 #endif
697 s++; if(!*s)break;
698 s++; if(!*s)break;
699 s++; if(!*s)break;
700 s++;
703 break;
705 case 'U':
707 PyObject *obj = va_arg(count, PyObject *);
708 assert(obj && PyUnicode_Check(obj));
709 n += PyUnicode_GET_SIZE(obj);
710 break;
712 case 'V':
714 PyObject *obj = va_arg(count, PyObject *);
715 const char *str = va_arg(count, const char *);
716 assert(obj || str);
717 assert(!obj || PyUnicode_Check(obj));
718 if (obj)
719 n += PyUnicode_GET_SIZE(obj);
720 else
721 n += strlen(str);
722 break;
724 case 'S':
726 PyObject *obj = va_arg(count, PyObject *);
727 PyObject *str;
728 assert(obj);
729 str = PyObject_Str(obj);
730 if (!str)
731 goto fail;
732 n += PyUnicode_GET_SIZE(str);
733 /* Remember the str and switch to the next slot */
734 *callresult++ = str;
735 break;
737 case 'R':
739 PyObject *obj = va_arg(count, PyObject *);
740 PyObject *repr;
741 assert(obj);
742 repr = PyObject_Repr(obj);
743 if (!repr)
744 goto fail;
745 n += PyUnicode_GET_SIZE(repr);
746 /* Remember the repr and switch to the next slot */
747 *callresult++ = repr;
748 break;
750 case 'p':
751 (void) va_arg(count, int);
752 /* maximum 64-bit pointer representation:
753 * 0xffffffffffffffff
754 * so 19 characters is enough.
755 * XXX I count 18 -- what's the extra for?
757 n += 19;
758 break;
759 default:
760 /* if we stumble upon an unknown
761 formatting code, copy the rest of
762 the format string to the output
763 string. (we cannot just skip the
764 code, since there's no way to know
765 what's in the argument list) */
766 n += strlen(p);
767 goto expand;
769 } else
770 n++;
772 expand:
773 if (abuffersize > 20) {
774 abuffer = PyObject_Malloc(abuffersize);
775 if (!abuffer) {
776 PyErr_NoMemory();
777 goto fail;
779 realbuffer = abuffer;
781 else
782 realbuffer = buffer;
783 /* step 4: fill the buffer */
784 /* Since we've analyzed how much space we need for the worst case,
785 we don't have to resize the string.
786 There can be no errors beyond this point. */
787 string = PyUnicode_FromUnicode(NULL, n);
788 if (!string)
789 goto fail;
791 s = PyUnicode_AS_UNICODE(string);
792 callresult = callresults;
794 for (f = format; *f; f++) {
795 if (*f == '%') {
796 const char* p = f++;
797 int longflag = 0;
798 int size_tflag = 0;
799 zeropad = (*f == '0');
800 /* parse the width.precision part */
801 width = 0;
802 while (isdigit((unsigned)*f))
803 width = (width*10) + *f++ - '0';
804 precision = 0;
805 if (*f == '.') {
806 f++;
807 while (isdigit((unsigned)*f))
808 precision = (precision*10) + *f++ - '0';
810 /* handle the long flag, but only for %ld and %lu.
811 others can be added when necessary. */
812 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
813 longflag = 1;
814 ++f;
816 /* handle the size_t flag. */
817 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
818 size_tflag = 1;
819 ++f;
822 switch (*f) {
823 case 'c':
824 *s++ = va_arg(vargs, int);
825 break;
826 case 'd':
827 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
828 if (longflag)
829 sprintf(realbuffer, fmt, va_arg(vargs, long));
830 else if (size_tflag)
831 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
832 else
833 sprintf(realbuffer, fmt, va_arg(vargs, int));
834 appendstring(realbuffer);
835 break;
836 case 'u':
837 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
838 if (longflag)
839 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
840 else if (size_tflag)
841 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
842 else
843 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
844 appendstring(realbuffer);
845 break;
846 case 'i':
847 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
848 sprintf(realbuffer, fmt, va_arg(vargs, int));
849 appendstring(realbuffer);
850 break;
851 case 'x':
852 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
853 sprintf(realbuffer, fmt, va_arg(vargs, int));
854 appendstring(realbuffer);
855 break;
856 case 's':
858 /* Parameter must be UTF-8 encoded.
859 In case of encoding errors, use
860 the replacement character. */
861 PyObject *u;
862 p = va_arg(vargs, char*);
863 u = PyUnicode_DecodeUTF8(p, strlen(p),
864 "replace");
865 if (!u)
866 goto fail;
867 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
868 PyUnicode_GET_SIZE(u));
869 s += PyUnicode_GET_SIZE(u);
870 Py_DECREF(u);
871 break;
873 case 'U':
875 PyObject *obj = va_arg(vargs, PyObject *);
876 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
877 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
878 s += size;
879 break;
881 case 'V':
883 PyObject *obj = va_arg(vargs, PyObject *);
884 const char *str = va_arg(vargs, const char *);
885 if (obj) {
886 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
887 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
888 s += size;
889 } else {
890 appendstring(str);
892 break;
894 case 'S':
895 case 'R':
897 Py_UNICODE *ucopy;
898 Py_ssize_t usize;
899 Py_ssize_t upos;
900 /* unused, since we already have the result */
901 (void) va_arg(vargs, PyObject *);
902 ucopy = PyUnicode_AS_UNICODE(*callresult);
903 usize = PyUnicode_GET_SIZE(*callresult);
904 for (upos = 0; upos<usize;)
905 *s++ = ucopy[upos++];
906 /* We're done with the unicode()/repr() => forget it */
907 Py_DECREF(*callresult);
908 /* switch to next unicode()/repr() result */
909 ++callresult;
910 break;
912 case 'p':
913 sprintf(buffer, "%p", va_arg(vargs, void*));
914 /* %p is ill-defined: ensure leading 0x. */
915 if (buffer[1] == 'X')
916 buffer[1] = 'x';
917 else if (buffer[1] != 'x') {
918 memmove(buffer+2, buffer, strlen(buffer)+1);
919 buffer[0] = '0';
920 buffer[1] = 'x';
922 appendstring(buffer);
923 break;
924 case '%':
925 *s++ = '%';
926 break;
927 default:
928 appendstring(p);
929 goto end;
931 } else
932 *s++ = *f;
935 end:
936 if (callresults)
937 PyObject_Free(callresults);
938 if (abuffer)
939 PyObject_Free(abuffer);
940 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
941 return string;
942 fail:
943 if (callresults) {
944 PyObject **callresult2 = callresults;
945 while (callresult2 < callresult) {
946 Py_DECREF(*callresult2);
947 ++callresult2;
949 PyObject_Free(callresults);
951 if (abuffer)
952 PyObject_Free(abuffer);
953 return NULL;
956 #undef appendstring
958 PyObject *
959 PyUnicode_FromFormat(const char *format, ...)
961 PyObject* ret;
962 va_list vargs;
964 #ifdef HAVE_STDARG_PROTOTYPES
965 va_start(vargs, format);
966 #else
967 va_start(vargs);
968 #endif
969 ret = PyUnicode_FromFormatV(format, vargs);
970 va_end(vargs);
971 return ret;
974 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
975 wchar_t *w,
976 Py_ssize_t size)
978 if (unicode == NULL) {
979 PyErr_BadInternalCall();
980 return -1;
983 /* If possible, try to copy the 0-termination as well */
984 if (size > PyUnicode_GET_SIZE(unicode))
985 size = PyUnicode_GET_SIZE(unicode) + 1;
987 #ifdef HAVE_USABLE_WCHAR_T
988 memcpy(w, unicode->str, size * sizeof(wchar_t));
989 #else
991 register Py_UNICODE *u;
992 register Py_ssize_t i;
993 u = PyUnicode_AS_UNICODE(unicode);
994 for (i = size; i > 0; i--)
995 *w++ = *u++;
997 #endif
999 if (size > PyUnicode_GET_SIZE(unicode))
1000 return PyUnicode_GET_SIZE(unicode);
1001 else
1002 return size;
1005 #endif
1007 PyObject *PyUnicode_FromOrdinal(int ordinal)
1009 Py_UNICODE s[1];
1011 #ifdef Py_UNICODE_WIDE
1012 if (ordinal < 0 || ordinal > 0x10ffff) {
1013 PyErr_SetString(PyExc_ValueError,
1014 "unichr() arg not in range(0x110000) "
1015 "(wide Python build)");
1016 return NULL;
1018 #else
1019 if (ordinal < 0 || ordinal > 0xffff) {
1020 PyErr_SetString(PyExc_ValueError,
1021 "unichr() arg not in range(0x10000) "
1022 "(narrow Python build)");
1023 return NULL;
1025 #endif
1027 s[0] = (Py_UNICODE)ordinal;
1028 return PyUnicode_FromUnicode(s, 1);
1031 PyObject *PyUnicode_FromObject(register PyObject *obj)
1033 /* XXX Perhaps we should make this API an alias of
1034 PyObject_Unicode() instead ?! */
1035 if (PyUnicode_CheckExact(obj)) {
1036 Py_INCREF(obj);
1037 return obj;
1039 if (PyUnicode_Check(obj)) {
1040 /* For a Unicode subtype that's not a Unicode object,
1041 return a true Unicode object with the same data. */
1042 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1043 PyUnicode_GET_SIZE(obj));
1045 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1048 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1049 const char *encoding,
1050 const char *errors)
1052 const char *s = NULL;
1053 Py_ssize_t len;
1054 PyObject *v;
1056 if (obj == NULL) {
1057 PyErr_BadInternalCall();
1058 return NULL;
1061 #if 0
1062 /* For b/w compatibility we also accept Unicode objects provided
1063 that no encodings is given and then redirect to
1064 PyObject_Unicode() which then applies the additional logic for
1065 Unicode subclasses.
1067 NOTE: This API should really only be used for object which
1068 represent *encoded* Unicode !
1071 if (PyUnicode_Check(obj)) {
1072 if (encoding) {
1073 PyErr_SetString(PyExc_TypeError,
1074 "decoding Unicode is not supported");
1075 return NULL;
1077 return PyObject_Unicode(obj);
1079 #else
1080 if (PyUnicode_Check(obj)) {
1081 PyErr_SetString(PyExc_TypeError,
1082 "decoding Unicode is not supported");
1083 return NULL;
1085 #endif
1087 /* Coerce object */
1088 if (PyString_Check(obj)) {
1089 s = PyString_AS_STRING(obj);
1090 len = PyString_GET_SIZE(obj);
1092 else if (PyByteArray_Check(obj)) {
1093 /* Python 2.x specific */
1094 PyErr_Format(PyExc_TypeError,
1095 "decoding bytearray is not supported");
1096 return NULL;
1098 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1099 /* Overwrite the error message with something more useful in
1100 case of a TypeError. */
1101 if (PyErr_ExceptionMatches(PyExc_TypeError))
1102 PyErr_Format(PyExc_TypeError,
1103 "coercing to Unicode: need string or buffer, "
1104 "%.80s found",
1105 Py_TYPE(obj)->tp_name);
1106 goto onError;
1109 /* Convert to Unicode */
1110 if (len == 0) {
1111 Py_INCREF(unicode_empty);
1112 v = (PyObject *)unicode_empty;
1114 else
1115 v = PyUnicode_Decode(s, len, encoding, errors);
1117 return v;
1119 onError:
1120 return NULL;
1123 PyObject *PyUnicode_Decode(const char *s,
1124 Py_ssize_t size,
1125 const char *encoding,
1126 const char *errors)
1128 PyObject *buffer = NULL, *unicode;
1130 if (encoding == NULL)
1131 encoding = PyUnicode_GetDefaultEncoding();
1133 /* Shortcuts for common default encodings */
1134 if (strcmp(encoding, "utf-8") == 0)
1135 return PyUnicode_DecodeUTF8(s, size, errors);
1136 else if (strcmp(encoding, "latin-1") == 0)
1137 return PyUnicode_DecodeLatin1(s, size, errors);
1138 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1139 else if (strcmp(encoding, "mbcs") == 0)
1140 return PyUnicode_DecodeMBCS(s, size, errors);
1141 #endif
1142 else if (strcmp(encoding, "ascii") == 0)
1143 return PyUnicode_DecodeASCII(s, size, errors);
1145 /* Decode via the codec registry */
1146 buffer = PyBuffer_FromMemory((void *)s, size);
1147 if (buffer == NULL)
1148 goto onError;
1149 unicode = PyCodec_Decode(buffer, encoding, errors);
1150 if (unicode == NULL)
1151 goto onError;
1152 if (!PyUnicode_Check(unicode)) {
1153 PyErr_Format(PyExc_TypeError,
1154 "decoder did not return an unicode object (type=%.400s)",
1155 Py_TYPE(unicode)->tp_name);
1156 Py_DECREF(unicode);
1157 goto onError;
1159 Py_DECREF(buffer);
1160 return unicode;
1162 onError:
1163 Py_XDECREF(buffer);
1164 return NULL;
1167 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1168 const char *encoding,
1169 const char *errors)
1171 PyObject *v;
1173 if (!PyUnicode_Check(unicode)) {
1174 PyErr_BadArgument();
1175 goto onError;
1178 if (encoding == NULL)
1179 encoding = PyUnicode_GetDefaultEncoding();
1181 /* Decode via the codec registry */
1182 v = PyCodec_Decode(unicode, encoding, errors);
1183 if (v == NULL)
1184 goto onError;
1185 return v;
1187 onError:
1188 return NULL;
1191 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1192 Py_ssize_t size,
1193 const char *encoding,
1194 const char *errors)
1196 PyObject *v, *unicode;
1198 unicode = PyUnicode_FromUnicode(s, size);
1199 if (unicode == NULL)
1200 return NULL;
1201 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1202 Py_DECREF(unicode);
1203 return v;
1206 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1207 const char *encoding,
1208 const char *errors)
1210 PyObject *v;
1212 if (!PyUnicode_Check(unicode)) {
1213 PyErr_BadArgument();
1214 goto onError;
1217 if (encoding == NULL)
1218 encoding = PyUnicode_GetDefaultEncoding();
1220 /* Encode via the codec registry */
1221 v = PyCodec_Encode(unicode, encoding, errors);
1222 if (v == NULL)
1223 goto onError;
1224 return v;
1226 onError:
1227 return NULL;
1230 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1231 const char *encoding,
1232 const char *errors)
1234 PyObject *v;
1236 if (!PyUnicode_Check(unicode)) {
1237 PyErr_BadArgument();
1238 goto onError;
1241 if (encoding == NULL)
1242 encoding = PyUnicode_GetDefaultEncoding();
1244 /* Shortcuts for common default encodings */
1245 if (errors == NULL) {
1246 if (strcmp(encoding, "utf-8") == 0)
1247 return PyUnicode_AsUTF8String(unicode);
1248 else if (strcmp(encoding, "latin-1") == 0)
1249 return PyUnicode_AsLatin1String(unicode);
1250 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1251 else if (strcmp(encoding, "mbcs") == 0)
1252 return PyUnicode_AsMBCSString(unicode);
1253 #endif
1254 else if (strcmp(encoding, "ascii") == 0)
1255 return PyUnicode_AsASCIIString(unicode);
1258 /* Encode via the codec registry */
1259 v = PyCodec_Encode(unicode, encoding, errors);
1260 if (v == NULL)
1261 goto onError;
1262 if (!PyString_Check(v)) {
1263 PyErr_Format(PyExc_TypeError,
1264 "encoder did not return a string object (type=%.400s)",
1265 Py_TYPE(v)->tp_name);
1266 Py_DECREF(v);
1267 goto onError;
1269 return v;
1271 onError:
1272 return NULL;
1275 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1276 const char *errors)
1278 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1280 if (v)
1281 return v;
1282 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1283 if (v && errors == NULL)
1284 ((PyUnicodeObject *)unicode)->defenc = v;
1285 return v;
1288 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1290 if (!PyUnicode_Check(unicode)) {
1291 PyErr_BadArgument();
1292 goto onError;
1294 return PyUnicode_AS_UNICODE(unicode);
1296 onError:
1297 return NULL;
1300 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1302 if (!PyUnicode_Check(unicode)) {
1303 PyErr_BadArgument();
1304 goto onError;
1306 return PyUnicode_GET_SIZE(unicode);
1308 onError:
1309 return -1;
1312 const char *PyUnicode_GetDefaultEncoding(void)
1314 return unicode_default_encoding;
1317 int PyUnicode_SetDefaultEncoding(const char *encoding)
1319 PyObject *v;
1321 /* Make sure the encoding is valid. As side effect, this also
1322 loads the encoding into the codec registry cache. */
1323 v = _PyCodec_Lookup(encoding);
1324 if (v == NULL)
1325 goto onError;
1326 Py_DECREF(v);
1327 strncpy(unicode_default_encoding,
1328 encoding,
1329 sizeof(unicode_default_encoding));
1330 return 0;
1332 onError:
1333 return -1;
1336 /* error handling callback helper:
1337 build arguments, call the callback and check the arguments,
1338 if no exception occurred, copy the replacement to the output
1339 and adjust various state variables.
1340 return 0 on success, -1 on error
1343 static
1344 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1345 const char *encoding, const char *reason,
1346 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1347 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1348 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1350 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1352 PyObject *restuple = NULL;
1353 PyObject *repunicode = NULL;
1354 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1355 Py_ssize_t requiredsize;
1356 Py_ssize_t newpos;
1357 Py_UNICODE *repptr;
1358 Py_ssize_t repsize;
1359 int res = -1;
1361 if (*errorHandler == NULL) {
1362 *errorHandler = PyCodec_LookupError(errors);
1363 if (*errorHandler == NULL)
1364 goto onError;
1367 if (*exceptionObject == NULL) {
1368 *exceptionObject = PyUnicodeDecodeError_Create(
1369 encoding, input, insize, *startinpos, *endinpos, reason);
1370 if (*exceptionObject == NULL)
1371 goto onError;
1373 else {
1374 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1375 goto onError;
1376 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1377 goto onError;
1378 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1379 goto onError;
1382 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1383 if (restuple == NULL)
1384 goto onError;
1385 if (!PyTuple_Check(restuple)) {
1386 PyErr_Format(PyExc_TypeError, &argparse[4]);
1387 goto onError;
1389 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1390 goto onError;
1391 if (newpos<0)
1392 newpos = insize+newpos;
1393 if (newpos<0 || newpos>insize) {
1394 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1395 goto onError;
1398 /* need more space? (at least enough for what we
1399 have+the replacement+the rest of the string (starting
1400 at the new input position), so we won't have to check space
1401 when there are no errors in the rest of the string) */
1402 repptr = PyUnicode_AS_UNICODE(repunicode);
1403 repsize = PyUnicode_GET_SIZE(repunicode);
1404 requiredsize = *outpos + repsize + insize-newpos;
1405 if (requiredsize > outsize) {
1406 if (requiredsize<2*outsize)
1407 requiredsize = 2*outsize;
1408 if (PyUnicode_Resize(output, requiredsize) < 0)
1409 goto onError;
1410 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1412 *endinpos = newpos;
1413 *inptr = input + newpos;
1414 Py_UNICODE_COPY(*outptr, repptr, repsize);
1415 *outptr += repsize;
1416 *outpos += repsize;
1417 /* we made it! */
1418 res = 0;
1420 onError:
1421 Py_XDECREF(restuple);
1422 return res;
1425 /* --- UTF-7 Codec -------------------------------------------------------- */
1427 /* see RFC2152 for details */
1429 static
1430 char utf7_special[128] = {
1431 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1432 encoded:
1433 0 - not special
1434 1 - special
1435 2 - whitespace (optional)
1436 3 - RFC2152 Set O (optional) */
1437 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1438 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1439 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1440 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1441 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1442 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1443 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1444 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1448 /* Note: The comparison (c) <= 0 is a trick to work-around gcc
1449 warnings about the comparison always being false; since
1450 utf7_special[0] is 1, we can safely make that one comparison
1451 true */
1453 #define SPECIAL(c, encodeO, encodeWS) \
1454 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
1455 (encodeWS && (utf7_special[(c)] == 2)) || \
1456 (encodeO && (utf7_special[(c)] == 3)))
1458 #define B64(n) \
1459 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1460 #define B64CHAR(c) \
1461 (isalnum(c) || (c) == '+' || (c) == '/')
1462 #define UB64(c) \
1463 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1464 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
1466 #define ENCODE(out, ch, bits) \
1467 while (bits >= 6) { \
1468 *out++ = B64(ch >> (bits-6)); \
1469 bits -= 6; \
1472 #define DECODE(out, ch, bits, surrogate) \
1473 while (bits >= 16) { \
1474 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1475 bits -= 16; \
1476 if (surrogate) { \
1477 /* We have already generated an error for the high surrogate \
1478 so let's not bother seeing if the low surrogate is correct or not */ \
1479 surrogate = 0; \
1480 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
1481 /* This is a surrogate pair. Unfortunately we can't represent \
1482 it in a 16-bit character */ \
1483 surrogate = 1; \
1484 errmsg = "code pairs are not supported"; \
1485 goto utf7Error; \
1486 } else { \
1487 *out++ = outCh; \
1491 PyObject *PyUnicode_DecodeUTF7(const char *s,
1492 Py_ssize_t size,
1493 const char *errors)
1495 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1498 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1499 Py_ssize_t size,
1500 const char *errors,
1501 Py_ssize_t *consumed)
1503 const char *starts = s;
1504 Py_ssize_t startinpos;
1505 Py_ssize_t endinpos;
1506 Py_ssize_t outpos;
1507 const char *e;
1508 PyUnicodeObject *unicode;
1509 Py_UNICODE *p;
1510 const char *errmsg = "";
1511 int inShift = 0;
1512 unsigned int bitsleft = 0;
1513 unsigned long charsleft = 0;
1514 int surrogate = 0;
1515 PyObject *errorHandler = NULL;
1516 PyObject *exc = NULL;
1518 unicode = _PyUnicode_New(size);
1519 if (!unicode)
1520 return NULL;
1521 if (size == 0) {
1522 if (consumed)
1523 *consumed = 0;
1524 return (PyObject *)unicode;
1527 p = unicode->str;
1528 e = s + size;
1530 while (s < e) {
1531 Py_UNICODE ch;
1532 restart:
1533 ch = (unsigned char) *s;
1535 if (inShift) {
1536 if ((ch == '-') || !B64CHAR(ch)) {
1537 inShift = 0;
1538 s++;
1540 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1541 if (bitsleft >= 6) {
1542 /* The shift sequence has a partial character in it. If
1543 bitsleft < 6 then we could just classify it as padding
1544 but that is not the case here */
1546 errmsg = "partial character in shift sequence";
1547 goto utf7Error;
1549 /* According to RFC2152 the remaining bits should be zero. We
1550 choose to signal an error/insert a replacement character
1551 here so indicate the potential of a misencoded character. */
1553 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1554 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1555 errmsg = "non-zero padding bits in shift sequence";
1556 goto utf7Error;
1559 if (ch == '-') {
1560 if ((s < e) && (*(s) == '-')) {
1561 *p++ = '-';
1562 inShift = 1;
1564 } else if (SPECIAL(ch,0,0)) {
1565 errmsg = "unexpected special character";
1566 goto utf7Error;
1567 } else {
1568 *p++ = ch;
1570 } else {
1571 charsleft = (charsleft << 6) | UB64(ch);
1572 bitsleft += 6;
1573 s++;
1574 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1577 else if ( ch == '+' ) {
1578 startinpos = s-starts;
1579 s++;
1580 if (s < e && *s == '-') {
1581 s++;
1582 *p++ = '+';
1583 } else
1585 inShift = 1;
1586 bitsleft = 0;
1589 else if (SPECIAL(ch,0,0)) {
1590 startinpos = s-starts;
1591 errmsg = "unexpected special character";
1592 s++;
1593 goto utf7Error;
1595 else {
1596 *p++ = ch;
1597 s++;
1599 continue;
1600 utf7Error:
1601 outpos = p-PyUnicode_AS_UNICODE(unicode);
1602 endinpos = s-starts;
1603 if (unicode_decode_call_errorhandler(
1604 errors, &errorHandler,
1605 "utf7", errmsg,
1606 starts, size, &startinpos, &endinpos, &exc, &s,
1607 (PyObject **)&unicode, &outpos, &p))
1608 goto onError;
1611 if (inShift && !consumed) {
1612 outpos = p-PyUnicode_AS_UNICODE(unicode);
1613 endinpos = size;
1614 if (unicode_decode_call_errorhandler(
1615 errors, &errorHandler,
1616 "utf7", "unterminated shift sequence",
1617 starts, size, &startinpos, &endinpos, &exc, &s,
1618 (PyObject **)&unicode, &outpos, &p))
1619 goto onError;
1620 if (s < e)
1621 goto restart;
1623 if (consumed) {
1624 if(inShift)
1625 *consumed = startinpos;
1626 else
1627 *consumed = s-starts;
1630 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1631 goto onError;
1633 Py_XDECREF(errorHandler);
1634 Py_XDECREF(exc);
1635 return (PyObject *)unicode;
1637 onError:
1638 Py_XDECREF(errorHandler);
1639 Py_XDECREF(exc);
1640 Py_DECREF(unicode);
1641 return NULL;
1645 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1646 Py_ssize_t size,
1647 int encodeSetO,
1648 int encodeWhiteSpace,
1649 const char *errors)
1651 PyObject *v;
1652 /* It might be possible to tighten this worst case */
1653 Py_ssize_t cbAllocated = 5 * size;
1654 int inShift = 0;
1655 Py_ssize_t i = 0;
1656 unsigned int bitsleft = 0;
1657 unsigned long charsleft = 0;
1658 char * out;
1659 char * start;
1661 if (cbAllocated / 5 != size)
1662 return PyErr_NoMemory();
1664 if (size == 0)
1665 return PyString_FromStringAndSize(NULL, 0);
1667 v = PyString_FromStringAndSize(NULL, cbAllocated);
1668 if (v == NULL)
1669 return NULL;
1671 start = out = PyString_AS_STRING(v);
1672 for (;i < size; ++i) {
1673 Py_UNICODE ch = s[i];
1675 if (!inShift) {
1676 if (ch == '+') {
1677 *out++ = '+';
1678 *out++ = '-';
1679 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1680 charsleft = ch;
1681 bitsleft = 16;
1682 *out++ = '+';
1683 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1684 inShift = bitsleft > 0;
1685 } else {
1686 *out++ = (char) ch;
1688 } else {
1689 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1690 *out++ = B64(charsleft << (6-bitsleft));
1691 charsleft = 0;
1692 bitsleft = 0;
1693 /* Characters not in the BASE64 set implicitly unshift the sequence
1694 so no '-' is required, except if the character is itself a '-' */
1695 if (B64CHAR(ch) || ch == '-') {
1696 *out++ = '-';
1698 inShift = 0;
1699 *out++ = (char) ch;
1700 } else {
1701 bitsleft += 16;
1702 charsleft = (charsleft << 16) | ch;
1703 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1705 /* If the next character is special then we dont' need to terminate
1706 the shift sequence. If the next character is not a BASE64 character
1707 or '-' then the shift sequence will be terminated implicitly and we
1708 don't have to insert a '-'. */
1710 if (bitsleft == 0) {
1711 if (i + 1 < size) {
1712 Py_UNICODE ch2 = s[i+1];
1714 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1716 } else if (B64CHAR(ch2) || ch2 == '-') {
1717 *out++ = '-';
1718 inShift = 0;
1719 } else {
1720 inShift = 0;
1724 else {
1725 *out++ = '-';
1726 inShift = 0;
1732 if (bitsleft) {
1733 *out++= B64(charsleft << (6-bitsleft) );
1734 *out++ = '-';
1737 _PyString_Resize(&v, out - start);
1738 return v;
1741 #undef SPECIAL
1742 #undef B64
1743 #undef B64CHAR
1744 #undef UB64
1745 #undef ENCODE
1746 #undef DECODE
1748 /* --- UTF-8 Codec -------------------------------------------------------- */
1750 static
1751 char utf8_code_length[256] = {
1752 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1753 illegal prefix. see RFC 2279 for details */
1754 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1755 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1756 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1757 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1758 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1759 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1760 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1761 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1762 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1763 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1764 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1765 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1766 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1767 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1768 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1769 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1772 PyObject *PyUnicode_DecodeUTF8(const char *s,
1773 Py_ssize_t size,
1774 const char *errors)
1776 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1779 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1780 Py_ssize_t size,
1781 const char *errors,
1782 Py_ssize_t *consumed)
1784 const char *starts = s;
1785 int n;
1786 Py_ssize_t startinpos;
1787 Py_ssize_t endinpos;
1788 Py_ssize_t outpos;
1789 const char *e;
1790 PyUnicodeObject *unicode;
1791 Py_UNICODE *p;
1792 const char *errmsg = "";
1793 PyObject *errorHandler = NULL;
1794 PyObject *exc = NULL;
1796 /* Note: size will always be longer than the resulting Unicode
1797 character count */
1798 unicode = _PyUnicode_New(size);
1799 if (!unicode)
1800 return NULL;
1801 if (size == 0) {
1802 if (consumed)
1803 *consumed = 0;
1804 return (PyObject *)unicode;
1807 /* Unpack UTF-8 encoded data */
1808 p = unicode->str;
1809 e = s + size;
1811 while (s < e) {
1812 Py_UCS4 ch = (unsigned char)*s;
1814 if (ch < 0x80) {
1815 *p++ = (Py_UNICODE)ch;
1816 s++;
1817 continue;
1820 n = utf8_code_length[ch];
1822 if (s + n > e) {
1823 if (consumed)
1824 break;
1825 else {
1826 errmsg = "unexpected end of data";
1827 startinpos = s-starts;
1828 endinpos = size;
1829 goto utf8Error;
1833 switch (n) {
1835 case 0:
1836 errmsg = "unexpected code byte";
1837 startinpos = s-starts;
1838 endinpos = startinpos+1;
1839 goto utf8Error;
1841 case 1:
1842 errmsg = "internal error";
1843 startinpos = s-starts;
1844 endinpos = startinpos+1;
1845 goto utf8Error;
1847 case 2:
1848 if ((s[1] & 0xc0) != 0x80) {
1849 errmsg = "invalid data";
1850 startinpos = s-starts;
1851 endinpos = startinpos+2;
1852 goto utf8Error;
1854 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1855 if (ch < 0x80) {
1856 startinpos = s-starts;
1857 endinpos = startinpos+2;
1858 errmsg = "illegal encoding";
1859 goto utf8Error;
1861 else
1862 *p++ = (Py_UNICODE)ch;
1863 break;
1865 case 3:
1866 if ((s[1] & 0xc0) != 0x80 ||
1867 (s[2] & 0xc0) != 0x80) {
1868 errmsg = "invalid data";
1869 startinpos = s-starts;
1870 endinpos = startinpos+3;
1871 goto utf8Error;
1873 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1874 if (ch < 0x0800) {
1875 /* Note: UTF-8 encodings of surrogates are considered
1876 legal UTF-8 sequences;
1878 XXX For wide builds (UCS-4) we should probably try
1879 to recombine the surrogates into a single code
1880 unit.
1882 errmsg = "illegal encoding";
1883 startinpos = s-starts;
1884 endinpos = startinpos+3;
1885 goto utf8Error;
1887 else
1888 *p++ = (Py_UNICODE)ch;
1889 break;
1891 case 4:
1892 if ((s[1] & 0xc0) != 0x80 ||
1893 (s[2] & 0xc0) != 0x80 ||
1894 (s[3] & 0xc0) != 0x80) {
1895 errmsg = "invalid data";
1896 startinpos = s-starts;
1897 endinpos = startinpos+4;
1898 goto utf8Error;
1900 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1901 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1902 /* validate and convert to UTF-16 */
1903 if ((ch < 0x10000) /* minimum value allowed for 4
1904 byte encoding */
1905 || (ch > 0x10ffff)) /* maximum value allowed for
1906 UTF-16 */
1908 errmsg = "illegal encoding";
1909 startinpos = s-starts;
1910 endinpos = startinpos+4;
1911 goto utf8Error;
1913 #ifdef Py_UNICODE_WIDE
1914 *p++ = (Py_UNICODE)ch;
1915 #else
1916 /* compute and append the two surrogates: */
1918 /* translate from 10000..10FFFF to 0..FFFF */
1919 ch -= 0x10000;
1921 /* high surrogate = top 10 bits added to D800 */
1922 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1924 /* low surrogate = bottom 10 bits added to DC00 */
1925 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1926 #endif
1927 break;
1929 default:
1930 /* Other sizes are only needed for UCS-4 */
1931 errmsg = "unsupported Unicode code range";
1932 startinpos = s-starts;
1933 endinpos = startinpos+n;
1934 goto utf8Error;
1936 s += n;
1937 continue;
1939 utf8Error:
1940 outpos = p-PyUnicode_AS_UNICODE(unicode);
1941 if (unicode_decode_call_errorhandler(
1942 errors, &errorHandler,
1943 "utf8", errmsg,
1944 starts, size, &startinpos, &endinpos, &exc, &s,
1945 (PyObject **)&unicode, &outpos, &p))
1946 goto onError;
1948 if (consumed)
1949 *consumed = s-starts;
1951 /* Adjust length */
1952 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1953 goto onError;
1955 Py_XDECREF(errorHandler);
1956 Py_XDECREF(exc);
1957 return (PyObject *)unicode;
1959 onError:
1960 Py_XDECREF(errorHandler);
1961 Py_XDECREF(exc);
1962 Py_DECREF(unicode);
1963 return NULL;
1966 /* Allocation strategy: if the string is short, convert into a stack buffer
1967 and allocate exactly as much space needed at the end. Else allocate the
1968 maximum possible needed (4 result bytes per Unicode character), and return
1969 the excess memory at the end.
1971 PyObject *
1972 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1973 Py_ssize_t size,
1974 const char *errors)
1976 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
1978 Py_ssize_t i; /* index into s of next input byte */
1979 PyObject *v; /* result string object */
1980 char *p; /* next free byte in output buffer */
1981 Py_ssize_t nallocated; /* number of result bytes allocated */
1982 Py_ssize_t nneeded; /* number of result bytes needed */
1983 char stackbuf[MAX_SHORT_UNICHARS * 4];
1985 assert(s != NULL);
1986 assert(size >= 0);
1988 if (size <= MAX_SHORT_UNICHARS) {
1989 /* Write into the stack buffer; nallocated can't overflow.
1990 * At the end, we'll allocate exactly as much heap space as it
1991 * turns out we need.
1993 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1994 v = NULL; /* will allocate after we're done */
1995 p = stackbuf;
1997 else {
1998 /* Overallocate on the heap, and give the excess back at the end. */
1999 nallocated = size * 4;
2000 if (nallocated / 4 != size) /* overflow! */
2001 return PyErr_NoMemory();
2002 v = PyString_FromStringAndSize(NULL, nallocated);
2003 if (v == NULL)
2004 return NULL;
2005 p = PyString_AS_STRING(v);
2008 for (i = 0; i < size;) {
2009 Py_UCS4 ch = s[i++];
2011 if (ch < 0x80)
2012 /* Encode ASCII */
2013 *p++ = (char) ch;
2015 else if (ch < 0x0800) {
2016 /* Encode Latin-1 */
2017 *p++ = (char)(0xc0 | (ch >> 6));
2018 *p++ = (char)(0x80 | (ch & 0x3f));
2020 else {
2021 /* Encode UCS2 Unicode ordinals */
2022 if (ch < 0x10000) {
2023 /* Special case: check for high surrogate */
2024 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2025 Py_UCS4 ch2 = s[i];
2026 /* Check for low surrogate and combine the two to
2027 form a UCS4 value */
2028 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2029 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2030 i++;
2031 goto encodeUCS4;
2033 /* Fall through: handles isolated high surrogates */
2035 *p++ = (char)(0xe0 | (ch >> 12));
2036 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2037 *p++ = (char)(0x80 | (ch & 0x3f));
2038 continue;
2040 encodeUCS4:
2041 /* Encode UCS4 Unicode ordinals */
2042 *p++ = (char)(0xf0 | (ch >> 18));
2043 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2044 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2045 *p++ = (char)(0x80 | (ch & 0x3f));
2049 if (v == NULL) {
2050 /* This was stack allocated. */
2051 nneeded = p - stackbuf;
2052 assert(nneeded <= nallocated);
2053 v = PyString_FromStringAndSize(stackbuf, nneeded);
2055 else {
2056 /* Cut back to size actually needed. */
2057 nneeded = p - PyString_AS_STRING(v);
2058 assert(nneeded <= nallocated);
2059 _PyString_Resize(&v, nneeded);
2061 return v;
2063 #undef MAX_SHORT_UNICHARS
2066 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2068 if (!PyUnicode_Check(unicode)) {
2069 PyErr_BadArgument();
2070 return NULL;
2072 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2073 PyUnicode_GET_SIZE(unicode),
2074 NULL);
2077 /* --- UTF-32 Codec ------------------------------------------------------- */
2079 PyObject *
2080 PyUnicode_DecodeUTF32(const char *s,
2081 Py_ssize_t size,
2082 const char *errors,
2083 int *byteorder)
2085 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2088 PyObject *
2089 PyUnicode_DecodeUTF32Stateful(const char *s,
2090 Py_ssize_t size,
2091 const char *errors,
2092 int *byteorder,
2093 Py_ssize_t *consumed)
2095 const char *starts = s;
2096 Py_ssize_t startinpos;
2097 Py_ssize_t endinpos;
2098 Py_ssize_t outpos;
2099 PyUnicodeObject *unicode;
2100 Py_UNICODE *p;
2101 #ifndef Py_UNICODE_WIDE
2102 int i, pairs;
2103 #else
2104 const int pairs = 0;
2105 #endif
2106 const unsigned char *q, *e;
2107 int bo = 0; /* assume native ordering by default */
2108 const char *errmsg = "";
2109 /* Offsets from q for retrieving bytes in the right order. */
2110 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2111 int iorder[] = {0, 1, 2, 3};
2112 #else
2113 int iorder[] = {3, 2, 1, 0};
2114 #endif
2115 PyObject *errorHandler = NULL;
2116 PyObject *exc = NULL;
2117 /* On narrow builds we split characters outside the BMP into two
2118 codepoints => count how much extra space we need. */
2119 #ifndef Py_UNICODE_WIDE
2120 for (i = pairs = 0; i < size/4; i++)
2121 if (((Py_UCS4 *)s)[i] >= 0x10000)
2122 pairs++;
2123 #endif
2125 /* This might be one to much, because of a BOM */
2126 unicode = _PyUnicode_New((size+3)/4+pairs);
2127 if (!unicode)
2128 return NULL;
2129 if (size == 0)
2130 return (PyObject *)unicode;
2132 /* Unpack UTF-32 encoded data */
2133 p = unicode->str;
2134 q = (unsigned char *)s;
2135 e = q + size;
2137 if (byteorder)
2138 bo = *byteorder;
2140 /* Check for BOM marks (U+FEFF) in the input and adjust current
2141 byte order setting accordingly. In native mode, the leading BOM
2142 mark is skipped, in all other modes, it is copied to the output
2143 stream as-is (giving a ZWNBSP character). */
2144 if (bo == 0) {
2145 if (size >= 4) {
2146 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2147 (q[iorder[1]] << 8) | q[iorder[0]];
2148 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2149 if (bom == 0x0000FEFF) {
2150 q += 4;
2151 bo = -1;
2153 else if (bom == 0xFFFE0000) {
2154 q += 4;
2155 bo = 1;
2157 #else
2158 if (bom == 0x0000FEFF) {
2159 q += 4;
2160 bo = 1;
2162 else if (bom == 0xFFFE0000) {
2163 q += 4;
2164 bo = -1;
2166 #endif
2170 if (bo == -1) {
2171 /* force LE */
2172 iorder[0] = 0;
2173 iorder[1] = 1;
2174 iorder[2] = 2;
2175 iorder[3] = 3;
2177 else if (bo == 1) {
2178 /* force BE */
2179 iorder[0] = 3;
2180 iorder[1] = 2;
2181 iorder[2] = 1;
2182 iorder[3] = 0;
2185 while (q < e) {
2186 Py_UCS4 ch;
2187 /* remaining bytes at the end? (size should be divisible by 4) */
2188 if (e-q<4) {
2189 if (consumed)
2190 break;
2191 errmsg = "truncated data";
2192 startinpos = ((const char *)q)-starts;
2193 endinpos = ((const char *)e)-starts;
2194 goto utf32Error;
2195 /* The remaining input chars are ignored if the callback
2196 chooses to skip the input */
2198 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2199 (q[iorder[1]] << 8) | q[iorder[0]];
2201 if (ch >= 0x110000)
2203 errmsg = "codepoint not in range(0x110000)";
2204 startinpos = ((const char *)q)-starts;
2205 endinpos = startinpos+4;
2206 goto utf32Error;
2208 #ifndef Py_UNICODE_WIDE
2209 if (ch >= 0x10000)
2211 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2212 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2214 else
2215 #endif
2216 *p++ = ch;
2217 q += 4;
2218 continue;
2219 utf32Error:
2220 outpos = p-PyUnicode_AS_UNICODE(unicode);
2221 if (unicode_decode_call_errorhandler(
2222 errors, &errorHandler,
2223 "utf32", errmsg,
2224 starts, size, &startinpos, &endinpos, &exc, &s,
2225 (PyObject **)&unicode, &outpos, &p))
2226 goto onError;
2229 if (byteorder)
2230 *byteorder = bo;
2232 if (consumed)
2233 *consumed = (const char *)q-starts;
2235 /* Adjust length */
2236 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2237 goto onError;
2239 Py_XDECREF(errorHandler);
2240 Py_XDECREF(exc);
2241 return (PyObject *)unicode;
2243 onError:
2244 Py_DECREF(unicode);
2245 Py_XDECREF(errorHandler);
2246 Py_XDECREF(exc);
2247 return NULL;
2250 PyObject *
2251 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2252 Py_ssize_t size,
2253 const char *errors,
2254 int byteorder)
2256 PyObject *v;
2257 unsigned char *p;
2258 Py_ssize_t nsize, bytesize;
2259 #ifndef Py_UNICODE_WIDE
2260 Py_ssize_t i, pairs;
2261 #else
2262 const int pairs = 0;
2263 #endif
2264 /* Offsets from p for storing byte pairs in the right order. */
2265 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2266 int iorder[] = {0, 1, 2, 3};
2267 #else
2268 int iorder[] = {3, 2, 1, 0};
2269 #endif
2271 #define STORECHAR(CH) \
2272 do { \
2273 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2274 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2275 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2276 p[iorder[0]] = (CH) & 0xff; \
2277 p += 4; \
2278 } while(0)
2280 /* In narrow builds we can output surrogate pairs as one codepoint,
2281 so we need less space. */
2282 #ifndef Py_UNICODE_WIDE
2283 for (i = pairs = 0; i < size-1; i++)
2284 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2285 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2286 pairs++;
2287 #endif
2288 nsize = (size - pairs + (byteorder == 0));
2289 bytesize = nsize * 4;
2290 if (bytesize / 4 != nsize)
2291 return PyErr_NoMemory();
2292 v = PyString_FromStringAndSize(NULL, bytesize);
2293 if (v == NULL)
2294 return NULL;
2296 p = (unsigned char *)PyString_AS_STRING(v);
2297 if (byteorder == 0)
2298 STORECHAR(0xFEFF);
2299 if (size == 0)
2300 return v;
2302 if (byteorder == -1) {
2303 /* force LE */
2304 iorder[0] = 0;
2305 iorder[1] = 1;
2306 iorder[2] = 2;
2307 iorder[3] = 3;
2309 else if (byteorder == 1) {
2310 /* force BE */
2311 iorder[0] = 3;
2312 iorder[1] = 2;
2313 iorder[2] = 1;
2314 iorder[3] = 0;
2317 while (size-- > 0) {
2318 Py_UCS4 ch = *s++;
2319 #ifndef Py_UNICODE_WIDE
2320 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2321 Py_UCS4 ch2 = *s;
2322 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2323 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2324 s++;
2325 size--;
2328 #endif
2329 STORECHAR(ch);
2331 return v;
2332 #undef STORECHAR
2335 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2337 if (!PyUnicode_Check(unicode)) {
2338 PyErr_BadArgument();
2339 return NULL;
2341 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2342 PyUnicode_GET_SIZE(unicode),
2343 NULL,
2347 /* --- UTF-16 Codec ------------------------------------------------------- */
2349 PyObject *
2350 PyUnicode_DecodeUTF16(const char *s,
2351 Py_ssize_t size,
2352 const char *errors,
2353 int *byteorder)
2355 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2358 PyObject *
2359 PyUnicode_DecodeUTF16Stateful(const char *s,
2360 Py_ssize_t size,
2361 const char *errors,
2362 int *byteorder,
2363 Py_ssize_t *consumed)
2365 const char *starts = s;
2366 Py_ssize_t startinpos;
2367 Py_ssize_t endinpos;
2368 Py_ssize_t outpos;
2369 PyUnicodeObject *unicode;
2370 Py_UNICODE *p;
2371 const unsigned char *q, *e;
2372 int bo = 0; /* assume native ordering by default */
2373 const char *errmsg = "";
2374 /* Offsets from q for retrieving byte pairs in the right order. */
2375 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2376 int ihi = 1, ilo = 0;
2377 #else
2378 int ihi = 0, ilo = 1;
2379 #endif
2380 PyObject *errorHandler = NULL;
2381 PyObject *exc = NULL;
2383 /* Note: size will always be longer than the resulting Unicode
2384 character count */
2385 unicode = _PyUnicode_New(size);
2386 if (!unicode)
2387 return NULL;
2388 if (size == 0)
2389 return (PyObject *)unicode;
2391 /* Unpack UTF-16 encoded data */
2392 p = unicode->str;
2393 q = (unsigned char *)s;
2394 e = q + size;
2396 if (byteorder)
2397 bo = *byteorder;
2399 /* Check for BOM marks (U+FEFF) in the input and adjust current
2400 byte order setting accordingly. In native mode, the leading BOM
2401 mark is skipped, in all other modes, it is copied to the output
2402 stream as-is (giving a ZWNBSP character). */
2403 if (bo == 0) {
2404 if (size >= 2) {
2405 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2406 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2407 if (bom == 0xFEFF) {
2408 q += 2;
2409 bo = -1;
2411 else if (bom == 0xFFFE) {
2412 q += 2;
2413 bo = 1;
2415 #else
2416 if (bom == 0xFEFF) {
2417 q += 2;
2418 bo = 1;
2420 else if (bom == 0xFFFE) {
2421 q += 2;
2422 bo = -1;
2424 #endif
2428 if (bo == -1) {
2429 /* force LE */
2430 ihi = 1;
2431 ilo = 0;
2433 else if (bo == 1) {
2434 /* force BE */
2435 ihi = 0;
2436 ilo = 1;
2439 while (q < e) {
2440 Py_UNICODE ch;
2441 /* remaining bytes at the end? (size should be even) */
2442 if (e-q<2) {
2443 if (consumed)
2444 break;
2445 errmsg = "truncated data";
2446 startinpos = ((const char *)q)-starts;
2447 endinpos = ((const char *)e)-starts;
2448 goto utf16Error;
2449 /* The remaining input chars are ignored if the callback
2450 chooses to skip the input */
2452 ch = (q[ihi] << 8) | q[ilo];
2454 q += 2;
2456 if (ch < 0xD800 || ch > 0xDFFF) {
2457 *p++ = ch;
2458 continue;
2461 /* UTF-16 code pair: */
2462 if (q >= e) {
2463 errmsg = "unexpected end of data";
2464 startinpos = (((const char *)q)-2)-starts;
2465 endinpos = ((const char *)e)-starts;
2466 goto utf16Error;
2468 if (0xD800 <= ch && ch <= 0xDBFF) {
2469 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2470 q += 2;
2471 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2472 #ifndef Py_UNICODE_WIDE
2473 *p++ = ch;
2474 *p++ = ch2;
2475 #else
2476 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2477 #endif
2478 continue;
2480 else {
2481 errmsg = "illegal UTF-16 surrogate";
2482 startinpos = (((const char *)q)-4)-starts;
2483 endinpos = startinpos+2;
2484 goto utf16Error;
2488 errmsg = "illegal encoding";
2489 startinpos = (((const char *)q)-2)-starts;
2490 endinpos = startinpos+2;
2491 /* Fall through to report the error */
2493 utf16Error:
2494 outpos = p-PyUnicode_AS_UNICODE(unicode);
2495 if (unicode_decode_call_errorhandler(
2496 errors, &errorHandler,
2497 "utf16", errmsg,
2498 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2499 (PyObject **)&unicode, &outpos, &p))
2500 goto onError;
2503 if (byteorder)
2504 *byteorder = bo;
2506 if (consumed)
2507 *consumed = (const char *)q-starts;
2509 /* Adjust length */
2510 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2511 goto onError;
2513 Py_XDECREF(errorHandler);
2514 Py_XDECREF(exc);
2515 return (PyObject *)unicode;
2517 onError:
2518 Py_DECREF(unicode);
2519 Py_XDECREF(errorHandler);
2520 Py_XDECREF(exc);
2521 return NULL;
2524 PyObject *
2525 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2526 Py_ssize_t size,
2527 const char *errors,
2528 int byteorder)
2530 PyObject *v;
2531 unsigned char *p;
2532 Py_ssize_t nsize, bytesize;
2533 #ifdef Py_UNICODE_WIDE
2534 Py_ssize_t i, pairs;
2535 #else
2536 const int pairs = 0;
2537 #endif
2538 /* Offsets from p for storing byte pairs in the right order. */
2539 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2540 int ihi = 1, ilo = 0;
2541 #else
2542 int ihi = 0, ilo = 1;
2543 #endif
2545 #define STORECHAR(CH) \
2546 do { \
2547 p[ihi] = ((CH) >> 8) & 0xff; \
2548 p[ilo] = (CH) & 0xff; \
2549 p += 2; \
2550 } while(0)
2552 #ifdef Py_UNICODE_WIDE
2553 for (i = pairs = 0; i < size; i++)
2554 if (s[i] >= 0x10000)
2555 pairs++;
2556 #endif
2557 /* 2 * (size + pairs + (byteorder == 0)) */
2558 if (size > PY_SSIZE_T_MAX ||
2559 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2560 return PyErr_NoMemory();
2561 nsize = size + pairs + (byteorder == 0);
2562 bytesize = nsize * 2;
2563 if (bytesize / 2 != nsize)
2564 return PyErr_NoMemory();
2565 v = PyString_FromStringAndSize(NULL, bytesize);
2566 if (v == NULL)
2567 return NULL;
2569 p = (unsigned char *)PyString_AS_STRING(v);
2570 if (byteorder == 0)
2571 STORECHAR(0xFEFF);
2572 if (size == 0)
2573 return v;
2575 if (byteorder == -1) {
2576 /* force LE */
2577 ihi = 1;
2578 ilo = 0;
2580 else if (byteorder == 1) {
2581 /* force BE */
2582 ihi = 0;
2583 ilo = 1;
2586 while (size-- > 0) {
2587 Py_UNICODE ch = *s++;
2588 Py_UNICODE ch2 = 0;
2589 #ifdef Py_UNICODE_WIDE
2590 if (ch >= 0x10000) {
2591 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2592 ch = 0xD800 | ((ch-0x10000) >> 10);
2594 #endif
2595 STORECHAR(ch);
2596 if (ch2)
2597 STORECHAR(ch2);
2599 return v;
2600 #undef STORECHAR
2603 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2605 if (!PyUnicode_Check(unicode)) {
2606 PyErr_BadArgument();
2607 return NULL;
2609 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2610 PyUnicode_GET_SIZE(unicode),
2611 NULL,
2615 /* --- Unicode Escape Codec ----------------------------------------------- */
2617 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2619 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2620 Py_ssize_t size,
2621 const char *errors)
2623 const char *starts = s;
2624 Py_ssize_t startinpos;
2625 Py_ssize_t endinpos;
2626 Py_ssize_t outpos;
2627 int i;
2628 PyUnicodeObject *v;
2629 Py_UNICODE *p;
2630 const char *end;
2631 char* message;
2632 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2633 PyObject *errorHandler = NULL;
2634 PyObject *exc = NULL;
2636 /* Escaped strings will always be longer than the resulting
2637 Unicode string, so we start with size here and then reduce the
2638 length after conversion to the true value.
2639 (but if the error callback returns a long replacement string
2640 we'll have to allocate more space) */
2641 v = _PyUnicode_New(size);
2642 if (v == NULL)
2643 goto onError;
2644 if (size == 0)
2645 return (PyObject *)v;
2647 p = PyUnicode_AS_UNICODE(v);
2648 end = s + size;
2650 while (s < end) {
2651 unsigned char c;
2652 Py_UNICODE x;
2653 int digits;
2655 /* Non-escape characters are interpreted as Unicode ordinals */
2656 if (*s != '\\') {
2657 *p++ = (unsigned char) *s++;
2658 continue;
2661 startinpos = s-starts;
2662 /* \ - Escapes */
2663 s++;
2664 c = *s++;
2665 if (s > end)
2666 c = '\0'; /* Invalid after \ */
2667 switch (c) {
2669 /* \x escapes */
2670 case '\n': break;
2671 case '\\': *p++ = '\\'; break;
2672 case '\'': *p++ = '\''; break;
2673 case '\"': *p++ = '\"'; break;
2674 case 'b': *p++ = '\b'; break;
2675 case 'f': *p++ = '\014'; break; /* FF */
2676 case 't': *p++ = '\t'; break;
2677 case 'n': *p++ = '\n'; break;
2678 case 'r': *p++ = '\r'; break;
2679 case 'v': *p++ = '\013'; break; /* VT */
2680 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2682 /* \OOO (octal) escapes */
2683 case '0': case '1': case '2': case '3':
2684 case '4': case '5': case '6': case '7':
2685 x = s[-1] - '0';
2686 if (s < end && '0' <= *s && *s <= '7') {
2687 x = (x<<3) + *s++ - '0';
2688 if (s < end && '0' <= *s && *s <= '7')
2689 x = (x<<3) + *s++ - '0';
2691 *p++ = x;
2692 break;
2694 /* hex escapes */
2695 /* \xXX */
2696 case 'x':
2697 digits = 2;
2698 message = "truncated \\xXX escape";
2699 goto hexescape;
2701 /* \uXXXX */
2702 case 'u':
2703 digits = 4;
2704 message = "truncated \\uXXXX escape";
2705 goto hexescape;
2707 /* \UXXXXXXXX */
2708 case 'U':
2709 digits = 8;
2710 message = "truncated \\UXXXXXXXX escape";
2711 hexescape:
2712 chr = 0;
2713 outpos = p-PyUnicode_AS_UNICODE(v);
2714 if (s+digits>end) {
2715 endinpos = size;
2716 if (unicode_decode_call_errorhandler(
2717 errors, &errorHandler,
2718 "unicodeescape", "end of string in escape sequence",
2719 starts, size, &startinpos, &endinpos, &exc, &s,
2720 (PyObject **)&v, &outpos, &p))
2721 goto onError;
2722 goto nextByte;
2724 for (i = 0; i < digits; ++i) {
2725 c = (unsigned char) s[i];
2726 if (!isxdigit(c)) {
2727 endinpos = (s+i+1)-starts;
2728 if (unicode_decode_call_errorhandler(
2729 errors, &errorHandler,
2730 "unicodeescape", message,
2731 starts, size, &startinpos, &endinpos, &exc, &s,
2732 (PyObject **)&v, &outpos, &p))
2733 goto onError;
2734 goto nextByte;
2736 chr = (chr<<4) & ~0xF;
2737 if (c >= '0' && c <= '9')
2738 chr += c - '0';
2739 else if (c >= 'a' && c <= 'f')
2740 chr += 10 + c - 'a';
2741 else
2742 chr += 10 + c - 'A';
2744 s += i;
2745 if (chr == 0xffffffff && PyErr_Occurred())
2746 /* _decoding_error will have already written into the
2747 target buffer. */
2748 break;
2749 store:
2750 /* when we get here, chr is a 32-bit unicode character */
2751 if (chr <= 0xffff)
2752 /* UCS-2 character */
2753 *p++ = (Py_UNICODE) chr;
2754 else if (chr <= 0x10ffff) {
2755 /* UCS-4 character. Either store directly, or as
2756 surrogate pair. */
2757 #ifdef Py_UNICODE_WIDE
2758 *p++ = chr;
2759 #else
2760 chr -= 0x10000L;
2761 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2762 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2763 #endif
2764 } else {
2765 endinpos = s-starts;
2766 outpos = p-PyUnicode_AS_UNICODE(v);
2767 if (unicode_decode_call_errorhandler(
2768 errors, &errorHandler,
2769 "unicodeescape", "illegal Unicode character",
2770 starts, size, &startinpos, &endinpos, &exc, &s,
2771 (PyObject **)&v, &outpos, &p))
2772 goto onError;
2774 break;
2776 /* \N{name} */
2777 case 'N':
2778 message = "malformed \\N character escape";
2779 if (ucnhash_CAPI == NULL) {
2780 /* load the unicode data module */
2781 PyObject *m, *api;
2782 m = PyImport_ImportModuleNoBlock("unicodedata");
2783 if (m == NULL)
2784 goto ucnhashError;
2785 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
2786 Py_DECREF(m);
2787 if (api == NULL)
2788 goto ucnhashError;
2789 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
2790 Py_DECREF(api);
2791 if (ucnhash_CAPI == NULL)
2792 goto ucnhashError;
2794 if (*s == '{') {
2795 const char *start = s+1;
2796 /* look for the closing brace */
2797 while (*s != '}' && s < end)
2798 s++;
2799 if (s > start && s < end && *s == '}') {
2800 /* found a name. look it up in the unicode database */
2801 message = "unknown Unicode character name";
2802 s++;
2803 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2804 goto store;
2807 endinpos = s-starts;
2808 outpos = p-PyUnicode_AS_UNICODE(v);
2809 if (unicode_decode_call_errorhandler(
2810 errors, &errorHandler,
2811 "unicodeescape", message,
2812 starts, size, &startinpos, &endinpos, &exc, &s,
2813 (PyObject **)&v, &outpos, &p))
2814 goto onError;
2815 break;
2817 default:
2818 if (s > end) {
2819 message = "\\ at end of string";
2820 s--;
2821 endinpos = s-starts;
2822 outpos = p-PyUnicode_AS_UNICODE(v);
2823 if (unicode_decode_call_errorhandler(
2824 errors, &errorHandler,
2825 "unicodeescape", message,
2826 starts, size, &startinpos, &endinpos, &exc, &s,
2827 (PyObject **)&v, &outpos, &p))
2828 goto onError;
2830 else {
2831 *p++ = '\\';
2832 *p++ = (unsigned char)s[-1];
2834 break;
2836 nextByte:
2839 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2840 goto onError;
2841 Py_XDECREF(errorHandler);
2842 Py_XDECREF(exc);
2843 return (PyObject *)v;
2845 ucnhashError:
2846 PyErr_SetString(
2847 PyExc_UnicodeError,
2848 "\\N escapes not supported (can't load unicodedata module)"
2850 Py_XDECREF(v);
2851 Py_XDECREF(errorHandler);
2852 Py_XDECREF(exc);
2853 return NULL;
2855 onError:
2856 Py_XDECREF(v);
2857 Py_XDECREF(errorHandler);
2858 Py_XDECREF(exc);
2859 return NULL;
2862 /* Return a Unicode-Escape string version of the Unicode object.
2864 If quotes is true, the string is enclosed in u"" or u'' quotes as
2865 appropriate.
2869 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2870 Py_ssize_t size,
2871 Py_UNICODE ch)
2873 /* like wcschr, but doesn't stop at NULL characters */
2875 while (size-- > 0) {
2876 if (*s == ch)
2877 return s;
2878 s++;
2881 return NULL;
2884 static
2885 PyObject *unicodeescape_string(const Py_UNICODE *s,
2886 Py_ssize_t size,
2887 int quotes)
2889 PyObject *repr;
2890 char *p;
2892 static const char *hexdigit = "0123456789abcdef";
2893 #ifdef Py_UNICODE_WIDE
2894 const Py_ssize_t expandsize = 10;
2895 #else
2896 const Py_ssize_t expandsize = 6;
2897 #endif
2899 /* XXX(nnorwitz): rather than over-allocating, it would be
2900 better to choose a different scheme. Perhaps scan the
2901 first N-chars of the string and allocate based on that size.
2903 /* Initial allocation is based on the longest-possible unichr
2904 escape.
2906 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2907 unichr, so in this case it's the longest unichr escape. In
2908 narrow (UTF-16) builds this is five chars per source unichr
2909 since there are two unichrs in the surrogate pair, so in narrow
2910 (UTF-16) builds it's not the longest unichr escape.
2912 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2913 so in the narrow (UTF-16) build case it's the longest unichr
2914 escape.
2917 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
2918 return PyErr_NoMemory();
2920 repr = PyString_FromStringAndSize(NULL,
2922 + expandsize*size
2923 + 1);
2924 if (repr == NULL)
2925 return NULL;
2927 p = PyString_AS_STRING(repr);
2929 if (quotes) {
2930 *p++ = 'u';
2931 *p++ = (findchar(s, size, '\'') &&
2932 !findchar(s, size, '"')) ? '"' : '\'';
2934 while (size-- > 0) {
2935 Py_UNICODE ch = *s++;
2937 /* Escape quotes and backslashes */
2938 if ((quotes &&
2939 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
2940 *p++ = '\\';
2941 *p++ = (char) ch;
2942 continue;
2945 #ifdef Py_UNICODE_WIDE
2946 /* Map 21-bit characters to '\U00xxxxxx' */
2947 else if (ch >= 0x10000) {
2948 *p++ = '\\';
2949 *p++ = 'U';
2950 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2951 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2952 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2953 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2954 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2955 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2956 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
2957 *p++ = hexdigit[ch & 0x0000000F];
2958 continue;
2960 #else
2961 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
2962 else if (ch >= 0xD800 && ch < 0xDC00) {
2963 Py_UNICODE ch2;
2964 Py_UCS4 ucs;
2966 ch2 = *s++;
2967 size--;
2968 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2969 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2970 *p++ = '\\';
2971 *p++ = 'U';
2972 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2973 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2974 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2975 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2976 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2977 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2978 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2979 *p++ = hexdigit[ucs & 0x0000000F];
2980 continue;
2982 /* Fall through: isolated surrogates are copied as-is */
2983 s--;
2984 size++;
2986 #endif
2988 /* Map 16-bit characters to '\uxxxx' */
2989 if (ch >= 256) {
2990 *p++ = '\\';
2991 *p++ = 'u';
2992 *p++ = hexdigit[(ch >> 12) & 0x000F];
2993 *p++ = hexdigit[(ch >> 8) & 0x000F];
2994 *p++ = hexdigit[(ch >> 4) & 0x000F];
2995 *p++ = hexdigit[ch & 0x000F];
2998 /* Map special whitespace to '\t', \n', '\r' */
2999 else if (ch == '\t') {
3000 *p++ = '\\';
3001 *p++ = 't';
3003 else if (ch == '\n') {
3004 *p++ = '\\';
3005 *p++ = 'n';
3007 else if (ch == '\r') {
3008 *p++ = '\\';
3009 *p++ = 'r';
3012 /* Map non-printable US ASCII to '\xhh' */
3013 else if (ch < ' ' || ch >= 0x7F) {
3014 *p++ = '\\';
3015 *p++ = 'x';
3016 *p++ = hexdigit[(ch >> 4) & 0x000F];
3017 *p++ = hexdigit[ch & 0x000F];
3020 /* Copy everything else as-is */
3021 else
3022 *p++ = (char) ch;
3024 if (quotes)
3025 *p++ = PyString_AS_STRING(repr)[1];
3027 *p = '\0';
3028 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
3029 return repr;
3032 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3033 Py_ssize_t size)
3035 return unicodeescape_string(s, size, 0);
3038 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3040 if (!PyUnicode_Check(unicode)) {
3041 PyErr_BadArgument();
3042 return NULL;
3044 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3045 PyUnicode_GET_SIZE(unicode));
3048 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3050 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3051 Py_ssize_t size,
3052 const char *errors)
3054 const char *starts = s;
3055 Py_ssize_t startinpos;
3056 Py_ssize_t endinpos;
3057 Py_ssize_t outpos;
3058 PyUnicodeObject *v;
3059 Py_UNICODE *p;
3060 const char *end;
3061 const char *bs;
3062 PyObject *errorHandler = NULL;
3063 PyObject *exc = NULL;
3065 /* Escaped strings will always be longer than the resulting
3066 Unicode string, so we start with size here and then reduce the
3067 length after conversion to the true value. (But decoding error
3068 handler might have to resize the string) */
3069 v = _PyUnicode_New(size);
3070 if (v == NULL)
3071 goto onError;
3072 if (size == 0)
3073 return (PyObject *)v;
3074 p = PyUnicode_AS_UNICODE(v);
3075 end = s + size;
3076 while (s < end) {
3077 unsigned char c;
3078 Py_UCS4 x;
3079 int i;
3080 int count;
3082 /* Non-escape characters are interpreted as Unicode ordinals */
3083 if (*s != '\\') {
3084 *p++ = (unsigned char)*s++;
3085 continue;
3087 startinpos = s-starts;
3089 /* \u-escapes are only interpreted iff the number of leading
3090 backslashes if odd */
3091 bs = s;
3092 for (;s < end;) {
3093 if (*s != '\\')
3094 break;
3095 *p++ = (unsigned char)*s++;
3097 if (((s - bs) & 1) == 0 ||
3098 s >= end ||
3099 (*s != 'u' && *s != 'U')) {
3100 continue;
3102 p--;
3103 count = *s=='u' ? 4 : 8;
3104 s++;
3106 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3107 outpos = p-PyUnicode_AS_UNICODE(v);
3108 for (x = 0, i = 0; i < count; ++i, ++s) {
3109 c = (unsigned char)*s;
3110 if (!isxdigit(c)) {
3111 endinpos = s-starts;
3112 if (unicode_decode_call_errorhandler(
3113 errors, &errorHandler,
3114 "rawunicodeescape", "truncated \\uXXXX",
3115 starts, size, &startinpos, &endinpos, &exc, &s,
3116 (PyObject **)&v, &outpos, &p))
3117 goto onError;
3118 goto nextByte;
3120 x = (x<<4) & ~0xF;
3121 if (c >= '0' && c <= '9')
3122 x += c - '0';
3123 else if (c >= 'a' && c <= 'f')
3124 x += 10 + c - 'a';
3125 else
3126 x += 10 + c - 'A';
3128 if (x <= 0xffff)
3129 /* UCS-2 character */
3130 *p++ = (Py_UNICODE) x;
3131 else if (x <= 0x10ffff) {
3132 /* UCS-4 character. Either store directly, or as
3133 surrogate pair. */
3134 #ifdef Py_UNICODE_WIDE
3135 *p++ = (Py_UNICODE) x;
3136 #else
3137 x -= 0x10000L;
3138 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3139 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3140 #endif
3141 } else {
3142 endinpos = s-starts;
3143 outpos = p-PyUnicode_AS_UNICODE(v);
3144 if (unicode_decode_call_errorhandler(
3145 errors, &errorHandler,
3146 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3147 starts, size, &startinpos, &endinpos, &exc, &s,
3148 (PyObject **)&v, &outpos, &p))
3149 goto onError;
3151 nextByte:
3154 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3155 goto onError;
3156 Py_XDECREF(errorHandler);
3157 Py_XDECREF(exc);
3158 return (PyObject *)v;
3160 onError:
3161 Py_XDECREF(v);
3162 Py_XDECREF(errorHandler);
3163 Py_XDECREF(exc);
3164 return NULL;
3167 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3168 Py_ssize_t size)
3170 PyObject *repr;
3171 char *p;
3172 char *q;
3174 static const char *hexdigit = "0123456789abcdef";
3175 #ifdef Py_UNICODE_WIDE
3176 const Py_ssize_t expandsize = 10;
3177 #else
3178 const Py_ssize_t expandsize = 6;
3179 #endif
3181 if (size > PY_SSIZE_T_MAX / expandsize)
3182 return PyErr_NoMemory();
3184 repr = PyString_FromStringAndSize(NULL, expandsize * size);
3185 if (repr == NULL)
3186 return NULL;
3187 if (size == 0)
3188 return repr;
3190 p = q = PyString_AS_STRING(repr);
3191 while (size-- > 0) {
3192 Py_UNICODE ch = *s++;
3193 #ifdef Py_UNICODE_WIDE
3194 /* Map 32-bit characters to '\Uxxxxxxxx' */
3195 if (ch >= 0x10000) {
3196 *p++ = '\\';
3197 *p++ = 'U';
3198 *p++ = hexdigit[(ch >> 28) & 0xf];
3199 *p++ = hexdigit[(ch >> 24) & 0xf];
3200 *p++ = hexdigit[(ch >> 20) & 0xf];
3201 *p++ = hexdigit[(ch >> 16) & 0xf];
3202 *p++ = hexdigit[(ch >> 12) & 0xf];
3203 *p++ = hexdigit[(ch >> 8) & 0xf];
3204 *p++ = hexdigit[(ch >> 4) & 0xf];
3205 *p++ = hexdigit[ch & 15];
3207 else
3208 #else
3209 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3210 if (ch >= 0xD800 && ch < 0xDC00) {
3211 Py_UNICODE ch2;
3212 Py_UCS4 ucs;
3214 ch2 = *s++;
3215 size--;
3216 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3217 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3218 *p++ = '\\';
3219 *p++ = 'U';
3220 *p++ = hexdigit[(ucs >> 28) & 0xf];
3221 *p++ = hexdigit[(ucs >> 24) & 0xf];
3222 *p++ = hexdigit[(ucs >> 20) & 0xf];
3223 *p++ = hexdigit[(ucs >> 16) & 0xf];
3224 *p++ = hexdigit[(ucs >> 12) & 0xf];
3225 *p++ = hexdigit[(ucs >> 8) & 0xf];
3226 *p++ = hexdigit[(ucs >> 4) & 0xf];
3227 *p++ = hexdigit[ucs & 0xf];
3228 continue;
3230 /* Fall through: isolated surrogates are copied as-is */
3231 s--;
3232 size++;
3234 #endif
3235 /* Map 16-bit characters to '\uxxxx' */
3236 if (ch >= 256) {
3237 *p++ = '\\';
3238 *p++ = 'u';
3239 *p++ = hexdigit[(ch >> 12) & 0xf];
3240 *p++ = hexdigit[(ch >> 8) & 0xf];
3241 *p++ = hexdigit[(ch >> 4) & 0xf];
3242 *p++ = hexdigit[ch & 15];
3244 /* Copy everything else as-is */
3245 else
3246 *p++ = (char) ch;
3248 *p = '\0';
3249 _PyString_Resize(&repr, p - q);
3250 return repr;
3253 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3255 if (!PyUnicode_Check(unicode)) {
3256 PyErr_BadArgument();
3257 return NULL;
3259 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3260 PyUnicode_GET_SIZE(unicode));
3263 /* --- Unicode Internal Codec ------------------------------------------- */
3265 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3266 Py_ssize_t size,
3267 const char *errors)
3269 const char *starts = s;
3270 Py_ssize_t startinpos;
3271 Py_ssize_t endinpos;
3272 Py_ssize_t outpos;
3273 PyUnicodeObject *v;
3274 Py_UNICODE *p;
3275 const char *end;
3276 const char *reason;
3277 PyObject *errorHandler = NULL;
3278 PyObject *exc = NULL;
3280 #ifdef Py_UNICODE_WIDE
3281 Py_UNICODE unimax = PyUnicode_GetMax();
3282 #endif
3284 /* XXX overflow detection missing */
3285 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3286 if (v == NULL)
3287 goto onError;
3288 if (PyUnicode_GetSize((PyObject *)v) == 0)
3289 return (PyObject *)v;
3290 p = PyUnicode_AS_UNICODE(v);
3291 end = s + size;
3293 while (s < end) {
3294 memcpy(p, s, sizeof(Py_UNICODE));
3295 /* We have to sanity check the raw data, otherwise doom looms for
3296 some malformed UCS-4 data. */
3297 if (
3298 #ifdef Py_UNICODE_WIDE
3299 *p > unimax || *p < 0 ||
3300 #endif
3301 end-s < Py_UNICODE_SIZE
3304 startinpos = s - starts;
3305 if (end-s < Py_UNICODE_SIZE) {
3306 endinpos = end-starts;
3307 reason = "truncated input";
3309 else {
3310 endinpos = s - starts + Py_UNICODE_SIZE;
3311 reason = "illegal code point (> 0x10FFFF)";
3313 outpos = p - PyUnicode_AS_UNICODE(v);
3314 if (unicode_decode_call_errorhandler(
3315 errors, &errorHandler,
3316 "unicode_internal", reason,
3317 starts, size, &startinpos, &endinpos, &exc, &s,
3318 (PyObject **)&v, &outpos, &p)) {
3319 goto onError;
3322 else {
3323 p++;
3324 s += Py_UNICODE_SIZE;
3328 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3329 goto onError;
3330 Py_XDECREF(errorHandler);
3331 Py_XDECREF(exc);
3332 return (PyObject *)v;
3334 onError:
3335 Py_XDECREF(v);
3336 Py_XDECREF(errorHandler);
3337 Py_XDECREF(exc);
3338 return NULL;
3341 /* --- Latin-1 Codec ------------------------------------------------------ */
3343 PyObject *PyUnicode_DecodeLatin1(const char *s,
3344 Py_ssize_t size,
3345 const char *errors)
3347 PyUnicodeObject *v;
3348 Py_UNICODE *p;
3350 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3351 if (size == 1) {
3352 Py_UNICODE r = *(unsigned char*)s;
3353 return PyUnicode_FromUnicode(&r, 1);
3356 v = _PyUnicode_New(size);
3357 if (v == NULL)
3358 goto onError;
3359 if (size == 0)
3360 return (PyObject *)v;
3361 p = PyUnicode_AS_UNICODE(v);
3362 while (size-- > 0)
3363 *p++ = (unsigned char)*s++;
3364 return (PyObject *)v;
3366 onError:
3367 Py_XDECREF(v);
3368 return NULL;
3371 /* create or adjust a UnicodeEncodeError */
3372 static void make_encode_exception(PyObject **exceptionObject,
3373 const char *encoding,
3374 const Py_UNICODE *unicode, Py_ssize_t size,
3375 Py_ssize_t startpos, Py_ssize_t endpos,
3376 const char *reason)
3378 if (*exceptionObject == NULL) {
3379 *exceptionObject = PyUnicodeEncodeError_Create(
3380 encoding, unicode, size, startpos, endpos, reason);
3382 else {
3383 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3384 goto onError;
3385 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3386 goto onError;
3387 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3388 goto onError;
3389 return;
3390 onError:
3391 Py_DECREF(*exceptionObject);
3392 *exceptionObject = NULL;
3396 /* raises a UnicodeEncodeError */
3397 static void raise_encode_exception(PyObject **exceptionObject,
3398 const char *encoding,
3399 const Py_UNICODE *unicode, Py_ssize_t size,
3400 Py_ssize_t startpos, Py_ssize_t endpos,
3401 const char *reason)
3403 make_encode_exception(exceptionObject,
3404 encoding, unicode, size, startpos, endpos, reason);
3405 if (*exceptionObject != NULL)
3406 PyCodec_StrictErrors(*exceptionObject);
3409 /* error handling callback helper:
3410 build arguments, call the callback and check the arguments,
3411 put the result into newpos and return the replacement string, which
3412 has to be freed by the caller */
3413 static PyObject *unicode_encode_call_errorhandler(const char *errors,
3414 PyObject **errorHandler,
3415 const char *encoding, const char *reason,
3416 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3417 Py_ssize_t startpos, Py_ssize_t endpos,
3418 Py_ssize_t *newpos)
3420 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3422 PyObject *restuple;
3423 PyObject *resunicode;
3425 if (*errorHandler == NULL) {
3426 *errorHandler = PyCodec_LookupError(errors);
3427 if (*errorHandler == NULL)
3428 return NULL;
3431 make_encode_exception(exceptionObject,
3432 encoding, unicode, size, startpos, endpos, reason);
3433 if (*exceptionObject == NULL)
3434 return NULL;
3436 restuple = PyObject_CallFunctionObjArgs(
3437 *errorHandler, *exceptionObject, NULL);
3438 if (restuple == NULL)
3439 return NULL;
3440 if (!PyTuple_Check(restuple)) {
3441 PyErr_Format(PyExc_TypeError, &argparse[4]);
3442 Py_DECREF(restuple);
3443 return NULL;
3445 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3446 &resunicode, newpos)) {
3447 Py_DECREF(restuple);
3448 return NULL;
3450 if (*newpos<0)
3451 *newpos = size+*newpos;
3452 if (*newpos<0 || *newpos>size) {
3453 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3454 Py_DECREF(restuple);
3455 return NULL;
3457 Py_INCREF(resunicode);
3458 Py_DECREF(restuple);
3459 return resunicode;
3462 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3463 Py_ssize_t size,
3464 const char *errors,
3465 int limit)
3467 /* output object */
3468 PyObject *res;
3469 /* pointers to the beginning and end+1 of input */
3470 const Py_UNICODE *startp = p;
3471 const Py_UNICODE *endp = p + size;
3472 /* pointer to the beginning of the unencodable characters */
3473 /* const Py_UNICODE *badp = NULL; */
3474 /* pointer into the output */
3475 char *str;
3476 /* current output position */
3477 Py_ssize_t respos = 0;
3478 Py_ssize_t ressize;
3479 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3480 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3481 PyObject *errorHandler = NULL;
3482 PyObject *exc = NULL;
3483 /* the following variable is used for caching string comparisons
3484 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3485 int known_errorHandler = -1;
3487 /* allocate enough for a simple encoding without
3488 replacements, if we need more, we'll resize */
3489 res = PyString_FromStringAndSize(NULL, size);
3490 if (res == NULL)
3491 goto onError;
3492 if (size == 0)
3493 return res;
3494 str = PyString_AS_STRING(res);
3495 ressize = size;
3497 while (p<endp) {
3498 Py_UNICODE c = *p;
3500 /* can we encode this? */
3501 if (c<limit) {
3502 /* no overflow check, because we know that the space is enough */
3503 *str++ = (char)c;
3504 ++p;
3506 else {
3507 Py_ssize_t unicodepos = p-startp;
3508 Py_ssize_t requiredsize;
3509 PyObject *repunicode;
3510 Py_ssize_t repsize;
3511 Py_ssize_t newpos;
3512 Py_ssize_t respos;
3513 Py_UNICODE *uni2;
3514 /* startpos for collecting unencodable chars */
3515 const Py_UNICODE *collstart = p;
3516 const Py_UNICODE *collend = p;
3517 /* find all unecodable characters */
3518 while ((collend < endp) && ((*collend)>=limit))
3519 ++collend;
3520 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3521 if (known_errorHandler==-1) {
3522 if ((errors==NULL) || (!strcmp(errors, "strict")))
3523 known_errorHandler = 1;
3524 else if (!strcmp(errors, "replace"))
3525 known_errorHandler = 2;
3526 else if (!strcmp(errors, "ignore"))
3527 known_errorHandler = 3;
3528 else if (!strcmp(errors, "xmlcharrefreplace"))
3529 known_errorHandler = 4;
3530 else
3531 known_errorHandler = 0;
3533 switch (known_errorHandler) {
3534 case 1: /* strict */
3535 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3536 goto onError;
3537 case 2: /* replace */
3538 while (collstart++<collend)
3539 *str++ = '?'; /* fall through */
3540 case 3: /* ignore */
3541 p = collend;
3542 break;
3543 case 4: /* xmlcharrefreplace */
3544 respos = str-PyString_AS_STRING(res);
3545 /* determine replacement size (temporarily (mis)uses p) */
3546 for (p = collstart, repsize = 0; p < collend; ++p) {
3547 if (*p<10)
3548 repsize += 2+1+1;
3549 else if (*p<100)
3550 repsize += 2+2+1;
3551 else if (*p<1000)
3552 repsize += 2+3+1;
3553 else if (*p<10000)
3554 repsize += 2+4+1;
3555 #ifndef Py_UNICODE_WIDE
3556 else
3557 repsize += 2+5+1;
3558 #else
3559 else if (*p<100000)
3560 repsize += 2+5+1;
3561 else if (*p<1000000)
3562 repsize += 2+6+1;
3563 else
3564 repsize += 2+7+1;
3565 #endif
3567 requiredsize = respos+repsize+(endp-collend);
3568 if (requiredsize > ressize) {
3569 if (requiredsize<2*ressize)
3570 requiredsize = 2*ressize;
3571 if (_PyString_Resize(&res, requiredsize))
3572 goto onError;
3573 str = PyString_AS_STRING(res) + respos;
3574 ressize = requiredsize;
3576 /* generate replacement (temporarily (mis)uses p) */
3577 for (p = collstart; p < collend; ++p) {
3578 str += sprintf(str, "&#%d;", (int)*p);
3580 p = collend;
3581 break;
3582 default:
3583 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3584 encoding, reason, startp, size, &exc,
3585 collstart-startp, collend-startp, &newpos);
3586 if (repunicode == NULL)
3587 goto onError;
3588 /* need more space? (at least enough for what we
3589 have+the replacement+the rest of the string, so
3590 we won't have to check space for encodable characters) */
3591 respos = str-PyString_AS_STRING(res);
3592 repsize = PyUnicode_GET_SIZE(repunicode);
3593 requiredsize = respos+repsize+(endp-collend);
3594 if (requiredsize > ressize) {
3595 if (requiredsize<2*ressize)
3596 requiredsize = 2*ressize;
3597 if (_PyString_Resize(&res, requiredsize)) {
3598 Py_DECREF(repunicode);
3599 goto onError;
3601 str = PyString_AS_STRING(res) + respos;
3602 ressize = requiredsize;
3604 /* check if there is anything unencodable in the replacement
3605 and copy it to the output */
3606 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3607 c = *uni2;
3608 if (c >= limit) {
3609 raise_encode_exception(&exc, encoding, startp, size,
3610 unicodepos, unicodepos+1, reason);
3611 Py_DECREF(repunicode);
3612 goto onError;
3614 *str = (char)c;
3616 p = startp + newpos;
3617 Py_DECREF(repunicode);
3621 /* Resize if we allocated to much */
3622 respos = str-PyString_AS_STRING(res);
3623 if (respos<ressize)
3624 /* If this falls res will be NULL */
3625 _PyString_Resize(&res, respos);
3626 Py_XDECREF(errorHandler);
3627 Py_XDECREF(exc);
3628 return res;
3630 onError:
3631 Py_XDECREF(res);
3632 Py_XDECREF(errorHandler);
3633 Py_XDECREF(exc);
3634 return NULL;
3637 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3638 Py_ssize_t size,
3639 const char *errors)
3641 return unicode_encode_ucs1(p, size, errors, 256);
3644 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3646 if (!PyUnicode_Check(unicode)) {
3647 PyErr_BadArgument();
3648 return NULL;
3650 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3651 PyUnicode_GET_SIZE(unicode),
3652 NULL);
3655 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3657 PyObject *PyUnicode_DecodeASCII(const char *s,
3658 Py_ssize_t size,
3659 const char *errors)
3661 const char *starts = s;
3662 PyUnicodeObject *v;
3663 Py_UNICODE *p;
3664 Py_ssize_t startinpos;
3665 Py_ssize_t endinpos;
3666 Py_ssize_t outpos;
3667 const char *e;
3668 PyObject *errorHandler = NULL;
3669 PyObject *exc = NULL;
3671 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3672 if (size == 1 && *(unsigned char*)s < 128) {
3673 Py_UNICODE r = *(unsigned char*)s;
3674 return PyUnicode_FromUnicode(&r, 1);
3677 v = _PyUnicode_New(size);
3678 if (v == NULL)
3679 goto onError;
3680 if (size == 0)
3681 return (PyObject *)v;
3682 p = PyUnicode_AS_UNICODE(v);
3683 e = s + size;
3684 while (s < e) {
3685 register unsigned char c = (unsigned char)*s;
3686 if (c < 128) {
3687 *p++ = c;
3688 ++s;
3690 else {
3691 startinpos = s-starts;
3692 endinpos = startinpos + 1;
3693 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3694 if (unicode_decode_call_errorhandler(
3695 errors, &errorHandler,
3696 "ascii", "ordinal not in range(128)",
3697 starts, size, &startinpos, &endinpos, &exc, &s,
3698 (PyObject **)&v, &outpos, &p))
3699 goto onError;
3702 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
3703 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3704 goto onError;
3705 Py_XDECREF(errorHandler);
3706 Py_XDECREF(exc);
3707 return (PyObject *)v;
3709 onError:
3710 Py_XDECREF(v);
3711 Py_XDECREF(errorHandler);
3712 Py_XDECREF(exc);
3713 return NULL;
3716 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3717 Py_ssize_t size,
3718 const char *errors)
3720 return unicode_encode_ucs1(p, size, errors, 128);
3723 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3725 if (!PyUnicode_Check(unicode)) {
3726 PyErr_BadArgument();
3727 return NULL;
3729 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3730 PyUnicode_GET_SIZE(unicode),
3731 NULL);
3734 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3736 /* --- MBCS codecs for Windows -------------------------------------------- */
3738 #if SIZEOF_INT < SIZEOF_SSIZE_T
3739 #define NEED_RETRY
3740 #endif
3742 /* XXX This code is limited to "true" double-byte encodings, as
3743 a) it assumes an incomplete character consists of a single byte, and
3744 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3745 encodings, see IsDBCSLeadByteEx documentation. */
3747 static int is_dbcs_lead_byte(const char *s, int offset)
3749 const char *curr = s + offset;
3751 if (IsDBCSLeadByte(*curr)) {
3752 const char *prev = CharPrev(s, curr);
3753 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3755 return 0;
3759 * Decode MBCS string into unicode object. If 'final' is set, converts
3760 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3762 static int decode_mbcs(PyUnicodeObject **v,
3763 const char *s, /* MBCS string */
3764 int size, /* sizeof MBCS string */
3765 int final)
3767 Py_UNICODE *p;
3768 Py_ssize_t n = 0;
3769 int usize = 0;
3771 assert(size >= 0);
3773 /* Skip trailing lead-byte unless 'final' is set */
3774 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3775 --size;
3777 /* First get the size of the result */
3778 if (size > 0) {
3779 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3780 if (usize == 0) {
3781 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3782 return -1;
3786 if (*v == NULL) {
3787 /* Create unicode object */
3788 *v = _PyUnicode_New(usize);
3789 if (*v == NULL)
3790 return -1;
3792 else {
3793 /* Extend unicode object */
3794 n = PyUnicode_GET_SIZE(*v);
3795 if (_PyUnicode_Resize(v, n + usize) < 0)
3796 return -1;
3799 /* Do the conversion */
3800 if (size > 0) {
3801 p = PyUnicode_AS_UNICODE(*v) + n;
3802 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3803 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3804 return -1;
3808 return size;
3811 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3812 Py_ssize_t size,
3813 const char *errors,
3814 Py_ssize_t *consumed)
3816 PyUnicodeObject *v = NULL;
3817 int done;
3819 if (consumed)
3820 *consumed = 0;
3822 #ifdef NEED_RETRY
3823 retry:
3824 if (size > INT_MAX)
3825 done = decode_mbcs(&v, s, INT_MAX, 0);
3826 else
3827 #endif
3828 done = decode_mbcs(&v, s, (int)size, !consumed);
3830 if (done < 0) {
3831 Py_XDECREF(v);
3832 return NULL;
3835 if (consumed)
3836 *consumed += done;
3838 #ifdef NEED_RETRY
3839 if (size > INT_MAX) {
3840 s += done;
3841 size -= done;
3842 goto retry;
3844 #endif
3846 return (PyObject *)v;
3849 PyObject *PyUnicode_DecodeMBCS(const char *s,
3850 Py_ssize_t size,
3851 const char *errors)
3853 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3857 * Convert unicode into string object (MBCS).
3858 * Returns 0 if succeed, -1 otherwise.
3860 static int encode_mbcs(PyObject **repr,
3861 const Py_UNICODE *p, /* unicode */
3862 int size) /* size of unicode */
3864 int mbcssize = 0;
3865 Py_ssize_t n = 0;
3867 assert(size >= 0);
3869 /* First get the size of the result */
3870 if (size > 0) {
3871 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3872 if (mbcssize == 0) {
3873 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3874 return -1;
3878 if (*repr == NULL) {
3879 /* Create string object */
3880 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3881 if (*repr == NULL)
3882 return -1;
3884 else {
3885 /* Extend string object */
3886 n = PyString_Size(*repr);
3887 if (_PyString_Resize(repr, n + mbcssize) < 0)
3888 return -1;
3891 /* Do the conversion */
3892 if (size > 0) {
3893 char *s = PyString_AS_STRING(*repr) + n;
3894 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3895 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3896 return -1;
3900 return 0;
3903 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
3904 Py_ssize_t size,
3905 const char *errors)
3907 PyObject *repr = NULL;
3908 int ret;
3910 #ifdef NEED_RETRY
3911 retry:
3912 if (size > INT_MAX)
3913 ret = encode_mbcs(&repr, p, INT_MAX);
3914 else
3915 #endif
3916 ret = encode_mbcs(&repr, p, (int)size);
3918 if (ret < 0) {
3919 Py_XDECREF(repr);
3920 return NULL;
3923 #ifdef NEED_RETRY
3924 if (size > INT_MAX) {
3925 p += INT_MAX;
3926 size -= INT_MAX;
3927 goto retry;
3929 #endif
3931 return repr;
3934 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3936 if (!PyUnicode_Check(unicode)) {
3937 PyErr_BadArgument();
3938 return NULL;
3940 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3941 PyUnicode_GET_SIZE(unicode),
3942 NULL);
3945 #undef NEED_RETRY
3947 #endif /* MS_WINDOWS */
3949 /* --- Character Mapping Codec -------------------------------------------- */
3951 PyObject *PyUnicode_DecodeCharmap(const char *s,
3952 Py_ssize_t size,
3953 PyObject *mapping,
3954 const char *errors)
3956 const char *starts = s;
3957 Py_ssize_t startinpos;
3958 Py_ssize_t endinpos;
3959 Py_ssize_t outpos;
3960 const char *e;
3961 PyUnicodeObject *v;
3962 Py_UNICODE *p;
3963 Py_ssize_t extrachars = 0;
3964 PyObject *errorHandler = NULL;
3965 PyObject *exc = NULL;
3966 Py_UNICODE *mapstring = NULL;
3967 Py_ssize_t maplen = 0;
3969 /* Default to Latin-1 */
3970 if (mapping == NULL)
3971 return PyUnicode_DecodeLatin1(s, size, errors);
3973 v = _PyUnicode_New(size);
3974 if (v == NULL)
3975 goto onError;
3976 if (size == 0)
3977 return (PyObject *)v;
3978 p = PyUnicode_AS_UNICODE(v);
3979 e = s + size;
3980 if (PyUnicode_CheckExact(mapping)) {
3981 mapstring = PyUnicode_AS_UNICODE(mapping);
3982 maplen = PyUnicode_GET_SIZE(mapping);
3983 while (s < e) {
3984 unsigned char ch = *s;
3985 Py_UNICODE x = 0xfffe; /* illegal value */
3987 if (ch < maplen)
3988 x = mapstring[ch];
3990 if (x == 0xfffe) {
3991 /* undefined mapping */
3992 outpos = p-PyUnicode_AS_UNICODE(v);
3993 startinpos = s-starts;
3994 endinpos = startinpos+1;
3995 if (unicode_decode_call_errorhandler(
3996 errors, &errorHandler,
3997 "charmap", "character maps to <undefined>",
3998 starts, size, &startinpos, &endinpos, &exc, &s,
3999 (PyObject **)&v, &outpos, &p)) {
4000 goto onError;
4002 continue;
4004 *p++ = x;
4005 ++s;
4008 else {
4009 while (s < e) {
4010 unsigned char ch = *s;
4011 PyObject *w, *x;
4013 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4014 w = PyInt_FromLong((long)ch);
4015 if (w == NULL)
4016 goto onError;
4017 x = PyObject_GetItem(mapping, w);
4018 Py_DECREF(w);
4019 if (x == NULL) {
4020 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4021 /* No mapping found means: mapping is undefined. */
4022 PyErr_Clear();
4023 x = Py_None;
4024 Py_INCREF(x);
4025 } else
4026 goto onError;
4029 /* Apply mapping */
4030 if (PyInt_Check(x)) {
4031 long value = PyInt_AS_LONG(x);
4032 if (value < 0 || value > 65535) {
4033 PyErr_SetString(PyExc_TypeError,
4034 "character mapping must be in range(65536)");
4035 Py_DECREF(x);
4036 goto onError;
4038 *p++ = (Py_UNICODE)value;
4040 else if (x == Py_None) {
4041 /* undefined mapping */
4042 outpos = p-PyUnicode_AS_UNICODE(v);
4043 startinpos = s-starts;
4044 endinpos = startinpos+1;
4045 if (unicode_decode_call_errorhandler(
4046 errors, &errorHandler,
4047 "charmap", "character maps to <undefined>",
4048 starts, size, &startinpos, &endinpos, &exc, &s,
4049 (PyObject **)&v, &outpos, &p)) {
4050 Py_DECREF(x);
4051 goto onError;
4053 Py_DECREF(x);
4054 continue;
4056 else if (PyUnicode_Check(x)) {
4057 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4059 if (targetsize == 1)
4060 /* 1-1 mapping */
4061 *p++ = *PyUnicode_AS_UNICODE(x);
4063 else if (targetsize > 1) {
4064 /* 1-n mapping */
4065 if (targetsize > extrachars) {
4066 /* resize first */
4067 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4068 Py_ssize_t needed = (targetsize - extrachars) + \
4069 (targetsize << 2);
4070 extrachars += needed;
4071 /* XXX overflow detection missing */
4072 if (_PyUnicode_Resize(&v,
4073 PyUnicode_GET_SIZE(v) + needed) < 0) {
4074 Py_DECREF(x);
4075 goto onError;
4077 p = PyUnicode_AS_UNICODE(v) + oldpos;
4079 Py_UNICODE_COPY(p,
4080 PyUnicode_AS_UNICODE(x),
4081 targetsize);
4082 p += targetsize;
4083 extrachars -= targetsize;
4085 /* 1-0 mapping: skip the character */
4087 else {
4088 /* wrong return value */
4089 PyErr_SetString(PyExc_TypeError,
4090 "character mapping must return integer, None or unicode");
4091 Py_DECREF(x);
4092 goto onError;
4094 Py_DECREF(x);
4095 ++s;
4098 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4099 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4100 goto onError;
4101 Py_XDECREF(errorHandler);
4102 Py_XDECREF(exc);
4103 return (PyObject *)v;
4105 onError:
4106 Py_XDECREF(errorHandler);
4107 Py_XDECREF(exc);
4108 Py_XDECREF(v);
4109 return NULL;
4112 /* Charmap encoding: the lookup table */
4114 struct encoding_map{
4115 PyObject_HEAD
4116 unsigned char level1[32];
4117 int count2, count3;
4118 unsigned char level23[1];
4121 static PyObject*
4122 encoding_map_size(PyObject *obj, PyObject* args)
4124 struct encoding_map *map = (struct encoding_map*)obj;
4125 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4126 128*map->count3);
4129 static PyMethodDef encoding_map_methods[] = {
4130 {"size", encoding_map_size, METH_NOARGS,
4131 PyDoc_STR("Return the size (in bytes) of this object") },
4132 { 0 }
4135 static void
4136 encoding_map_dealloc(PyObject* o)
4138 PyObject_FREE(o);
4141 static PyTypeObject EncodingMapType = {
4142 PyVarObject_HEAD_INIT(NULL, 0)
4143 "EncodingMap", /*tp_name*/
4144 sizeof(struct encoding_map), /*tp_basicsize*/
4145 0, /*tp_itemsize*/
4146 /* methods */
4147 encoding_map_dealloc, /*tp_dealloc*/
4148 0, /*tp_print*/
4149 0, /*tp_getattr*/
4150 0, /*tp_setattr*/
4151 0, /*tp_compare*/
4152 0, /*tp_repr*/
4153 0, /*tp_as_number*/
4154 0, /*tp_as_sequence*/
4155 0, /*tp_as_mapping*/
4156 0, /*tp_hash*/
4157 0, /*tp_call*/
4158 0, /*tp_str*/
4159 0, /*tp_getattro*/
4160 0, /*tp_setattro*/
4161 0, /*tp_as_buffer*/
4162 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4163 0, /*tp_doc*/
4164 0, /*tp_traverse*/
4165 0, /*tp_clear*/
4166 0, /*tp_richcompare*/
4167 0, /*tp_weaklistoffset*/
4168 0, /*tp_iter*/
4169 0, /*tp_iternext*/
4170 encoding_map_methods, /*tp_methods*/
4171 0, /*tp_members*/
4172 0, /*tp_getset*/
4173 0, /*tp_base*/
4174 0, /*tp_dict*/
4175 0, /*tp_descr_get*/
4176 0, /*tp_descr_set*/
4177 0, /*tp_dictoffset*/
4178 0, /*tp_init*/
4179 0, /*tp_alloc*/
4180 0, /*tp_new*/
4181 0, /*tp_free*/
4182 0, /*tp_is_gc*/
4185 PyObject*
4186 PyUnicode_BuildEncodingMap(PyObject* string)
4188 Py_UNICODE *decode;
4189 PyObject *result;
4190 struct encoding_map *mresult;
4191 int i;
4192 int need_dict = 0;
4193 unsigned char level1[32];
4194 unsigned char level2[512];
4195 unsigned char *mlevel1, *mlevel2, *mlevel3;
4196 int count2 = 0, count3 = 0;
4198 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4199 PyErr_BadArgument();
4200 return NULL;
4202 decode = PyUnicode_AS_UNICODE(string);
4203 memset(level1, 0xFF, sizeof level1);
4204 memset(level2, 0xFF, sizeof level2);
4206 /* If there isn't a one-to-one mapping of NULL to \0,
4207 or if there are non-BMP characters, we need to use
4208 a mapping dictionary. */
4209 if (decode[0] != 0)
4210 need_dict = 1;
4211 for (i = 1; i < 256; i++) {
4212 int l1, l2;
4213 if (decode[i] == 0
4214 #ifdef Py_UNICODE_WIDE
4215 || decode[i] > 0xFFFF
4216 #endif
4218 need_dict = 1;
4219 break;
4221 if (decode[i] == 0xFFFE)
4222 /* unmapped character */
4223 continue;
4224 l1 = decode[i] >> 11;
4225 l2 = decode[i] >> 7;
4226 if (level1[l1] == 0xFF)
4227 level1[l1] = count2++;
4228 if (level2[l2] == 0xFF)
4229 level2[l2] = count3++;
4232 if (count2 >= 0xFF || count3 >= 0xFF)
4233 need_dict = 1;
4235 if (need_dict) {
4236 PyObject *result = PyDict_New();
4237 PyObject *key, *value;
4238 if (!result)
4239 return NULL;
4240 for (i = 0; i < 256; i++) {
4241 key = value = NULL;
4242 key = PyInt_FromLong(decode[i]);
4243 value = PyInt_FromLong(i);
4244 if (!key || !value)
4245 goto failed1;
4246 if (PyDict_SetItem(result, key, value) == -1)
4247 goto failed1;
4248 Py_DECREF(key);
4249 Py_DECREF(value);
4251 return result;
4252 failed1:
4253 Py_XDECREF(key);
4254 Py_XDECREF(value);
4255 Py_DECREF(result);
4256 return NULL;
4259 /* Create a three-level trie */
4260 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4261 16*count2 + 128*count3 - 1);
4262 if (!result)
4263 return PyErr_NoMemory();
4264 PyObject_Init(result, &EncodingMapType);
4265 mresult = (struct encoding_map*)result;
4266 mresult->count2 = count2;
4267 mresult->count3 = count3;
4268 mlevel1 = mresult->level1;
4269 mlevel2 = mresult->level23;
4270 mlevel3 = mresult->level23 + 16*count2;
4271 memcpy(mlevel1, level1, 32);
4272 memset(mlevel2, 0xFF, 16*count2);
4273 memset(mlevel3, 0, 128*count3);
4274 count3 = 0;
4275 for (i = 1; i < 256; i++) {
4276 int o1, o2, o3, i2, i3;
4277 if (decode[i] == 0xFFFE)
4278 /* unmapped character */
4279 continue;
4280 o1 = decode[i]>>11;
4281 o2 = (decode[i]>>7) & 0xF;
4282 i2 = 16*mlevel1[o1] + o2;
4283 if (mlevel2[i2] == 0xFF)
4284 mlevel2[i2] = count3++;
4285 o3 = decode[i] & 0x7F;
4286 i3 = 128*mlevel2[i2] + o3;
4287 mlevel3[i3] = i;
4289 return result;
4292 static int
4293 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4295 struct encoding_map *map = (struct encoding_map*)mapping;
4296 int l1 = c>>11;
4297 int l2 = (c>>7) & 0xF;
4298 int l3 = c & 0x7F;
4299 int i;
4301 #ifdef Py_UNICODE_WIDE
4302 if (c > 0xFFFF) {
4303 return -1;
4305 #endif
4306 if (c == 0)
4307 return 0;
4308 /* level 1*/
4309 i = map->level1[l1];
4310 if (i == 0xFF) {
4311 return -1;
4313 /* level 2*/
4314 i = map->level23[16*i+l2];
4315 if (i == 0xFF) {
4316 return -1;
4318 /* level 3 */
4319 i = map->level23[16*map->count2 + 128*i + l3];
4320 if (i == 0) {
4321 return -1;
4323 return i;
4326 /* Lookup the character ch in the mapping. If the character
4327 can't be found, Py_None is returned (or NULL, if another
4328 error occurred). */
4329 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4331 PyObject *w = PyInt_FromLong((long)c);
4332 PyObject *x;
4334 if (w == NULL)
4335 return NULL;
4336 x = PyObject_GetItem(mapping, w);
4337 Py_DECREF(w);
4338 if (x == NULL) {
4339 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4340 /* No mapping found means: mapping is undefined. */
4341 PyErr_Clear();
4342 x = Py_None;
4343 Py_INCREF(x);
4344 return x;
4345 } else
4346 return NULL;
4348 else if (x == Py_None)
4349 return x;
4350 else if (PyInt_Check(x)) {
4351 long value = PyInt_AS_LONG(x);
4352 if (value < 0 || value > 255) {
4353 PyErr_SetString(PyExc_TypeError,
4354 "character mapping must be in range(256)");
4355 Py_DECREF(x);
4356 return NULL;
4358 return x;
4360 else if (PyString_Check(x))
4361 return x;
4362 else {
4363 /* wrong return value */
4364 PyErr_SetString(PyExc_TypeError,
4365 "character mapping must return integer, None or str");
4366 Py_DECREF(x);
4367 return NULL;
4371 static int
4372 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4374 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4375 /* exponentially overallocate to minimize reallocations */
4376 if (requiredsize < 2*outsize)
4377 requiredsize = 2*outsize;
4378 if (_PyString_Resize(outobj, requiredsize)) {
4379 return 0;
4381 return 1;
4384 typedef enum charmapencode_result {
4385 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4386 }charmapencode_result;
4387 /* lookup the character, put the result in the output string and adjust
4388 various state variables. Reallocate the output string if not enough
4389 space is available. Return a new reference to the object that
4390 was put in the output buffer, or Py_None, if the mapping was undefined
4391 (in which case no character was written) or NULL, if a
4392 reallocation error occurred. The caller must decref the result */
4393 static
4394 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4395 PyObject **outobj, Py_ssize_t *outpos)
4397 PyObject *rep;
4398 char *outstart;
4399 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4401 if (Py_TYPE(mapping) == &EncodingMapType) {
4402 int res = encoding_map_lookup(c, mapping);
4403 Py_ssize_t requiredsize = *outpos+1;
4404 if (res == -1)
4405 return enc_FAILED;
4406 if (outsize<requiredsize)
4407 if (!charmapencode_resize(outobj, outpos, requiredsize))
4408 return enc_EXCEPTION;
4409 outstart = PyString_AS_STRING(*outobj);
4410 outstart[(*outpos)++] = (char)res;
4411 return enc_SUCCESS;
4414 rep = charmapencode_lookup(c, mapping);
4415 if (rep==NULL)
4416 return enc_EXCEPTION;
4417 else if (rep==Py_None) {
4418 Py_DECREF(rep);
4419 return enc_FAILED;
4420 } else {
4421 if (PyInt_Check(rep)) {
4422 Py_ssize_t requiredsize = *outpos+1;
4423 if (outsize<requiredsize)
4424 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4425 Py_DECREF(rep);
4426 return enc_EXCEPTION;
4428 outstart = PyString_AS_STRING(*outobj);
4429 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4431 else {
4432 const char *repchars = PyString_AS_STRING(rep);
4433 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4434 Py_ssize_t requiredsize = *outpos+repsize;
4435 if (outsize<requiredsize)
4436 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4437 Py_DECREF(rep);
4438 return enc_EXCEPTION;
4440 outstart = PyString_AS_STRING(*outobj);
4441 memcpy(outstart + *outpos, repchars, repsize);
4442 *outpos += repsize;
4445 Py_DECREF(rep);
4446 return enc_SUCCESS;
4449 /* handle an error in PyUnicode_EncodeCharmap
4450 Return 0 on success, -1 on error */
4451 static
4452 int charmap_encoding_error(
4453 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4454 PyObject **exceptionObject,
4455 int *known_errorHandler, PyObject **errorHandler, const char *errors,
4456 PyObject **res, Py_ssize_t *respos)
4458 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4459 Py_ssize_t repsize;
4460 Py_ssize_t newpos;
4461 Py_UNICODE *uni2;
4462 /* startpos for collecting unencodable chars */
4463 Py_ssize_t collstartpos = *inpos;
4464 Py_ssize_t collendpos = *inpos+1;
4465 Py_ssize_t collpos;
4466 char *encoding = "charmap";
4467 char *reason = "character maps to <undefined>";
4468 charmapencode_result x;
4470 /* find all unencodable characters */
4471 while (collendpos < size) {
4472 PyObject *rep;
4473 if (Py_TYPE(mapping) == &EncodingMapType) {
4474 int res = encoding_map_lookup(p[collendpos], mapping);
4475 if (res != -1)
4476 break;
4477 ++collendpos;
4478 continue;
4481 rep = charmapencode_lookup(p[collendpos], mapping);
4482 if (rep==NULL)
4483 return -1;
4484 else if (rep!=Py_None) {
4485 Py_DECREF(rep);
4486 break;
4488 Py_DECREF(rep);
4489 ++collendpos;
4491 /* cache callback name lookup
4492 * (if not done yet, i.e. it's the first error) */
4493 if (*known_errorHandler==-1) {
4494 if ((errors==NULL) || (!strcmp(errors, "strict")))
4495 *known_errorHandler = 1;
4496 else if (!strcmp(errors, "replace"))
4497 *known_errorHandler = 2;
4498 else if (!strcmp(errors, "ignore"))
4499 *known_errorHandler = 3;
4500 else if (!strcmp(errors, "xmlcharrefreplace"))
4501 *known_errorHandler = 4;
4502 else
4503 *known_errorHandler = 0;
4505 switch (*known_errorHandler) {
4506 case 1: /* strict */
4507 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4508 return -1;
4509 case 2: /* replace */
4510 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4511 x = charmapencode_output('?', mapping, res, respos);
4512 if (x==enc_EXCEPTION) {
4513 return -1;
4515 else if (x==enc_FAILED) {
4516 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4517 return -1;
4520 /* fall through */
4521 case 3: /* ignore */
4522 *inpos = collendpos;
4523 break;
4524 case 4: /* xmlcharrefreplace */
4525 /* generate replacement (temporarily (mis)uses p) */
4526 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4527 char buffer[2+29+1+1];
4528 char *cp;
4529 sprintf(buffer, "&#%d;", (int)p[collpos]);
4530 for (cp = buffer; *cp; ++cp) {
4531 x = charmapencode_output(*cp, mapping, res, respos);
4532 if (x==enc_EXCEPTION)
4533 return -1;
4534 else if (x==enc_FAILED) {
4535 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4536 return -1;
4540 *inpos = collendpos;
4541 break;
4542 default:
4543 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4544 encoding, reason, p, size, exceptionObject,
4545 collstartpos, collendpos, &newpos);
4546 if (repunicode == NULL)
4547 return -1;
4548 /* generate replacement */
4549 repsize = PyUnicode_GET_SIZE(repunicode);
4550 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4551 x = charmapencode_output(*uni2, mapping, res, respos);
4552 if (x==enc_EXCEPTION) {
4553 return -1;
4555 else if (x==enc_FAILED) {
4556 Py_DECREF(repunicode);
4557 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4558 return -1;
4561 *inpos = newpos;
4562 Py_DECREF(repunicode);
4564 return 0;
4567 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4568 Py_ssize_t size,
4569 PyObject *mapping,
4570 const char *errors)
4572 /* output object */
4573 PyObject *res = NULL;
4574 /* current input position */
4575 Py_ssize_t inpos = 0;
4576 /* current output position */
4577 Py_ssize_t respos = 0;
4578 PyObject *errorHandler = NULL;
4579 PyObject *exc = NULL;
4580 /* the following variable is used for caching string comparisons
4581 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4582 * 3=ignore, 4=xmlcharrefreplace */
4583 int known_errorHandler = -1;
4585 /* Default to Latin-1 */
4586 if (mapping == NULL)
4587 return PyUnicode_EncodeLatin1(p, size, errors);
4589 /* allocate enough for a simple encoding without
4590 replacements, if we need more, we'll resize */
4591 res = PyString_FromStringAndSize(NULL, size);
4592 if (res == NULL)
4593 goto onError;
4594 if (size == 0)
4595 return res;
4597 while (inpos<size) {
4598 /* try to encode it */
4599 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4600 if (x==enc_EXCEPTION) /* error */
4601 goto onError;
4602 if (x==enc_FAILED) { /* unencodable character */
4603 if (charmap_encoding_error(p, size, &inpos, mapping,
4604 &exc,
4605 &known_errorHandler, &errorHandler, errors,
4606 &res, &respos)) {
4607 goto onError;
4610 else
4611 /* done with this character => adjust input position */
4612 ++inpos;
4615 /* Resize if we allocated to much */
4616 if (respos<PyString_GET_SIZE(res)) {
4617 if (_PyString_Resize(&res, respos))
4618 goto onError;
4620 Py_XDECREF(exc);
4621 Py_XDECREF(errorHandler);
4622 return res;
4624 onError:
4625 Py_XDECREF(res);
4626 Py_XDECREF(exc);
4627 Py_XDECREF(errorHandler);
4628 return NULL;
4631 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4632 PyObject *mapping)
4634 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4635 PyErr_BadArgument();
4636 return NULL;
4638 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4639 PyUnicode_GET_SIZE(unicode),
4640 mapping,
4641 NULL);
4644 /* create or adjust a UnicodeTranslateError */
4645 static void make_translate_exception(PyObject **exceptionObject,
4646 const Py_UNICODE *unicode, Py_ssize_t size,
4647 Py_ssize_t startpos, Py_ssize_t endpos,
4648 const char *reason)
4650 if (*exceptionObject == NULL) {
4651 *exceptionObject = PyUnicodeTranslateError_Create(
4652 unicode, size, startpos, endpos, reason);
4654 else {
4655 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4656 goto onError;
4657 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4658 goto onError;
4659 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4660 goto onError;
4661 return;
4662 onError:
4663 Py_DECREF(*exceptionObject);
4664 *exceptionObject = NULL;
4668 /* raises a UnicodeTranslateError */
4669 static void raise_translate_exception(PyObject **exceptionObject,
4670 const Py_UNICODE *unicode, Py_ssize_t size,
4671 Py_ssize_t startpos, Py_ssize_t endpos,
4672 const char *reason)
4674 make_translate_exception(exceptionObject,
4675 unicode, size, startpos, endpos, reason);
4676 if (*exceptionObject != NULL)
4677 PyCodec_StrictErrors(*exceptionObject);
4680 /* error handling callback helper:
4681 build arguments, call the callback and check the arguments,
4682 put the result into newpos and return the replacement string, which
4683 has to be freed by the caller */
4684 static PyObject *unicode_translate_call_errorhandler(const char *errors,
4685 PyObject **errorHandler,
4686 const char *reason,
4687 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4688 Py_ssize_t startpos, Py_ssize_t endpos,
4689 Py_ssize_t *newpos)
4691 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4693 Py_ssize_t i_newpos;
4694 PyObject *restuple;
4695 PyObject *resunicode;
4697 if (*errorHandler == NULL) {
4698 *errorHandler = PyCodec_LookupError(errors);
4699 if (*errorHandler == NULL)
4700 return NULL;
4703 make_translate_exception(exceptionObject,
4704 unicode, size, startpos, endpos, reason);
4705 if (*exceptionObject == NULL)
4706 return NULL;
4708 restuple = PyObject_CallFunctionObjArgs(
4709 *errorHandler, *exceptionObject, NULL);
4710 if (restuple == NULL)
4711 return NULL;
4712 if (!PyTuple_Check(restuple)) {
4713 PyErr_Format(PyExc_TypeError, &argparse[4]);
4714 Py_DECREF(restuple);
4715 return NULL;
4717 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4718 &resunicode, &i_newpos)) {
4719 Py_DECREF(restuple);
4720 return NULL;
4722 if (i_newpos<0)
4723 *newpos = size+i_newpos;
4724 else
4725 *newpos = i_newpos;
4726 if (*newpos<0 || *newpos>size) {
4727 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4728 Py_DECREF(restuple);
4729 return NULL;
4731 Py_INCREF(resunicode);
4732 Py_DECREF(restuple);
4733 return resunicode;
4736 /* Lookup the character ch in the mapping and put the result in result,
4737 which must be decrefed by the caller.
4738 Return 0 on success, -1 on error */
4739 static
4740 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4742 PyObject *w = PyInt_FromLong((long)c);
4743 PyObject *x;
4745 if (w == NULL)
4746 return -1;
4747 x = PyObject_GetItem(mapping, w);
4748 Py_DECREF(w);
4749 if (x == NULL) {
4750 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4751 /* No mapping found means: use 1:1 mapping. */
4752 PyErr_Clear();
4753 *result = NULL;
4754 return 0;
4755 } else
4756 return -1;
4758 else if (x == Py_None) {
4759 *result = x;
4760 return 0;
4762 else if (PyInt_Check(x)) {
4763 long value = PyInt_AS_LONG(x);
4764 long max = PyUnicode_GetMax();
4765 if (value < 0 || value > max) {
4766 PyErr_Format(PyExc_TypeError,
4767 "character mapping must be in range(0x%lx)", max+1);
4768 Py_DECREF(x);
4769 return -1;
4771 *result = x;
4772 return 0;
4774 else if (PyUnicode_Check(x)) {
4775 *result = x;
4776 return 0;
4778 else {
4779 /* wrong return value */
4780 PyErr_SetString(PyExc_TypeError,
4781 "character mapping must return integer, None or unicode");
4782 Py_DECREF(x);
4783 return -1;
4786 /* ensure that *outobj is at least requiredsize characters long,
4787 if not reallocate and adjust various state variables.
4788 Return 0 on success, -1 on error */
4789 static
4790 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4791 Py_ssize_t requiredsize)
4793 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4794 if (requiredsize > oldsize) {
4795 /* remember old output position */
4796 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4797 /* exponentially overallocate to minimize reallocations */
4798 if (requiredsize < 2 * oldsize)
4799 requiredsize = 2 * oldsize;
4800 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
4801 return -1;
4802 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4804 return 0;
4806 /* lookup the character, put the result in the output string and adjust
4807 various state variables. Return a new reference to the object that
4808 was put in the output buffer in *result, or Py_None, if the mapping was
4809 undefined (in which case no character was written).
4810 The called must decref result.
4811 Return 0 on success, -1 on error. */
4812 static
4813 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
4814 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4815 PyObject **res)
4817 if (charmaptranslate_lookup(*curinp, mapping, res))
4818 return -1;
4819 if (*res==NULL) {
4820 /* not found => default to 1:1 mapping */
4821 *(*outp)++ = *curinp;
4823 else if (*res==Py_None)
4825 else if (PyInt_Check(*res)) {
4826 /* no overflow check, because we know that the space is enough */
4827 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4829 else if (PyUnicode_Check(*res)) {
4830 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4831 if (repsize==1) {
4832 /* no overflow check, because we know that the space is enough */
4833 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4835 else if (repsize!=0) {
4836 /* more than one character */
4837 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4838 (insize - (curinp-startinp)) +
4839 repsize - 1;
4840 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4841 return -1;
4842 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4843 *outp += repsize;
4846 else
4847 return -1;
4848 return 0;
4851 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
4852 Py_ssize_t size,
4853 PyObject *mapping,
4854 const char *errors)
4856 /* output object */
4857 PyObject *res = NULL;
4858 /* pointers to the beginning and end+1 of input */
4859 const Py_UNICODE *startp = p;
4860 const Py_UNICODE *endp = p + size;
4861 /* pointer into the output */
4862 Py_UNICODE *str;
4863 /* current output position */
4864 Py_ssize_t respos = 0;
4865 char *reason = "character maps to <undefined>";
4866 PyObject *errorHandler = NULL;
4867 PyObject *exc = NULL;
4868 /* the following variable is used for caching string comparisons
4869 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4870 * 3=ignore, 4=xmlcharrefreplace */
4871 int known_errorHandler = -1;
4873 if (mapping == NULL) {
4874 PyErr_BadArgument();
4875 return NULL;
4878 /* allocate enough for a simple 1:1 translation without
4879 replacements, if we need more, we'll resize */
4880 res = PyUnicode_FromUnicode(NULL, size);
4881 if (res == NULL)
4882 goto onError;
4883 if (size == 0)
4884 return res;
4885 str = PyUnicode_AS_UNICODE(res);
4887 while (p<endp) {
4888 /* try to encode it */
4889 PyObject *x = NULL;
4890 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4891 Py_XDECREF(x);
4892 goto onError;
4894 Py_XDECREF(x);
4895 if (x!=Py_None) /* it worked => adjust input pointer */
4896 ++p;
4897 else { /* untranslatable character */
4898 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4899 Py_ssize_t repsize;
4900 Py_ssize_t newpos;
4901 Py_UNICODE *uni2;
4902 /* startpos for collecting untranslatable chars */
4903 const Py_UNICODE *collstart = p;
4904 const Py_UNICODE *collend = p+1;
4905 const Py_UNICODE *coll;
4907 /* find all untranslatable characters */
4908 while (collend < endp) {
4909 if (charmaptranslate_lookup(*collend, mapping, &x))
4910 goto onError;
4911 Py_XDECREF(x);
4912 if (x!=Py_None)
4913 break;
4914 ++collend;
4916 /* cache callback name lookup
4917 * (if not done yet, i.e. it's the first error) */
4918 if (known_errorHandler==-1) {
4919 if ((errors==NULL) || (!strcmp(errors, "strict")))
4920 known_errorHandler = 1;
4921 else if (!strcmp(errors, "replace"))
4922 known_errorHandler = 2;
4923 else if (!strcmp(errors, "ignore"))
4924 known_errorHandler = 3;
4925 else if (!strcmp(errors, "xmlcharrefreplace"))
4926 known_errorHandler = 4;
4927 else
4928 known_errorHandler = 0;
4930 switch (known_errorHandler) {
4931 case 1: /* strict */
4932 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4933 goto onError;
4934 case 2: /* replace */
4935 /* No need to check for space, this is a 1:1 replacement */
4936 for (coll = collstart; coll<collend; ++coll)
4937 *str++ = '?';
4938 /* fall through */
4939 case 3: /* ignore */
4940 p = collend;
4941 break;
4942 case 4: /* xmlcharrefreplace */
4943 /* generate replacement (temporarily (mis)uses p) */
4944 for (p = collstart; p < collend; ++p) {
4945 char buffer[2+29+1+1];
4946 char *cp;
4947 sprintf(buffer, "&#%d;", (int)*p);
4948 if (charmaptranslate_makespace(&res, &str,
4949 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4950 goto onError;
4951 for (cp = buffer; *cp; ++cp)
4952 *str++ = *cp;
4954 p = collend;
4955 break;
4956 default:
4957 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4958 reason, startp, size, &exc,
4959 collstart-startp, collend-startp, &newpos);
4960 if (repunicode == NULL)
4961 goto onError;
4962 /* generate replacement */
4963 repsize = PyUnicode_GET_SIZE(repunicode);
4964 if (charmaptranslate_makespace(&res, &str,
4965 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4966 Py_DECREF(repunicode);
4967 goto onError;
4969 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4970 *str++ = *uni2;
4971 p = startp + newpos;
4972 Py_DECREF(repunicode);
4976 /* Resize if we allocated to much */
4977 respos = str-PyUnicode_AS_UNICODE(res);
4978 if (respos<PyUnicode_GET_SIZE(res)) {
4979 if (_PyUnicode_Resize(&res, respos) < 0)
4980 goto onError;
4982 Py_XDECREF(exc);
4983 Py_XDECREF(errorHandler);
4984 return res;
4986 onError:
4987 Py_XDECREF(res);
4988 Py_XDECREF(exc);
4989 Py_XDECREF(errorHandler);
4990 return NULL;
4993 PyObject *PyUnicode_Translate(PyObject *str,
4994 PyObject *mapping,
4995 const char *errors)
4997 PyObject *result;
4999 str = PyUnicode_FromObject(str);
5000 if (str == NULL)
5001 goto onError;
5002 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5003 PyUnicode_GET_SIZE(str),
5004 mapping,
5005 errors);
5006 Py_DECREF(str);
5007 return result;
5009 onError:
5010 Py_XDECREF(str);
5011 return NULL;
5014 /* --- Decimal Encoder ---------------------------------------------------- */
5016 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5017 Py_ssize_t length,
5018 char *output,
5019 const char *errors)
5021 Py_UNICODE *p, *end;
5022 PyObject *errorHandler = NULL;
5023 PyObject *exc = NULL;
5024 const char *encoding = "decimal";
5025 const char *reason = "invalid decimal Unicode string";
5026 /* the following variable is used for caching string comparisons
5027 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5028 int known_errorHandler = -1;
5030 if (output == NULL) {
5031 PyErr_BadArgument();
5032 return -1;
5035 p = s;
5036 end = s + length;
5037 while (p < end) {
5038 register Py_UNICODE ch = *p;
5039 int decimal;
5040 PyObject *repunicode;
5041 Py_ssize_t repsize;
5042 Py_ssize_t newpos;
5043 Py_UNICODE *uni2;
5044 Py_UNICODE *collstart;
5045 Py_UNICODE *collend;
5047 if (Py_UNICODE_ISSPACE(ch)) {
5048 *output++ = ' ';
5049 ++p;
5050 continue;
5052 decimal = Py_UNICODE_TODECIMAL(ch);
5053 if (decimal >= 0) {
5054 *output++ = '0' + decimal;
5055 ++p;
5056 continue;
5058 if (0 < ch && ch < 256) {
5059 *output++ = (char)ch;
5060 ++p;
5061 continue;
5063 /* All other characters are considered unencodable */
5064 collstart = p;
5065 collend = p+1;
5066 while (collend < end) {
5067 if ((0 < *collend && *collend < 256) ||
5068 !Py_UNICODE_ISSPACE(*collend) ||
5069 Py_UNICODE_TODECIMAL(*collend))
5070 break;
5072 /* cache callback name lookup
5073 * (if not done yet, i.e. it's the first error) */
5074 if (known_errorHandler==-1) {
5075 if ((errors==NULL) || (!strcmp(errors, "strict")))
5076 known_errorHandler = 1;
5077 else if (!strcmp(errors, "replace"))
5078 known_errorHandler = 2;
5079 else if (!strcmp(errors, "ignore"))
5080 known_errorHandler = 3;
5081 else if (!strcmp(errors, "xmlcharrefreplace"))
5082 known_errorHandler = 4;
5083 else
5084 known_errorHandler = 0;
5086 switch (known_errorHandler) {
5087 case 1: /* strict */
5088 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5089 goto onError;
5090 case 2: /* replace */
5091 for (p = collstart; p < collend; ++p)
5092 *output++ = '?';
5093 /* fall through */
5094 case 3: /* ignore */
5095 p = collend;
5096 break;
5097 case 4: /* xmlcharrefreplace */
5098 /* generate replacement (temporarily (mis)uses p) */
5099 for (p = collstart; p < collend; ++p)
5100 output += sprintf(output, "&#%d;", (int)*p);
5101 p = collend;
5102 break;
5103 default:
5104 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5105 encoding, reason, s, length, &exc,
5106 collstart-s, collend-s, &newpos);
5107 if (repunicode == NULL)
5108 goto onError;
5109 /* generate replacement */
5110 repsize = PyUnicode_GET_SIZE(repunicode);
5111 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5112 Py_UNICODE ch = *uni2;
5113 if (Py_UNICODE_ISSPACE(ch))
5114 *output++ = ' ';
5115 else {
5116 decimal = Py_UNICODE_TODECIMAL(ch);
5117 if (decimal >= 0)
5118 *output++ = '0' + decimal;
5119 else if (0 < ch && ch < 256)
5120 *output++ = (char)ch;
5121 else {
5122 Py_DECREF(repunicode);
5123 raise_encode_exception(&exc, encoding,
5124 s, length, collstart-s, collend-s, reason);
5125 goto onError;
5129 p = s + newpos;
5130 Py_DECREF(repunicode);
5133 /* 0-terminate the output string */
5134 *output++ = '\0';
5135 Py_XDECREF(exc);
5136 Py_XDECREF(errorHandler);
5137 return 0;
5139 onError:
5140 Py_XDECREF(exc);
5141 Py_XDECREF(errorHandler);
5142 return -1;
5145 /* --- Helpers ------------------------------------------------------------ */
5147 #include "stringlib/unicodedefs.h"
5149 #define FROM_UNICODE
5151 #include "stringlib/fastsearch.h"
5153 #include "stringlib/count.h"
5154 #include "stringlib/find.h"
5155 #include "stringlib/partition.h"
5157 /* helper macro to fixup start/end slice values */
5158 #define FIX_START_END(obj) \
5159 if (start < 0) \
5160 start += (obj)->length; \
5161 if (start < 0) \
5162 start = 0; \
5163 if (end > (obj)->length) \
5164 end = (obj)->length; \
5165 if (end < 0) \
5166 end += (obj)->length; \
5167 if (end < 0) \
5168 end = 0;
5170 Py_ssize_t PyUnicode_Count(PyObject *str,
5171 PyObject *substr,
5172 Py_ssize_t start,
5173 Py_ssize_t end)
5175 Py_ssize_t result;
5176 PyUnicodeObject* str_obj;
5177 PyUnicodeObject* sub_obj;
5179 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5180 if (!str_obj)
5181 return -1;
5182 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5183 if (!sub_obj) {
5184 Py_DECREF(str_obj);
5185 return -1;
5188 FIX_START_END(str_obj);
5190 result = stringlib_count(
5191 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5194 Py_DECREF(sub_obj);
5195 Py_DECREF(str_obj);
5197 return result;
5200 Py_ssize_t PyUnicode_Find(PyObject *str,
5201 PyObject *sub,
5202 Py_ssize_t start,
5203 Py_ssize_t end,
5204 int direction)
5206 Py_ssize_t result;
5208 str = PyUnicode_FromObject(str);
5209 if (!str)
5210 return -2;
5211 sub = PyUnicode_FromObject(sub);
5212 if (!sub) {
5213 Py_DECREF(str);
5214 return -2;
5217 if (direction > 0)
5218 result = stringlib_find_slice(
5219 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5220 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5221 start, end
5223 else
5224 result = stringlib_rfind_slice(
5225 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5226 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5227 start, end
5230 Py_DECREF(str);
5231 Py_DECREF(sub);
5233 return result;
5236 static
5237 int tailmatch(PyUnicodeObject *self,
5238 PyUnicodeObject *substring,
5239 Py_ssize_t start,
5240 Py_ssize_t end,
5241 int direction)
5243 if (substring->length == 0)
5244 return 1;
5246 FIX_START_END(self);
5248 end -= substring->length;
5249 if (end < start)
5250 return 0;
5252 if (direction > 0) {
5253 if (Py_UNICODE_MATCH(self, end, substring))
5254 return 1;
5255 } else {
5256 if (Py_UNICODE_MATCH(self, start, substring))
5257 return 1;
5260 return 0;
5263 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5264 PyObject *substr,
5265 Py_ssize_t start,
5266 Py_ssize_t end,
5267 int direction)
5269 Py_ssize_t result;
5271 str = PyUnicode_FromObject(str);
5272 if (str == NULL)
5273 return -1;
5274 substr = PyUnicode_FromObject(substr);
5275 if (substr == NULL) {
5276 Py_DECREF(str);
5277 return -1;
5280 result = tailmatch((PyUnicodeObject *)str,
5281 (PyUnicodeObject *)substr,
5282 start, end, direction);
5283 Py_DECREF(str);
5284 Py_DECREF(substr);
5285 return result;
5288 /* Apply fixfct filter to the Unicode object self and return a
5289 reference to the modified object */
5291 static
5292 PyObject *fixup(PyUnicodeObject *self,
5293 int (*fixfct)(PyUnicodeObject *s))
5296 PyUnicodeObject *u;
5298 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5299 if (u == NULL)
5300 return NULL;
5302 Py_UNICODE_COPY(u->str, self->str, self->length);
5304 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5305 /* fixfct should return TRUE if it modified the buffer. If
5306 FALSE, return a reference to the original buffer instead
5307 (to save space, not time) */
5308 Py_INCREF(self);
5309 Py_DECREF(u);
5310 return (PyObject*) self;
5312 return (PyObject*) u;
5315 static
5316 int fixupper(PyUnicodeObject *self)
5318 Py_ssize_t len = self->length;
5319 Py_UNICODE *s = self->str;
5320 int status = 0;
5322 while (len-- > 0) {
5323 register Py_UNICODE ch;
5325 ch = Py_UNICODE_TOUPPER(*s);
5326 if (ch != *s) {
5327 status = 1;
5328 *s = ch;
5330 s++;
5333 return status;
5336 static
5337 int fixlower(PyUnicodeObject *self)
5339 Py_ssize_t len = self->length;
5340 Py_UNICODE *s = self->str;
5341 int status = 0;
5343 while (len-- > 0) {
5344 register Py_UNICODE ch;
5346 ch = Py_UNICODE_TOLOWER(*s);
5347 if (ch != *s) {
5348 status = 1;
5349 *s = ch;
5351 s++;
5354 return status;
5357 static
5358 int fixswapcase(PyUnicodeObject *self)
5360 Py_ssize_t len = self->length;
5361 Py_UNICODE *s = self->str;
5362 int status = 0;
5364 while (len-- > 0) {
5365 if (Py_UNICODE_ISUPPER(*s)) {
5366 *s = Py_UNICODE_TOLOWER(*s);
5367 status = 1;
5368 } else if (Py_UNICODE_ISLOWER(*s)) {
5369 *s = Py_UNICODE_TOUPPER(*s);
5370 status = 1;
5372 s++;
5375 return status;
5378 static
5379 int fixcapitalize(PyUnicodeObject *self)
5381 Py_ssize_t len = self->length;
5382 Py_UNICODE *s = self->str;
5383 int status = 0;
5385 if (len == 0)
5386 return 0;
5387 if (Py_UNICODE_ISLOWER(*s)) {
5388 *s = Py_UNICODE_TOUPPER(*s);
5389 status = 1;
5391 s++;
5392 while (--len > 0) {
5393 if (Py_UNICODE_ISUPPER(*s)) {
5394 *s = Py_UNICODE_TOLOWER(*s);
5395 status = 1;
5397 s++;
5399 return status;
5402 static
5403 int fixtitle(PyUnicodeObject *self)
5405 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5406 register Py_UNICODE *e;
5407 int previous_is_cased;
5409 /* Shortcut for single character strings */
5410 if (PyUnicode_GET_SIZE(self) == 1) {
5411 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5412 if (*p != ch) {
5413 *p = ch;
5414 return 1;
5416 else
5417 return 0;
5420 e = p + PyUnicode_GET_SIZE(self);
5421 previous_is_cased = 0;
5422 for (; p < e; p++) {
5423 register const Py_UNICODE ch = *p;
5425 if (previous_is_cased)
5426 *p = Py_UNICODE_TOLOWER(ch);
5427 else
5428 *p = Py_UNICODE_TOTITLE(ch);
5430 if (Py_UNICODE_ISLOWER(ch) ||
5431 Py_UNICODE_ISUPPER(ch) ||
5432 Py_UNICODE_ISTITLE(ch))
5433 previous_is_cased = 1;
5434 else
5435 previous_is_cased = 0;
5437 return 1;
5440 PyObject *
5441 PyUnicode_Join(PyObject *separator, PyObject *seq)
5443 PyObject *internal_separator = NULL;
5444 const Py_UNICODE blank = ' ';
5445 const Py_UNICODE *sep = &blank;
5446 Py_ssize_t seplen = 1;
5447 PyUnicodeObject *res = NULL; /* the result */
5448 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5449 Py_ssize_t res_used; /* # used bytes */
5450 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5451 PyObject *fseq; /* PySequence_Fast(seq) */
5452 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
5453 PyObject *item;
5454 Py_ssize_t i;
5456 fseq = PySequence_Fast(seq, "");
5457 if (fseq == NULL) {
5458 return NULL;
5461 /* Grrrr. A codec may be invoked to convert str objects to
5462 * Unicode, and so it's possible to call back into Python code
5463 * during PyUnicode_FromObject(), and so it's possible for a sick
5464 * codec to change the size of fseq (if seq is a list). Therefore
5465 * we have to keep refetching the size -- can't assume seqlen
5466 * is invariant.
5468 seqlen = PySequence_Fast_GET_SIZE(fseq);
5469 /* If empty sequence, return u"". */
5470 if (seqlen == 0) {
5471 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5472 goto Done;
5474 /* If singleton sequence with an exact Unicode, return that. */
5475 if (seqlen == 1) {
5476 item = PySequence_Fast_GET_ITEM(fseq, 0);
5477 if (PyUnicode_CheckExact(item)) {
5478 Py_INCREF(item);
5479 res = (PyUnicodeObject *)item;
5480 goto Done;
5484 /* At least two items to join, or one that isn't exact Unicode. */
5485 if (seqlen > 1) {
5486 /* Set up sep and seplen -- they're needed. */
5487 if (separator == NULL) {
5488 sep = &blank;
5489 seplen = 1;
5491 else {
5492 internal_separator = PyUnicode_FromObject(separator);
5493 if (internal_separator == NULL)
5494 goto onError;
5495 sep = PyUnicode_AS_UNICODE(internal_separator);
5496 seplen = PyUnicode_GET_SIZE(internal_separator);
5497 /* In case PyUnicode_FromObject() mutated seq. */
5498 seqlen = PySequence_Fast_GET_SIZE(fseq);
5502 /* Get space. */
5503 res = _PyUnicode_New(res_alloc);
5504 if (res == NULL)
5505 goto onError;
5506 res_p = PyUnicode_AS_UNICODE(res);
5507 res_used = 0;
5509 for (i = 0; i < seqlen; ++i) {
5510 Py_ssize_t itemlen;
5511 Py_ssize_t new_res_used;
5513 item = PySequence_Fast_GET_ITEM(fseq, i);
5514 /* Convert item to Unicode. */
5515 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5516 PyErr_Format(PyExc_TypeError,
5517 "sequence item %zd: expected string or Unicode,"
5518 " %.80s found",
5519 i, Py_TYPE(item)->tp_name);
5520 goto onError;
5522 item = PyUnicode_FromObject(item);
5523 if (item == NULL)
5524 goto onError;
5525 /* We own a reference to item from here on. */
5527 /* In case PyUnicode_FromObject() mutated seq. */
5528 seqlen = PySequence_Fast_GET_SIZE(fseq);
5530 /* Make sure we have enough space for the separator and the item. */
5531 itemlen = PyUnicode_GET_SIZE(item);
5532 new_res_used = res_used + itemlen;
5533 if (new_res_used < 0)
5534 goto Overflow;
5535 if (i < seqlen - 1) {
5536 new_res_used += seplen;
5537 if (new_res_used < 0)
5538 goto Overflow;
5540 if (new_res_used > res_alloc) {
5541 /* double allocated size until it's big enough */
5542 do {
5543 res_alloc += res_alloc;
5544 if (res_alloc <= 0)
5545 goto Overflow;
5546 } while (new_res_used > res_alloc);
5547 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5548 Py_DECREF(item);
5549 goto onError;
5551 res_p = PyUnicode_AS_UNICODE(res) + res_used;
5554 /* Copy item, and maybe the separator. */
5555 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5556 res_p += itemlen;
5557 if (i < seqlen - 1) {
5558 Py_UNICODE_COPY(res_p, sep, seplen);
5559 res_p += seplen;
5561 Py_DECREF(item);
5562 res_used = new_res_used;
5565 /* Shrink res to match the used area; this probably can't fail,
5566 * but it's cheap to check.
5568 if (_PyUnicode_Resize(&res, res_used) < 0)
5569 goto onError;
5571 Done:
5572 Py_XDECREF(internal_separator);
5573 Py_DECREF(fseq);
5574 return (PyObject *)res;
5576 Overflow:
5577 PyErr_SetString(PyExc_OverflowError,
5578 "join() result is too long for a Python string");
5579 Py_DECREF(item);
5580 /* fall through */
5582 onError:
5583 Py_XDECREF(internal_separator);
5584 Py_DECREF(fseq);
5585 Py_XDECREF(res);
5586 return NULL;
5589 static
5590 PyUnicodeObject *pad(PyUnicodeObject *self,
5591 Py_ssize_t left,
5592 Py_ssize_t right,
5593 Py_UNICODE fill)
5595 PyUnicodeObject *u;
5597 if (left < 0)
5598 left = 0;
5599 if (right < 0)
5600 right = 0;
5602 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5603 Py_INCREF(self);
5604 return self;
5607 if (left > PY_SSIZE_T_MAX - self->length ||
5608 right > PY_SSIZE_T_MAX - (left + self->length)) {
5609 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5610 return NULL;
5612 u = _PyUnicode_New(left + self->length + right);
5613 if (u) {
5614 if (left)
5615 Py_UNICODE_FILL(u->str, fill, left);
5616 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5617 if (right)
5618 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5621 return u;
5624 #define SPLIT_APPEND(data, left, right) \
5625 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
5626 if (!str) \
5627 goto onError; \
5628 if (PyList_Append(list, str)) { \
5629 Py_DECREF(str); \
5630 goto onError; \
5632 else \
5633 Py_DECREF(str);
5635 static
5636 PyObject *split_whitespace(PyUnicodeObject *self,
5637 PyObject *list,
5638 Py_ssize_t maxcount)
5640 register Py_ssize_t i;
5641 register Py_ssize_t j;
5642 Py_ssize_t len = self->length;
5643 PyObject *str;
5644 register const Py_UNICODE *buf = self->str;
5646 for (i = j = 0; i < len; ) {
5647 /* find a token */
5648 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5649 i++;
5650 j = i;
5651 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
5652 i++;
5653 if (j < i) {
5654 if (maxcount-- <= 0)
5655 break;
5656 SPLIT_APPEND(buf, j, i);
5657 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5658 i++;
5659 j = i;
5662 if (j < len) {
5663 SPLIT_APPEND(buf, j, len);
5665 return list;
5667 onError:
5668 Py_DECREF(list);
5669 return NULL;
5672 PyObject *PyUnicode_Splitlines(PyObject *string,
5673 int keepends)
5675 register Py_ssize_t i;
5676 register Py_ssize_t j;
5677 Py_ssize_t len;
5678 PyObject *list;
5679 PyObject *str;
5680 Py_UNICODE *data;
5682 string = PyUnicode_FromObject(string);
5683 if (string == NULL)
5684 return NULL;
5685 data = PyUnicode_AS_UNICODE(string);
5686 len = PyUnicode_GET_SIZE(string);
5688 list = PyList_New(0);
5689 if (!list)
5690 goto onError;
5692 for (i = j = 0; i < len; ) {
5693 Py_ssize_t eol;
5695 /* Find a line and append it */
5696 while (i < len && !BLOOM_LINEBREAK(data[i]))
5697 i++;
5699 /* Skip the line break reading CRLF as one line break */
5700 eol = i;
5701 if (i < len) {
5702 if (data[i] == '\r' && i + 1 < len &&
5703 data[i+1] == '\n')
5704 i += 2;
5705 else
5706 i++;
5707 if (keepends)
5708 eol = i;
5710 SPLIT_APPEND(data, j, eol);
5711 j = i;
5713 if (j < len) {
5714 SPLIT_APPEND(data, j, len);
5717 Py_DECREF(string);
5718 return list;
5720 onError:
5721 Py_XDECREF(list);
5722 Py_DECREF(string);
5723 return NULL;
5726 static
5727 PyObject *split_char(PyUnicodeObject *self,
5728 PyObject *list,
5729 Py_UNICODE ch,
5730 Py_ssize_t maxcount)
5732 register Py_ssize_t i;
5733 register Py_ssize_t j;
5734 Py_ssize_t len = self->length;
5735 PyObject *str;
5736 register const Py_UNICODE *buf = self->str;
5738 for (i = j = 0; i < len; ) {
5739 if (buf[i] == ch) {
5740 if (maxcount-- <= 0)
5741 break;
5742 SPLIT_APPEND(buf, j, i);
5743 i = j = i + 1;
5744 } else
5745 i++;
5747 if (j <= len) {
5748 SPLIT_APPEND(buf, j, len);
5750 return list;
5752 onError:
5753 Py_DECREF(list);
5754 return NULL;
5757 static
5758 PyObject *split_substring(PyUnicodeObject *self,
5759 PyObject *list,
5760 PyUnicodeObject *substring,
5761 Py_ssize_t maxcount)
5763 register Py_ssize_t i;
5764 register Py_ssize_t j;
5765 Py_ssize_t len = self->length;
5766 Py_ssize_t sublen = substring->length;
5767 PyObject *str;
5769 for (i = j = 0; i <= len - sublen; ) {
5770 if (Py_UNICODE_MATCH(self, i, substring)) {
5771 if (maxcount-- <= 0)
5772 break;
5773 SPLIT_APPEND(self->str, j, i);
5774 i = j = i + sublen;
5775 } else
5776 i++;
5778 if (j <= len) {
5779 SPLIT_APPEND(self->str, j, len);
5781 return list;
5783 onError:
5784 Py_DECREF(list);
5785 return NULL;
5788 static
5789 PyObject *rsplit_whitespace(PyUnicodeObject *self,
5790 PyObject *list,
5791 Py_ssize_t maxcount)
5793 register Py_ssize_t i;
5794 register Py_ssize_t j;
5795 Py_ssize_t len = self->length;
5796 PyObject *str;
5797 register const Py_UNICODE *buf = self->str;
5799 for (i = j = len - 1; i >= 0; ) {
5800 /* find a token */
5801 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5802 i--;
5803 j = i;
5804 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
5805 i--;
5806 if (j > i) {
5807 if (maxcount-- <= 0)
5808 break;
5809 SPLIT_APPEND(buf, i + 1, j + 1);
5810 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5811 i--;
5812 j = i;
5815 if (j >= 0) {
5816 SPLIT_APPEND(buf, 0, j + 1);
5818 if (PyList_Reverse(list) < 0)
5819 goto onError;
5820 return list;
5822 onError:
5823 Py_DECREF(list);
5824 return NULL;
5827 static
5828 PyObject *rsplit_char(PyUnicodeObject *self,
5829 PyObject *list,
5830 Py_UNICODE ch,
5831 Py_ssize_t maxcount)
5833 register Py_ssize_t i;
5834 register Py_ssize_t j;
5835 Py_ssize_t len = self->length;
5836 PyObject *str;
5837 register const Py_UNICODE *buf = self->str;
5839 for (i = j = len - 1; i >= 0; ) {
5840 if (buf[i] == ch) {
5841 if (maxcount-- <= 0)
5842 break;
5843 SPLIT_APPEND(buf, i + 1, j + 1);
5844 j = i = i - 1;
5845 } else
5846 i--;
5848 if (j >= -1) {
5849 SPLIT_APPEND(buf, 0, j + 1);
5851 if (PyList_Reverse(list) < 0)
5852 goto onError;
5853 return list;
5855 onError:
5856 Py_DECREF(list);
5857 return NULL;
5860 static
5861 PyObject *rsplit_substring(PyUnicodeObject *self,
5862 PyObject *list,
5863 PyUnicodeObject *substring,
5864 Py_ssize_t maxcount)
5866 register Py_ssize_t i;
5867 register Py_ssize_t j;
5868 Py_ssize_t len = self->length;
5869 Py_ssize_t sublen = substring->length;
5870 PyObject *str;
5872 for (i = len - sublen, j = len; i >= 0; ) {
5873 if (Py_UNICODE_MATCH(self, i, substring)) {
5874 if (maxcount-- <= 0)
5875 break;
5876 SPLIT_APPEND(self->str, i + sublen, j);
5877 j = i;
5878 i -= sublen;
5879 } else
5880 i--;
5882 if (j >= 0) {
5883 SPLIT_APPEND(self->str, 0, j);
5885 if (PyList_Reverse(list) < 0)
5886 goto onError;
5887 return list;
5889 onError:
5890 Py_DECREF(list);
5891 return NULL;
5894 #undef SPLIT_APPEND
5896 static
5897 PyObject *split(PyUnicodeObject *self,
5898 PyUnicodeObject *substring,
5899 Py_ssize_t maxcount)
5901 PyObject *list;
5903 if (maxcount < 0)
5904 maxcount = PY_SSIZE_T_MAX;
5906 list = PyList_New(0);
5907 if (!list)
5908 return NULL;
5910 if (substring == NULL)
5911 return split_whitespace(self,list,maxcount);
5913 else if (substring->length == 1)
5914 return split_char(self,list,substring->str[0],maxcount);
5916 else if (substring->length == 0) {
5917 Py_DECREF(list);
5918 PyErr_SetString(PyExc_ValueError, "empty separator");
5919 return NULL;
5921 else
5922 return split_substring(self,list,substring,maxcount);
5925 static
5926 PyObject *rsplit(PyUnicodeObject *self,
5927 PyUnicodeObject *substring,
5928 Py_ssize_t maxcount)
5930 PyObject *list;
5932 if (maxcount < 0)
5933 maxcount = PY_SSIZE_T_MAX;
5935 list = PyList_New(0);
5936 if (!list)
5937 return NULL;
5939 if (substring == NULL)
5940 return rsplit_whitespace(self,list,maxcount);
5942 else if (substring->length == 1)
5943 return rsplit_char(self,list,substring->str[0],maxcount);
5945 else if (substring->length == 0) {
5946 Py_DECREF(list);
5947 PyErr_SetString(PyExc_ValueError, "empty separator");
5948 return NULL;
5950 else
5951 return rsplit_substring(self,list,substring,maxcount);
5954 static
5955 PyObject *replace(PyUnicodeObject *self,
5956 PyUnicodeObject *str1,
5957 PyUnicodeObject *str2,
5958 Py_ssize_t maxcount)
5960 PyUnicodeObject *u;
5962 if (maxcount < 0)
5963 maxcount = PY_SSIZE_T_MAX;
5965 if (str1->length == str2->length) {
5966 /* same length */
5967 Py_ssize_t i;
5968 if (str1->length == 1) {
5969 /* replace characters */
5970 Py_UNICODE u1, u2;
5971 if (!findchar(self->str, self->length, str1->str[0]))
5972 goto nothing;
5973 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5974 if (!u)
5975 return NULL;
5976 Py_UNICODE_COPY(u->str, self->str, self->length);
5977 u1 = str1->str[0];
5978 u2 = str2->str[0];
5979 for (i = 0; i < u->length; i++)
5980 if (u->str[i] == u1) {
5981 if (--maxcount < 0)
5982 break;
5983 u->str[i] = u2;
5985 } else {
5986 i = fastsearch(
5987 self->str, self->length, str1->str, str1->length, FAST_SEARCH
5989 if (i < 0)
5990 goto nothing;
5991 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5992 if (!u)
5993 return NULL;
5994 Py_UNICODE_COPY(u->str, self->str, self->length);
5995 while (i <= self->length - str1->length)
5996 if (Py_UNICODE_MATCH(self, i, str1)) {
5997 if (--maxcount < 0)
5998 break;
5999 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6000 i += str1->length;
6001 } else
6002 i++;
6004 } else {
6006 Py_ssize_t n, i, j, e;
6007 Py_ssize_t product, new_size, delta;
6008 Py_UNICODE *p;
6010 /* replace strings */
6011 n = stringlib_count(self->str, self->length, str1->str, str1->length);
6012 if (n > maxcount)
6013 n = maxcount;
6014 if (n == 0)
6015 goto nothing;
6016 /* new_size = self->length + n * (str2->length - str1->length)); */
6017 delta = (str2->length - str1->length);
6018 if (delta == 0) {
6019 new_size = self->length;
6020 } else {
6021 product = n * (str2->length - str1->length);
6022 if ((product / (str2->length - str1->length)) != n) {
6023 PyErr_SetString(PyExc_OverflowError,
6024 "replace string is too long");
6025 return NULL;
6027 new_size = self->length + product;
6028 if (new_size < 0) {
6029 PyErr_SetString(PyExc_OverflowError,
6030 "replace string is too long");
6031 return NULL;
6034 u = _PyUnicode_New(new_size);
6035 if (!u)
6036 return NULL;
6037 i = 0;
6038 p = u->str;
6039 e = self->length - str1->length;
6040 if (str1->length > 0) {
6041 while (n-- > 0) {
6042 /* look for next match */
6043 j = i;
6044 while (j <= e) {
6045 if (Py_UNICODE_MATCH(self, j, str1))
6046 break;
6047 j++;
6049 if (j > i) {
6050 if (j > e)
6051 break;
6052 /* copy unchanged part [i:j] */
6053 Py_UNICODE_COPY(p, self->str+i, j-i);
6054 p += j - i;
6056 /* copy substitution string */
6057 if (str2->length > 0) {
6058 Py_UNICODE_COPY(p, str2->str, str2->length);
6059 p += str2->length;
6061 i = j + str1->length;
6063 if (i < self->length)
6064 /* copy tail [i:] */
6065 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6066 } else {
6067 /* interleave */
6068 while (n > 0) {
6069 Py_UNICODE_COPY(p, str2->str, str2->length);
6070 p += str2->length;
6071 if (--n <= 0)
6072 break;
6073 *p++ = self->str[i++];
6075 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6078 return (PyObject *) u;
6080 nothing:
6081 /* nothing to replace; return original string (when possible) */
6082 if (PyUnicode_CheckExact(self)) {
6083 Py_INCREF(self);
6084 return (PyObject *) self;
6086 return PyUnicode_FromUnicode(self->str, self->length);
6089 /* --- Unicode Object Methods --------------------------------------------- */
6091 PyDoc_STRVAR(title__doc__,
6092 "S.title() -> unicode\n\
6094 Return a titlecased version of S, i.e. words start with title case\n\
6095 characters, all remaining cased characters have lower case.");
6097 static PyObject*
6098 unicode_title(PyUnicodeObject *self)
6100 return fixup(self, fixtitle);
6103 PyDoc_STRVAR(capitalize__doc__,
6104 "S.capitalize() -> unicode\n\
6106 Return a capitalized version of S, i.e. make the first character\n\
6107 have upper case.");
6109 static PyObject*
6110 unicode_capitalize(PyUnicodeObject *self)
6112 return fixup(self, fixcapitalize);
6115 #if 0
6116 PyDoc_STRVAR(capwords__doc__,
6117 "S.capwords() -> unicode\n\
6119 Apply .capitalize() to all words in S and return the result with\n\
6120 normalized whitespace (all whitespace strings are replaced by ' ').");
6122 static PyObject*
6123 unicode_capwords(PyUnicodeObject *self)
6125 PyObject *list;
6126 PyObject *item;
6127 Py_ssize_t i;
6129 /* Split into words */
6130 list = split(self, NULL, -1);
6131 if (!list)
6132 return NULL;
6134 /* Capitalize each word */
6135 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6136 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6137 fixcapitalize);
6138 if (item == NULL)
6139 goto onError;
6140 Py_DECREF(PyList_GET_ITEM(list, i));
6141 PyList_SET_ITEM(list, i, item);
6144 /* Join the words to form a new string */
6145 item = PyUnicode_Join(NULL, list);
6147 onError:
6148 Py_DECREF(list);
6149 return (PyObject *)item;
6151 #endif
6153 /* Argument converter. Coerces to a single unicode character */
6155 static int
6156 convert_uc(PyObject *obj, void *addr)
6158 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6159 PyObject *uniobj;
6160 Py_UNICODE *unistr;
6162 uniobj = PyUnicode_FromObject(obj);
6163 if (uniobj == NULL) {
6164 PyErr_SetString(PyExc_TypeError,
6165 "The fill character cannot be converted to Unicode");
6166 return 0;
6168 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6169 PyErr_SetString(PyExc_TypeError,
6170 "The fill character must be exactly one character long");
6171 Py_DECREF(uniobj);
6172 return 0;
6174 unistr = PyUnicode_AS_UNICODE(uniobj);
6175 *fillcharloc = unistr[0];
6176 Py_DECREF(uniobj);
6177 return 1;
6180 PyDoc_STRVAR(center__doc__,
6181 "S.center(width[, fillchar]) -> unicode\n\
6183 Return S centered in a Unicode string of length width. Padding is\n\
6184 done using the specified fill character (default is a space)");
6186 static PyObject *
6187 unicode_center(PyUnicodeObject *self, PyObject *args)
6189 Py_ssize_t marg, left;
6190 Py_ssize_t width;
6191 Py_UNICODE fillchar = ' ';
6193 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6194 return NULL;
6196 if (self->length >= width && PyUnicode_CheckExact(self)) {
6197 Py_INCREF(self);
6198 return (PyObject*) self;
6201 marg = width - self->length;
6202 left = marg / 2 + (marg & width & 1);
6204 return (PyObject*) pad(self, left, marg - left, fillchar);
6207 #if 0
6209 /* This code should go into some future Unicode collation support
6210 module. The basic comparison should compare ordinals on a naive
6211 basis (this is what Java does and thus JPython too). */
6213 /* speedy UTF-16 code point order comparison */
6214 /* gleaned from: */
6215 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6217 static short utf16Fixup[32] =
6219 0, 0, 0, 0, 0, 0, 0, 0,
6220 0, 0, 0, 0, 0, 0, 0, 0,
6221 0, 0, 0, 0, 0, 0, 0, 0,
6222 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6225 static int
6226 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6228 Py_ssize_t len1, len2;
6230 Py_UNICODE *s1 = str1->str;
6231 Py_UNICODE *s2 = str2->str;
6233 len1 = str1->length;
6234 len2 = str2->length;
6236 while (len1 > 0 && len2 > 0) {
6237 Py_UNICODE c1, c2;
6239 c1 = *s1++;
6240 c2 = *s2++;
6242 if (c1 > (1<<11) * 26)
6243 c1 += utf16Fixup[c1>>11];
6244 if (c2 > (1<<11) * 26)
6245 c2 += utf16Fixup[c2>>11];
6246 /* now c1 and c2 are in UTF-32-compatible order */
6248 if (c1 != c2)
6249 return (c1 < c2) ? -1 : 1;
6251 len1--; len2--;
6254 return (len1 < len2) ? -1 : (len1 != len2);
6257 #else
6259 static int
6260 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6262 register Py_ssize_t len1, len2;
6264 Py_UNICODE *s1 = str1->str;
6265 Py_UNICODE *s2 = str2->str;
6267 len1 = str1->length;
6268 len2 = str2->length;
6270 while (len1 > 0 && len2 > 0) {
6271 Py_UNICODE c1, c2;
6273 c1 = *s1++;
6274 c2 = *s2++;
6276 if (c1 != c2)
6277 return (c1 < c2) ? -1 : 1;
6279 len1--; len2--;
6282 return (len1 < len2) ? -1 : (len1 != len2);
6285 #endif
6287 int PyUnicode_Compare(PyObject *left,
6288 PyObject *right)
6290 PyUnicodeObject *u = NULL, *v = NULL;
6291 int result;
6293 /* Coerce the two arguments */
6294 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6295 if (u == NULL)
6296 goto onError;
6297 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6298 if (v == NULL)
6299 goto onError;
6301 /* Shortcut for empty or interned objects */
6302 if (v == u) {
6303 Py_DECREF(u);
6304 Py_DECREF(v);
6305 return 0;
6308 result = unicode_compare(u, v);
6310 Py_DECREF(u);
6311 Py_DECREF(v);
6312 return result;
6314 onError:
6315 Py_XDECREF(u);
6316 Py_XDECREF(v);
6317 return -1;
6320 PyObject *PyUnicode_RichCompare(PyObject *left,
6321 PyObject *right,
6322 int op)
6324 int result;
6326 result = PyUnicode_Compare(left, right);
6327 if (result == -1 && PyErr_Occurred())
6328 goto onError;
6330 /* Convert the return value to a Boolean */
6331 switch (op) {
6332 case Py_EQ:
6333 result = (result == 0);
6334 break;
6335 case Py_NE:
6336 result = (result != 0);
6337 break;
6338 case Py_LE:
6339 result = (result <= 0);
6340 break;
6341 case Py_GE:
6342 result = (result >= 0);
6343 break;
6344 case Py_LT:
6345 result = (result == -1);
6346 break;
6347 case Py_GT:
6348 result = (result == 1);
6349 break;
6351 return PyBool_FromLong(result);
6353 onError:
6355 /* Standard case
6357 Type errors mean that PyUnicode_FromObject() could not convert
6358 one of the arguments (usually the right hand side) to Unicode,
6359 ie. we can't handle the comparison request. However, it is
6360 possible that the other object knows a comparison method, which
6361 is why we return Py_NotImplemented to give the other object a
6362 chance.
6365 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6366 PyErr_Clear();
6367 Py_INCREF(Py_NotImplemented);
6368 return Py_NotImplemented;
6370 if (op != Py_EQ && op != Py_NE)
6371 return NULL;
6373 /* Equality comparison.
6375 This is a special case: we silence any PyExc_UnicodeDecodeError
6376 and instead turn it into a PyErr_UnicodeWarning.
6379 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6380 return NULL;
6381 PyErr_Clear();
6382 if (PyErr_Warn(PyExc_UnicodeWarning,
6383 (op == Py_EQ) ?
6384 "Unicode equal comparison "
6385 "failed to convert both arguments to Unicode - "
6386 "interpreting them as being unequal" :
6387 "Unicode unequal comparison "
6388 "failed to convert both arguments to Unicode - "
6389 "interpreting them as being unequal"
6390 ) < 0)
6391 return NULL;
6392 result = (op == Py_NE);
6393 return PyBool_FromLong(result);
6396 int PyUnicode_Contains(PyObject *container,
6397 PyObject *element)
6399 PyObject *str, *sub;
6400 int result;
6402 /* Coerce the two arguments */
6403 sub = PyUnicode_FromObject(element);
6404 if (!sub) {
6405 PyErr_SetString(PyExc_TypeError,
6406 "'in <string>' requires string as left operand");
6407 return -1;
6410 str = PyUnicode_FromObject(container);
6411 if (!str) {
6412 Py_DECREF(sub);
6413 return -1;
6416 result = stringlib_contains_obj(str, sub);
6418 Py_DECREF(str);
6419 Py_DECREF(sub);
6421 return result;
6424 /* Concat to string or Unicode object giving a new Unicode object. */
6426 PyObject *PyUnicode_Concat(PyObject *left,
6427 PyObject *right)
6429 PyUnicodeObject *u = NULL, *v = NULL, *w;
6431 /* Coerce the two arguments */
6432 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6433 if (u == NULL)
6434 goto onError;
6435 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6436 if (v == NULL)
6437 goto onError;
6439 /* Shortcuts */
6440 if (v == unicode_empty) {
6441 Py_DECREF(v);
6442 return (PyObject *)u;
6444 if (u == unicode_empty) {
6445 Py_DECREF(u);
6446 return (PyObject *)v;
6449 /* Concat the two Unicode strings */
6450 w = _PyUnicode_New(u->length + v->length);
6451 if (w == NULL)
6452 goto onError;
6453 Py_UNICODE_COPY(w->str, u->str, u->length);
6454 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6456 Py_DECREF(u);
6457 Py_DECREF(v);
6458 return (PyObject *)w;
6460 onError:
6461 Py_XDECREF(u);
6462 Py_XDECREF(v);
6463 return NULL;
6466 PyDoc_STRVAR(count__doc__,
6467 "S.count(sub[, start[, end]]) -> int\n\
6469 Return the number of non-overlapping occurrences of substring sub in\n\
6470 Unicode string S[start:end]. Optional arguments start and end are\n\
6471 interpreted as in slice notation.");
6473 static PyObject *
6474 unicode_count(PyUnicodeObject *self, PyObject *args)
6476 PyUnicodeObject *substring;
6477 Py_ssize_t start = 0;
6478 Py_ssize_t end = PY_SSIZE_T_MAX;
6479 PyObject *result;
6481 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6482 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6483 return NULL;
6485 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6486 (PyObject *)substring);
6487 if (substring == NULL)
6488 return NULL;
6490 FIX_START_END(self);
6492 result = PyInt_FromSsize_t(
6493 stringlib_count(self->str + start, end - start,
6494 substring->str, substring->length)
6497 Py_DECREF(substring);
6499 return result;
6502 PyDoc_STRVAR(encode__doc__,
6503 "S.encode([encoding[,errors]]) -> string or unicode\n\
6505 Encodes S using the codec registered for encoding. encoding defaults\n\
6506 to the default encoding. errors may be given to set a different error\n\
6507 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6508 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6509 'xmlcharrefreplace' as well as any other name registered with\n\
6510 codecs.register_error that can handle UnicodeEncodeErrors.");
6512 static PyObject *
6513 unicode_encode(PyUnicodeObject *self, PyObject *args)
6515 char *encoding = NULL;
6516 char *errors = NULL;
6517 PyObject *v;
6519 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6520 return NULL;
6521 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6522 if (v == NULL)
6523 goto onError;
6524 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6525 PyErr_Format(PyExc_TypeError,
6526 "encoder did not return a string/unicode object "
6527 "(type=%.400s)",
6528 Py_TYPE(v)->tp_name);
6529 Py_DECREF(v);
6530 return NULL;
6532 return v;
6534 onError:
6535 return NULL;
6538 PyDoc_STRVAR(decode__doc__,
6539 "S.decode([encoding[,errors]]) -> string or unicode\n\
6541 Decodes S using the codec registered for encoding. encoding defaults\n\
6542 to the default encoding. errors may be given to set a different error\n\
6543 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6544 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6545 as well as any other name registerd with codecs.register_error that is\n\
6546 able to handle UnicodeDecodeErrors.");
6548 static PyObject *
6549 unicode_decode(PyUnicodeObject *self, PyObject *args)
6551 char *encoding = NULL;
6552 char *errors = NULL;
6553 PyObject *v;
6555 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6556 return NULL;
6557 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
6558 if (v == NULL)
6559 goto onError;
6560 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6561 PyErr_Format(PyExc_TypeError,
6562 "decoder did not return a string/unicode object "
6563 "(type=%.400s)",
6564 Py_TYPE(v)->tp_name);
6565 Py_DECREF(v);
6566 return NULL;
6568 return v;
6570 onError:
6571 return NULL;
6574 PyDoc_STRVAR(expandtabs__doc__,
6575 "S.expandtabs([tabsize]) -> unicode\n\
6577 Return a copy of S where all tab characters are expanded using spaces.\n\
6578 If tabsize is not given, a tab size of 8 characters is assumed.");
6580 static PyObject*
6581 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6583 Py_UNICODE *e;
6584 Py_UNICODE *p;
6585 Py_UNICODE *q;
6586 Py_UNICODE *qe;
6587 Py_ssize_t i, j, incr;
6588 PyUnicodeObject *u;
6589 int tabsize = 8;
6591 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6592 return NULL;
6594 /* First pass: determine size of output string */
6595 i = 0; /* chars up to and including most recent \n or \r */
6596 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6597 e = self->str + self->length; /* end of input */
6598 for (p = self->str; p < e; p++)
6599 if (*p == '\t') {
6600 if (tabsize > 0) {
6601 incr = tabsize - (j % tabsize); /* cannot overflow */
6602 if (j > PY_SSIZE_T_MAX - incr)
6603 goto overflow1;
6604 j += incr;
6607 else {
6608 if (j > PY_SSIZE_T_MAX - 1)
6609 goto overflow1;
6610 j++;
6611 if (*p == '\n' || *p == '\r') {
6612 if (i > PY_SSIZE_T_MAX - j)
6613 goto overflow1;
6614 i += j;
6615 j = 0;
6619 if (i > PY_SSIZE_T_MAX - j)
6620 goto overflow1;
6622 /* Second pass: create output string and fill it */
6623 u = _PyUnicode_New(i + j);
6624 if (!u)
6625 return NULL;
6627 j = 0; /* same as in first pass */
6628 q = u->str; /* next output char */
6629 qe = u->str + u->length; /* end of output */
6631 for (p = self->str; p < e; p++)
6632 if (*p == '\t') {
6633 if (tabsize > 0) {
6634 i = tabsize - (j % tabsize);
6635 j += i;
6636 while (i--) {
6637 if (q >= qe)
6638 goto overflow2;
6639 *q++ = ' ';
6643 else {
6644 if (q >= qe)
6645 goto overflow2;
6646 *q++ = *p;
6647 j++;
6648 if (*p == '\n' || *p == '\r')
6649 j = 0;
6652 return (PyObject*) u;
6654 overflow2:
6655 Py_DECREF(u);
6656 overflow1:
6657 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6658 return NULL;
6661 PyDoc_STRVAR(find__doc__,
6662 "S.find(sub [,start [,end]]) -> int\n\
6664 Return the lowest index in S where substring sub is found,\n\
6665 such that sub is contained within s[start:end]. Optional\n\
6666 arguments start and end are interpreted as in slice notation.\n\
6668 Return -1 on failure.");
6670 static PyObject *
6671 unicode_find(PyUnicodeObject *self, PyObject *args)
6673 PyObject *substring;
6674 Py_ssize_t start;
6675 Py_ssize_t end;
6676 Py_ssize_t result;
6678 if (!_ParseTupleFinds(args, &substring, &start, &end))
6679 return NULL;
6681 result = stringlib_find_slice(
6682 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6683 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6684 start, end
6687 Py_DECREF(substring);
6689 return PyInt_FromSsize_t(result);
6692 static PyObject *
6693 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6695 if (index < 0 || index >= self->length) {
6696 PyErr_SetString(PyExc_IndexError, "string index out of range");
6697 return NULL;
6700 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6703 static long
6704 unicode_hash(PyUnicodeObject *self)
6706 /* Since Unicode objects compare equal to their ASCII string
6707 counterparts, they should use the individual character values
6708 as basis for their hash value. This is needed to assure that
6709 strings and Unicode objects behave in the same way as
6710 dictionary keys. */
6712 register Py_ssize_t len;
6713 register Py_UNICODE *p;
6714 register long x;
6716 if (self->hash != -1)
6717 return self->hash;
6718 len = PyUnicode_GET_SIZE(self);
6719 p = PyUnicode_AS_UNICODE(self);
6720 x = *p << 7;
6721 while (--len >= 0)
6722 x = (1000003*x) ^ *p++;
6723 x ^= PyUnicode_GET_SIZE(self);
6724 if (x == -1)
6725 x = -2;
6726 self->hash = x;
6727 return x;
6730 PyDoc_STRVAR(index__doc__,
6731 "S.index(sub [,start [,end]]) -> int\n\
6733 Like S.find() but raise ValueError when the substring is not found.");
6735 static PyObject *
6736 unicode_index(PyUnicodeObject *self, PyObject *args)
6738 Py_ssize_t result;
6739 PyObject *substring;
6740 Py_ssize_t start;
6741 Py_ssize_t end;
6743 if (!_ParseTupleFinds(args, &substring, &start, &end))
6744 return NULL;
6746 result = stringlib_find_slice(
6747 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6748 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6749 start, end
6752 Py_DECREF(substring);
6754 if (result < 0) {
6755 PyErr_SetString(PyExc_ValueError, "substring not found");
6756 return NULL;
6759 return PyInt_FromSsize_t(result);
6762 PyDoc_STRVAR(islower__doc__,
6763 "S.islower() -> bool\n\
6765 Return True if all cased characters in S are lowercase and there is\n\
6766 at least one cased character in S, False otherwise.");
6768 static PyObject*
6769 unicode_islower(PyUnicodeObject *self)
6771 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6772 register const Py_UNICODE *e;
6773 int cased;
6775 /* Shortcut for single character strings */
6776 if (PyUnicode_GET_SIZE(self) == 1)
6777 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6779 /* Special case for empty strings */
6780 if (PyUnicode_GET_SIZE(self) == 0)
6781 return PyBool_FromLong(0);
6783 e = p + PyUnicode_GET_SIZE(self);
6784 cased = 0;
6785 for (; p < e; p++) {
6786 register const Py_UNICODE ch = *p;
6788 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6789 return PyBool_FromLong(0);
6790 else if (!cased && Py_UNICODE_ISLOWER(ch))
6791 cased = 1;
6793 return PyBool_FromLong(cased);
6796 PyDoc_STRVAR(isupper__doc__,
6797 "S.isupper() -> bool\n\
6799 Return True if all cased characters in S are uppercase and there is\n\
6800 at least one cased character in S, False otherwise.");
6802 static PyObject*
6803 unicode_isupper(PyUnicodeObject *self)
6805 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6806 register const Py_UNICODE *e;
6807 int cased;
6809 /* Shortcut for single character strings */
6810 if (PyUnicode_GET_SIZE(self) == 1)
6811 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6813 /* Special case for empty strings */
6814 if (PyUnicode_GET_SIZE(self) == 0)
6815 return PyBool_FromLong(0);
6817 e = p + PyUnicode_GET_SIZE(self);
6818 cased = 0;
6819 for (; p < e; p++) {
6820 register const Py_UNICODE ch = *p;
6822 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6823 return PyBool_FromLong(0);
6824 else if (!cased && Py_UNICODE_ISUPPER(ch))
6825 cased = 1;
6827 return PyBool_FromLong(cased);
6830 PyDoc_STRVAR(istitle__doc__,
6831 "S.istitle() -> bool\n\
6833 Return True if S is a titlecased string and there is at least one\n\
6834 character in S, i.e. upper- and titlecase characters may only\n\
6835 follow uncased characters and lowercase characters only cased ones.\n\
6836 Return False otherwise.");
6838 static PyObject*
6839 unicode_istitle(PyUnicodeObject *self)
6841 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6842 register const Py_UNICODE *e;
6843 int cased, previous_is_cased;
6845 /* Shortcut for single character strings */
6846 if (PyUnicode_GET_SIZE(self) == 1)
6847 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6848 (Py_UNICODE_ISUPPER(*p) != 0));
6850 /* Special case for empty strings */
6851 if (PyUnicode_GET_SIZE(self) == 0)
6852 return PyBool_FromLong(0);
6854 e = p + PyUnicode_GET_SIZE(self);
6855 cased = 0;
6856 previous_is_cased = 0;
6857 for (; p < e; p++) {
6858 register const Py_UNICODE ch = *p;
6860 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6861 if (previous_is_cased)
6862 return PyBool_FromLong(0);
6863 previous_is_cased = 1;
6864 cased = 1;
6866 else if (Py_UNICODE_ISLOWER(ch)) {
6867 if (!previous_is_cased)
6868 return PyBool_FromLong(0);
6869 previous_is_cased = 1;
6870 cased = 1;
6872 else
6873 previous_is_cased = 0;
6875 return PyBool_FromLong(cased);
6878 PyDoc_STRVAR(isspace__doc__,
6879 "S.isspace() -> bool\n\
6881 Return True if all characters in S are whitespace\n\
6882 and there is at least one character in S, False otherwise.");
6884 static PyObject*
6885 unicode_isspace(PyUnicodeObject *self)
6887 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6888 register const Py_UNICODE *e;
6890 /* Shortcut for single character strings */
6891 if (PyUnicode_GET_SIZE(self) == 1 &&
6892 Py_UNICODE_ISSPACE(*p))
6893 return PyBool_FromLong(1);
6895 /* Special case for empty strings */
6896 if (PyUnicode_GET_SIZE(self) == 0)
6897 return PyBool_FromLong(0);
6899 e = p + PyUnicode_GET_SIZE(self);
6900 for (; p < e; p++) {
6901 if (!Py_UNICODE_ISSPACE(*p))
6902 return PyBool_FromLong(0);
6904 return PyBool_FromLong(1);
6907 PyDoc_STRVAR(isalpha__doc__,
6908 "S.isalpha() -> bool\n\
6910 Return True if all characters in S are alphabetic\n\
6911 and there is at least one character in S, False otherwise.");
6913 static PyObject*
6914 unicode_isalpha(PyUnicodeObject *self)
6916 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6917 register const Py_UNICODE *e;
6919 /* Shortcut for single character strings */
6920 if (PyUnicode_GET_SIZE(self) == 1 &&
6921 Py_UNICODE_ISALPHA(*p))
6922 return PyBool_FromLong(1);
6924 /* Special case for empty strings */
6925 if (PyUnicode_GET_SIZE(self) == 0)
6926 return PyBool_FromLong(0);
6928 e = p + PyUnicode_GET_SIZE(self);
6929 for (; p < e; p++) {
6930 if (!Py_UNICODE_ISALPHA(*p))
6931 return PyBool_FromLong(0);
6933 return PyBool_FromLong(1);
6936 PyDoc_STRVAR(isalnum__doc__,
6937 "S.isalnum() -> bool\n\
6939 Return True if all characters in S are alphanumeric\n\
6940 and there is at least one character in S, False otherwise.");
6942 static PyObject*
6943 unicode_isalnum(PyUnicodeObject *self)
6945 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6946 register const Py_UNICODE *e;
6948 /* Shortcut for single character strings */
6949 if (PyUnicode_GET_SIZE(self) == 1 &&
6950 Py_UNICODE_ISALNUM(*p))
6951 return PyBool_FromLong(1);
6953 /* Special case for empty strings */
6954 if (PyUnicode_GET_SIZE(self) == 0)
6955 return PyBool_FromLong(0);
6957 e = p + PyUnicode_GET_SIZE(self);
6958 for (; p < e; p++) {
6959 if (!Py_UNICODE_ISALNUM(*p))
6960 return PyBool_FromLong(0);
6962 return PyBool_FromLong(1);
6965 PyDoc_STRVAR(isdecimal__doc__,
6966 "S.isdecimal() -> bool\n\
6968 Return True if there are only decimal characters in S,\n\
6969 False otherwise.");
6971 static PyObject*
6972 unicode_isdecimal(PyUnicodeObject *self)
6974 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6975 register const Py_UNICODE *e;
6977 /* Shortcut for single character strings */
6978 if (PyUnicode_GET_SIZE(self) == 1 &&
6979 Py_UNICODE_ISDECIMAL(*p))
6980 return PyBool_FromLong(1);
6982 /* Special case for empty strings */
6983 if (PyUnicode_GET_SIZE(self) == 0)
6984 return PyBool_FromLong(0);
6986 e = p + PyUnicode_GET_SIZE(self);
6987 for (; p < e; p++) {
6988 if (!Py_UNICODE_ISDECIMAL(*p))
6989 return PyBool_FromLong(0);
6991 return PyBool_FromLong(1);
6994 PyDoc_STRVAR(isdigit__doc__,
6995 "S.isdigit() -> bool\n\
6997 Return True if all characters in S are digits\n\
6998 and there is at least one character in S, False otherwise.");
7000 static PyObject*
7001 unicode_isdigit(PyUnicodeObject *self)
7003 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7004 register const Py_UNICODE *e;
7006 /* Shortcut for single character strings */
7007 if (PyUnicode_GET_SIZE(self) == 1 &&
7008 Py_UNICODE_ISDIGIT(*p))
7009 return PyBool_FromLong(1);
7011 /* Special case for empty strings */
7012 if (PyUnicode_GET_SIZE(self) == 0)
7013 return PyBool_FromLong(0);
7015 e = p + PyUnicode_GET_SIZE(self);
7016 for (; p < e; p++) {
7017 if (!Py_UNICODE_ISDIGIT(*p))
7018 return PyBool_FromLong(0);
7020 return PyBool_FromLong(1);
7023 PyDoc_STRVAR(isnumeric__doc__,
7024 "S.isnumeric() -> bool\n\
7026 Return True if there are only numeric characters in S,\n\
7027 False otherwise.");
7029 static PyObject*
7030 unicode_isnumeric(PyUnicodeObject *self)
7032 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7033 register const Py_UNICODE *e;
7035 /* Shortcut for single character strings */
7036 if (PyUnicode_GET_SIZE(self) == 1 &&
7037 Py_UNICODE_ISNUMERIC(*p))
7038 return PyBool_FromLong(1);
7040 /* Special case for empty strings */
7041 if (PyUnicode_GET_SIZE(self) == 0)
7042 return PyBool_FromLong(0);
7044 e = p + PyUnicode_GET_SIZE(self);
7045 for (; p < e; p++) {
7046 if (!Py_UNICODE_ISNUMERIC(*p))
7047 return PyBool_FromLong(0);
7049 return PyBool_FromLong(1);
7052 PyDoc_STRVAR(join__doc__,
7053 "S.join(sequence) -> unicode\n\
7055 Return a string which is the concatenation of the strings in the\n\
7056 sequence. The separator between elements is S.");
7058 static PyObject*
7059 unicode_join(PyObject *self, PyObject *data)
7061 return PyUnicode_Join(self, data);
7064 static Py_ssize_t
7065 unicode_length(PyUnicodeObject *self)
7067 return self->length;
7070 PyDoc_STRVAR(ljust__doc__,
7071 "S.ljust(width[, fillchar]) -> int\n\
7073 Return S left justified in a Unicode string of length width. Padding is\n\
7074 done using the specified fill character (default is a space).");
7076 static PyObject *
7077 unicode_ljust(PyUnicodeObject *self, PyObject *args)
7079 Py_ssize_t width;
7080 Py_UNICODE fillchar = ' ';
7082 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7083 return NULL;
7085 if (self->length >= width && PyUnicode_CheckExact(self)) {
7086 Py_INCREF(self);
7087 return (PyObject*) self;
7090 return (PyObject*) pad(self, 0, width - self->length, fillchar);
7093 PyDoc_STRVAR(lower__doc__,
7094 "S.lower() -> unicode\n\
7096 Return a copy of the string S converted to lowercase.");
7098 static PyObject*
7099 unicode_lower(PyUnicodeObject *self)
7101 return fixup(self, fixlower);
7104 #define LEFTSTRIP 0
7105 #define RIGHTSTRIP 1
7106 #define BOTHSTRIP 2
7108 /* Arrays indexed by above */
7109 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7111 #define STRIPNAME(i) (stripformat[i]+3)
7113 /* externally visible for str.strip(unicode) */
7114 PyObject *
7115 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7117 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7118 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7119 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7120 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7121 Py_ssize_t i, j;
7123 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7125 i = 0;
7126 if (striptype != RIGHTSTRIP) {
7127 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7128 i++;
7132 j = len;
7133 if (striptype != LEFTSTRIP) {
7134 do {
7135 j--;
7136 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7137 j++;
7140 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7141 Py_INCREF(self);
7142 return (PyObject*)self;
7144 else
7145 return PyUnicode_FromUnicode(s+i, j-i);
7149 static PyObject *
7150 do_strip(PyUnicodeObject *self, int striptype)
7152 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7153 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7155 i = 0;
7156 if (striptype != RIGHTSTRIP) {
7157 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7158 i++;
7162 j = len;
7163 if (striptype != LEFTSTRIP) {
7164 do {
7165 j--;
7166 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7167 j++;
7170 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7171 Py_INCREF(self);
7172 return (PyObject*)self;
7174 else
7175 return PyUnicode_FromUnicode(s+i, j-i);
7179 static PyObject *
7180 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7182 PyObject *sep = NULL;
7184 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7185 return NULL;
7187 if (sep != NULL && sep != Py_None) {
7188 if (PyUnicode_Check(sep))
7189 return _PyUnicode_XStrip(self, striptype, sep);
7190 else if (PyString_Check(sep)) {
7191 PyObject *res;
7192 sep = PyUnicode_FromObject(sep);
7193 if (sep==NULL)
7194 return NULL;
7195 res = _PyUnicode_XStrip(self, striptype, sep);
7196 Py_DECREF(sep);
7197 return res;
7199 else {
7200 PyErr_Format(PyExc_TypeError,
7201 "%s arg must be None, unicode or str",
7202 STRIPNAME(striptype));
7203 return NULL;
7207 return do_strip(self, striptype);
7211 PyDoc_STRVAR(strip__doc__,
7212 "S.strip([chars]) -> unicode\n\
7214 Return a copy of the string S with leading and trailing\n\
7215 whitespace removed.\n\
7216 If chars is given and not None, remove characters in chars instead.\n\
7217 If chars is a str, it will be converted to unicode before stripping");
7219 static PyObject *
7220 unicode_strip(PyUnicodeObject *self, PyObject *args)
7222 if (PyTuple_GET_SIZE(args) == 0)
7223 return do_strip(self, BOTHSTRIP); /* Common case */
7224 else
7225 return do_argstrip(self, BOTHSTRIP, args);
7229 PyDoc_STRVAR(lstrip__doc__,
7230 "S.lstrip([chars]) -> unicode\n\
7232 Return a copy of the string S with leading whitespace removed.\n\
7233 If chars is given and not None, remove characters in chars instead.\n\
7234 If chars is a str, it will be converted to unicode before stripping");
7236 static PyObject *
7237 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7239 if (PyTuple_GET_SIZE(args) == 0)
7240 return do_strip(self, LEFTSTRIP); /* Common case */
7241 else
7242 return do_argstrip(self, LEFTSTRIP, args);
7246 PyDoc_STRVAR(rstrip__doc__,
7247 "S.rstrip([chars]) -> unicode\n\
7249 Return a copy of the string S with trailing whitespace removed.\n\
7250 If chars is given and not None, remove characters in chars instead.\n\
7251 If chars is a str, it will be converted to unicode before stripping");
7253 static PyObject *
7254 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7256 if (PyTuple_GET_SIZE(args) == 0)
7257 return do_strip(self, RIGHTSTRIP); /* Common case */
7258 else
7259 return do_argstrip(self, RIGHTSTRIP, args);
7263 static PyObject*
7264 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7266 PyUnicodeObject *u;
7267 Py_UNICODE *p;
7268 Py_ssize_t nchars;
7269 size_t nbytes;
7271 if (len < 0)
7272 len = 0;
7274 if (len == 1 && PyUnicode_CheckExact(str)) {
7275 /* no repeat, return original string */
7276 Py_INCREF(str);
7277 return (PyObject*) str;
7280 /* ensure # of chars needed doesn't overflow int and # of bytes
7281 * needed doesn't overflow size_t
7283 nchars = len * str->length;
7284 if (len && nchars / len != str->length) {
7285 PyErr_SetString(PyExc_OverflowError,
7286 "repeated string is too long");
7287 return NULL;
7289 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7290 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7291 PyErr_SetString(PyExc_OverflowError,
7292 "repeated string is too long");
7293 return NULL;
7295 u = _PyUnicode_New(nchars);
7296 if (!u)
7297 return NULL;
7299 p = u->str;
7301 if (str->length == 1 && len > 0) {
7302 Py_UNICODE_FILL(p, str->str[0], len);
7303 } else {
7304 Py_ssize_t done = 0; /* number of characters copied this far */
7305 if (done < nchars) {
7306 Py_UNICODE_COPY(p, str->str, str->length);
7307 done = str->length;
7309 while (done < nchars) {
7310 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7311 Py_UNICODE_COPY(p+done, p, n);
7312 done += n;
7316 return (PyObject*) u;
7319 PyObject *PyUnicode_Replace(PyObject *obj,
7320 PyObject *subobj,
7321 PyObject *replobj,
7322 Py_ssize_t maxcount)
7324 PyObject *self;
7325 PyObject *str1;
7326 PyObject *str2;
7327 PyObject *result;
7329 self = PyUnicode_FromObject(obj);
7330 if (self == NULL)
7331 return NULL;
7332 str1 = PyUnicode_FromObject(subobj);
7333 if (str1 == NULL) {
7334 Py_DECREF(self);
7335 return NULL;
7337 str2 = PyUnicode_FromObject(replobj);
7338 if (str2 == NULL) {
7339 Py_DECREF(self);
7340 Py_DECREF(str1);
7341 return NULL;
7343 result = replace((PyUnicodeObject *)self,
7344 (PyUnicodeObject *)str1,
7345 (PyUnicodeObject *)str2,
7346 maxcount);
7347 Py_DECREF(self);
7348 Py_DECREF(str1);
7349 Py_DECREF(str2);
7350 return result;
7353 PyDoc_STRVAR(replace__doc__,
7354 "S.replace (old, new[, count]) -> unicode\n\
7356 Return a copy of S with all occurrences of substring\n\
7357 old replaced by new. If the optional argument count is\n\
7358 given, only the first count occurrences are replaced.");
7360 static PyObject*
7361 unicode_replace(PyUnicodeObject *self, PyObject *args)
7363 PyUnicodeObject *str1;
7364 PyUnicodeObject *str2;
7365 Py_ssize_t maxcount = -1;
7366 PyObject *result;
7368 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7369 return NULL;
7370 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7371 if (str1 == NULL)
7372 return NULL;
7373 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7374 if (str2 == NULL) {
7375 Py_DECREF(str1);
7376 return NULL;
7379 result = replace(self, str1, str2, maxcount);
7381 Py_DECREF(str1);
7382 Py_DECREF(str2);
7383 return result;
7386 static
7387 PyObject *unicode_repr(PyObject *unicode)
7389 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7390 PyUnicode_GET_SIZE(unicode),
7394 PyDoc_STRVAR(rfind__doc__,
7395 "S.rfind(sub [,start [,end]]) -> int\n\
7397 Return the highest index in S where substring sub is found,\n\
7398 such that sub is contained within s[start:end]. Optional\n\
7399 arguments start and end are interpreted as in slice notation.\n\
7401 Return -1 on failure.");
7403 static PyObject *
7404 unicode_rfind(PyUnicodeObject *self, PyObject *args)
7406 PyObject *substring;
7407 Py_ssize_t start;
7408 Py_ssize_t end;
7409 Py_ssize_t result;
7411 if (!_ParseTupleFinds(args, &substring, &start, &end))
7412 return NULL;
7414 result = stringlib_rfind_slice(
7415 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7416 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7417 start, end
7420 Py_DECREF(substring);
7422 return PyInt_FromSsize_t(result);
7425 PyDoc_STRVAR(rindex__doc__,
7426 "S.rindex(sub [,start [,end]]) -> int\n\
7428 Like S.rfind() but raise ValueError when the substring is not found.");
7430 static PyObject *
7431 unicode_rindex(PyUnicodeObject *self, PyObject *args)
7433 PyObject *substring;
7434 Py_ssize_t start;
7435 Py_ssize_t end;
7436 Py_ssize_t result;
7438 if (!_ParseTupleFinds(args, &substring, &start, &end))
7439 return NULL;
7441 result = stringlib_rfind_slice(
7442 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7443 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7444 start, end
7447 Py_DECREF(substring);
7449 if (result < 0) {
7450 PyErr_SetString(PyExc_ValueError, "substring not found");
7451 return NULL;
7453 return PyInt_FromSsize_t(result);
7456 PyDoc_STRVAR(rjust__doc__,
7457 "S.rjust(width[, fillchar]) -> unicode\n\
7459 Return S right justified in a Unicode string of length width. Padding is\n\
7460 done using the specified fill character (default is a space).");
7462 static PyObject *
7463 unicode_rjust(PyUnicodeObject *self, PyObject *args)
7465 Py_ssize_t width;
7466 Py_UNICODE fillchar = ' ';
7468 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7469 return NULL;
7471 if (self->length >= width && PyUnicode_CheckExact(self)) {
7472 Py_INCREF(self);
7473 return (PyObject*) self;
7476 return (PyObject*) pad(self, width - self->length, 0, fillchar);
7479 static PyObject*
7480 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
7482 /* standard clamping */
7483 if (start < 0)
7484 start = 0;
7485 if (end < 0)
7486 end = 0;
7487 if (end > self->length)
7488 end = self->length;
7489 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
7490 /* full slice, return original string */
7491 Py_INCREF(self);
7492 return (PyObject*) self;
7494 if (start > end)
7495 start = end;
7496 /* copy slice */
7497 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7498 end - start);
7501 PyObject *PyUnicode_Split(PyObject *s,
7502 PyObject *sep,
7503 Py_ssize_t maxsplit)
7505 PyObject *result;
7507 s = PyUnicode_FromObject(s);
7508 if (s == NULL)
7509 return NULL;
7510 if (sep != NULL) {
7511 sep = PyUnicode_FromObject(sep);
7512 if (sep == NULL) {
7513 Py_DECREF(s);
7514 return NULL;
7518 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7520 Py_DECREF(s);
7521 Py_XDECREF(sep);
7522 return result;
7525 PyDoc_STRVAR(split__doc__,
7526 "S.split([sep [,maxsplit]]) -> list of strings\n\
7528 Return a list of the words in S, using sep as the\n\
7529 delimiter string. If maxsplit is given, at most maxsplit\n\
7530 splits are done. If sep is not specified or is None, any\n\
7531 whitespace string is a separator and empty strings are\n\
7532 removed from the result.");
7534 static PyObject*
7535 unicode_split(PyUnicodeObject *self, PyObject *args)
7537 PyObject *substring = Py_None;
7538 Py_ssize_t maxcount = -1;
7540 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7541 return NULL;
7543 if (substring == Py_None)
7544 return split(self, NULL, maxcount);
7545 else if (PyUnicode_Check(substring))
7546 return split(self, (PyUnicodeObject *)substring, maxcount);
7547 else
7548 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7551 PyObject *
7552 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7554 PyObject* str_obj;
7555 PyObject* sep_obj;
7556 PyObject* out;
7558 str_obj = PyUnicode_FromObject(str_in);
7559 if (!str_obj)
7560 return NULL;
7561 sep_obj = PyUnicode_FromObject(sep_in);
7562 if (!sep_obj) {
7563 Py_DECREF(str_obj);
7564 return NULL;
7567 out = stringlib_partition(
7568 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7569 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7572 Py_DECREF(sep_obj);
7573 Py_DECREF(str_obj);
7575 return out;
7579 PyObject *
7580 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7582 PyObject* str_obj;
7583 PyObject* sep_obj;
7584 PyObject* out;
7586 str_obj = PyUnicode_FromObject(str_in);
7587 if (!str_obj)
7588 return NULL;
7589 sep_obj = PyUnicode_FromObject(sep_in);
7590 if (!sep_obj) {
7591 Py_DECREF(str_obj);
7592 return NULL;
7595 out = stringlib_rpartition(
7596 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7597 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7600 Py_DECREF(sep_obj);
7601 Py_DECREF(str_obj);
7603 return out;
7606 PyDoc_STRVAR(partition__doc__,
7607 "S.partition(sep) -> (head, sep, tail)\n\
7609 Searches for the separator sep in S, and returns the part before it,\n\
7610 the separator itself, and the part after it. If the separator is not\n\
7611 found, returns S and two empty strings.");
7613 static PyObject*
7614 unicode_partition(PyUnicodeObject *self, PyObject *separator)
7616 return PyUnicode_Partition((PyObject *)self, separator);
7619 PyDoc_STRVAR(rpartition__doc__,
7620 "S.rpartition(sep) -> (tail, sep, head)\n\
7622 Searches for the separator sep in S, starting at the end of S, and returns\n\
7623 the part before it, the separator itself, and the part after it. If the\n\
7624 separator is not found, returns two empty strings and S.");
7626 static PyObject*
7627 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7629 return PyUnicode_RPartition((PyObject *)self, separator);
7632 PyObject *PyUnicode_RSplit(PyObject *s,
7633 PyObject *sep,
7634 Py_ssize_t maxsplit)
7636 PyObject *result;
7638 s = PyUnicode_FromObject(s);
7639 if (s == NULL)
7640 return NULL;
7641 if (sep != NULL) {
7642 sep = PyUnicode_FromObject(sep);
7643 if (sep == NULL) {
7644 Py_DECREF(s);
7645 return NULL;
7649 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7651 Py_DECREF(s);
7652 Py_XDECREF(sep);
7653 return result;
7656 PyDoc_STRVAR(rsplit__doc__,
7657 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7659 Return a list of the words in S, using sep as the\n\
7660 delimiter string, starting at the end of the string and\n\
7661 working to the front. If maxsplit is given, at most maxsplit\n\
7662 splits are done. If sep is not specified, any whitespace string\n\
7663 is a separator.");
7665 static PyObject*
7666 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7668 PyObject *substring = Py_None;
7669 Py_ssize_t maxcount = -1;
7671 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7672 return NULL;
7674 if (substring == Py_None)
7675 return rsplit(self, NULL, maxcount);
7676 else if (PyUnicode_Check(substring))
7677 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7678 else
7679 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7682 PyDoc_STRVAR(splitlines__doc__,
7683 "S.splitlines([keepends]]) -> list of strings\n\
7685 Return a list of the lines in S, breaking at line boundaries.\n\
7686 Line breaks are not included in the resulting list unless keepends\n\
7687 is given and true.");
7689 static PyObject*
7690 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7692 int keepends = 0;
7694 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7695 return NULL;
7697 return PyUnicode_Splitlines((PyObject *)self, keepends);
7700 static
7701 PyObject *unicode_str(PyUnicodeObject *self)
7703 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
7706 PyDoc_STRVAR(swapcase__doc__,
7707 "S.swapcase() -> unicode\n\
7709 Return a copy of S with uppercase characters converted to lowercase\n\
7710 and vice versa.");
7712 static PyObject*
7713 unicode_swapcase(PyUnicodeObject *self)
7715 return fixup(self, fixswapcase);
7718 PyDoc_STRVAR(translate__doc__,
7719 "S.translate(table) -> unicode\n\
7721 Return a copy of the string S, where all characters have been mapped\n\
7722 through the given translation table, which must be a mapping of\n\
7723 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7724 Unmapped characters are left untouched. Characters mapped to None\n\
7725 are deleted.");
7727 static PyObject*
7728 unicode_translate(PyUnicodeObject *self, PyObject *table)
7730 return PyUnicode_TranslateCharmap(self->str,
7731 self->length,
7732 table,
7733 "ignore");
7736 PyDoc_STRVAR(upper__doc__,
7737 "S.upper() -> unicode\n\
7739 Return a copy of S converted to uppercase.");
7741 static PyObject*
7742 unicode_upper(PyUnicodeObject *self)
7744 return fixup(self, fixupper);
7747 PyDoc_STRVAR(zfill__doc__,
7748 "S.zfill(width) -> unicode\n\
7750 Pad a numeric string x with zeros on the left, to fill a field\n\
7751 of the specified width. The string x is never truncated.");
7753 static PyObject *
7754 unicode_zfill(PyUnicodeObject *self, PyObject *args)
7756 Py_ssize_t fill;
7757 PyUnicodeObject *u;
7759 Py_ssize_t width;
7760 if (!PyArg_ParseTuple(args, "n:zfill", &width))
7761 return NULL;
7763 if (self->length >= width) {
7764 if (PyUnicode_CheckExact(self)) {
7765 Py_INCREF(self);
7766 return (PyObject*) self;
7768 else
7769 return PyUnicode_FromUnicode(
7770 PyUnicode_AS_UNICODE(self),
7771 PyUnicode_GET_SIZE(self)
7775 fill = width - self->length;
7777 u = pad(self, fill, 0, '0');
7779 if (u == NULL)
7780 return NULL;
7782 if (u->str[fill] == '+' || u->str[fill] == '-') {
7783 /* move sign to beginning of string */
7784 u->str[0] = u->str[fill];
7785 u->str[fill] = '0';
7788 return (PyObject*) u;
7791 #if 0
7792 static PyObject*
7793 free_listsize(PyUnicodeObject *self)
7795 return PyInt_FromLong(numfree);
7797 #endif
7799 PyDoc_STRVAR(startswith__doc__,
7800 "S.startswith(prefix[, start[, end]]) -> bool\n\
7802 Return True if S starts with the specified prefix, False otherwise.\n\
7803 With optional start, test S beginning at that position.\n\
7804 With optional end, stop comparing S at that position.\n\
7805 prefix can also be a tuple of strings to try.");
7807 static PyObject *
7808 unicode_startswith(PyUnicodeObject *self,
7809 PyObject *args)
7811 PyObject *subobj;
7812 PyUnicodeObject *substring;
7813 Py_ssize_t start = 0;
7814 Py_ssize_t end = PY_SSIZE_T_MAX;
7815 int result;
7817 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
7818 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7819 return NULL;
7820 if (PyTuple_Check(subobj)) {
7821 Py_ssize_t i;
7822 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7823 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7824 PyTuple_GET_ITEM(subobj, i));
7825 if (substring == NULL)
7826 return NULL;
7827 result = tailmatch(self, substring, start, end, -1);
7828 Py_DECREF(substring);
7829 if (result) {
7830 Py_RETURN_TRUE;
7833 /* nothing matched */
7834 Py_RETURN_FALSE;
7836 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7837 if (substring == NULL)
7838 return NULL;
7839 result = tailmatch(self, substring, start, end, -1);
7840 Py_DECREF(substring);
7841 return PyBool_FromLong(result);
7845 PyDoc_STRVAR(endswith__doc__,
7846 "S.endswith(suffix[, start[, end]]) -> bool\n\
7848 Return True if S ends with the specified suffix, False otherwise.\n\
7849 With optional start, test S beginning at that position.\n\
7850 With optional end, stop comparing S at that position.\n\
7851 suffix can also be a tuple of strings to try.");
7853 static PyObject *
7854 unicode_endswith(PyUnicodeObject *self,
7855 PyObject *args)
7857 PyObject *subobj;
7858 PyUnicodeObject *substring;
7859 Py_ssize_t start = 0;
7860 Py_ssize_t end = PY_SSIZE_T_MAX;
7861 int result;
7863 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7864 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7865 return NULL;
7866 if (PyTuple_Check(subobj)) {
7867 Py_ssize_t i;
7868 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7869 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7870 PyTuple_GET_ITEM(subobj, i));
7871 if (substring == NULL)
7872 return NULL;
7873 result = tailmatch(self, substring, start, end, +1);
7874 Py_DECREF(substring);
7875 if (result) {
7876 Py_RETURN_TRUE;
7879 Py_RETURN_FALSE;
7881 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7882 if (substring == NULL)
7883 return NULL;
7885 result = tailmatch(self, substring, start, end, +1);
7886 Py_DECREF(substring);
7887 return PyBool_FromLong(result);
7891 /* Implements do_string_format, which is unicode because of stringlib */
7892 #include "stringlib/string_format.h"
7894 PyDoc_STRVAR(format__doc__,
7895 "S.format(*args, **kwargs) -> unicode\n\
7899 static PyObject *
7900 unicode__format__(PyObject *self, PyObject *args)
7902 PyObject *format_spec;
7903 PyObject *result = NULL;
7904 PyObject *tmp = NULL;
7906 /* If 2.x, convert format_spec to the same type as value */
7907 /* This is to allow things like u''.format('') */
7908 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7909 goto done;
7910 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7911 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
7912 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
7913 goto done;
7915 tmp = PyObject_Unicode(format_spec);
7916 if (tmp == NULL)
7917 goto done;
7918 format_spec = tmp;
7920 result = _PyUnicode_FormatAdvanced(self,
7921 PyUnicode_AS_UNICODE(format_spec),
7922 PyUnicode_GET_SIZE(format_spec));
7923 done:
7924 Py_XDECREF(tmp);
7925 return result;
7928 PyDoc_STRVAR(p_format__doc__,
7929 "S.__format__(format_spec) -> unicode\n\
7933 static PyObject *
7934 unicode__sizeof__(PyUnicodeObject *v)
7936 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7937 sizeof(Py_UNICODE) * (v->length + 1));
7940 PyDoc_STRVAR(sizeof__doc__,
7941 "S.__sizeof__() -> size of S in memory, in bytes\n\
7945 static PyObject *
7946 unicode_getnewargs(PyUnicodeObject *v)
7948 return Py_BuildValue("(u#)", v->str, v->length);
7952 static PyMethodDef unicode_methods[] = {
7954 /* Order is according to common usage: often used methods should
7955 appear first, since lookup is done sequentially. */
7957 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7958 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7959 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
7960 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
7961 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7962 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7963 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7964 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7965 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7966 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7967 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
7968 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
7969 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7970 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7971 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
7972 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
7973 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
7974 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7975 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7976 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7977 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
7978 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
7979 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
7980 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
7981 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
7982 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7983 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7984 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7985 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7986 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7987 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7988 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7989 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7990 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7991 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7992 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7993 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7994 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7995 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
7996 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
7997 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7998 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7999 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8000 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
8001 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
8002 #if 0
8003 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
8004 #endif
8006 #if 0
8007 /* This one is just used for debugging the implementation. */
8008 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
8009 #endif
8011 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
8012 {NULL, NULL}
8015 static PyObject *
8016 unicode_mod(PyObject *v, PyObject *w)
8018 if (!PyUnicode_Check(v)) {
8019 Py_INCREF(Py_NotImplemented);
8020 return Py_NotImplemented;
8022 return PyUnicode_Format(v, w);
8025 static PyNumberMethods unicode_as_number = {
8026 0, /*nb_add*/
8027 0, /*nb_subtract*/
8028 0, /*nb_multiply*/
8029 0, /*nb_divide*/
8030 unicode_mod, /*nb_remainder*/
8033 static PySequenceMethods unicode_as_sequence = {
8034 (lenfunc) unicode_length, /* sq_length */
8035 PyUnicode_Concat, /* sq_concat */
8036 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8037 (ssizeargfunc) unicode_getitem, /* sq_item */
8038 (ssizessizeargfunc) unicode_slice, /* sq_slice */
8039 0, /* sq_ass_item */
8040 0, /* sq_ass_slice */
8041 PyUnicode_Contains, /* sq_contains */
8044 static PyObject*
8045 unicode_subscript(PyUnicodeObject* self, PyObject* item)
8047 if (PyIndex_Check(item)) {
8048 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
8049 if (i == -1 && PyErr_Occurred())
8050 return NULL;
8051 if (i < 0)
8052 i += PyUnicode_GET_SIZE(self);
8053 return unicode_getitem(self, i);
8054 } else if (PySlice_Check(item)) {
8055 Py_ssize_t start, stop, step, slicelength, cur, i;
8056 Py_UNICODE* source_buf;
8057 Py_UNICODE* result_buf;
8058 PyObject* result;
8060 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
8061 &start, &stop, &step, &slicelength) < 0) {
8062 return NULL;
8065 if (slicelength <= 0) {
8066 return PyUnicode_FromUnicode(NULL, 0);
8067 } else if (start == 0 && step == 1 && slicelength == self->length &&
8068 PyUnicode_CheckExact(self)) {
8069 Py_INCREF(self);
8070 return (PyObject *)self;
8071 } else if (step == 1) {
8072 return PyUnicode_FromUnicode(self->str + start, slicelength);
8073 } else {
8074 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
8075 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8076 sizeof(Py_UNICODE));
8078 if (result_buf == NULL)
8079 return PyErr_NoMemory();
8081 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8082 result_buf[i] = source_buf[cur];
8085 result = PyUnicode_FromUnicode(result_buf, slicelength);
8086 PyObject_FREE(result_buf);
8087 return result;
8089 } else {
8090 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8091 return NULL;
8095 static PyMappingMethods unicode_as_mapping = {
8096 (lenfunc)unicode_length, /* mp_length */
8097 (binaryfunc)unicode_subscript, /* mp_subscript */
8098 (objobjargproc)0, /* mp_ass_subscript */
8101 static Py_ssize_t
8102 unicode_buffer_getreadbuf(PyUnicodeObject *self,
8103 Py_ssize_t index,
8104 const void **ptr)
8106 if (index != 0) {
8107 PyErr_SetString(PyExc_SystemError,
8108 "accessing non-existent unicode segment");
8109 return -1;
8111 *ptr = (void *) self->str;
8112 return PyUnicode_GET_DATA_SIZE(self);
8115 static Py_ssize_t
8116 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
8117 const void **ptr)
8119 PyErr_SetString(PyExc_TypeError,
8120 "cannot use unicode as modifiable buffer");
8121 return -1;
8124 static int
8125 unicode_buffer_getsegcount(PyUnicodeObject *self,
8126 Py_ssize_t *lenp)
8128 if (lenp)
8129 *lenp = PyUnicode_GET_DATA_SIZE(self);
8130 return 1;
8133 static Py_ssize_t
8134 unicode_buffer_getcharbuf(PyUnicodeObject *self,
8135 Py_ssize_t index,
8136 const void **ptr)
8138 PyObject *str;
8140 if (index != 0) {
8141 PyErr_SetString(PyExc_SystemError,
8142 "accessing non-existent unicode segment");
8143 return -1;
8145 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
8146 if (str == NULL)
8147 return -1;
8148 *ptr = (void *) PyString_AS_STRING(str);
8149 return PyString_GET_SIZE(str);
8152 /* Helpers for PyUnicode_Format() */
8154 static PyObject *
8155 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8157 Py_ssize_t argidx = *p_argidx;
8158 if (argidx < arglen) {
8159 (*p_argidx)++;
8160 if (arglen < 0)
8161 return args;
8162 else
8163 return PyTuple_GetItem(args, argidx);
8165 PyErr_SetString(PyExc_TypeError,
8166 "not enough arguments for format string");
8167 return NULL;
8170 #define F_LJUST (1<<0)
8171 #define F_SIGN (1<<1)
8172 #define F_BLANK (1<<2)
8173 #define F_ALT (1<<3)
8174 #define F_ZERO (1<<4)
8176 static Py_ssize_t
8177 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8179 register Py_ssize_t i;
8180 Py_ssize_t len = strlen(charbuffer);
8181 for (i = len - 1; i >= 0; i--)
8182 buffer[i] = (Py_UNICODE) charbuffer[i];
8184 return len;
8187 static int
8188 doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8190 Py_ssize_t result;
8192 PyOS_ascii_formatd((char *)buffer, len, format, x);
8193 result = strtounicode(buffer, (char *)buffer);
8194 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8197 static int
8198 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8200 Py_ssize_t result;
8202 PyOS_snprintf((char *)buffer, len, format, x);
8203 result = strtounicode(buffer, (char *)buffer);
8204 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8207 /* XXX To save some code duplication, formatfloat/long/int could have been
8208 shared with stringobject.c, converting from 8-bit to Unicode after the
8209 formatting is done. */
8211 static int
8212 formatfloat(Py_UNICODE *buf,
8213 size_t buflen,
8214 int flags,
8215 int prec,
8216 int type,
8217 PyObject *v)
8219 /* fmt = '%#.' + `prec` + `type`
8220 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
8221 char fmt[20];
8222 double x;
8224 x = PyFloat_AsDouble(v);
8225 if (x == -1.0 && PyErr_Occurred())
8226 return -1;
8227 if (prec < 0)
8228 prec = 6;
8229 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8230 type = 'g';
8231 /* Worst case length calc to ensure no buffer overrun:
8233 'g' formats:
8234 fmt = %#.<prec>g
8235 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8236 for any double rep.)
8237 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8239 'f' formats:
8240 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8241 len = 1 + 50 + 1 + prec = 52 + prec
8243 If prec=0 the effective precision is 1 (the leading digit is
8244 always given), therefore increase the length by one.
8247 if (((type == 'g' || type == 'G') &&
8248 buflen <= (size_t)10 + (size_t)prec) ||
8249 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
8250 PyErr_SetString(PyExc_OverflowError,
8251 "formatted float is too long (precision too large?)");
8252 return -1;
8254 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8255 (flags&F_ALT) ? "#" : "",
8256 prec, type);
8257 return doubletounicode(buf, buflen, fmt, x);
8260 static PyObject*
8261 formatlong(PyObject *val, int flags, int prec, int type)
8263 char *buf;
8264 int i, len;
8265 PyObject *str; /* temporary string object. */
8266 PyUnicodeObject *result;
8268 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8269 if (!str)
8270 return NULL;
8271 result = _PyUnicode_New(len);
8272 if (!result) {
8273 Py_DECREF(str);
8274 return NULL;
8276 for (i = 0; i < len; i++)
8277 result->str[i] = buf[i];
8278 result->str[len] = 0;
8279 Py_DECREF(str);
8280 return (PyObject*)result;
8283 static int
8284 formatint(Py_UNICODE *buf,
8285 size_t buflen,
8286 int flags,
8287 int prec,
8288 int type,
8289 PyObject *v)
8291 /* fmt = '%#.' + `prec` + 'l' + `type`
8292 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8293 * + 1 + 1
8294 * = 24
8296 char fmt[64]; /* plenty big enough! */
8297 char *sign;
8298 long x;
8300 x = PyInt_AsLong(v);
8301 if (x == -1 && PyErr_Occurred())
8302 return -1;
8303 if (x < 0 && type == 'u') {
8304 type = 'd';
8306 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8307 sign = "-";
8308 else
8309 sign = "";
8310 if (prec < 0)
8311 prec = 1;
8313 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8314 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8316 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8317 PyErr_SetString(PyExc_OverflowError,
8318 "formatted integer is too long (precision too large?)");
8319 return -1;
8322 if ((flags & F_ALT) &&
8323 (type == 'x' || type == 'X')) {
8324 /* When converting under %#x or %#X, there are a number
8325 * of issues that cause pain:
8326 * - when 0 is being converted, the C standard leaves off
8327 * the '0x' or '0X', which is inconsistent with other
8328 * %#x/%#X conversions and inconsistent with Python's
8329 * hex() function
8330 * - there are platforms that violate the standard and
8331 * convert 0 with the '0x' or '0X'
8332 * (Metrowerks, Compaq Tru64)
8333 * - there are platforms that give '0x' when converting
8334 * under %#X, but convert 0 in accordance with the
8335 * standard (OS/2 EMX)
8337 * We can achieve the desired consistency by inserting our
8338 * own '0x' or '0X' prefix, and substituting %x/%X in place
8339 * of %#x/%#X.
8341 * Note that this is the same approach as used in
8342 * formatint() in stringobject.c
8344 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8345 sign, type, prec, type);
8347 else {
8348 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8349 sign, (flags&F_ALT) ? "#" : "",
8350 prec, type);
8352 if (sign[0])
8353 return longtounicode(buf, buflen, fmt, -x);
8354 else
8355 return longtounicode(buf, buflen, fmt, x);
8358 static int
8359 formatchar(Py_UNICODE *buf,
8360 size_t buflen,
8361 PyObject *v)
8363 /* presume that the buffer is at least 2 characters long */
8364 if (PyUnicode_Check(v)) {
8365 if (PyUnicode_GET_SIZE(v) != 1)
8366 goto onError;
8367 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8370 else if (PyString_Check(v)) {
8371 if (PyString_GET_SIZE(v) != 1)
8372 goto onError;
8373 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8376 else {
8377 /* Integer input truncated to a character */
8378 long x;
8379 x = PyInt_AsLong(v);
8380 if (x == -1 && PyErr_Occurred())
8381 goto onError;
8382 #ifdef Py_UNICODE_WIDE
8383 if (x < 0 || x > 0x10ffff) {
8384 PyErr_SetString(PyExc_OverflowError,
8385 "%c arg not in range(0x110000) "
8386 "(wide Python build)");
8387 return -1;
8389 #else
8390 if (x < 0 || x > 0xffff) {
8391 PyErr_SetString(PyExc_OverflowError,
8392 "%c arg not in range(0x10000) "
8393 "(narrow Python build)");
8394 return -1;
8396 #endif
8397 buf[0] = (Py_UNICODE) x;
8399 buf[1] = '\0';
8400 return 1;
8402 onError:
8403 PyErr_SetString(PyExc_TypeError,
8404 "%c requires int or char");
8405 return -1;
8408 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8410 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8411 chars are formatted. XXX This is a magic number. Each formatting
8412 routine does bounds checking to ensure no overflow, but a better
8413 solution may be to malloc a buffer of appropriate size for each
8414 format. For now, the current solution is sufficient.
8416 #define FORMATBUFLEN (size_t)120
8418 PyObject *PyUnicode_Format(PyObject *format,
8419 PyObject *args)
8421 Py_UNICODE *fmt, *res;
8422 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8423 int args_owned = 0;
8424 PyUnicodeObject *result = NULL;
8425 PyObject *dict = NULL;
8426 PyObject *uformat;
8428 if (format == NULL || args == NULL) {
8429 PyErr_BadInternalCall();
8430 return NULL;
8432 uformat = PyUnicode_FromObject(format);
8433 if (uformat == NULL)
8434 return NULL;
8435 fmt = PyUnicode_AS_UNICODE(uformat);
8436 fmtcnt = PyUnicode_GET_SIZE(uformat);
8438 reslen = rescnt = fmtcnt + 100;
8439 result = _PyUnicode_New(reslen);
8440 if (result == NULL)
8441 goto onError;
8442 res = PyUnicode_AS_UNICODE(result);
8444 if (PyTuple_Check(args)) {
8445 arglen = PyTuple_Size(args);
8446 argidx = 0;
8448 else {
8449 arglen = -1;
8450 argidx = -2;
8452 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
8453 !PyObject_TypeCheck(args, &PyBaseString_Type))
8454 dict = args;
8456 while (--fmtcnt >= 0) {
8457 if (*fmt != '%') {
8458 if (--rescnt < 0) {
8459 rescnt = fmtcnt + 100;
8460 reslen += rescnt;
8461 if (_PyUnicode_Resize(&result, reslen) < 0)
8462 goto onError;
8463 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8464 --rescnt;
8466 *res++ = *fmt++;
8468 else {
8469 /* Got a format specifier */
8470 int flags = 0;
8471 Py_ssize_t width = -1;
8472 int prec = -1;
8473 Py_UNICODE c = '\0';
8474 Py_UNICODE fill;
8475 int isnumok;
8476 PyObject *v = NULL;
8477 PyObject *temp = NULL;
8478 Py_UNICODE *pbuf;
8479 Py_UNICODE sign;
8480 Py_ssize_t len;
8481 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
8483 fmt++;
8484 if (*fmt == '(') {
8485 Py_UNICODE *keystart;
8486 Py_ssize_t keylen;
8487 PyObject *key;
8488 int pcount = 1;
8490 if (dict == NULL) {
8491 PyErr_SetString(PyExc_TypeError,
8492 "format requires a mapping");
8493 goto onError;
8495 ++fmt;
8496 --fmtcnt;
8497 keystart = fmt;
8498 /* Skip over balanced parentheses */
8499 while (pcount > 0 && --fmtcnt >= 0) {
8500 if (*fmt == ')')
8501 --pcount;
8502 else if (*fmt == '(')
8503 ++pcount;
8504 fmt++;
8506 keylen = fmt - keystart - 1;
8507 if (fmtcnt < 0 || pcount > 0) {
8508 PyErr_SetString(PyExc_ValueError,
8509 "incomplete format key");
8510 goto onError;
8512 #if 0
8513 /* keys are converted to strings using UTF-8 and
8514 then looked up since Python uses strings to hold
8515 variables names etc. in its namespaces and we
8516 wouldn't want to break common idioms. */
8517 key = PyUnicode_EncodeUTF8(keystart,
8518 keylen,
8519 NULL);
8520 #else
8521 key = PyUnicode_FromUnicode(keystart, keylen);
8522 #endif
8523 if (key == NULL)
8524 goto onError;
8525 if (args_owned) {
8526 Py_DECREF(args);
8527 args_owned = 0;
8529 args = PyObject_GetItem(dict, key);
8530 Py_DECREF(key);
8531 if (args == NULL) {
8532 goto onError;
8534 args_owned = 1;
8535 arglen = -1;
8536 argidx = -2;
8538 while (--fmtcnt >= 0) {
8539 switch (c = *fmt++) {
8540 case '-': flags |= F_LJUST; continue;
8541 case '+': flags |= F_SIGN; continue;
8542 case ' ': flags |= F_BLANK; continue;
8543 case '#': flags |= F_ALT; continue;
8544 case '0': flags |= F_ZERO; continue;
8546 break;
8548 if (c == '*') {
8549 v = getnextarg(args, arglen, &argidx);
8550 if (v == NULL)
8551 goto onError;
8552 if (!PyInt_Check(v)) {
8553 PyErr_SetString(PyExc_TypeError,
8554 "* wants int");
8555 goto onError;
8557 width = PyInt_AsLong(v);
8558 if (width < 0) {
8559 flags |= F_LJUST;
8560 width = -width;
8562 if (--fmtcnt >= 0)
8563 c = *fmt++;
8565 else if (c >= '0' && c <= '9') {
8566 width = c - '0';
8567 while (--fmtcnt >= 0) {
8568 c = *fmt++;
8569 if (c < '0' || c > '9')
8570 break;
8571 if ((width*10) / 10 != width) {
8572 PyErr_SetString(PyExc_ValueError,
8573 "width too big");
8574 goto onError;
8576 width = width*10 + (c - '0');
8579 if (c == '.') {
8580 prec = 0;
8581 if (--fmtcnt >= 0)
8582 c = *fmt++;
8583 if (c == '*') {
8584 v = getnextarg(args, arglen, &argidx);
8585 if (v == NULL)
8586 goto onError;
8587 if (!PyInt_Check(v)) {
8588 PyErr_SetString(PyExc_TypeError,
8589 "* wants int");
8590 goto onError;
8592 prec = PyInt_AsLong(v);
8593 if (prec < 0)
8594 prec = 0;
8595 if (--fmtcnt >= 0)
8596 c = *fmt++;
8598 else if (c >= '0' && c <= '9') {
8599 prec = c - '0';
8600 while (--fmtcnt >= 0) {
8601 c = Py_CHARMASK(*fmt++);
8602 if (c < '0' || c > '9')
8603 break;
8604 if ((prec*10) / 10 != prec) {
8605 PyErr_SetString(PyExc_ValueError,
8606 "prec too big");
8607 goto onError;
8609 prec = prec*10 + (c - '0');
8612 } /* prec */
8613 if (fmtcnt >= 0) {
8614 if (c == 'h' || c == 'l' || c == 'L') {
8615 if (--fmtcnt >= 0)
8616 c = *fmt++;
8619 if (fmtcnt < 0) {
8620 PyErr_SetString(PyExc_ValueError,
8621 "incomplete format");
8622 goto onError;
8624 if (c != '%') {
8625 v = getnextarg(args, arglen, &argidx);
8626 if (v == NULL)
8627 goto onError;
8629 sign = 0;
8630 fill = ' ';
8631 switch (c) {
8633 case '%':
8634 pbuf = formatbuf;
8635 /* presume that buffer length is at least 1 */
8636 pbuf[0] = '%';
8637 len = 1;
8638 break;
8640 case 's':
8641 case 'r':
8642 if (PyUnicode_Check(v) && c == 's') {
8643 temp = v;
8644 Py_INCREF(temp);
8646 else {
8647 PyObject *unicode;
8648 if (c == 's')
8649 temp = PyObject_Unicode(v);
8650 else
8651 temp = PyObject_Repr(v);
8652 if (temp == NULL)
8653 goto onError;
8654 if (PyUnicode_Check(temp))
8655 /* nothing to do */;
8656 else if (PyString_Check(temp)) {
8657 /* convert to string to Unicode */
8658 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8659 PyString_GET_SIZE(temp),
8660 NULL,
8661 "strict");
8662 Py_DECREF(temp);
8663 temp = unicode;
8664 if (temp == NULL)
8665 goto onError;
8667 else {
8668 Py_DECREF(temp);
8669 PyErr_SetString(PyExc_TypeError,
8670 "%s argument has non-string str()");
8671 goto onError;
8674 pbuf = PyUnicode_AS_UNICODE(temp);
8675 len = PyUnicode_GET_SIZE(temp);
8676 if (prec >= 0 && len > prec)
8677 len = prec;
8678 break;
8680 case 'i':
8681 case 'd':
8682 case 'u':
8683 case 'o':
8684 case 'x':
8685 case 'X':
8686 if (c == 'i')
8687 c = 'd';
8688 isnumok = 0;
8689 if (PyNumber_Check(v)) {
8690 PyObject *iobj=NULL;
8692 if (PyInt_Check(v) || (PyLong_Check(v))) {
8693 iobj = v;
8694 Py_INCREF(iobj);
8696 else {
8697 iobj = PyNumber_Int(v);
8698 if (iobj==NULL) iobj = PyNumber_Long(v);
8700 if (iobj!=NULL) {
8701 if (PyInt_Check(iobj)) {
8702 isnumok = 1;
8703 pbuf = formatbuf;
8704 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8705 flags, prec, c, iobj);
8706 Py_DECREF(iobj);
8707 if (len < 0)
8708 goto onError;
8709 sign = 1;
8711 else if (PyLong_Check(iobj)) {
8712 isnumok = 1;
8713 temp = formatlong(iobj, flags, prec, c);
8714 Py_DECREF(iobj);
8715 if (!temp)
8716 goto onError;
8717 pbuf = PyUnicode_AS_UNICODE(temp);
8718 len = PyUnicode_GET_SIZE(temp);
8719 sign = 1;
8721 else {
8722 Py_DECREF(iobj);
8726 if (!isnumok) {
8727 PyErr_Format(PyExc_TypeError,
8728 "%%%c format: a number is required, "
8729 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8730 goto onError;
8732 if (flags & F_ZERO)
8733 fill = '0';
8734 break;
8736 case 'e':
8737 case 'E':
8738 case 'f':
8739 case 'F':
8740 case 'g':
8741 case 'G':
8742 if (c == 'F')
8743 c = 'f';
8744 pbuf = formatbuf;
8745 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8746 flags, prec, c, v);
8747 if (len < 0)
8748 goto onError;
8749 sign = 1;
8750 if (flags & F_ZERO)
8751 fill = '0';
8752 break;
8754 case 'c':
8755 pbuf = formatbuf;
8756 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8757 if (len < 0)
8758 goto onError;
8759 break;
8761 default:
8762 PyErr_Format(PyExc_ValueError,
8763 "unsupported format character '%c' (0x%x) "
8764 "at index %zd",
8765 (31<=c && c<=126) ? (char)c : '?',
8766 (int)c,
8767 (Py_ssize_t)(fmt - 1 -
8768 PyUnicode_AS_UNICODE(uformat)));
8769 goto onError;
8771 if (sign) {
8772 if (*pbuf == '-' || *pbuf == '+') {
8773 sign = *pbuf++;
8774 len--;
8776 else if (flags & F_SIGN)
8777 sign = '+';
8778 else if (flags & F_BLANK)
8779 sign = ' ';
8780 else
8781 sign = 0;
8783 if (width < len)
8784 width = len;
8785 if (rescnt - (sign != 0) < width) {
8786 reslen -= rescnt;
8787 rescnt = width + fmtcnt + 100;
8788 reslen += rescnt;
8789 if (reslen < 0) {
8790 Py_XDECREF(temp);
8791 PyErr_NoMemory();
8792 goto onError;
8794 if (_PyUnicode_Resize(&result, reslen) < 0) {
8795 Py_XDECREF(temp);
8796 goto onError;
8798 res = PyUnicode_AS_UNICODE(result)
8799 + reslen - rescnt;
8801 if (sign) {
8802 if (fill != ' ')
8803 *res++ = sign;
8804 rescnt--;
8805 if (width > len)
8806 width--;
8808 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8809 assert(pbuf[0] == '0');
8810 assert(pbuf[1] == c);
8811 if (fill != ' ') {
8812 *res++ = *pbuf++;
8813 *res++ = *pbuf++;
8815 rescnt -= 2;
8816 width -= 2;
8817 if (width < 0)
8818 width = 0;
8819 len -= 2;
8821 if (width > len && !(flags & F_LJUST)) {
8822 do {
8823 --rescnt;
8824 *res++ = fill;
8825 } while (--width > len);
8827 if (fill == ' ') {
8828 if (sign)
8829 *res++ = sign;
8830 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8831 assert(pbuf[0] == '0');
8832 assert(pbuf[1] == c);
8833 *res++ = *pbuf++;
8834 *res++ = *pbuf++;
8837 Py_UNICODE_COPY(res, pbuf, len);
8838 res += len;
8839 rescnt -= len;
8840 while (--width >= len) {
8841 --rescnt;
8842 *res++ = ' ';
8844 if (dict && (argidx < arglen) && c != '%') {
8845 PyErr_SetString(PyExc_TypeError,
8846 "not all arguments converted during string formatting");
8847 Py_XDECREF(temp);
8848 goto onError;
8850 Py_XDECREF(temp);
8851 } /* '%' */
8852 } /* until end */
8853 if (argidx < arglen && !dict) {
8854 PyErr_SetString(PyExc_TypeError,
8855 "not all arguments converted during string formatting");
8856 goto onError;
8859 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8860 goto onError;
8861 if (args_owned) {
8862 Py_DECREF(args);
8864 Py_DECREF(uformat);
8865 return (PyObject *)result;
8867 onError:
8868 Py_XDECREF(result);
8869 Py_DECREF(uformat);
8870 if (args_owned) {
8871 Py_DECREF(args);
8873 return NULL;
8876 static PyBufferProcs unicode_as_buffer = {
8877 (readbufferproc) unicode_buffer_getreadbuf,
8878 (writebufferproc) unicode_buffer_getwritebuf,
8879 (segcountproc) unicode_buffer_getsegcount,
8880 (charbufferproc) unicode_buffer_getcharbuf,
8883 static PyObject *
8884 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8886 static PyObject *
8887 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8889 PyObject *x = NULL;
8890 static char *kwlist[] = {"string", "encoding", "errors", 0};
8891 char *encoding = NULL;
8892 char *errors = NULL;
8894 if (type != &PyUnicode_Type)
8895 return unicode_subtype_new(type, args, kwds);
8896 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8897 kwlist, &x, &encoding, &errors))
8898 return NULL;
8899 if (x == NULL)
8900 return (PyObject *)_PyUnicode_New(0);
8901 if (encoding == NULL && errors == NULL)
8902 return PyObject_Unicode(x);
8903 else
8904 return PyUnicode_FromEncodedObject(x, encoding, errors);
8907 static PyObject *
8908 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8910 PyUnicodeObject *tmp, *pnew;
8911 Py_ssize_t n;
8913 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8914 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8915 if (tmp == NULL)
8916 return NULL;
8917 assert(PyUnicode_Check(tmp));
8918 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8919 if (pnew == NULL) {
8920 Py_DECREF(tmp);
8921 return NULL;
8923 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8924 if (pnew->str == NULL) {
8925 _Py_ForgetReference((PyObject *)pnew);
8926 PyObject_Del(pnew);
8927 Py_DECREF(tmp);
8928 return PyErr_NoMemory();
8930 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8931 pnew->length = n;
8932 pnew->hash = tmp->hash;
8933 Py_DECREF(tmp);
8934 return (PyObject *)pnew;
8937 PyDoc_STRVAR(unicode_doc,
8938 "unicode(string [, encoding[, errors]]) -> object\n\
8940 Create a new Unicode object from the given encoded string.\n\
8941 encoding defaults to the current default string encoding.\n\
8942 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8944 PyTypeObject PyUnicode_Type = {
8945 PyVarObject_HEAD_INIT(&PyType_Type, 0)
8946 "unicode", /* tp_name */
8947 sizeof(PyUnicodeObject), /* tp_size */
8948 0, /* tp_itemsize */
8949 /* Slots */
8950 (destructor)unicode_dealloc, /* tp_dealloc */
8951 0, /* tp_print */
8952 0, /* tp_getattr */
8953 0, /* tp_setattr */
8954 0, /* tp_compare */
8955 unicode_repr, /* tp_repr */
8956 &unicode_as_number, /* tp_as_number */
8957 &unicode_as_sequence, /* tp_as_sequence */
8958 &unicode_as_mapping, /* tp_as_mapping */
8959 (hashfunc) unicode_hash, /* tp_hash*/
8960 0, /* tp_call*/
8961 (reprfunc) unicode_str, /* tp_str */
8962 PyObject_GenericGetAttr, /* tp_getattro */
8963 0, /* tp_setattro */
8964 &unicode_as_buffer, /* tp_as_buffer */
8965 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
8966 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
8967 unicode_doc, /* tp_doc */
8968 0, /* tp_traverse */
8969 0, /* tp_clear */
8970 PyUnicode_RichCompare, /* tp_richcompare */
8971 0, /* tp_weaklistoffset */
8972 0, /* tp_iter */
8973 0, /* tp_iternext */
8974 unicode_methods, /* tp_methods */
8975 0, /* tp_members */
8976 0, /* tp_getset */
8977 &PyBaseString_Type, /* tp_base */
8978 0, /* tp_dict */
8979 0, /* tp_descr_get */
8980 0, /* tp_descr_set */
8981 0, /* tp_dictoffset */
8982 0, /* tp_init */
8983 0, /* tp_alloc */
8984 unicode_new, /* tp_new */
8985 PyObject_Del, /* tp_free */
8988 /* Initialize the Unicode implementation */
8990 void _PyUnicode_Init(void)
8992 int i;
8994 /* XXX - move this array to unicodectype.c ? */
8995 Py_UNICODE linebreak[] = {
8996 0x000A, /* LINE FEED */
8997 0x000D, /* CARRIAGE RETURN */
8998 0x001C, /* FILE SEPARATOR */
8999 0x001D, /* GROUP SEPARATOR */
9000 0x001E, /* RECORD SEPARATOR */
9001 0x0085, /* NEXT LINE */
9002 0x2028, /* LINE SEPARATOR */
9003 0x2029, /* PARAGRAPH SEPARATOR */
9006 /* Init the implementation */
9007 free_list = NULL;
9008 numfree = 0;
9009 unicode_empty = _PyUnicode_New(0);
9010 if (!unicode_empty)
9011 return;
9013 strcpy(unicode_default_encoding, "ascii");
9014 for (i = 0; i < 256; i++)
9015 unicode_latin1[i] = NULL;
9016 if (PyType_Ready(&PyUnicode_Type) < 0)
9017 Py_FatalError("Can't initialize 'unicode'");
9019 /* initialize the linebreak bloom filter */
9020 bloom_linebreak = make_bloom_mask(
9021 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9024 PyType_Ready(&EncodingMapType);
9027 /* Finalize the Unicode implementation */
9030 PyUnicode_ClearFreeList(void)
9032 int freelist_size = numfree;
9033 PyUnicodeObject *u;
9035 for (u = free_list; u != NULL;) {
9036 PyUnicodeObject *v = u;
9037 u = *(PyUnicodeObject **)u;
9038 if (v->str)
9039 PyObject_DEL(v->str);
9040 Py_XDECREF(v->defenc);
9041 PyObject_Del(v);
9042 numfree--;
9044 free_list = NULL;
9045 assert(numfree == 0);
9046 return freelist_size;
9049 void
9050 _PyUnicode_Fini(void)
9052 int i;
9054 Py_XDECREF(unicode_empty);
9055 unicode_empty = NULL;
9057 for (i = 0; i < 256; i++) {
9058 if (unicode_latin1[i]) {
9059 Py_DECREF(unicode_latin1[i]);
9060 unicode_latin1[i] = NULL;
9063 (void)PyUnicode_ClearFreeList();
9066 #ifdef __cplusplus
9068 #endif
9072 Local variables:
9073 c-basic-offset: 4
9074 indent-tabs-mode: nil
9075 End: