More Python 2.3 compatibility fixes for decimal.py.
[python.git] / Objects / unicodeobject.c
blob9615d432274f5f4d0f0a3b939b9e9629c6aa9c9a
1 /*
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
10 Copyright (c) Corporation for National Research Initiatives.
12 --------------------------------------------------------------------
13 The original string type implementation is:
15 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
18 By obtaining, using, and/or copying this software and/or its
19 associated documentation, you agree that you have read, understood,
20 and will comply with the following terms and conditions:
22 Permission to use, copy, modify, and distribute this software and its
23 associated documentation for any purpose and without fee is hereby
24 granted, provided that the above copyright notice appears in all
25 copies, and that both that copyright notice and this permission notice
26 appear in supporting documentation, and that the name of Secret Labs
27 AB or the author not be used in advertising or publicity pertaining to
28 distribution of the software without specific, written prior
29 permission.
31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 --------------------------------------------------------------------
42 #define PY_SSIZE_T_CLEAN
43 #include "Python.h"
45 #include "unicodeobject.h"
46 #include "ucnhash.h"
48 #ifdef MS_WINDOWS
49 #include <windows.h>
50 #endif
52 /* Limit for the Unicode object free list */
54 #define PyUnicode_MAXFREELIST 1024
56 /* Limit for the Unicode object free list stay alive optimization.
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
60 limit. This reduces malloc() overhead for small Unicode objects.
62 At worst this will result in PyUnicode_MAXFREELIST *
63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64 malloc()-overhead) bytes of unused garbage.
66 Setting the limit to 0 effectively turns the feature off.
68 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
73 #define KEEPALIVE_SIZE_LIMIT 9
75 /* Endianness switches; defaults to little endian */
77 #ifdef WORDS_BIGENDIAN
78 # define BYTEORDER_IS_BIG_ENDIAN
79 #else
80 # define BYTEORDER_IS_LITTLE_ENDIAN
81 #endif
83 /* --- Globals ------------------------------------------------------------
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
91 #ifdef __cplusplus
92 extern "C" {
93 #endif
95 /* Free list for Unicode objects */
96 static PyUnicodeObject *free_list;
97 static int numfree;
99 /* The empty Unicode object is shared to improve performance. */
100 static PyUnicodeObject *unicode_empty;
102 /* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104 static PyUnicodeObject *unicode_latin1[256];
106 /* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
109 Always use the PyUnicode_SetDefaultEncoding() and
110 PyUnicode_GetDefaultEncoding() APIs to access this global.
113 static char unicode_default_encoding[100];
115 /* Fast detection of the most frequent whitespace characters */
116 const unsigned char _Py_ascii_whitespace[] = {
117 0, 0, 0, 0, 0, 0, 0, 0,
118 /* case 0x0009: * HORIZONTAL TABULATION */
119 /* case 0x000A: * LINE FEED */
120 /* case 0x000B: * VERTICAL TABULATION */
121 /* case 0x000C: * FORM FEED */
122 /* case 0x000D: * CARRIAGE RETURN */
123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
125 /* case 0x001C: * FILE SEPARATOR */
126 /* case 0x001D: * GROUP SEPARATOR */
127 /* case 0x001E: * RECORD SEPARATOR */
128 /* case 0x001F: * UNIT SEPARATOR */
129 0, 0, 0, 0, 1, 1, 1, 1,
130 /* case 0x0020: * SPACE */
131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
146 /* Same for linebreaks */
147 static unsigned char ascii_linebreak[] = {
148 0, 0, 0, 0, 0, 0, 0, 0,
149 /* 0x000A, * LINE FEED */
150 /* 0x000D, * CARRIAGE RETURN */
151 0, 0, 1, 0, 0, 1, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 /* 0x001C, * FILE SEPARATOR */
154 /* 0x001D, * GROUP SEPARATOR */
155 /* 0x001E, * RECORD SEPARATOR */
156 0, 0, 0, 0, 1, 1, 1, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0
173 Py_UNICODE
174 PyUnicode_GetMax(void)
176 #ifdef Py_UNICODE_WIDE
177 return 0x10FFFF;
178 #else
179 /* This is actually an illegal character, so it should
180 not be passed to unichr. */
181 return 0xFFFF;
182 #endif
185 /* --- Bloom Filters ----------------------------------------------------- */
187 /* stuff to implement simple "bloom filters" for Unicode characters.
188 to keep things simple, we use a single bitmask, using the least 5
189 bits from each unicode characters as the bit index. */
191 /* the linebreak mask is set up by Unicode_Init below */
193 #define BLOOM_MASK unsigned long
195 static BLOOM_MASK bloom_linebreak;
197 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
199 #define BLOOM_LINEBREAK(ch) \
200 ((ch) < 128U ? ascii_linebreak[(ch)] : \
201 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
203 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
205 /* calculate simple bloom-style bitmask for a given unicode string */
207 long mask;
208 Py_ssize_t i;
210 mask = 0;
211 for (i = 0; i < len; i++)
212 mask |= (1 << (ptr[i] & 0x1F));
214 return mask;
217 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
219 Py_ssize_t i;
221 for (i = 0; i < setlen; i++)
222 if (set[i] == chr)
223 return 1;
225 return 0;
228 #define BLOOM_MEMBER(mask, chr, set, setlen)\
229 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
231 /* --- Unicode Object ----------------------------------------------------- */
233 static
234 int unicode_resize(register PyUnicodeObject *unicode,
235 Py_ssize_t length)
237 void *oldstr;
239 /* Shortcut if there's nothing much to do. */
240 if (unicode->length == length)
241 goto reset;
243 /* Resizing shared object (unicode_empty or single character
244 objects) in-place is not allowed. Use PyUnicode_Resize()
245 instead ! */
247 if (unicode == unicode_empty ||
248 (unicode->length == 1 &&
249 unicode->str[0] < 256U &&
250 unicode_latin1[unicode->str[0]] == unicode)) {
251 PyErr_SetString(PyExc_SystemError,
252 "can't resize shared unicode objects");
253 return -1;
256 /* We allocate one more byte to make sure the string is Ux0000 terminated.
257 The overallocation is also used by fastsearch, which assumes that it's
258 safe to look at str[length] (without making any assumptions about what
259 it contains). */
261 oldstr = unicode->str;
262 unicode->str = PyObject_REALLOC(unicode->str,
263 sizeof(Py_UNICODE) * (length + 1));
264 if (!unicode->str) {
265 unicode->str = (Py_UNICODE *)oldstr;
266 PyErr_NoMemory();
267 return -1;
269 unicode->str[length] = 0;
270 unicode->length = length;
272 reset:
273 /* Reset the object caches */
274 if (unicode->defenc) {
275 Py_DECREF(unicode->defenc);
276 unicode->defenc = NULL;
278 unicode->hash = -1;
280 return 0;
283 /* We allocate one more byte to make sure the string is
284 Ux0000 terminated -- XXX is this needed ?
286 XXX This allocator could further be enhanced by assuring that the
287 free list never reduces its size below 1.
291 static
292 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
294 register PyUnicodeObject *unicode;
296 /* Optimization for empty strings */
297 if (length == 0 && unicode_empty != NULL) {
298 Py_INCREF(unicode_empty);
299 return unicode_empty;
302 /* Ensure we won't overflow the size. */
303 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
304 return (PyUnicodeObject *)PyErr_NoMemory();
307 /* Unicode freelist & memory allocation */
308 if (free_list) {
309 unicode = free_list;
310 free_list = *(PyUnicodeObject **)unicode;
311 numfree--;
312 if (unicode->str) {
313 /* Keep-Alive optimization: we only upsize the buffer,
314 never downsize it. */
315 if ((unicode->length < length) &&
316 unicode_resize(unicode, length) < 0) {
317 PyObject_DEL(unicode->str);
318 unicode->str = NULL;
321 else {
322 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
323 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
325 PyObject_INIT(unicode, &PyUnicode_Type);
327 else {
328 size_t new_size;
329 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
330 if (unicode == NULL)
331 return NULL;
332 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
333 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
336 if (!unicode->str) {
337 PyErr_NoMemory();
338 goto onError;
340 /* Initialize the first element to guard against cases where
341 * the caller fails before initializing str -- unicode_resize()
342 * reads str[0], and the Keep-Alive optimization can keep memory
343 * allocated for str alive across a call to unicode_dealloc(unicode).
344 * We don't want unicode_resize to read uninitialized memory in
345 * that case.
347 unicode->str[0] = 0;
348 unicode->str[length] = 0;
349 unicode->length = length;
350 unicode->hash = -1;
351 unicode->defenc = NULL;
352 return unicode;
354 onError:
355 /* XXX UNREF/NEWREF interface should be more symmetrical */
356 _Py_DEC_REFTOTAL;
357 _Py_ForgetReference((PyObject *)unicode);
358 PyObject_Del(unicode);
359 return NULL;
362 static
363 void unicode_dealloc(register PyUnicodeObject *unicode)
365 if (PyUnicode_CheckExact(unicode) &&
366 numfree < PyUnicode_MAXFREELIST) {
367 /* Keep-Alive optimization */
368 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
369 PyObject_DEL(unicode->str);
370 unicode->str = NULL;
371 unicode->length = 0;
373 if (unicode->defenc) {
374 Py_DECREF(unicode->defenc);
375 unicode->defenc = NULL;
377 /* Add to free list */
378 *(PyUnicodeObject **)unicode = free_list;
379 free_list = unicode;
380 numfree++;
382 else {
383 PyObject_DEL(unicode->str);
384 Py_XDECREF(unicode->defenc);
385 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
389 static
390 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
392 register PyUnicodeObject *v;
394 /* Argument checks */
395 if (unicode == NULL) {
396 PyErr_BadInternalCall();
397 return -1;
399 v = *unicode;
400 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
401 PyErr_BadInternalCall();
402 return -1;
405 /* Resizing unicode_empty and single character objects is not
406 possible since these are being shared. We simply return a fresh
407 copy with the same Unicode content. */
408 if (v->length != length &&
409 (v == unicode_empty || v->length == 1)) {
410 PyUnicodeObject *w = _PyUnicode_New(length);
411 if (w == NULL)
412 return -1;
413 Py_UNICODE_COPY(w->str, v->str,
414 length < v->length ? length : v->length);
415 Py_DECREF(*unicode);
416 *unicode = w;
417 return 0;
420 /* Note that we don't have to modify *unicode for unshared Unicode
421 objects, since we can modify them in-place. */
422 return unicode_resize(v, length);
425 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
427 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
430 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
431 Py_ssize_t size)
433 PyUnicodeObject *unicode;
435 /* If the Unicode data is known at construction time, we can apply
436 some optimizations which share commonly used objects. */
437 if (u != NULL) {
439 /* Optimization for empty strings */
440 if (size == 0 && unicode_empty != NULL) {
441 Py_INCREF(unicode_empty);
442 return (PyObject *)unicode_empty;
445 /* Single character Unicode objects in the Latin-1 range are
446 shared when using this constructor */
447 if (size == 1 && *u < 256) {
448 unicode = unicode_latin1[*u];
449 if (!unicode) {
450 unicode = _PyUnicode_New(1);
451 if (!unicode)
452 return NULL;
453 unicode->str[0] = *u;
454 unicode_latin1[*u] = unicode;
456 Py_INCREF(unicode);
457 return (PyObject *)unicode;
461 unicode = _PyUnicode_New(size);
462 if (!unicode)
463 return NULL;
465 /* Copy the Unicode data into the new object */
466 if (u != NULL)
467 Py_UNICODE_COPY(unicode->str, u, size);
469 return (PyObject *)unicode;
472 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
474 PyUnicodeObject *unicode;
476 if (size < 0) {
477 PyErr_SetString(PyExc_SystemError,
478 "Negative size passed to PyUnicode_FromStringAndSize");
479 return NULL;
482 /* If the Unicode data is known at construction time, we can apply
483 some optimizations which share commonly used objects.
484 Also, this means the input must be UTF-8, so fall back to the
485 UTF-8 decoder at the end. */
486 if (u != NULL) {
488 /* Optimization for empty strings */
489 if (size == 0 && unicode_empty != NULL) {
490 Py_INCREF(unicode_empty);
491 return (PyObject *)unicode_empty;
494 /* Single characters are shared when using this constructor.
495 Restrict to ASCII, since the input must be UTF-8. */
496 if (size == 1 && Py_CHARMASK(*u) < 128) {
497 unicode = unicode_latin1[Py_CHARMASK(*u)];
498 if (!unicode) {
499 unicode = _PyUnicode_New(1);
500 if (!unicode)
501 return NULL;
502 unicode->str[0] = Py_CHARMASK(*u);
503 unicode_latin1[Py_CHARMASK(*u)] = unicode;
505 Py_INCREF(unicode);
506 return (PyObject *)unicode;
509 return PyUnicode_DecodeUTF8(u, size, NULL);
512 unicode = _PyUnicode_New(size);
513 if (!unicode)
514 return NULL;
516 return (PyObject *)unicode;
519 PyObject *PyUnicode_FromString(const char *u)
521 size_t size = strlen(u);
522 if (size > PY_SSIZE_T_MAX) {
523 PyErr_SetString(PyExc_OverflowError, "input too long");
524 return NULL;
527 return PyUnicode_FromStringAndSize(u, size);
530 #ifdef HAVE_WCHAR_H
532 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
533 Py_ssize_t size)
535 PyUnicodeObject *unicode;
537 if (w == NULL) {
538 PyErr_BadInternalCall();
539 return NULL;
542 unicode = _PyUnicode_New(size);
543 if (!unicode)
544 return NULL;
546 /* Copy the wchar_t data into the new object */
547 #ifdef HAVE_USABLE_WCHAR_T
548 memcpy(unicode->str, w, size * sizeof(wchar_t));
549 #else
551 register Py_UNICODE *u;
552 register Py_ssize_t i;
553 u = PyUnicode_AS_UNICODE(unicode);
554 for (i = size; i > 0; i--)
555 *u++ = *w++;
557 #endif
559 return (PyObject *)unicode;
562 static void
563 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
565 *fmt++ = '%';
566 if (width) {
567 if (zeropad)
568 *fmt++ = '0';
569 fmt += sprintf(fmt, "%d", width);
571 if (precision)
572 fmt += sprintf(fmt, ".%d", precision);
573 if (longflag)
574 *fmt++ = 'l';
575 else if (size_tflag) {
576 char *f = PY_FORMAT_SIZE_T;
577 while (*f)
578 *fmt++ = *f++;
580 *fmt++ = c;
581 *fmt = '\0';
584 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
586 PyObject *
587 PyUnicode_FromFormatV(const char *format, va_list vargs)
589 va_list count;
590 Py_ssize_t callcount = 0;
591 PyObject **callresults = NULL;
592 PyObject **callresult = NULL;
593 Py_ssize_t n = 0;
594 int width = 0;
595 int precision = 0;
596 int zeropad;
597 const char* f;
598 Py_UNICODE *s;
599 PyObject *string;
600 /* used by sprintf */
601 char buffer[21];
602 /* use abuffer instead of buffer, if we need more space
603 * (which can happen if there's a format specifier with width). */
604 char *abuffer = NULL;
605 char *realbuffer;
606 Py_ssize_t abuffersize = 0;
607 char fmt[60]; /* should be enough for %0width.precisionld */
608 const char *copy;
610 #ifdef VA_LIST_IS_ARRAY
611 Py_MEMCPY(count, vargs, sizeof(va_list));
612 #else
613 #ifdef __va_copy
614 __va_copy(count, vargs);
615 #else
616 count = vargs;
617 #endif
618 #endif
619 /* step 1: count the number of %S/%R format specifications
620 * (we call PyObject_Str()/PyObject_Repr() for these objects
621 * once during step 3 and put the result in an array) */
622 for (f = format; *f; f++) {
623 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
624 ++callcount;
626 /* step 2: allocate memory for the results of
627 * PyObject_Str()/PyObject_Repr() calls */
628 if (callcount) {
629 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
630 if (!callresults) {
631 PyErr_NoMemory();
632 return NULL;
634 callresult = callresults;
636 /* step 3: figure out how large a buffer we need */
637 for (f = format; *f; f++) {
638 if (*f == '%') {
639 const char* p = f;
640 width = 0;
641 while (isdigit((unsigned)*f))
642 width = (width*10) + *f++ - '0';
643 while (*++f && *f != '%' && !isalpha((unsigned)*f))
646 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
647 * they don't affect the amount of space we reserve.
649 if ((*f == 'l' || *f == 'z') &&
650 (f[1] == 'd' || f[1] == 'u'))
651 ++f;
653 switch (*f) {
654 case 'c':
655 (void)va_arg(count, int);
656 /* fall through... */
657 case '%':
658 n++;
659 break;
660 case 'd': case 'u': case 'i': case 'x':
661 (void) va_arg(count, int);
662 /* 20 bytes is enough to hold a 64-bit
663 integer. Decimal takes the most space.
664 This isn't enough for octal.
665 If a width is specified we need more
666 (which we allocate later). */
667 if (width < 20)
668 width = 20;
669 n += width;
670 if (abuffersize < width)
671 abuffersize = width;
672 break;
673 case 's':
675 /* UTF-8 */
676 unsigned char*s;
677 s = va_arg(count, unsigned char*);
678 while (*s) {
679 if (*s < 128) {
680 n++; s++;
681 } else if (*s < 0xc0) {
682 /* invalid UTF-8 */
683 n++; s++;
684 } else if (*s < 0xc0) {
685 n++;
686 s++; if(!*s)break;
687 s++;
688 } else if (*s < 0xe0) {
689 n++;
690 s++; if(!*s)break;
691 s++; if(!*s)break;
692 s++;
693 } else {
694 #ifdef Py_UNICODE_WIDE
695 n++;
696 #else
697 n+=2;
698 #endif
699 s++; if(!*s)break;
700 s++; if(!*s)break;
701 s++; if(!*s)break;
702 s++;
705 break;
707 case 'U':
709 PyObject *obj = va_arg(count, PyObject *);
710 assert(obj && PyUnicode_Check(obj));
711 n += PyUnicode_GET_SIZE(obj);
712 break;
714 case 'V':
716 PyObject *obj = va_arg(count, PyObject *);
717 const char *str = va_arg(count, const char *);
718 assert(obj || str);
719 assert(!obj || PyUnicode_Check(obj));
720 if (obj)
721 n += PyUnicode_GET_SIZE(obj);
722 else
723 n += strlen(str);
724 break;
726 case 'S':
728 PyObject *obj = va_arg(count, PyObject *);
729 PyObject *str;
730 assert(obj);
731 str = PyObject_Str(obj);
732 if (!str)
733 goto fail;
734 n += PyUnicode_GET_SIZE(str);
735 /* Remember the str and switch to the next slot */
736 *callresult++ = str;
737 break;
739 case 'R':
741 PyObject *obj = va_arg(count, PyObject *);
742 PyObject *repr;
743 assert(obj);
744 repr = PyObject_Repr(obj);
745 if (!repr)
746 goto fail;
747 n += PyUnicode_GET_SIZE(repr);
748 /* Remember the repr and switch to the next slot */
749 *callresult++ = repr;
750 break;
752 case 'p':
753 (void) va_arg(count, int);
754 /* maximum 64-bit pointer representation:
755 * 0xffffffffffffffff
756 * so 19 characters is enough.
757 * XXX I count 18 -- what's the extra for?
759 n += 19;
760 break;
761 default:
762 /* if we stumble upon an unknown
763 formatting code, copy the rest of
764 the format string to the output
765 string. (we cannot just skip the
766 code, since there's no way to know
767 what's in the argument list) */
768 n += strlen(p);
769 goto expand;
771 } else
772 n++;
774 expand:
775 if (abuffersize > 20) {
776 abuffer = PyObject_Malloc(abuffersize);
777 if (!abuffer) {
778 PyErr_NoMemory();
779 goto fail;
781 realbuffer = abuffer;
783 else
784 realbuffer = buffer;
785 /* step 4: fill the buffer */
786 /* Since we've analyzed how much space we need for the worst case,
787 we don't have to resize the string.
788 There can be no errors beyond this point. */
789 string = PyUnicode_FromUnicode(NULL, n);
790 if (!string)
791 goto fail;
793 s = PyUnicode_AS_UNICODE(string);
794 callresult = callresults;
796 for (f = format; *f; f++) {
797 if (*f == '%') {
798 const char* p = f++;
799 int longflag = 0;
800 int size_tflag = 0;
801 zeropad = (*f == '0');
802 /* parse the width.precision part */
803 width = 0;
804 while (isdigit((unsigned)*f))
805 width = (width*10) + *f++ - '0';
806 precision = 0;
807 if (*f == '.') {
808 f++;
809 while (isdigit((unsigned)*f))
810 precision = (precision*10) + *f++ - '0';
812 /* handle the long flag, but only for %ld and %lu.
813 others can be added when necessary. */
814 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
815 longflag = 1;
816 ++f;
818 /* handle the size_t flag. */
819 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
820 size_tflag = 1;
821 ++f;
824 switch (*f) {
825 case 'c':
826 *s++ = va_arg(vargs, int);
827 break;
828 case 'd':
829 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
830 if (longflag)
831 sprintf(realbuffer, fmt, va_arg(vargs, long));
832 else if (size_tflag)
833 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
834 else
835 sprintf(realbuffer, fmt, va_arg(vargs, int));
836 appendstring(realbuffer);
837 break;
838 case 'u':
839 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
840 if (longflag)
841 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
842 else if (size_tflag)
843 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
844 else
845 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
846 appendstring(realbuffer);
847 break;
848 case 'i':
849 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
850 sprintf(realbuffer, fmt, va_arg(vargs, int));
851 appendstring(realbuffer);
852 break;
853 case 'x':
854 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
855 sprintf(realbuffer, fmt, va_arg(vargs, int));
856 appendstring(realbuffer);
857 break;
858 case 's':
860 /* Parameter must be UTF-8 encoded.
861 In case of encoding errors, use
862 the replacement character. */
863 PyObject *u;
864 p = va_arg(vargs, char*);
865 u = PyUnicode_DecodeUTF8(p, strlen(p),
866 "replace");
867 if (!u)
868 goto fail;
869 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
870 PyUnicode_GET_SIZE(u));
871 s += PyUnicode_GET_SIZE(u);
872 Py_DECREF(u);
873 break;
875 case 'U':
877 PyObject *obj = va_arg(vargs, PyObject *);
878 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
879 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
880 s += size;
881 break;
883 case 'V':
885 PyObject *obj = va_arg(vargs, PyObject *);
886 const char *str = va_arg(vargs, const char *);
887 if (obj) {
888 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
889 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
890 s += size;
891 } else {
892 appendstring(str);
894 break;
896 case 'S':
897 case 'R':
899 Py_UNICODE *ucopy;
900 Py_ssize_t usize;
901 Py_ssize_t upos;
902 /* unused, since we already have the result */
903 (void) va_arg(vargs, PyObject *);
904 ucopy = PyUnicode_AS_UNICODE(*callresult);
905 usize = PyUnicode_GET_SIZE(*callresult);
906 for (upos = 0; upos<usize;)
907 *s++ = ucopy[upos++];
908 /* We're done with the unicode()/repr() => forget it */
909 Py_DECREF(*callresult);
910 /* switch to next unicode()/repr() result */
911 ++callresult;
912 break;
914 case 'p':
915 sprintf(buffer, "%p", va_arg(vargs, void*));
916 /* %p is ill-defined: ensure leading 0x. */
917 if (buffer[1] == 'X')
918 buffer[1] = 'x';
919 else if (buffer[1] != 'x') {
920 memmove(buffer+2, buffer, strlen(buffer)+1);
921 buffer[0] = '0';
922 buffer[1] = 'x';
924 appendstring(buffer);
925 break;
926 case '%':
927 *s++ = '%';
928 break;
929 default:
930 appendstring(p);
931 goto end;
933 } else
934 *s++ = *f;
937 end:
938 if (callresults)
939 PyObject_Free(callresults);
940 if (abuffer)
941 PyObject_Free(abuffer);
942 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
943 return string;
944 fail:
945 if (callresults) {
946 PyObject **callresult2 = callresults;
947 while (callresult2 < callresult) {
948 Py_DECREF(*callresult2);
949 ++callresult2;
951 PyObject_Free(callresults);
953 if (abuffer)
954 PyObject_Free(abuffer);
955 return NULL;
958 #undef appendstring
960 PyObject *
961 PyUnicode_FromFormat(const char *format, ...)
963 PyObject* ret;
964 va_list vargs;
966 #ifdef HAVE_STDARG_PROTOTYPES
967 va_start(vargs, format);
968 #else
969 va_start(vargs);
970 #endif
971 ret = PyUnicode_FromFormatV(format, vargs);
972 va_end(vargs);
973 return ret;
976 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
977 wchar_t *w,
978 Py_ssize_t size)
980 if (unicode == NULL) {
981 PyErr_BadInternalCall();
982 return -1;
985 /* If possible, try to copy the 0-termination as well */
986 if (size > PyUnicode_GET_SIZE(unicode))
987 size = PyUnicode_GET_SIZE(unicode) + 1;
989 #ifdef HAVE_USABLE_WCHAR_T
990 memcpy(w, unicode->str, size * sizeof(wchar_t));
991 #else
993 register Py_UNICODE *u;
994 register Py_ssize_t i;
995 u = PyUnicode_AS_UNICODE(unicode);
996 for (i = size; i > 0; i--)
997 *w++ = *u++;
999 #endif
1001 if (size > PyUnicode_GET_SIZE(unicode))
1002 return PyUnicode_GET_SIZE(unicode);
1003 else
1004 return size;
1007 #endif
1009 PyObject *PyUnicode_FromOrdinal(int ordinal)
1011 Py_UNICODE s[1];
1013 #ifdef Py_UNICODE_WIDE
1014 if (ordinal < 0 || ordinal > 0x10ffff) {
1015 PyErr_SetString(PyExc_ValueError,
1016 "unichr() arg not in range(0x110000) "
1017 "(wide Python build)");
1018 return NULL;
1020 #else
1021 if (ordinal < 0 || ordinal > 0xffff) {
1022 PyErr_SetString(PyExc_ValueError,
1023 "unichr() arg not in range(0x10000) "
1024 "(narrow Python build)");
1025 return NULL;
1027 #endif
1029 s[0] = (Py_UNICODE)ordinal;
1030 return PyUnicode_FromUnicode(s, 1);
1033 PyObject *PyUnicode_FromObject(register PyObject *obj)
1035 /* XXX Perhaps we should make this API an alias of
1036 PyObject_Unicode() instead ?! */
1037 if (PyUnicode_CheckExact(obj)) {
1038 Py_INCREF(obj);
1039 return obj;
1041 if (PyUnicode_Check(obj)) {
1042 /* For a Unicode subtype that's not a Unicode object,
1043 return a true Unicode object with the same data. */
1044 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1045 PyUnicode_GET_SIZE(obj));
1047 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1050 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1051 const char *encoding,
1052 const char *errors)
1054 const char *s = NULL;
1055 Py_ssize_t len;
1056 PyObject *v;
1058 if (obj == NULL) {
1059 PyErr_BadInternalCall();
1060 return NULL;
1063 #if 0
1064 /* For b/w compatibility we also accept Unicode objects provided
1065 that no encodings is given and then redirect to
1066 PyObject_Unicode() which then applies the additional logic for
1067 Unicode subclasses.
1069 NOTE: This API should really only be used for object which
1070 represent *encoded* Unicode !
1073 if (PyUnicode_Check(obj)) {
1074 if (encoding) {
1075 PyErr_SetString(PyExc_TypeError,
1076 "decoding Unicode is not supported");
1077 return NULL;
1079 return PyObject_Unicode(obj);
1081 #else
1082 if (PyUnicode_Check(obj)) {
1083 PyErr_SetString(PyExc_TypeError,
1084 "decoding Unicode is not supported");
1085 return NULL;
1087 #endif
1089 /* Coerce object */
1090 if (PyString_Check(obj)) {
1091 s = PyString_AS_STRING(obj);
1092 len = PyString_GET_SIZE(obj);
1094 else if (PyByteArray_Check(obj)) {
1095 /* Python 2.x specific */
1096 PyErr_Format(PyExc_TypeError,
1097 "decoding bytearray is not supported");
1098 return NULL;
1100 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1101 /* Overwrite the error message with something more useful in
1102 case of a TypeError. */
1103 if (PyErr_ExceptionMatches(PyExc_TypeError))
1104 PyErr_Format(PyExc_TypeError,
1105 "coercing to Unicode: need string or buffer, "
1106 "%.80s found",
1107 Py_TYPE(obj)->tp_name);
1108 goto onError;
1111 /* Convert to Unicode */
1112 if (len == 0) {
1113 Py_INCREF(unicode_empty);
1114 v = (PyObject *)unicode_empty;
1116 else
1117 v = PyUnicode_Decode(s, len, encoding, errors);
1119 return v;
1121 onError:
1122 return NULL;
1125 PyObject *PyUnicode_Decode(const char *s,
1126 Py_ssize_t size,
1127 const char *encoding,
1128 const char *errors)
1130 PyObject *buffer = NULL, *unicode;
1132 if (encoding == NULL)
1133 encoding = PyUnicode_GetDefaultEncoding();
1135 /* Shortcuts for common default encodings */
1136 if (strcmp(encoding, "utf-8") == 0)
1137 return PyUnicode_DecodeUTF8(s, size, errors);
1138 else if (strcmp(encoding, "latin-1") == 0)
1139 return PyUnicode_DecodeLatin1(s, size, errors);
1140 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1141 else if (strcmp(encoding, "mbcs") == 0)
1142 return PyUnicode_DecodeMBCS(s, size, errors);
1143 #endif
1144 else if (strcmp(encoding, "ascii") == 0)
1145 return PyUnicode_DecodeASCII(s, size, errors);
1147 /* Decode via the codec registry */
1148 buffer = PyBuffer_FromMemory((void *)s, size);
1149 if (buffer == NULL)
1150 goto onError;
1151 unicode = PyCodec_Decode(buffer, encoding, errors);
1152 if (unicode == NULL)
1153 goto onError;
1154 if (!PyUnicode_Check(unicode)) {
1155 PyErr_Format(PyExc_TypeError,
1156 "decoder did not return an unicode object (type=%.400s)",
1157 Py_TYPE(unicode)->tp_name);
1158 Py_DECREF(unicode);
1159 goto onError;
1161 Py_DECREF(buffer);
1162 return unicode;
1164 onError:
1165 Py_XDECREF(buffer);
1166 return NULL;
1169 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1170 const char *encoding,
1171 const char *errors)
1173 PyObject *v;
1175 if (!PyUnicode_Check(unicode)) {
1176 PyErr_BadArgument();
1177 goto onError;
1180 if (encoding == NULL)
1181 encoding = PyUnicode_GetDefaultEncoding();
1183 /* Decode via the codec registry */
1184 v = PyCodec_Decode(unicode, encoding, errors);
1185 if (v == NULL)
1186 goto onError;
1187 return v;
1189 onError:
1190 return NULL;
1193 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1194 Py_ssize_t size,
1195 const char *encoding,
1196 const char *errors)
1198 PyObject *v, *unicode;
1200 unicode = PyUnicode_FromUnicode(s, size);
1201 if (unicode == NULL)
1202 return NULL;
1203 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1204 Py_DECREF(unicode);
1205 return v;
1208 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1209 const char *encoding,
1210 const char *errors)
1212 PyObject *v;
1214 if (!PyUnicode_Check(unicode)) {
1215 PyErr_BadArgument();
1216 goto onError;
1219 if (encoding == NULL)
1220 encoding = PyUnicode_GetDefaultEncoding();
1222 /* Encode via the codec registry */
1223 v = PyCodec_Encode(unicode, encoding, errors);
1224 if (v == NULL)
1225 goto onError;
1226 return v;
1228 onError:
1229 return NULL;
1232 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1233 const char *encoding,
1234 const char *errors)
1236 PyObject *v;
1238 if (!PyUnicode_Check(unicode)) {
1239 PyErr_BadArgument();
1240 goto onError;
1243 if (encoding == NULL)
1244 encoding = PyUnicode_GetDefaultEncoding();
1246 /* Shortcuts for common default encodings */
1247 if (errors == NULL) {
1248 if (strcmp(encoding, "utf-8") == 0)
1249 return PyUnicode_AsUTF8String(unicode);
1250 else if (strcmp(encoding, "latin-1") == 0)
1251 return PyUnicode_AsLatin1String(unicode);
1252 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1253 else if (strcmp(encoding, "mbcs") == 0)
1254 return PyUnicode_AsMBCSString(unicode);
1255 #endif
1256 else if (strcmp(encoding, "ascii") == 0)
1257 return PyUnicode_AsASCIIString(unicode);
1260 /* Encode via the codec registry */
1261 v = PyCodec_Encode(unicode, encoding, errors);
1262 if (v == NULL)
1263 goto onError;
1264 if (!PyString_Check(v)) {
1265 PyErr_Format(PyExc_TypeError,
1266 "encoder did not return a string object (type=%.400s)",
1267 Py_TYPE(v)->tp_name);
1268 Py_DECREF(v);
1269 goto onError;
1271 return v;
1273 onError:
1274 return NULL;
1277 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1278 const char *errors)
1280 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1282 if (v)
1283 return v;
1284 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1285 if (v && errors == NULL)
1286 ((PyUnicodeObject *)unicode)->defenc = v;
1287 return v;
1290 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1292 if (!PyUnicode_Check(unicode)) {
1293 PyErr_BadArgument();
1294 goto onError;
1296 return PyUnicode_AS_UNICODE(unicode);
1298 onError:
1299 return NULL;
1302 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1304 if (!PyUnicode_Check(unicode)) {
1305 PyErr_BadArgument();
1306 goto onError;
1308 return PyUnicode_GET_SIZE(unicode);
1310 onError:
1311 return -1;
1314 const char *PyUnicode_GetDefaultEncoding(void)
1316 return unicode_default_encoding;
1319 int PyUnicode_SetDefaultEncoding(const char *encoding)
1321 PyObject *v;
1323 /* Make sure the encoding is valid. As side effect, this also
1324 loads the encoding into the codec registry cache. */
1325 v = _PyCodec_Lookup(encoding);
1326 if (v == NULL)
1327 goto onError;
1328 Py_DECREF(v);
1329 strncpy(unicode_default_encoding,
1330 encoding,
1331 sizeof(unicode_default_encoding));
1332 return 0;
1334 onError:
1335 return -1;
1338 /* error handling callback helper:
1339 build arguments, call the callback and check the arguments,
1340 if no exception occurred, copy the replacement to the output
1341 and adjust various state variables.
1342 return 0 on success, -1 on error
1345 static
1346 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1347 const char *encoding, const char *reason,
1348 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1349 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1350 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1352 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1354 PyObject *restuple = NULL;
1355 PyObject *repunicode = NULL;
1356 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1357 Py_ssize_t requiredsize;
1358 Py_ssize_t newpos;
1359 Py_UNICODE *repptr;
1360 Py_ssize_t repsize;
1361 int res = -1;
1363 if (*errorHandler == NULL) {
1364 *errorHandler = PyCodec_LookupError(errors);
1365 if (*errorHandler == NULL)
1366 goto onError;
1369 if (*exceptionObject == NULL) {
1370 *exceptionObject = PyUnicodeDecodeError_Create(
1371 encoding, input, insize, *startinpos, *endinpos, reason);
1372 if (*exceptionObject == NULL)
1373 goto onError;
1375 else {
1376 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1377 goto onError;
1378 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1379 goto onError;
1380 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1381 goto onError;
1384 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1385 if (restuple == NULL)
1386 goto onError;
1387 if (!PyTuple_Check(restuple)) {
1388 PyErr_Format(PyExc_TypeError, &argparse[4]);
1389 goto onError;
1391 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1392 goto onError;
1393 if (newpos<0)
1394 newpos = insize+newpos;
1395 if (newpos<0 || newpos>insize) {
1396 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1397 goto onError;
1400 /* need more space? (at least enough for what we
1401 have+the replacement+the rest of the string (starting
1402 at the new input position), so we won't have to check space
1403 when there are no errors in the rest of the string) */
1404 repptr = PyUnicode_AS_UNICODE(repunicode);
1405 repsize = PyUnicode_GET_SIZE(repunicode);
1406 requiredsize = *outpos + repsize + insize-newpos;
1407 if (requiredsize > outsize) {
1408 if (requiredsize<2*outsize)
1409 requiredsize = 2*outsize;
1410 if (_PyUnicode_Resize(output, requiredsize) < 0)
1411 goto onError;
1412 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1414 *endinpos = newpos;
1415 *inptr = input + newpos;
1416 Py_UNICODE_COPY(*outptr, repptr, repsize);
1417 *outptr += repsize;
1418 *outpos += repsize;
1419 /* we made it! */
1420 res = 0;
1422 onError:
1423 Py_XDECREF(restuple);
1424 return res;
1427 /* --- UTF-7 Codec -------------------------------------------------------- */
1429 /* see RFC2152 for details */
1431 static
1432 char utf7_special[128] = {
1433 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1434 encoded:
1435 0 - not special
1436 1 - special
1437 2 - whitespace (optional)
1438 3 - RFC2152 Set O (optional) */
1439 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1440 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1441 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1442 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1443 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1444 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1445 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1446 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1450 /* Note: The comparison (c) <= 0 is a trick to work-around gcc
1451 warnings about the comparison always being false; since
1452 utf7_special[0] is 1, we can safely make that one comparison
1453 true */
1455 #define SPECIAL(c, encodeO, encodeWS) \
1456 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
1457 (encodeWS && (utf7_special[(c)] == 2)) || \
1458 (encodeO && (utf7_special[(c)] == 3)))
1460 #define B64(n) \
1461 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1462 #define B64CHAR(c) \
1463 (isalnum(c) || (c) == '+' || (c) == '/')
1464 #define UB64(c) \
1465 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1466 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
1468 #define ENCODE(out, ch, bits) \
1469 while (bits >= 6) { \
1470 *out++ = B64(ch >> (bits-6)); \
1471 bits -= 6; \
1474 #define DECODE(out, ch, bits, surrogate) \
1475 while (bits >= 16) { \
1476 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1477 bits -= 16; \
1478 if (surrogate) { \
1479 /* We have already generated an error for the high surrogate \
1480 so let's not bother seeing if the low surrogate is correct or not */ \
1481 surrogate = 0; \
1482 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
1483 /* This is a surrogate pair. Unfortunately we can't represent \
1484 it in a 16-bit character */ \
1485 surrogate = 1; \
1486 errmsg = "code pairs are not supported"; \
1487 goto utf7Error; \
1488 } else { \
1489 *out++ = outCh; \
1493 PyObject *PyUnicode_DecodeUTF7(const char *s,
1494 Py_ssize_t size,
1495 const char *errors)
1497 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1500 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1501 Py_ssize_t size,
1502 const char *errors,
1503 Py_ssize_t *consumed)
1505 const char *starts = s;
1506 Py_ssize_t startinpos;
1507 Py_ssize_t endinpos;
1508 Py_ssize_t outpos;
1509 const char *e;
1510 PyUnicodeObject *unicode;
1511 Py_UNICODE *p;
1512 const char *errmsg = "";
1513 int inShift = 0;
1514 unsigned int bitsleft = 0;
1515 unsigned long charsleft = 0;
1516 int surrogate = 0;
1517 PyObject *errorHandler = NULL;
1518 PyObject *exc = NULL;
1520 unicode = _PyUnicode_New(size);
1521 if (!unicode)
1522 return NULL;
1523 if (size == 0) {
1524 if (consumed)
1525 *consumed = 0;
1526 return (PyObject *)unicode;
1529 p = unicode->str;
1530 e = s + size;
1532 while (s < e) {
1533 Py_UNICODE ch;
1534 restart:
1535 ch = (unsigned char) *s;
1537 if (inShift) {
1538 if ((ch == '-') || !B64CHAR(ch)) {
1539 inShift = 0;
1540 s++;
1542 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1543 if (bitsleft >= 6) {
1544 /* The shift sequence has a partial character in it. If
1545 bitsleft < 6 then we could just classify it as padding
1546 but that is not the case here */
1548 errmsg = "partial character in shift sequence";
1549 goto utf7Error;
1551 /* According to RFC2152 the remaining bits should be zero. We
1552 choose to signal an error/insert a replacement character
1553 here so indicate the potential of a misencoded character. */
1555 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1556 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1557 errmsg = "non-zero padding bits in shift sequence";
1558 goto utf7Error;
1561 if (ch == '-') {
1562 if ((s < e) && (*(s) == '-')) {
1563 *p++ = '-';
1564 inShift = 1;
1566 } else if (SPECIAL(ch,0,0)) {
1567 errmsg = "unexpected special character";
1568 goto utf7Error;
1569 } else {
1570 *p++ = ch;
1572 } else {
1573 charsleft = (charsleft << 6) | UB64(ch);
1574 bitsleft += 6;
1575 s++;
1576 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1579 else if ( ch == '+' ) {
1580 startinpos = s-starts;
1581 s++;
1582 if (s < e && *s == '-') {
1583 s++;
1584 *p++ = '+';
1585 } else
1587 inShift = 1;
1588 bitsleft = 0;
1591 else if (SPECIAL(ch,0,0)) {
1592 startinpos = s-starts;
1593 errmsg = "unexpected special character";
1594 s++;
1595 goto utf7Error;
1597 else {
1598 *p++ = ch;
1599 s++;
1601 continue;
1602 utf7Error:
1603 outpos = p-PyUnicode_AS_UNICODE(unicode);
1604 endinpos = s-starts;
1605 if (unicode_decode_call_errorhandler(
1606 errors, &errorHandler,
1607 "utf7", errmsg,
1608 starts, size, &startinpos, &endinpos, &exc, &s,
1609 &unicode, &outpos, &p))
1610 goto onError;
1613 if (inShift && !consumed) {
1614 outpos = p-PyUnicode_AS_UNICODE(unicode);
1615 endinpos = size;
1616 if (unicode_decode_call_errorhandler(
1617 errors, &errorHandler,
1618 "utf7", "unterminated shift sequence",
1619 starts, size, &startinpos, &endinpos, &exc, &s,
1620 &unicode, &outpos, &p))
1621 goto onError;
1622 if (s < e)
1623 goto restart;
1625 if (consumed) {
1626 if(inShift)
1627 *consumed = startinpos;
1628 else
1629 *consumed = s-starts;
1632 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1633 goto onError;
1635 Py_XDECREF(errorHandler);
1636 Py_XDECREF(exc);
1637 return (PyObject *)unicode;
1639 onError:
1640 Py_XDECREF(errorHandler);
1641 Py_XDECREF(exc);
1642 Py_DECREF(unicode);
1643 return NULL;
1647 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1648 Py_ssize_t size,
1649 int encodeSetO,
1650 int encodeWhiteSpace,
1651 const char *errors)
1653 PyObject *v;
1654 /* It might be possible to tighten this worst case */
1655 Py_ssize_t cbAllocated = 5 * size;
1656 int inShift = 0;
1657 Py_ssize_t i = 0;
1658 unsigned int bitsleft = 0;
1659 unsigned long charsleft = 0;
1660 char * out;
1661 char * start;
1663 if (cbAllocated / 5 != size)
1664 return PyErr_NoMemory();
1666 if (size == 0)
1667 return PyString_FromStringAndSize(NULL, 0);
1669 v = PyString_FromStringAndSize(NULL, cbAllocated);
1670 if (v == NULL)
1671 return NULL;
1673 start = out = PyString_AS_STRING(v);
1674 for (;i < size; ++i) {
1675 Py_UNICODE ch = s[i];
1677 if (!inShift) {
1678 if (ch == '+') {
1679 *out++ = '+';
1680 *out++ = '-';
1681 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1682 charsleft = ch;
1683 bitsleft = 16;
1684 *out++ = '+';
1685 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1686 inShift = bitsleft > 0;
1687 } else {
1688 *out++ = (char) ch;
1690 } else {
1691 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1692 *out++ = B64(charsleft << (6-bitsleft));
1693 charsleft = 0;
1694 bitsleft = 0;
1695 /* Characters not in the BASE64 set implicitly unshift the sequence
1696 so no '-' is required, except if the character is itself a '-' */
1697 if (B64CHAR(ch) || ch == '-') {
1698 *out++ = '-';
1700 inShift = 0;
1701 *out++ = (char) ch;
1702 } else {
1703 bitsleft += 16;
1704 charsleft = (charsleft << 16) | ch;
1705 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1707 /* If the next character is special then we dont' need to terminate
1708 the shift sequence. If the next character is not a BASE64 character
1709 or '-' then the shift sequence will be terminated implicitly and we
1710 don't have to insert a '-'. */
1712 if (bitsleft == 0) {
1713 if (i + 1 < size) {
1714 Py_UNICODE ch2 = s[i+1];
1716 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1718 } else if (B64CHAR(ch2) || ch2 == '-') {
1719 *out++ = '-';
1720 inShift = 0;
1721 } else {
1722 inShift = 0;
1726 else {
1727 *out++ = '-';
1728 inShift = 0;
1734 if (bitsleft) {
1735 *out++= B64(charsleft << (6-bitsleft) );
1736 *out++ = '-';
1739 _PyString_Resize(&v, out - start);
1740 return v;
1743 #undef SPECIAL
1744 #undef B64
1745 #undef B64CHAR
1746 #undef UB64
1747 #undef ENCODE
1748 #undef DECODE
1750 /* --- UTF-8 Codec -------------------------------------------------------- */
1752 static
1753 char utf8_code_length[256] = {
1754 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1755 illegal prefix. see RFC 2279 for details */
1756 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1757 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1758 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1759 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1760 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1761 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1762 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1763 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1764 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1765 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1766 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1767 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1768 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1769 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1770 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1771 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1774 PyObject *PyUnicode_DecodeUTF8(const char *s,
1775 Py_ssize_t size,
1776 const char *errors)
1778 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1781 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1782 Py_ssize_t size,
1783 const char *errors,
1784 Py_ssize_t *consumed)
1786 const char *starts = s;
1787 int n;
1788 Py_ssize_t startinpos;
1789 Py_ssize_t endinpos;
1790 Py_ssize_t outpos;
1791 const char *e;
1792 PyUnicodeObject *unicode;
1793 Py_UNICODE *p;
1794 const char *errmsg = "";
1795 PyObject *errorHandler = NULL;
1796 PyObject *exc = NULL;
1798 /* Note: size will always be longer than the resulting Unicode
1799 character count */
1800 unicode = _PyUnicode_New(size);
1801 if (!unicode)
1802 return NULL;
1803 if (size == 0) {
1804 if (consumed)
1805 *consumed = 0;
1806 return (PyObject *)unicode;
1809 /* Unpack UTF-8 encoded data */
1810 p = unicode->str;
1811 e = s + size;
1813 while (s < e) {
1814 Py_UCS4 ch = (unsigned char)*s;
1816 if (ch < 0x80) {
1817 *p++ = (Py_UNICODE)ch;
1818 s++;
1819 continue;
1822 n = utf8_code_length[ch];
1824 if (s + n > e) {
1825 if (consumed)
1826 break;
1827 else {
1828 errmsg = "unexpected end of data";
1829 startinpos = s-starts;
1830 endinpos = size;
1831 goto utf8Error;
1835 switch (n) {
1837 case 0:
1838 errmsg = "unexpected code byte";
1839 startinpos = s-starts;
1840 endinpos = startinpos+1;
1841 goto utf8Error;
1843 case 1:
1844 errmsg = "internal error";
1845 startinpos = s-starts;
1846 endinpos = startinpos+1;
1847 goto utf8Error;
1849 case 2:
1850 if ((s[1] & 0xc0) != 0x80) {
1851 errmsg = "invalid data";
1852 startinpos = s-starts;
1853 endinpos = startinpos+2;
1854 goto utf8Error;
1856 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1857 if (ch < 0x80) {
1858 startinpos = s-starts;
1859 endinpos = startinpos+2;
1860 errmsg = "illegal encoding";
1861 goto utf8Error;
1863 else
1864 *p++ = (Py_UNICODE)ch;
1865 break;
1867 case 3:
1868 if ((s[1] & 0xc0) != 0x80 ||
1869 (s[2] & 0xc0) != 0x80) {
1870 errmsg = "invalid data";
1871 startinpos = s-starts;
1872 endinpos = startinpos+3;
1873 goto utf8Error;
1875 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1876 if (ch < 0x0800) {
1877 /* Note: UTF-8 encodings of surrogates are considered
1878 legal UTF-8 sequences;
1880 XXX For wide builds (UCS-4) we should probably try
1881 to recombine the surrogates into a single code
1882 unit.
1884 errmsg = "illegal encoding";
1885 startinpos = s-starts;
1886 endinpos = startinpos+3;
1887 goto utf8Error;
1889 else
1890 *p++ = (Py_UNICODE)ch;
1891 break;
1893 case 4:
1894 if ((s[1] & 0xc0) != 0x80 ||
1895 (s[2] & 0xc0) != 0x80 ||
1896 (s[3] & 0xc0) != 0x80) {
1897 errmsg = "invalid data";
1898 startinpos = s-starts;
1899 endinpos = startinpos+4;
1900 goto utf8Error;
1902 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1903 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1904 /* validate and convert to UTF-16 */
1905 if ((ch < 0x10000) /* minimum value allowed for 4
1906 byte encoding */
1907 || (ch > 0x10ffff)) /* maximum value allowed for
1908 UTF-16 */
1910 errmsg = "illegal encoding";
1911 startinpos = s-starts;
1912 endinpos = startinpos+4;
1913 goto utf8Error;
1915 #ifdef Py_UNICODE_WIDE
1916 *p++ = (Py_UNICODE)ch;
1917 #else
1918 /* compute and append the two surrogates: */
1920 /* translate from 10000..10FFFF to 0..FFFF */
1921 ch -= 0x10000;
1923 /* high surrogate = top 10 bits added to D800 */
1924 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1926 /* low surrogate = bottom 10 bits added to DC00 */
1927 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1928 #endif
1929 break;
1931 default:
1932 /* Other sizes are only needed for UCS-4 */
1933 errmsg = "unsupported Unicode code range";
1934 startinpos = s-starts;
1935 endinpos = startinpos+n;
1936 goto utf8Error;
1938 s += n;
1939 continue;
1941 utf8Error:
1942 outpos = p-PyUnicode_AS_UNICODE(unicode);
1943 if (unicode_decode_call_errorhandler(
1944 errors, &errorHandler,
1945 "utf8", errmsg,
1946 starts, size, &startinpos, &endinpos, &exc, &s,
1947 &unicode, &outpos, &p))
1948 goto onError;
1950 if (consumed)
1951 *consumed = s-starts;
1953 /* Adjust length */
1954 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1955 goto onError;
1957 Py_XDECREF(errorHandler);
1958 Py_XDECREF(exc);
1959 return (PyObject *)unicode;
1961 onError:
1962 Py_XDECREF(errorHandler);
1963 Py_XDECREF(exc);
1964 Py_DECREF(unicode);
1965 return NULL;
1968 /* Allocation strategy: if the string is short, convert into a stack buffer
1969 and allocate exactly as much space needed at the end. Else allocate the
1970 maximum possible needed (4 result bytes per Unicode character), and return
1971 the excess memory at the end.
1973 PyObject *
1974 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1975 Py_ssize_t size,
1976 const char *errors)
1978 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
1980 Py_ssize_t i; /* index into s of next input byte */
1981 PyObject *v; /* result string object */
1982 char *p; /* next free byte in output buffer */
1983 Py_ssize_t nallocated; /* number of result bytes allocated */
1984 Py_ssize_t nneeded; /* number of result bytes needed */
1985 char stackbuf[MAX_SHORT_UNICHARS * 4];
1987 assert(s != NULL);
1988 assert(size >= 0);
1990 if (size <= MAX_SHORT_UNICHARS) {
1991 /* Write into the stack buffer; nallocated can't overflow.
1992 * At the end, we'll allocate exactly as much heap space as it
1993 * turns out we need.
1995 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1996 v = NULL; /* will allocate after we're done */
1997 p = stackbuf;
1999 else {
2000 /* Overallocate on the heap, and give the excess back at the end. */
2001 nallocated = size * 4;
2002 if (nallocated / 4 != size) /* overflow! */
2003 return PyErr_NoMemory();
2004 v = PyString_FromStringAndSize(NULL, nallocated);
2005 if (v == NULL)
2006 return NULL;
2007 p = PyString_AS_STRING(v);
2010 for (i = 0; i < size;) {
2011 Py_UCS4 ch = s[i++];
2013 if (ch < 0x80)
2014 /* Encode ASCII */
2015 *p++ = (char) ch;
2017 else if (ch < 0x0800) {
2018 /* Encode Latin-1 */
2019 *p++ = (char)(0xc0 | (ch >> 6));
2020 *p++ = (char)(0x80 | (ch & 0x3f));
2022 else {
2023 /* Encode UCS2 Unicode ordinals */
2024 if (ch < 0x10000) {
2025 /* Special case: check for high surrogate */
2026 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2027 Py_UCS4 ch2 = s[i];
2028 /* Check for low surrogate and combine the two to
2029 form a UCS4 value */
2030 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2031 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2032 i++;
2033 goto encodeUCS4;
2035 /* Fall through: handles isolated high surrogates */
2037 *p++ = (char)(0xe0 | (ch >> 12));
2038 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2039 *p++ = (char)(0x80 | (ch & 0x3f));
2040 continue;
2042 encodeUCS4:
2043 /* Encode UCS4 Unicode ordinals */
2044 *p++ = (char)(0xf0 | (ch >> 18));
2045 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2046 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2047 *p++ = (char)(0x80 | (ch & 0x3f));
2051 if (v == NULL) {
2052 /* This was stack allocated. */
2053 nneeded = p - stackbuf;
2054 assert(nneeded <= nallocated);
2055 v = PyString_FromStringAndSize(stackbuf, nneeded);
2057 else {
2058 /* Cut back to size actually needed. */
2059 nneeded = p - PyString_AS_STRING(v);
2060 assert(nneeded <= nallocated);
2061 _PyString_Resize(&v, nneeded);
2063 return v;
2065 #undef MAX_SHORT_UNICHARS
2068 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2070 if (!PyUnicode_Check(unicode)) {
2071 PyErr_BadArgument();
2072 return NULL;
2074 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2075 PyUnicode_GET_SIZE(unicode),
2076 NULL);
2079 /* --- UTF-32 Codec ------------------------------------------------------- */
2081 PyObject *
2082 PyUnicode_DecodeUTF32(const char *s,
2083 Py_ssize_t size,
2084 const char *errors,
2085 int *byteorder)
2087 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2090 PyObject *
2091 PyUnicode_DecodeUTF32Stateful(const char *s,
2092 Py_ssize_t size,
2093 const char *errors,
2094 int *byteorder,
2095 Py_ssize_t *consumed)
2097 const char *starts = s;
2098 Py_ssize_t startinpos;
2099 Py_ssize_t endinpos;
2100 Py_ssize_t outpos;
2101 PyUnicodeObject *unicode;
2102 Py_UNICODE *p;
2103 #ifndef Py_UNICODE_WIDE
2104 int i, pairs;
2105 #else
2106 const int pairs = 0;
2107 #endif
2108 const unsigned char *q, *e;
2109 int bo = 0; /* assume native ordering by default */
2110 const char *errmsg = "";
2111 /* Offsets from q for retrieving bytes in the right order. */
2112 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2113 int iorder[] = {0, 1, 2, 3};
2114 #else
2115 int iorder[] = {3, 2, 1, 0};
2116 #endif
2117 PyObject *errorHandler = NULL;
2118 PyObject *exc = NULL;
2119 /* On narrow builds we split characters outside the BMP into two
2120 codepoints => count how much extra space we need. */
2121 #ifndef Py_UNICODE_WIDE
2122 for (i = pairs = 0; i < size/4; i++)
2123 if (((Py_UCS4 *)s)[i] >= 0x10000)
2124 pairs++;
2125 #endif
2127 /* This might be one to much, because of a BOM */
2128 unicode = _PyUnicode_New((size+3)/4+pairs);
2129 if (!unicode)
2130 return NULL;
2131 if (size == 0)
2132 return (PyObject *)unicode;
2134 /* Unpack UTF-32 encoded data */
2135 p = unicode->str;
2136 q = (unsigned char *)s;
2137 e = q + size;
2139 if (byteorder)
2140 bo = *byteorder;
2142 /* Check for BOM marks (U+FEFF) in the input and adjust current
2143 byte order setting accordingly. In native mode, the leading BOM
2144 mark is skipped, in all other modes, it is copied to the output
2145 stream as-is (giving a ZWNBSP character). */
2146 if (bo == 0) {
2147 if (size >= 4) {
2148 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2149 (q[iorder[1]] << 8) | q[iorder[0]];
2150 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2151 if (bom == 0x0000FEFF) {
2152 q += 4;
2153 bo = -1;
2155 else if (bom == 0xFFFE0000) {
2156 q += 4;
2157 bo = 1;
2159 #else
2160 if (bom == 0x0000FEFF) {
2161 q += 4;
2162 bo = 1;
2164 else if (bom == 0xFFFE0000) {
2165 q += 4;
2166 bo = -1;
2168 #endif
2172 if (bo == -1) {
2173 /* force LE */
2174 iorder[0] = 0;
2175 iorder[1] = 1;
2176 iorder[2] = 2;
2177 iorder[3] = 3;
2179 else if (bo == 1) {
2180 /* force BE */
2181 iorder[0] = 3;
2182 iorder[1] = 2;
2183 iorder[2] = 1;
2184 iorder[3] = 0;
2187 while (q < e) {
2188 Py_UCS4 ch;
2189 /* remaining bytes at the end? (size should be divisible by 4) */
2190 if (e-q<4) {
2191 if (consumed)
2192 break;
2193 errmsg = "truncated data";
2194 startinpos = ((const char *)q)-starts;
2195 endinpos = ((const char *)e)-starts;
2196 goto utf32Error;
2197 /* The remaining input chars are ignored if the callback
2198 chooses to skip the input */
2200 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2201 (q[iorder[1]] << 8) | q[iorder[0]];
2203 if (ch >= 0x110000)
2205 errmsg = "codepoint not in range(0x110000)";
2206 startinpos = ((const char *)q)-starts;
2207 endinpos = startinpos+4;
2208 goto utf32Error;
2210 #ifndef Py_UNICODE_WIDE
2211 if (ch >= 0x10000)
2213 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2214 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2216 else
2217 #endif
2218 *p++ = ch;
2219 q += 4;
2220 continue;
2221 utf32Error:
2222 outpos = p-PyUnicode_AS_UNICODE(unicode);
2223 if (unicode_decode_call_errorhandler(
2224 errors, &errorHandler,
2225 "utf32", errmsg,
2226 starts, size, &startinpos, &endinpos, &exc, &s,
2227 &unicode, &outpos, &p))
2228 goto onError;
2231 if (byteorder)
2232 *byteorder = bo;
2234 if (consumed)
2235 *consumed = (const char *)q-starts;
2237 /* Adjust length */
2238 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2239 goto onError;
2241 Py_XDECREF(errorHandler);
2242 Py_XDECREF(exc);
2243 return (PyObject *)unicode;
2245 onError:
2246 Py_DECREF(unicode);
2247 Py_XDECREF(errorHandler);
2248 Py_XDECREF(exc);
2249 return NULL;
2252 PyObject *
2253 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2254 Py_ssize_t size,
2255 const char *errors,
2256 int byteorder)
2258 PyObject *v;
2259 unsigned char *p;
2260 Py_ssize_t nsize, bytesize;
2261 #ifndef Py_UNICODE_WIDE
2262 Py_ssize_t i, pairs;
2263 #else
2264 const int pairs = 0;
2265 #endif
2266 /* Offsets from p for storing byte pairs in the right order. */
2267 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2268 int iorder[] = {0, 1, 2, 3};
2269 #else
2270 int iorder[] = {3, 2, 1, 0};
2271 #endif
2273 #define STORECHAR(CH) \
2274 do { \
2275 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2276 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2277 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2278 p[iorder[0]] = (CH) & 0xff; \
2279 p += 4; \
2280 } while(0)
2282 /* In narrow builds we can output surrogate pairs as one codepoint,
2283 so we need less space. */
2284 #ifndef Py_UNICODE_WIDE
2285 for (i = pairs = 0; i < size-1; i++)
2286 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2287 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2288 pairs++;
2289 #endif
2290 nsize = (size - pairs + (byteorder == 0));
2291 bytesize = nsize * 4;
2292 if (bytesize / 4 != nsize)
2293 return PyErr_NoMemory();
2294 v = PyString_FromStringAndSize(NULL, bytesize);
2295 if (v == NULL)
2296 return NULL;
2298 p = (unsigned char *)PyString_AS_STRING(v);
2299 if (byteorder == 0)
2300 STORECHAR(0xFEFF);
2301 if (size == 0)
2302 return v;
2304 if (byteorder == -1) {
2305 /* force LE */
2306 iorder[0] = 0;
2307 iorder[1] = 1;
2308 iorder[2] = 2;
2309 iorder[3] = 3;
2311 else if (byteorder == 1) {
2312 /* force BE */
2313 iorder[0] = 3;
2314 iorder[1] = 2;
2315 iorder[2] = 1;
2316 iorder[3] = 0;
2319 while (size-- > 0) {
2320 Py_UCS4 ch = *s++;
2321 #ifndef Py_UNICODE_WIDE
2322 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2323 Py_UCS4 ch2 = *s;
2324 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2325 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2326 s++;
2327 size--;
2330 #endif
2331 STORECHAR(ch);
2333 return v;
2334 #undef STORECHAR
2337 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2339 if (!PyUnicode_Check(unicode)) {
2340 PyErr_BadArgument();
2341 return NULL;
2343 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2344 PyUnicode_GET_SIZE(unicode),
2345 NULL,
2349 /* --- UTF-16 Codec ------------------------------------------------------- */
2351 PyObject *
2352 PyUnicode_DecodeUTF16(const char *s,
2353 Py_ssize_t size,
2354 const char *errors,
2355 int *byteorder)
2357 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2360 PyObject *
2361 PyUnicode_DecodeUTF16Stateful(const char *s,
2362 Py_ssize_t size,
2363 const char *errors,
2364 int *byteorder,
2365 Py_ssize_t *consumed)
2367 const char *starts = s;
2368 Py_ssize_t startinpos;
2369 Py_ssize_t endinpos;
2370 Py_ssize_t outpos;
2371 PyUnicodeObject *unicode;
2372 Py_UNICODE *p;
2373 const unsigned char *q, *e;
2374 int bo = 0; /* assume native ordering by default */
2375 const char *errmsg = "";
2376 /* Offsets from q for retrieving byte pairs in the right order. */
2377 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2378 int ihi = 1, ilo = 0;
2379 #else
2380 int ihi = 0, ilo = 1;
2381 #endif
2382 PyObject *errorHandler = NULL;
2383 PyObject *exc = NULL;
2385 /* Note: size will always be longer than the resulting Unicode
2386 character count */
2387 unicode = _PyUnicode_New(size);
2388 if (!unicode)
2389 return NULL;
2390 if (size == 0)
2391 return (PyObject *)unicode;
2393 /* Unpack UTF-16 encoded data */
2394 p = unicode->str;
2395 q = (unsigned char *)s;
2396 e = q + size;
2398 if (byteorder)
2399 bo = *byteorder;
2401 /* Check for BOM marks (U+FEFF) in the input and adjust current
2402 byte order setting accordingly. In native mode, the leading BOM
2403 mark is skipped, in all other modes, it is copied to the output
2404 stream as-is (giving a ZWNBSP character). */
2405 if (bo == 0) {
2406 if (size >= 2) {
2407 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2408 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2409 if (bom == 0xFEFF) {
2410 q += 2;
2411 bo = -1;
2413 else if (bom == 0xFFFE) {
2414 q += 2;
2415 bo = 1;
2417 #else
2418 if (bom == 0xFEFF) {
2419 q += 2;
2420 bo = 1;
2422 else if (bom == 0xFFFE) {
2423 q += 2;
2424 bo = -1;
2426 #endif
2430 if (bo == -1) {
2431 /* force LE */
2432 ihi = 1;
2433 ilo = 0;
2435 else if (bo == 1) {
2436 /* force BE */
2437 ihi = 0;
2438 ilo = 1;
2441 while (q < e) {
2442 Py_UNICODE ch;
2443 /* remaining bytes at the end? (size should be even) */
2444 if (e-q<2) {
2445 if (consumed)
2446 break;
2447 errmsg = "truncated data";
2448 startinpos = ((const char *)q)-starts;
2449 endinpos = ((const char *)e)-starts;
2450 goto utf16Error;
2451 /* The remaining input chars are ignored if the callback
2452 chooses to skip the input */
2454 ch = (q[ihi] << 8) | q[ilo];
2456 q += 2;
2458 if (ch < 0xD800 || ch > 0xDFFF) {
2459 *p++ = ch;
2460 continue;
2463 /* UTF-16 code pair: */
2464 if (q >= e) {
2465 errmsg = "unexpected end of data";
2466 startinpos = (((const char *)q)-2)-starts;
2467 endinpos = ((const char *)e)-starts;
2468 goto utf16Error;
2470 if (0xD800 <= ch && ch <= 0xDBFF) {
2471 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2472 q += 2;
2473 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2474 #ifndef Py_UNICODE_WIDE
2475 *p++ = ch;
2476 *p++ = ch2;
2477 #else
2478 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2479 #endif
2480 continue;
2482 else {
2483 errmsg = "illegal UTF-16 surrogate";
2484 startinpos = (((const char *)q)-4)-starts;
2485 endinpos = startinpos+2;
2486 goto utf16Error;
2490 errmsg = "illegal encoding";
2491 startinpos = (((const char *)q)-2)-starts;
2492 endinpos = startinpos+2;
2493 /* Fall through to report the error */
2495 utf16Error:
2496 outpos = p-PyUnicode_AS_UNICODE(unicode);
2497 if (unicode_decode_call_errorhandler(
2498 errors, &errorHandler,
2499 "utf16", errmsg,
2500 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2501 &unicode, &outpos, &p))
2502 goto onError;
2505 if (byteorder)
2506 *byteorder = bo;
2508 if (consumed)
2509 *consumed = (const char *)q-starts;
2511 /* Adjust length */
2512 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2513 goto onError;
2515 Py_XDECREF(errorHandler);
2516 Py_XDECREF(exc);
2517 return (PyObject *)unicode;
2519 onError:
2520 Py_DECREF(unicode);
2521 Py_XDECREF(errorHandler);
2522 Py_XDECREF(exc);
2523 return NULL;
2526 PyObject *
2527 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2528 Py_ssize_t size,
2529 const char *errors,
2530 int byteorder)
2532 PyObject *v;
2533 unsigned char *p;
2534 Py_ssize_t nsize, bytesize;
2535 #ifdef Py_UNICODE_WIDE
2536 Py_ssize_t i, pairs;
2537 #else
2538 const int pairs = 0;
2539 #endif
2540 /* Offsets from p for storing byte pairs in the right order. */
2541 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2542 int ihi = 1, ilo = 0;
2543 #else
2544 int ihi = 0, ilo = 1;
2545 #endif
2547 #define STORECHAR(CH) \
2548 do { \
2549 p[ihi] = ((CH) >> 8) & 0xff; \
2550 p[ilo] = (CH) & 0xff; \
2551 p += 2; \
2552 } while(0)
2554 #ifdef Py_UNICODE_WIDE
2555 for (i = pairs = 0; i < size; i++)
2556 if (s[i] >= 0x10000)
2557 pairs++;
2558 #endif
2559 /* 2 * (size + pairs + (byteorder == 0)) */
2560 if (size > PY_SSIZE_T_MAX ||
2561 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2562 return PyErr_NoMemory();
2563 nsize = size + pairs + (byteorder == 0);
2564 bytesize = nsize * 2;
2565 if (bytesize / 2 != nsize)
2566 return PyErr_NoMemory();
2567 v = PyString_FromStringAndSize(NULL, bytesize);
2568 if (v == NULL)
2569 return NULL;
2571 p = (unsigned char *)PyString_AS_STRING(v);
2572 if (byteorder == 0)
2573 STORECHAR(0xFEFF);
2574 if (size == 0)
2575 return v;
2577 if (byteorder == -1) {
2578 /* force LE */
2579 ihi = 1;
2580 ilo = 0;
2582 else if (byteorder == 1) {
2583 /* force BE */
2584 ihi = 0;
2585 ilo = 1;
2588 while (size-- > 0) {
2589 Py_UNICODE ch = *s++;
2590 Py_UNICODE ch2 = 0;
2591 #ifdef Py_UNICODE_WIDE
2592 if (ch >= 0x10000) {
2593 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2594 ch = 0xD800 | ((ch-0x10000) >> 10);
2596 #endif
2597 STORECHAR(ch);
2598 if (ch2)
2599 STORECHAR(ch2);
2601 return v;
2602 #undef STORECHAR
2605 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2607 if (!PyUnicode_Check(unicode)) {
2608 PyErr_BadArgument();
2609 return NULL;
2611 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2612 PyUnicode_GET_SIZE(unicode),
2613 NULL,
2617 /* --- Unicode Escape Codec ----------------------------------------------- */
2619 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2621 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2622 Py_ssize_t size,
2623 const char *errors)
2625 const char *starts = s;
2626 Py_ssize_t startinpos;
2627 Py_ssize_t endinpos;
2628 Py_ssize_t outpos;
2629 int i;
2630 PyUnicodeObject *v;
2631 Py_UNICODE *p;
2632 const char *end;
2633 char* message;
2634 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2635 PyObject *errorHandler = NULL;
2636 PyObject *exc = NULL;
2638 /* Escaped strings will always be longer than the resulting
2639 Unicode string, so we start with size here and then reduce the
2640 length after conversion to the true value.
2641 (but if the error callback returns a long replacement string
2642 we'll have to allocate more space) */
2643 v = _PyUnicode_New(size);
2644 if (v == NULL)
2645 goto onError;
2646 if (size == 0)
2647 return (PyObject *)v;
2649 p = PyUnicode_AS_UNICODE(v);
2650 end = s + size;
2652 while (s < end) {
2653 unsigned char c;
2654 Py_UNICODE x;
2655 int digits;
2657 /* Non-escape characters are interpreted as Unicode ordinals */
2658 if (*s != '\\') {
2659 *p++ = (unsigned char) *s++;
2660 continue;
2663 startinpos = s-starts;
2664 /* \ - Escapes */
2665 s++;
2666 c = *s++;
2667 if (s > end)
2668 c = '\0'; /* Invalid after \ */
2669 switch (c) {
2671 /* \x escapes */
2672 case '\n': break;
2673 case '\\': *p++ = '\\'; break;
2674 case '\'': *p++ = '\''; break;
2675 case '\"': *p++ = '\"'; break;
2676 case 'b': *p++ = '\b'; break;
2677 case 'f': *p++ = '\014'; break; /* FF */
2678 case 't': *p++ = '\t'; break;
2679 case 'n': *p++ = '\n'; break;
2680 case 'r': *p++ = '\r'; break;
2681 case 'v': *p++ = '\013'; break; /* VT */
2682 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2684 /* \OOO (octal) escapes */
2685 case '0': case '1': case '2': case '3':
2686 case '4': case '5': case '6': case '7':
2687 x = s[-1] - '0';
2688 if (s < end && '0' <= *s && *s <= '7') {
2689 x = (x<<3) + *s++ - '0';
2690 if (s < end && '0' <= *s && *s <= '7')
2691 x = (x<<3) + *s++ - '0';
2693 *p++ = x;
2694 break;
2696 /* hex escapes */
2697 /* \xXX */
2698 case 'x':
2699 digits = 2;
2700 message = "truncated \\xXX escape";
2701 goto hexescape;
2703 /* \uXXXX */
2704 case 'u':
2705 digits = 4;
2706 message = "truncated \\uXXXX escape";
2707 goto hexescape;
2709 /* \UXXXXXXXX */
2710 case 'U':
2711 digits = 8;
2712 message = "truncated \\UXXXXXXXX escape";
2713 hexescape:
2714 chr = 0;
2715 outpos = p-PyUnicode_AS_UNICODE(v);
2716 if (s+digits>end) {
2717 endinpos = size;
2718 if (unicode_decode_call_errorhandler(
2719 errors, &errorHandler,
2720 "unicodeescape", "end of string in escape sequence",
2721 starts, size, &startinpos, &endinpos, &exc, &s,
2722 &v, &outpos, &p))
2723 goto onError;
2724 goto nextByte;
2726 for (i = 0; i < digits; ++i) {
2727 c = (unsigned char) s[i];
2728 if (!isxdigit(c)) {
2729 endinpos = (s+i+1)-starts;
2730 if (unicode_decode_call_errorhandler(
2731 errors, &errorHandler,
2732 "unicodeescape", message,
2733 starts, size, &startinpos, &endinpos, &exc, &s,
2734 &v, &outpos, &p))
2735 goto onError;
2736 goto nextByte;
2738 chr = (chr<<4) & ~0xF;
2739 if (c >= '0' && c <= '9')
2740 chr += c - '0';
2741 else if (c >= 'a' && c <= 'f')
2742 chr += 10 + c - 'a';
2743 else
2744 chr += 10 + c - 'A';
2746 s += i;
2747 if (chr == 0xffffffff && PyErr_Occurred())
2748 /* _decoding_error will have already written into the
2749 target buffer. */
2750 break;
2751 store:
2752 /* when we get here, chr is a 32-bit unicode character */
2753 if (chr <= 0xffff)
2754 /* UCS-2 character */
2755 *p++ = (Py_UNICODE) chr;
2756 else if (chr <= 0x10ffff) {
2757 /* UCS-4 character. Either store directly, or as
2758 surrogate pair. */
2759 #ifdef Py_UNICODE_WIDE
2760 *p++ = chr;
2761 #else
2762 chr -= 0x10000L;
2763 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2764 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2765 #endif
2766 } else {
2767 endinpos = s-starts;
2768 outpos = p-PyUnicode_AS_UNICODE(v);
2769 if (unicode_decode_call_errorhandler(
2770 errors, &errorHandler,
2771 "unicodeescape", "illegal Unicode character",
2772 starts, size, &startinpos, &endinpos, &exc, &s,
2773 &v, &outpos, &p))
2774 goto onError;
2776 break;
2778 /* \N{name} */
2779 case 'N':
2780 message = "malformed \\N character escape";
2781 if (ucnhash_CAPI == NULL) {
2782 /* load the unicode data module */
2783 PyObject *m, *api;
2784 m = PyImport_ImportModuleNoBlock("unicodedata");
2785 if (m == NULL)
2786 goto ucnhashError;
2787 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
2788 Py_DECREF(m);
2789 if (api == NULL)
2790 goto ucnhashError;
2791 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
2792 Py_DECREF(api);
2793 if (ucnhash_CAPI == NULL)
2794 goto ucnhashError;
2796 if (*s == '{') {
2797 const char *start = s+1;
2798 /* look for the closing brace */
2799 while (*s != '}' && s < end)
2800 s++;
2801 if (s > start && s < end && *s == '}') {
2802 /* found a name. look it up in the unicode database */
2803 message = "unknown Unicode character name";
2804 s++;
2805 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2806 goto store;
2809 endinpos = s-starts;
2810 outpos = p-PyUnicode_AS_UNICODE(v);
2811 if (unicode_decode_call_errorhandler(
2812 errors, &errorHandler,
2813 "unicodeescape", message,
2814 starts, size, &startinpos, &endinpos, &exc, &s,
2815 &v, &outpos, &p))
2816 goto onError;
2817 break;
2819 default:
2820 if (s > end) {
2821 message = "\\ at end of string";
2822 s--;
2823 endinpos = s-starts;
2824 outpos = p-PyUnicode_AS_UNICODE(v);
2825 if (unicode_decode_call_errorhandler(
2826 errors, &errorHandler,
2827 "unicodeescape", message,
2828 starts, size, &startinpos, &endinpos, &exc, &s,
2829 &v, &outpos, &p))
2830 goto onError;
2832 else {
2833 *p++ = '\\';
2834 *p++ = (unsigned char)s[-1];
2836 break;
2838 nextByte:
2841 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2842 goto onError;
2843 Py_XDECREF(errorHandler);
2844 Py_XDECREF(exc);
2845 return (PyObject *)v;
2847 ucnhashError:
2848 PyErr_SetString(
2849 PyExc_UnicodeError,
2850 "\\N escapes not supported (can't load unicodedata module)"
2852 Py_XDECREF(v);
2853 Py_XDECREF(errorHandler);
2854 Py_XDECREF(exc);
2855 return NULL;
2857 onError:
2858 Py_XDECREF(v);
2859 Py_XDECREF(errorHandler);
2860 Py_XDECREF(exc);
2861 return NULL;
2864 /* Return a Unicode-Escape string version of the Unicode object.
2866 If quotes is true, the string is enclosed in u"" or u'' quotes as
2867 appropriate.
2871 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2872 Py_ssize_t size,
2873 Py_UNICODE ch)
2875 /* like wcschr, but doesn't stop at NULL characters */
2877 while (size-- > 0) {
2878 if (*s == ch)
2879 return s;
2880 s++;
2883 return NULL;
2886 static
2887 PyObject *unicodeescape_string(const Py_UNICODE *s,
2888 Py_ssize_t size,
2889 int quotes)
2891 PyObject *repr;
2892 char *p;
2894 static const char *hexdigit = "0123456789abcdef";
2895 #ifdef Py_UNICODE_WIDE
2896 const Py_ssize_t expandsize = 10;
2897 #else
2898 const Py_ssize_t expandsize = 6;
2899 #endif
2901 /* XXX(nnorwitz): rather than over-allocating, it would be
2902 better to choose a different scheme. Perhaps scan the
2903 first N-chars of the string and allocate based on that size.
2905 /* Initial allocation is based on the longest-possible unichr
2906 escape.
2908 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2909 unichr, so in this case it's the longest unichr escape. In
2910 narrow (UTF-16) builds this is five chars per source unichr
2911 since there are two unichrs in the surrogate pair, so in narrow
2912 (UTF-16) builds it's not the longest unichr escape.
2914 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2915 so in the narrow (UTF-16) build case it's the longest unichr
2916 escape.
2919 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
2920 return PyErr_NoMemory();
2922 repr = PyString_FromStringAndSize(NULL,
2924 + expandsize*size
2925 + 1);
2926 if (repr == NULL)
2927 return NULL;
2929 p = PyString_AS_STRING(repr);
2931 if (quotes) {
2932 *p++ = 'u';
2933 *p++ = (findchar(s, size, '\'') &&
2934 !findchar(s, size, '"')) ? '"' : '\'';
2936 while (size-- > 0) {
2937 Py_UNICODE ch = *s++;
2939 /* Escape quotes and backslashes */
2940 if ((quotes &&
2941 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
2942 *p++ = '\\';
2943 *p++ = (char) ch;
2944 continue;
2947 #ifdef Py_UNICODE_WIDE
2948 /* Map 21-bit characters to '\U00xxxxxx' */
2949 else if (ch >= 0x10000) {
2950 *p++ = '\\';
2951 *p++ = 'U';
2952 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2953 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2954 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2955 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2956 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2957 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2958 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
2959 *p++ = hexdigit[ch & 0x0000000F];
2960 continue;
2962 #else
2963 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
2964 else if (ch >= 0xD800 && ch < 0xDC00) {
2965 Py_UNICODE ch2;
2966 Py_UCS4 ucs;
2968 ch2 = *s++;
2969 size--;
2970 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2971 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2972 *p++ = '\\';
2973 *p++ = 'U';
2974 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2975 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2976 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2977 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2978 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2979 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2980 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2981 *p++ = hexdigit[ucs & 0x0000000F];
2982 continue;
2984 /* Fall through: isolated surrogates are copied as-is */
2985 s--;
2986 size++;
2988 #endif
2990 /* Map 16-bit characters to '\uxxxx' */
2991 if (ch >= 256) {
2992 *p++ = '\\';
2993 *p++ = 'u';
2994 *p++ = hexdigit[(ch >> 12) & 0x000F];
2995 *p++ = hexdigit[(ch >> 8) & 0x000F];
2996 *p++ = hexdigit[(ch >> 4) & 0x000F];
2997 *p++ = hexdigit[ch & 0x000F];
3000 /* Map special whitespace to '\t', \n', '\r' */
3001 else if (ch == '\t') {
3002 *p++ = '\\';
3003 *p++ = 't';
3005 else if (ch == '\n') {
3006 *p++ = '\\';
3007 *p++ = 'n';
3009 else if (ch == '\r') {
3010 *p++ = '\\';
3011 *p++ = 'r';
3014 /* Map non-printable US ASCII to '\xhh' */
3015 else if (ch < ' ' || ch >= 0x7F) {
3016 *p++ = '\\';
3017 *p++ = 'x';
3018 *p++ = hexdigit[(ch >> 4) & 0x000F];
3019 *p++ = hexdigit[ch & 0x000F];
3022 /* Copy everything else as-is */
3023 else
3024 *p++ = (char) ch;
3026 if (quotes)
3027 *p++ = PyString_AS_STRING(repr)[1];
3029 *p = '\0';
3030 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
3031 return repr;
3034 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3035 Py_ssize_t size)
3037 return unicodeescape_string(s, size, 0);
3040 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3042 if (!PyUnicode_Check(unicode)) {
3043 PyErr_BadArgument();
3044 return NULL;
3046 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3047 PyUnicode_GET_SIZE(unicode));
3050 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3052 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3053 Py_ssize_t size,
3054 const char *errors)
3056 const char *starts = s;
3057 Py_ssize_t startinpos;
3058 Py_ssize_t endinpos;
3059 Py_ssize_t outpos;
3060 PyUnicodeObject *v;
3061 Py_UNICODE *p;
3062 const char *end;
3063 const char *bs;
3064 PyObject *errorHandler = NULL;
3065 PyObject *exc = NULL;
3067 /* Escaped strings will always be longer than the resulting
3068 Unicode string, so we start with size here and then reduce the
3069 length after conversion to the true value. (But decoding error
3070 handler might have to resize the string) */
3071 v = _PyUnicode_New(size);
3072 if (v == NULL)
3073 goto onError;
3074 if (size == 0)
3075 return (PyObject *)v;
3076 p = PyUnicode_AS_UNICODE(v);
3077 end = s + size;
3078 while (s < end) {
3079 unsigned char c;
3080 Py_UCS4 x;
3081 int i;
3082 int count;
3084 /* Non-escape characters are interpreted as Unicode ordinals */
3085 if (*s != '\\') {
3086 *p++ = (unsigned char)*s++;
3087 continue;
3089 startinpos = s-starts;
3091 /* \u-escapes are only interpreted iff the number of leading
3092 backslashes if odd */
3093 bs = s;
3094 for (;s < end;) {
3095 if (*s != '\\')
3096 break;
3097 *p++ = (unsigned char)*s++;
3099 if (((s - bs) & 1) == 0 ||
3100 s >= end ||
3101 (*s != 'u' && *s != 'U')) {
3102 continue;
3104 p--;
3105 count = *s=='u' ? 4 : 8;
3106 s++;
3108 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3109 outpos = p-PyUnicode_AS_UNICODE(v);
3110 for (x = 0, i = 0; i < count; ++i, ++s) {
3111 c = (unsigned char)*s;
3112 if (!isxdigit(c)) {
3113 endinpos = s-starts;
3114 if (unicode_decode_call_errorhandler(
3115 errors, &errorHandler,
3116 "rawunicodeescape", "truncated \\uXXXX",
3117 starts, size, &startinpos, &endinpos, &exc, &s,
3118 &v, &outpos, &p))
3119 goto onError;
3120 goto nextByte;
3122 x = (x<<4) & ~0xF;
3123 if (c >= '0' && c <= '9')
3124 x += c - '0';
3125 else if (c >= 'a' && c <= 'f')
3126 x += 10 + c - 'a';
3127 else
3128 x += 10 + c - 'A';
3130 if (x <= 0xffff)
3131 /* UCS-2 character */
3132 *p++ = (Py_UNICODE) x;
3133 else if (x <= 0x10ffff) {
3134 /* UCS-4 character. Either store directly, or as
3135 surrogate pair. */
3136 #ifdef Py_UNICODE_WIDE
3137 *p++ = (Py_UNICODE) x;
3138 #else
3139 x -= 0x10000L;
3140 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3141 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3142 #endif
3143 } else {
3144 endinpos = s-starts;
3145 outpos = p-PyUnicode_AS_UNICODE(v);
3146 if (unicode_decode_call_errorhandler(
3147 errors, &errorHandler,
3148 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3149 starts, size, &startinpos, &endinpos, &exc, &s,
3150 &v, &outpos, &p))
3151 goto onError;
3153 nextByte:
3156 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3157 goto onError;
3158 Py_XDECREF(errorHandler);
3159 Py_XDECREF(exc);
3160 return (PyObject *)v;
3162 onError:
3163 Py_XDECREF(v);
3164 Py_XDECREF(errorHandler);
3165 Py_XDECREF(exc);
3166 return NULL;
3169 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3170 Py_ssize_t size)
3172 PyObject *repr;
3173 char *p;
3174 char *q;
3176 static const char *hexdigit = "0123456789abcdef";
3177 #ifdef Py_UNICODE_WIDE
3178 const Py_ssize_t expandsize = 10;
3179 #else
3180 const Py_ssize_t expandsize = 6;
3181 #endif
3183 if (size > PY_SSIZE_T_MAX / expandsize)
3184 return PyErr_NoMemory();
3186 repr = PyString_FromStringAndSize(NULL, expandsize * size);
3187 if (repr == NULL)
3188 return NULL;
3189 if (size == 0)
3190 return repr;
3192 p = q = PyString_AS_STRING(repr);
3193 while (size-- > 0) {
3194 Py_UNICODE ch = *s++;
3195 #ifdef Py_UNICODE_WIDE
3196 /* Map 32-bit characters to '\Uxxxxxxxx' */
3197 if (ch >= 0x10000) {
3198 *p++ = '\\';
3199 *p++ = 'U';
3200 *p++ = hexdigit[(ch >> 28) & 0xf];
3201 *p++ = hexdigit[(ch >> 24) & 0xf];
3202 *p++ = hexdigit[(ch >> 20) & 0xf];
3203 *p++ = hexdigit[(ch >> 16) & 0xf];
3204 *p++ = hexdigit[(ch >> 12) & 0xf];
3205 *p++ = hexdigit[(ch >> 8) & 0xf];
3206 *p++ = hexdigit[(ch >> 4) & 0xf];
3207 *p++ = hexdigit[ch & 15];
3209 else
3210 #else
3211 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3212 if (ch >= 0xD800 && ch < 0xDC00) {
3213 Py_UNICODE ch2;
3214 Py_UCS4 ucs;
3216 ch2 = *s++;
3217 size--;
3218 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3219 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3220 *p++ = '\\';
3221 *p++ = 'U';
3222 *p++ = hexdigit[(ucs >> 28) & 0xf];
3223 *p++ = hexdigit[(ucs >> 24) & 0xf];
3224 *p++ = hexdigit[(ucs >> 20) & 0xf];
3225 *p++ = hexdigit[(ucs >> 16) & 0xf];
3226 *p++ = hexdigit[(ucs >> 12) & 0xf];
3227 *p++ = hexdigit[(ucs >> 8) & 0xf];
3228 *p++ = hexdigit[(ucs >> 4) & 0xf];
3229 *p++ = hexdigit[ucs & 0xf];
3230 continue;
3232 /* Fall through: isolated surrogates are copied as-is */
3233 s--;
3234 size++;
3236 #endif
3237 /* Map 16-bit characters to '\uxxxx' */
3238 if (ch >= 256) {
3239 *p++ = '\\';
3240 *p++ = 'u';
3241 *p++ = hexdigit[(ch >> 12) & 0xf];
3242 *p++ = hexdigit[(ch >> 8) & 0xf];
3243 *p++ = hexdigit[(ch >> 4) & 0xf];
3244 *p++ = hexdigit[ch & 15];
3246 /* Copy everything else as-is */
3247 else
3248 *p++ = (char) ch;
3250 *p = '\0';
3251 _PyString_Resize(&repr, p - q);
3252 return repr;
3255 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3257 if (!PyUnicode_Check(unicode)) {
3258 PyErr_BadArgument();
3259 return NULL;
3261 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3262 PyUnicode_GET_SIZE(unicode));
3265 /* --- Unicode Internal Codec ------------------------------------------- */
3267 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3268 Py_ssize_t size,
3269 const char *errors)
3271 const char *starts = s;
3272 Py_ssize_t startinpos;
3273 Py_ssize_t endinpos;
3274 Py_ssize_t outpos;
3275 PyUnicodeObject *v;
3276 Py_UNICODE *p;
3277 const char *end;
3278 const char *reason;
3279 PyObject *errorHandler = NULL;
3280 PyObject *exc = NULL;
3282 #ifdef Py_UNICODE_WIDE
3283 Py_UNICODE unimax = PyUnicode_GetMax();
3284 #endif
3286 /* XXX overflow detection missing */
3287 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3288 if (v == NULL)
3289 goto onError;
3290 if (PyUnicode_GetSize((PyObject *)v) == 0)
3291 return (PyObject *)v;
3292 p = PyUnicode_AS_UNICODE(v);
3293 end = s + size;
3295 while (s < end) {
3296 memcpy(p, s, sizeof(Py_UNICODE));
3297 /* We have to sanity check the raw data, otherwise doom looms for
3298 some malformed UCS-4 data. */
3299 if (
3300 #ifdef Py_UNICODE_WIDE
3301 *p > unimax || *p < 0 ||
3302 #endif
3303 end-s < Py_UNICODE_SIZE
3306 startinpos = s - starts;
3307 if (end-s < Py_UNICODE_SIZE) {
3308 endinpos = end-starts;
3309 reason = "truncated input";
3311 else {
3312 endinpos = s - starts + Py_UNICODE_SIZE;
3313 reason = "illegal code point (> 0x10FFFF)";
3315 outpos = p - PyUnicode_AS_UNICODE(v);
3316 if (unicode_decode_call_errorhandler(
3317 errors, &errorHandler,
3318 "unicode_internal", reason,
3319 starts, size, &startinpos, &endinpos, &exc, &s,
3320 &v, &outpos, &p)) {
3321 goto onError;
3324 else {
3325 p++;
3326 s += Py_UNICODE_SIZE;
3330 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3331 goto onError;
3332 Py_XDECREF(errorHandler);
3333 Py_XDECREF(exc);
3334 return (PyObject *)v;
3336 onError:
3337 Py_XDECREF(v);
3338 Py_XDECREF(errorHandler);
3339 Py_XDECREF(exc);
3340 return NULL;
3343 /* --- Latin-1 Codec ------------------------------------------------------ */
3345 PyObject *PyUnicode_DecodeLatin1(const char *s,
3346 Py_ssize_t size,
3347 const char *errors)
3349 PyUnicodeObject *v;
3350 Py_UNICODE *p;
3352 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3353 if (size == 1) {
3354 Py_UNICODE r = *(unsigned char*)s;
3355 return PyUnicode_FromUnicode(&r, 1);
3358 v = _PyUnicode_New(size);
3359 if (v == NULL)
3360 goto onError;
3361 if (size == 0)
3362 return (PyObject *)v;
3363 p = PyUnicode_AS_UNICODE(v);
3364 while (size-- > 0)
3365 *p++ = (unsigned char)*s++;
3366 return (PyObject *)v;
3368 onError:
3369 Py_XDECREF(v);
3370 return NULL;
3373 /* create or adjust a UnicodeEncodeError */
3374 static void make_encode_exception(PyObject **exceptionObject,
3375 const char *encoding,
3376 const Py_UNICODE *unicode, Py_ssize_t size,
3377 Py_ssize_t startpos, Py_ssize_t endpos,
3378 const char *reason)
3380 if (*exceptionObject == NULL) {
3381 *exceptionObject = PyUnicodeEncodeError_Create(
3382 encoding, unicode, size, startpos, endpos, reason);
3384 else {
3385 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3386 goto onError;
3387 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3388 goto onError;
3389 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3390 goto onError;
3391 return;
3392 onError:
3393 Py_DECREF(*exceptionObject);
3394 *exceptionObject = NULL;
3398 /* raises a UnicodeEncodeError */
3399 static void raise_encode_exception(PyObject **exceptionObject,
3400 const char *encoding,
3401 const Py_UNICODE *unicode, Py_ssize_t size,
3402 Py_ssize_t startpos, Py_ssize_t endpos,
3403 const char *reason)
3405 make_encode_exception(exceptionObject,
3406 encoding, unicode, size, startpos, endpos, reason);
3407 if (*exceptionObject != NULL)
3408 PyCodec_StrictErrors(*exceptionObject);
3411 /* error handling callback helper:
3412 build arguments, call the callback and check the arguments,
3413 put the result into newpos and return the replacement string, which
3414 has to be freed by the caller */
3415 static PyObject *unicode_encode_call_errorhandler(const char *errors,
3416 PyObject **errorHandler,
3417 const char *encoding, const char *reason,
3418 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3419 Py_ssize_t startpos, Py_ssize_t endpos,
3420 Py_ssize_t *newpos)
3422 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3424 PyObject *restuple;
3425 PyObject *resunicode;
3427 if (*errorHandler == NULL) {
3428 *errorHandler = PyCodec_LookupError(errors);
3429 if (*errorHandler == NULL)
3430 return NULL;
3433 make_encode_exception(exceptionObject,
3434 encoding, unicode, size, startpos, endpos, reason);
3435 if (*exceptionObject == NULL)
3436 return NULL;
3438 restuple = PyObject_CallFunctionObjArgs(
3439 *errorHandler, *exceptionObject, NULL);
3440 if (restuple == NULL)
3441 return NULL;
3442 if (!PyTuple_Check(restuple)) {
3443 PyErr_Format(PyExc_TypeError, &argparse[4]);
3444 Py_DECREF(restuple);
3445 return NULL;
3447 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3448 &resunicode, newpos)) {
3449 Py_DECREF(restuple);
3450 return NULL;
3452 if (*newpos<0)
3453 *newpos = size+*newpos;
3454 if (*newpos<0 || *newpos>size) {
3455 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3456 Py_DECREF(restuple);
3457 return NULL;
3459 Py_INCREF(resunicode);
3460 Py_DECREF(restuple);
3461 return resunicode;
3464 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3465 Py_ssize_t size,
3466 const char *errors,
3467 int limit)
3469 /* output object */
3470 PyObject *res;
3471 /* pointers to the beginning and end+1 of input */
3472 const Py_UNICODE *startp = p;
3473 const Py_UNICODE *endp = p + size;
3474 /* pointer to the beginning of the unencodable characters */
3475 /* const Py_UNICODE *badp = NULL; */
3476 /* pointer into the output */
3477 char *str;
3478 /* current output position */
3479 Py_ssize_t respos = 0;
3480 Py_ssize_t ressize;
3481 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3482 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3483 PyObject *errorHandler = NULL;
3484 PyObject *exc = NULL;
3485 /* the following variable is used for caching string comparisons
3486 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3487 int known_errorHandler = -1;
3489 /* allocate enough for a simple encoding without
3490 replacements, if we need more, we'll resize */
3491 res = PyString_FromStringAndSize(NULL, size);
3492 if (res == NULL)
3493 goto onError;
3494 if (size == 0)
3495 return res;
3496 str = PyString_AS_STRING(res);
3497 ressize = size;
3499 while (p<endp) {
3500 Py_UNICODE c = *p;
3502 /* can we encode this? */
3503 if (c<limit) {
3504 /* no overflow check, because we know that the space is enough */
3505 *str++ = (char)c;
3506 ++p;
3508 else {
3509 Py_ssize_t unicodepos = p-startp;
3510 Py_ssize_t requiredsize;
3511 PyObject *repunicode;
3512 Py_ssize_t repsize;
3513 Py_ssize_t newpos;
3514 Py_ssize_t respos;
3515 Py_UNICODE *uni2;
3516 /* startpos for collecting unencodable chars */
3517 const Py_UNICODE *collstart = p;
3518 const Py_UNICODE *collend = p;
3519 /* find all unecodable characters */
3520 while ((collend < endp) && ((*collend)>=limit))
3521 ++collend;
3522 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3523 if (known_errorHandler==-1) {
3524 if ((errors==NULL) || (!strcmp(errors, "strict")))
3525 known_errorHandler = 1;
3526 else if (!strcmp(errors, "replace"))
3527 known_errorHandler = 2;
3528 else if (!strcmp(errors, "ignore"))
3529 known_errorHandler = 3;
3530 else if (!strcmp(errors, "xmlcharrefreplace"))
3531 known_errorHandler = 4;
3532 else
3533 known_errorHandler = 0;
3535 switch (known_errorHandler) {
3536 case 1: /* strict */
3537 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3538 goto onError;
3539 case 2: /* replace */
3540 while (collstart++<collend)
3541 *str++ = '?'; /* fall through */
3542 case 3: /* ignore */
3543 p = collend;
3544 break;
3545 case 4: /* xmlcharrefreplace */
3546 respos = str-PyString_AS_STRING(res);
3547 /* determine replacement size (temporarily (mis)uses p) */
3548 for (p = collstart, repsize = 0; p < collend; ++p) {
3549 if (*p<10)
3550 repsize += 2+1+1;
3551 else if (*p<100)
3552 repsize += 2+2+1;
3553 else if (*p<1000)
3554 repsize += 2+3+1;
3555 else if (*p<10000)
3556 repsize += 2+4+1;
3557 #ifndef Py_UNICODE_WIDE
3558 else
3559 repsize += 2+5+1;
3560 #else
3561 else if (*p<100000)
3562 repsize += 2+5+1;
3563 else if (*p<1000000)
3564 repsize += 2+6+1;
3565 else
3566 repsize += 2+7+1;
3567 #endif
3569 requiredsize = respos+repsize+(endp-collend);
3570 if (requiredsize > ressize) {
3571 if (requiredsize<2*ressize)
3572 requiredsize = 2*ressize;
3573 if (_PyString_Resize(&res, requiredsize))
3574 goto onError;
3575 str = PyString_AS_STRING(res) + respos;
3576 ressize = requiredsize;
3578 /* generate replacement (temporarily (mis)uses p) */
3579 for (p = collstart; p < collend; ++p) {
3580 str += sprintf(str, "&#%d;", (int)*p);
3582 p = collend;
3583 break;
3584 default:
3585 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3586 encoding, reason, startp, size, &exc,
3587 collstart-startp, collend-startp, &newpos);
3588 if (repunicode == NULL)
3589 goto onError;
3590 /* need more space? (at least enough for what we
3591 have+the replacement+the rest of the string, so
3592 we won't have to check space for encodable characters) */
3593 respos = str-PyString_AS_STRING(res);
3594 repsize = PyUnicode_GET_SIZE(repunicode);
3595 requiredsize = respos+repsize+(endp-collend);
3596 if (requiredsize > ressize) {
3597 if (requiredsize<2*ressize)
3598 requiredsize = 2*ressize;
3599 if (_PyString_Resize(&res, requiredsize)) {
3600 Py_DECREF(repunicode);
3601 goto onError;
3603 str = PyString_AS_STRING(res) + respos;
3604 ressize = requiredsize;
3606 /* check if there is anything unencodable in the replacement
3607 and copy it to the output */
3608 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3609 c = *uni2;
3610 if (c >= limit) {
3611 raise_encode_exception(&exc, encoding, startp, size,
3612 unicodepos, unicodepos+1, reason);
3613 Py_DECREF(repunicode);
3614 goto onError;
3616 *str = (char)c;
3618 p = startp + newpos;
3619 Py_DECREF(repunicode);
3623 /* Resize if we allocated to much */
3624 respos = str-PyString_AS_STRING(res);
3625 if (respos<ressize)
3626 /* If this falls res will be NULL */
3627 _PyString_Resize(&res, respos);
3628 Py_XDECREF(errorHandler);
3629 Py_XDECREF(exc);
3630 return res;
3632 onError:
3633 Py_XDECREF(res);
3634 Py_XDECREF(errorHandler);
3635 Py_XDECREF(exc);
3636 return NULL;
3639 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3640 Py_ssize_t size,
3641 const char *errors)
3643 return unicode_encode_ucs1(p, size, errors, 256);
3646 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3648 if (!PyUnicode_Check(unicode)) {
3649 PyErr_BadArgument();
3650 return NULL;
3652 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3653 PyUnicode_GET_SIZE(unicode),
3654 NULL);
3657 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3659 PyObject *PyUnicode_DecodeASCII(const char *s,
3660 Py_ssize_t size,
3661 const char *errors)
3663 const char *starts = s;
3664 PyUnicodeObject *v;
3665 Py_UNICODE *p;
3666 Py_ssize_t startinpos;
3667 Py_ssize_t endinpos;
3668 Py_ssize_t outpos;
3669 const char *e;
3670 PyObject *errorHandler = NULL;
3671 PyObject *exc = NULL;
3673 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3674 if (size == 1 && *(unsigned char*)s < 128) {
3675 Py_UNICODE r = *(unsigned char*)s;
3676 return PyUnicode_FromUnicode(&r, 1);
3679 v = _PyUnicode_New(size);
3680 if (v == NULL)
3681 goto onError;
3682 if (size == 0)
3683 return (PyObject *)v;
3684 p = PyUnicode_AS_UNICODE(v);
3685 e = s + size;
3686 while (s < e) {
3687 register unsigned char c = (unsigned char)*s;
3688 if (c < 128) {
3689 *p++ = c;
3690 ++s;
3692 else {
3693 startinpos = s-starts;
3694 endinpos = startinpos + 1;
3695 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3696 if (unicode_decode_call_errorhandler(
3697 errors, &errorHandler,
3698 "ascii", "ordinal not in range(128)",
3699 starts, size, &startinpos, &endinpos, &exc, &s,
3700 &v, &outpos, &p))
3701 goto onError;
3704 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
3705 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3706 goto onError;
3707 Py_XDECREF(errorHandler);
3708 Py_XDECREF(exc);
3709 return (PyObject *)v;
3711 onError:
3712 Py_XDECREF(v);
3713 Py_XDECREF(errorHandler);
3714 Py_XDECREF(exc);
3715 return NULL;
3718 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3719 Py_ssize_t size,
3720 const char *errors)
3722 return unicode_encode_ucs1(p, size, errors, 128);
3725 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3727 if (!PyUnicode_Check(unicode)) {
3728 PyErr_BadArgument();
3729 return NULL;
3731 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3732 PyUnicode_GET_SIZE(unicode),
3733 NULL);
3736 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3738 /* --- MBCS codecs for Windows -------------------------------------------- */
3740 #if SIZEOF_INT < SIZEOF_SSIZE_T
3741 #define NEED_RETRY
3742 #endif
3744 /* XXX This code is limited to "true" double-byte encodings, as
3745 a) it assumes an incomplete character consists of a single byte, and
3746 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3747 encodings, see IsDBCSLeadByteEx documentation. */
3749 static int is_dbcs_lead_byte(const char *s, int offset)
3751 const char *curr = s + offset;
3753 if (IsDBCSLeadByte(*curr)) {
3754 const char *prev = CharPrev(s, curr);
3755 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3757 return 0;
3761 * Decode MBCS string into unicode object. If 'final' is set, converts
3762 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3764 static int decode_mbcs(PyUnicodeObject **v,
3765 const char *s, /* MBCS string */
3766 int size, /* sizeof MBCS string */
3767 int final)
3769 Py_UNICODE *p;
3770 Py_ssize_t n = 0;
3771 int usize = 0;
3773 assert(size >= 0);
3775 /* Skip trailing lead-byte unless 'final' is set */
3776 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3777 --size;
3779 /* First get the size of the result */
3780 if (size > 0) {
3781 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3782 if (usize == 0) {
3783 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3784 return -1;
3788 if (*v == NULL) {
3789 /* Create unicode object */
3790 *v = _PyUnicode_New(usize);
3791 if (*v == NULL)
3792 return -1;
3794 else {
3795 /* Extend unicode object */
3796 n = PyUnicode_GET_SIZE(*v);
3797 if (_PyUnicode_Resize(v, n + usize) < 0)
3798 return -1;
3801 /* Do the conversion */
3802 if (size > 0) {
3803 p = PyUnicode_AS_UNICODE(*v) + n;
3804 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3805 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3806 return -1;
3810 return size;
3813 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3814 Py_ssize_t size,
3815 const char *errors,
3816 Py_ssize_t *consumed)
3818 PyUnicodeObject *v = NULL;
3819 int done;
3821 if (consumed)
3822 *consumed = 0;
3824 #ifdef NEED_RETRY
3825 retry:
3826 if (size > INT_MAX)
3827 done = decode_mbcs(&v, s, INT_MAX, 0);
3828 else
3829 #endif
3830 done = decode_mbcs(&v, s, (int)size, !consumed);
3832 if (done < 0) {
3833 Py_XDECREF(v);
3834 return NULL;
3837 if (consumed)
3838 *consumed += done;
3840 #ifdef NEED_RETRY
3841 if (size > INT_MAX) {
3842 s += done;
3843 size -= done;
3844 goto retry;
3846 #endif
3848 return (PyObject *)v;
3851 PyObject *PyUnicode_DecodeMBCS(const char *s,
3852 Py_ssize_t size,
3853 const char *errors)
3855 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3859 * Convert unicode into string object (MBCS).
3860 * Returns 0 if succeed, -1 otherwise.
3862 static int encode_mbcs(PyObject **repr,
3863 const Py_UNICODE *p, /* unicode */
3864 int size) /* size of unicode */
3866 int mbcssize = 0;
3867 Py_ssize_t n = 0;
3869 assert(size >= 0);
3871 /* First get the size of the result */
3872 if (size > 0) {
3873 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3874 if (mbcssize == 0) {
3875 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3876 return -1;
3880 if (*repr == NULL) {
3881 /* Create string object */
3882 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3883 if (*repr == NULL)
3884 return -1;
3886 else {
3887 /* Extend string object */
3888 n = PyString_Size(*repr);
3889 if (_PyString_Resize(repr, n + mbcssize) < 0)
3890 return -1;
3893 /* Do the conversion */
3894 if (size > 0) {
3895 char *s = PyString_AS_STRING(*repr) + n;
3896 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3897 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3898 return -1;
3902 return 0;
3905 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
3906 Py_ssize_t size,
3907 const char *errors)
3909 PyObject *repr = NULL;
3910 int ret;
3912 #ifdef NEED_RETRY
3913 retry:
3914 if (size > INT_MAX)
3915 ret = encode_mbcs(&repr, p, INT_MAX);
3916 else
3917 #endif
3918 ret = encode_mbcs(&repr, p, (int)size);
3920 if (ret < 0) {
3921 Py_XDECREF(repr);
3922 return NULL;
3925 #ifdef NEED_RETRY
3926 if (size > INT_MAX) {
3927 p += INT_MAX;
3928 size -= INT_MAX;
3929 goto retry;
3931 #endif
3933 return repr;
3936 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3938 if (!PyUnicode_Check(unicode)) {
3939 PyErr_BadArgument();
3940 return NULL;
3942 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3943 PyUnicode_GET_SIZE(unicode),
3944 NULL);
3947 #undef NEED_RETRY
3949 #endif /* MS_WINDOWS */
3951 /* --- Character Mapping Codec -------------------------------------------- */
3953 PyObject *PyUnicode_DecodeCharmap(const char *s,
3954 Py_ssize_t size,
3955 PyObject *mapping,
3956 const char *errors)
3958 const char *starts = s;
3959 Py_ssize_t startinpos;
3960 Py_ssize_t endinpos;
3961 Py_ssize_t outpos;
3962 const char *e;
3963 PyUnicodeObject *v;
3964 Py_UNICODE *p;
3965 Py_ssize_t extrachars = 0;
3966 PyObject *errorHandler = NULL;
3967 PyObject *exc = NULL;
3968 Py_UNICODE *mapstring = NULL;
3969 Py_ssize_t maplen = 0;
3971 /* Default to Latin-1 */
3972 if (mapping == NULL)
3973 return PyUnicode_DecodeLatin1(s, size, errors);
3975 v = _PyUnicode_New(size);
3976 if (v == NULL)
3977 goto onError;
3978 if (size == 0)
3979 return (PyObject *)v;
3980 p = PyUnicode_AS_UNICODE(v);
3981 e = s + size;
3982 if (PyUnicode_CheckExact(mapping)) {
3983 mapstring = PyUnicode_AS_UNICODE(mapping);
3984 maplen = PyUnicode_GET_SIZE(mapping);
3985 while (s < e) {
3986 unsigned char ch = *s;
3987 Py_UNICODE x = 0xfffe; /* illegal value */
3989 if (ch < maplen)
3990 x = mapstring[ch];
3992 if (x == 0xfffe) {
3993 /* undefined mapping */
3994 outpos = p-PyUnicode_AS_UNICODE(v);
3995 startinpos = s-starts;
3996 endinpos = startinpos+1;
3997 if (unicode_decode_call_errorhandler(
3998 errors, &errorHandler,
3999 "charmap", "character maps to <undefined>",
4000 starts, size, &startinpos, &endinpos, &exc, &s,
4001 &v, &outpos, &p)) {
4002 goto onError;
4004 continue;
4006 *p++ = x;
4007 ++s;
4010 else {
4011 while (s < e) {
4012 unsigned char ch = *s;
4013 PyObject *w, *x;
4015 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4016 w = PyInt_FromLong((long)ch);
4017 if (w == NULL)
4018 goto onError;
4019 x = PyObject_GetItem(mapping, w);
4020 Py_DECREF(w);
4021 if (x == NULL) {
4022 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4023 /* No mapping found means: mapping is undefined. */
4024 PyErr_Clear();
4025 x = Py_None;
4026 Py_INCREF(x);
4027 } else
4028 goto onError;
4031 /* Apply mapping */
4032 if (PyInt_Check(x)) {
4033 long value = PyInt_AS_LONG(x);
4034 if (value < 0 || value > 65535) {
4035 PyErr_SetString(PyExc_TypeError,
4036 "character mapping must be in range(65536)");
4037 Py_DECREF(x);
4038 goto onError;
4040 *p++ = (Py_UNICODE)value;
4042 else if (x == Py_None) {
4043 /* undefined mapping */
4044 outpos = p-PyUnicode_AS_UNICODE(v);
4045 startinpos = s-starts;
4046 endinpos = startinpos+1;
4047 if (unicode_decode_call_errorhandler(
4048 errors, &errorHandler,
4049 "charmap", "character maps to <undefined>",
4050 starts, size, &startinpos, &endinpos, &exc, &s,
4051 &v, &outpos, &p)) {
4052 Py_DECREF(x);
4053 goto onError;
4055 Py_DECREF(x);
4056 continue;
4058 else if (PyUnicode_Check(x)) {
4059 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4061 if (targetsize == 1)
4062 /* 1-1 mapping */
4063 *p++ = *PyUnicode_AS_UNICODE(x);
4065 else if (targetsize > 1) {
4066 /* 1-n mapping */
4067 if (targetsize > extrachars) {
4068 /* resize first */
4069 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4070 Py_ssize_t needed = (targetsize - extrachars) + \
4071 (targetsize << 2);
4072 extrachars += needed;
4073 /* XXX overflow detection missing */
4074 if (_PyUnicode_Resize(&v,
4075 PyUnicode_GET_SIZE(v) + needed) < 0) {
4076 Py_DECREF(x);
4077 goto onError;
4079 p = PyUnicode_AS_UNICODE(v) + oldpos;
4081 Py_UNICODE_COPY(p,
4082 PyUnicode_AS_UNICODE(x),
4083 targetsize);
4084 p += targetsize;
4085 extrachars -= targetsize;
4087 /* 1-0 mapping: skip the character */
4089 else {
4090 /* wrong return value */
4091 PyErr_SetString(PyExc_TypeError,
4092 "character mapping must return integer, None or unicode");
4093 Py_DECREF(x);
4094 goto onError;
4096 Py_DECREF(x);
4097 ++s;
4100 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4101 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4102 goto onError;
4103 Py_XDECREF(errorHandler);
4104 Py_XDECREF(exc);
4105 return (PyObject *)v;
4107 onError:
4108 Py_XDECREF(errorHandler);
4109 Py_XDECREF(exc);
4110 Py_XDECREF(v);
4111 return NULL;
4114 /* Charmap encoding: the lookup table */
4116 struct encoding_map{
4117 PyObject_HEAD
4118 unsigned char level1[32];
4119 int count2, count3;
4120 unsigned char level23[1];
4123 static PyObject*
4124 encoding_map_size(PyObject *obj, PyObject* args)
4126 struct encoding_map *map = (struct encoding_map*)obj;
4127 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4128 128*map->count3);
4131 static PyMethodDef encoding_map_methods[] = {
4132 {"size", encoding_map_size, METH_NOARGS,
4133 PyDoc_STR("Return the size (in bytes) of this object") },
4134 { 0 }
4137 static void
4138 encoding_map_dealloc(PyObject* o)
4140 PyObject_FREE(o);
4143 static PyTypeObject EncodingMapType = {
4144 PyVarObject_HEAD_INIT(NULL, 0)
4145 "EncodingMap", /*tp_name*/
4146 sizeof(struct encoding_map), /*tp_basicsize*/
4147 0, /*tp_itemsize*/
4148 /* methods */
4149 encoding_map_dealloc, /*tp_dealloc*/
4150 0, /*tp_print*/
4151 0, /*tp_getattr*/
4152 0, /*tp_setattr*/
4153 0, /*tp_compare*/
4154 0, /*tp_repr*/
4155 0, /*tp_as_number*/
4156 0, /*tp_as_sequence*/
4157 0, /*tp_as_mapping*/
4158 0, /*tp_hash*/
4159 0, /*tp_call*/
4160 0, /*tp_str*/
4161 0, /*tp_getattro*/
4162 0, /*tp_setattro*/
4163 0, /*tp_as_buffer*/
4164 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4165 0, /*tp_doc*/
4166 0, /*tp_traverse*/
4167 0, /*tp_clear*/
4168 0, /*tp_richcompare*/
4169 0, /*tp_weaklistoffset*/
4170 0, /*tp_iter*/
4171 0, /*tp_iternext*/
4172 encoding_map_methods, /*tp_methods*/
4173 0, /*tp_members*/
4174 0, /*tp_getset*/
4175 0, /*tp_base*/
4176 0, /*tp_dict*/
4177 0, /*tp_descr_get*/
4178 0, /*tp_descr_set*/
4179 0, /*tp_dictoffset*/
4180 0, /*tp_init*/
4181 0, /*tp_alloc*/
4182 0, /*tp_new*/
4183 0, /*tp_free*/
4184 0, /*tp_is_gc*/
4187 PyObject*
4188 PyUnicode_BuildEncodingMap(PyObject* string)
4190 Py_UNICODE *decode;
4191 PyObject *result;
4192 struct encoding_map *mresult;
4193 int i;
4194 int need_dict = 0;
4195 unsigned char level1[32];
4196 unsigned char level2[512];
4197 unsigned char *mlevel1, *mlevel2, *mlevel3;
4198 int count2 = 0, count3 = 0;
4200 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4201 PyErr_BadArgument();
4202 return NULL;
4204 decode = PyUnicode_AS_UNICODE(string);
4205 memset(level1, 0xFF, sizeof level1);
4206 memset(level2, 0xFF, sizeof level2);
4208 /* If there isn't a one-to-one mapping of NULL to \0,
4209 or if there are non-BMP characters, we need to use
4210 a mapping dictionary. */
4211 if (decode[0] != 0)
4212 need_dict = 1;
4213 for (i = 1; i < 256; i++) {
4214 int l1, l2;
4215 if (decode[i] == 0
4216 #ifdef Py_UNICODE_WIDE
4217 || decode[i] > 0xFFFF
4218 #endif
4220 need_dict = 1;
4221 break;
4223 if (decode[i] == 0xFFFE)
4224 /* unmapped character */
4225 continue;
4226 l1 = decode[i] >> 11;
4227 l2 = decode[i] >> 7;
4228 if (level1[l1] == 0xFF)
4229 level1[l1] = count2++;
4230 if (level2[l2] == 0xFF)
4231 level2[l2] = count3++;
4234 if (count2 >= 0xFF || count3 >= 0xFF)
4235 need_dict = 1;
4237 if (need_dict) {
4238 PyObject *result = PyDict_New();
4239 PyObject *key, *value;
4240 if (!result)
4241 return NULL;
4242 for (i = 0; i < 256; i++) {
4243 key = value = NULL;
4244 key = PyInt_FromLong(decode[i]);
4245 value = PyInt_FromLong(i);
4246 if (!key || !value)
4247 goto failed1;
4248 if (PyDict_SetItem(result, key, value) == -1)
4249 goto failed1;
4250 Py_DECREF(key);
4251 Py_DECREF(value);
4253 return result;
4254 failed1:
4255 Py_XDECREF(key);
4256 Py_XDECREF(value);
4257 Py_DECREF(result);
4258 return NULL;
4261 /* Create a three-level trie */
4262 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4263 16*count2 + 128*count3 - 1);
4264 if (!result)
4265 return PyErr_NoMemory();
4266 PyObject_Init(result, &EncodingMapType);
4267 mresult = (struct encoding_map*)result;
4268 mresult->count2 = count2;
4269 mresult->count3 = count3;
4270 mlevel1 = mresult->level1;
4271 mlevel2 = mresult->level23;
4272 mlevel3 = mresult->level23 + 16*count2;
4273 memcpy(mlevel1, level1, 32);
4274 memset(mlevel2, 0xFF, 16*count2);
4275 memset(mlevel3, 0, 128*count3);
4276 count3 = 0;
4277 for (i = 1; i < 256; i++) {
4278 int o1, o2, o3, i2, i3;
4279 if (decode[i] == 0xFFFE)
4280 /* unmapped character */
4281 continue;
4282 o1 = decode[i]>>11;
4283 o2 = (decode[i]>>7) & 0xF;
4284 i2 = 16*mlevel1[o1] + o2;
4285 if (mlevel2[i2] == 0xFF)
4286 mlevel2[i2] = count3++;
4287 o3 = decode[i] & 0x7F;
4288 i3 = 128*mlevel2[i2] + o3;
4289 mlevel3[i3] = i;
4291 return result;
4294 static int
4295 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4297 struct encoding_map *map = (struct encoding_map*)mapping;
4298 int l1 = c>>11;
4299 int l2 = (c>>7) & 0xF;
4300 int l3 = c & 0x7F;
4301 int i;
4303 #ifdef Py_UNICODE_WIDE
4304 if (c > 0xFFFF) {
4305 return -1;
4307 #endif
4308 if (c == 0)
4309 return 0;
4310 /* level 1*/
4311 i = map->level1[l1];
4312 if (i == 0xFF) {
4313 return -1;
4315 /* level 2*/
4316 i = map->level23[16*i+l2];
4317 if (i == 0xFF) {
4318 return -1;
4320 /* level 3 */
4321 i = map->level23[16*map->count2 + 128*i + l3];
4322 if (i == 0) {
4323 return -1;
4325 return i;
4328 /* Lookup the character ch in the mapping. If the character
4329 can't be found, Py_None is returned (or NULL, if another
4330 error occurred). */
4331 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4333 PyObject *w = PyInt_FromLong((long)c);
4334 PyObject *x;
4336 if (w == NULL)
4337 return NULL;
4338 x = PyObject_GetItem(mapping, w);
4339 Py_DECREF(w);
4340 if (x == NULL) {
4341 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4342 /* No mapping found means: mapping is undefined. */
4343 PyErr_Clear();
4344 x = Py_None;
4345 Py_INCREF(x);
4346 return x;
4347 } else
4348 return NULL;
4350 else if (x == Py_None)
4351 return x;
4352 else if (PyInt_Check(x)) {
4353 long value = PyInt_AS_LONG(x);
4354 if (value < 0 || value > 255) {
4355 PyErr_SetString(PyExc_TypeError,
4356 "character mapping must be in range(256)");
4357 Py_DECREF(x);
4358 return NULL;
4360 return x;
4362 else if (PyString_Check(x))
4363 return x;
4364 else {
4365 /* wrong return value */
4366 PyErr_SetString(PyExc_TypeError,
4367 "character mapping must return integer, None or str");
4368 Py_DECREF(x);
4369 return NULL;
4373 static int
4374 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4376 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4377 /* exponentially overallocate to minimize reallocations */
4378 if (requiredsize < 2*outsize)
4379 requiredsize = 2*outsize;
4380 if (_PyString_Resize(outobj, requiredsize)) {
4381 return 0;
4383 return 1;
4386 typedef enum charmapencode_result {
4387 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4388 }charmapencode_result;
4389 /* lookup the character, put the result in the output string and adjust
4390 various state variables. Reallocate the output string if not enough
4391 space is available. Return a new reference to the object that
4392 was put in the output buffer, or Py_None, if the mapping was undefined
4393 (in which case no character was written) or NULL, if a
4394 reallocation error occurred. The caller must decref the result */
4395 static
4396 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4397 PyObject **outobj, Py_ssize_t *outpos)
4399 PyObject *rep;
4400 char *outstart;
4401 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4403 if (Py_TYPE(mapping) == &EncodingMapType) {
4404 int res = encoding_map_lookup(c, mapping);
4405 Py_ssize_t requiredsize = *outpos+1;
4406 if (res == -1)
4407 return enc_FAILED;
4408 if (outsize<requiredsize)
4409 if (!charmapencode_resize(outobj, outpos, requiredsize))
4410 return enc_EXCEPTION;
4411 outstart = PyString_AS_STRING(*outobj);
4412 outstart[(*outpos)++] = (char)res;
4413 return enc_SUCCESS;
4416 rep = charmapencode_lookup(c, mapping);
4417 if (rep==NULL)
4418 return enc_EXCEPTION;
4419 else if (rep==Py_None) {
4420 Py_DECREF(rep);
4421 return enc_FAILED;
4422 } else {
4423 if (PyInt_Check(rep)) {
4424 Py_ssize_t requiredsize = *outpos+1;
4425 if (outsize<requiredsize)
4426 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4427 Py_DECREF(rep);
4428 return enc_EXCEPTION;
4430 outstart = PyString_AS_STRING(*outobj);
4431 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4433 else {
4434 const char *repchars = PyString_AS_STRING(rep);
4435 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4436 Py_ssize_t requiredsize = *outpos+repsize;
4437 if (outsize<requiredsize)
4438 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4439 Py_DECREF(rep);
4440 return enc_EXCEPTION;
4442 outstart = PyString_AS_STRING(*outobj);
4443 memcpy(outstart + *outpos, repchars, repsize);
4444 *outpos += repsize;
4447 Py_DECREF(rep);
4448 return enc_SUCCESS;
4451 /* handle an error in PyUnicode_EncodeCharmap
4452 Return 0 on success, -1 on error */
4453 static
4454 int charmap_encoding_error(
4455 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4456 PyObject **exceptionObject,
4457 int *known_errorHandler, PyObject **errorHandler, const char *errors,
4458 PyObject **res, Py_ssize_t *respos)
4460 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4461 Py_ssize_t repsize;
4462 Py_ssize_t newpos;
4463 Py_UNICODE *uni2;
4464 /* startpos for collecting unencodable chars */
4465 Py_ssize_t collstartpos = *inpos;
4466 Py_ssize_t collendpos = *inpos+1;
4467 Py_ssize_t collpos;
4468 char *encoding = "charmap";
4469 char *reason = "character maps to <undefined>";
4470 charmapencode_result x;
4472 /* find all unencodable characters */
4473 while (collendpos < size) {
4474 PyObject *rep;
4475 if (Py_TYPE(mapping) == &EncodingMapType) {
4476 int res = encoding_map_lookup(p[collendpos], mapping);
4477 if (res != -1)
4478 break;
4479 ++collendpos;
4480 continue;
4483 rep = charmapencode_lookup(p[collendpos], mapping);
4484 if (rep==NULL)
4485 return -1;
4486 else if (rep!=Py_None) {
4487 Py_DECREF(rep);
4488 break;
4490 Py_DECREF(rep);
4491 ++collendpos;
4493 /* cache callback name lookup
4494 * (if not done yet, i.e. it's the first error) */
4495 if (*known_errorHandler==-1) {
4496 if ((errors==NULL) || (!strcmp(errors, "strict")))
4497 *known_errorHandler = 1;
4498 else if (!strcmp(errors, "replace"))
4499 *known_errorHandler = 2;
4500 else if (!strcmp(errors, "ignore"))
4501 *known_errorHandler = 3;
4502 else if (!strcmp(errors, "xmlcharrefreplace"))
4503 *known_errorHandler = 4;
4504 else
4505 *known_errorHandler = 0;
4507 switch (*known_errorHandler) {
4508 case 1: /* strict */
4509 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4510 return -1;
4511 case 2: /* replace */
4512 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4513 x = charmapencode_output('?', mapping, res, respos);
4514 if (x==enc_EXCEPTION) {
4515 return -1;
4517 else if (x==enc_FAILED) {
4518 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4519 return -1;
4522 /* fall through */
4523 case 3: /* ignore */
4524 *inpos = collendpos;
4525 break;
4526 case 4: /* xmlcharrefreplace */
4527 /* generate replacement (temporarily (mis)uses p) */
4528 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4529 char buffer[2+29+1+1];
4530 char *cp;
4531 sprintf(buffer, "&#%d;", (int)p[collpos]);
4532 for (cp = buffer; *cp; ++cp) {
4533 x = charmapencode_output(*cp, mapping, res, respos);
4534 if (x==enc_EXCEPTION)
4535 return -1;
4536 else if (x==enc_FAILED) {
4537 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4538 return -1;
4542 *inpos = collendpos;
4543 break;
4544 default:
4545 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4546 encoding, reason, p, size, exceptionObject,
4547 collstartpos, collendpos, &newpos);
4548 if (repunicode == NULL)
4549 return -1;
4550 /* generate replacement */
4551 repsize = PyUnicode_GET_SIZE(repunicode);
4552 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4553 x = charmapencode_output(*uni2, mapping, res, respos);
4554 if (x==enc_EXCEPTION) {
4555 return -1;
4557 else if (x==enc_FAILED) {
4558 Py_DECREF(repunicode);
4559 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4560 return -1;
4563 *inpos = newpos;
4564 Py_DECREF(repunicode);
4566 return 0;
4569 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4570 Py_ssize_t size,
4571 PyObject *mapping,
4572 const char *errors)
4574 /* output object */
4575 PyObject *res = NULL;
4576 /* current input position */
4577 Py_ssize_t inpos = 0;
4578 /* current output position */
4579 Py_ssize_t respos = 0;
4580 PyObject *errorHandler = NULL;
4581 PyObject *exc = NULL;
4582 /* the following variable is used for caching string comparisons
4583 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4584 * 3=ignore, 4=xmlcharrefreplace */
4585 int known_errorHandler = -1;
4587 /* Default to Latin-1 */
4588 if (mapping == NULL)
4589 return PyUnicode_EncodeLatin1(p, size, errors);
4591 /* allocate enough for a simple encoding without
4592 replacements, if we need more, we'll resize */
4593 res = PyString_FromStringAndSize(NULL, size);
4594 if (res == NULL)
4595 goto onError;
4596 if (size == 0)
4597 return res;
4599 while (inpos<size) {
4600 /* try to encode it */
4601 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4602 if (x==enc_EXCEPTION) /* error */
4603 goto onError;
4604 if (x==enc_FAILED) { /* unencodable character */
4605 if (charmap_encoding_error(p, size, &inpos, mapping,
4606 &exc,
4607 &known_errorHandler, &errorHandler, errors,
4608 &res, &respos)) {
4609 goto onError;
4612 else
4613 /* done with this character => adjust input position */
4614 ++inpos;
4617 /* Resize if we allocated to much */
4618 if (respos<PyString_GET_SIZE(res)) {
4619 if (_PyString_Resize(&res, respos))
4620 goto onError;
4622 Py_XDECREF(exc);
4623 Py_XDECREF(errorHandler);
4624 return res;
4626 onError:
4627 Py_XDECREF(res);
4628 Py_XDECREF(exc);
4629 Py_XDECREF(errorHandler);
4630 return NULL;
4633 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4634 PyObject *mapping)
4636 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4637 PyErr_BadArgument();
4638 return NULL;
4640 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4641 PyUnicode_GET_SIZE(unicode),
4642 mapping,
4643 NULL);
4646 /* create or adjust a UnicodeTranslateError */
4647 static void make_translate_exception(PyObject **exceptionObject,
4648 const Py_UNICODE *unicode, Py_ssize_t size,
4649 Py_ssize_t startpos, Py_ssize_t endpos,
4650 const char *reason)
4652 if (*exceptionObject == NULL) {
4653 *exceptionObject = PyUnicodeTranslateError_Create(
4654 unicode, size, startpos, endpos, reason);
4656 else {
4657 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4658 goto onError;
4659 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4660 goto onError;
4661 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4662 goto onError;
4663 return;
4664 onError:
4665 Py_DECREF(*exceptionObject);
4666 *exceptionObject = NULL;
4670 /* raises a UnicodeTranslateError */
4671 static void raise_translate_exception(PyObject **exceptionObject,
4672 const Py_UNICODE *unicode, Py_ssize_t size,
4673 Py_ssize_t startpos, Py_ssize_t endpos,
4674 const char *reason)
4676 make_translate_exception(exceptionObject,
4677 unicode, size, startpos, endpos, reason);
4678 if (*exceptionObject != NULL)
4679 PyCodec_StrictErrors(*exceptionObject);
4682 /* error handling callback helper:
4683 build arguments, call the callback and check the arguments,
4684 put the result into newpos and return the replacement string, which
4685 has to be freed by the caller */
4686 static PyObject *unicode_translate_call_errorhandler(const char *errors,
4687 PyObject **errorHandler,
4688 const char *reason,
4689 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4690 Py_ssize_t startpos, Py_ssize_t endpos,
4691 Py_ssize_t *newpos)
4693 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4695 Py_ssize_t i_newpos;
4696 PyObject *restuple;
4697 PyObject *resunicode;
4699 if (*errorHandler == NULL) {
4700 *errorHandler = PyCodec_LookupError(errors);
4701 if (*errorHandler == NULL)
4702 return NULL;
4705 make_translate_exception(exceptionObject,
4706 unicode, size, startpos, endpos, reason);
4707 if (*exceptionObject == NULL)
4708 return NULL;
4710 restuple = PyObject_CallFunctionObjArgs(
4711 *errorHandler, *exceptionObject, NULL);
4712 if (restuple == NULL)
4713 return NULL;
4714 if (!PyTuple_Check(restuple)) {
4715 PyErr_Format(PyExc_TypeError, &argparse[4]);
4716 Py_DECREF(restuple);
4717 return NULL;
4719 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4720 &resunicode, &i_newpos)) {
4721 Py_DECREF(restuple);
4722 return NULL;
4724 if (i_newpos<0)
4725 *newpos = size+i_newpos;
4726 else
4727 *newpos = i_newpos;
4728 if (*newpos<0 || *newpos>size) {
4729 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4730 Py_DECREF(restuple);
4731 return NULL;
4733 Py_INCREF(resunicode);
4734 Py_DECREF(restuple);
4735 return resunicode;
4738 /* Lookup the character ch in the mapping and put the result in result,
4739 which must be decrefed by the caller.
4740 Return 0 on success, -1 on error */
4741 static
4742 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4744 PyObject *w = PyInt_FromLong((long)c);
4745 PyObject *x;
4747 if (w == NULL)
4748 return -1;
4749 x = PyObject_GetItem(mapping, w);
4750 Py_DECREF(w);
4751 if (x == NULL) {
4752 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4753 /* No mapping found means: use 1:1 mapping. */
4754 PyErr_Clear();
4755 *result = NULL;
4756 return 0;
4757 } else
4758 return -1;
4760 else if (x == Py_None) {
4761 *result = x;
4762 return 0;
4764 else if (PyInt_Check(x)) {
4765 long value = PyInt_AS_LONG(x);
4766 long max = PyUnicode_GetMax();
4767 if (value < 0 || value > max) {
4768 PyErr_Format(PyExc_TypeError,
4769 "character mapping must be in range(0x%lx)", max+1);
4770 Py_DECREF(x);
4771 return -1;
4773 *result = x;
4774 return 0;
4776 else if (PyUnicode_Check(x)) {
4777 *result = x;
4778 return 0;
4780 else {
4781 /* wrong return value */
4782 PyErr_SetString(PyExc_TypeError,
4783 "character mapping must return integer, None or unicode");
4784 Py_DECREF(x);
4785 return -1;
4788 /* ensure that *outobj is at least requiredsize characters long,
4789 if not reallocate and adjust various state variables.
4790 Return 0 on success, -1 on error */
4791 static
4792 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4793 Py_ssize_t requiredsize)
4795 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4796 if (requiredsize > oldsize) {
4797 /* remember old output position */
4798 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4799 /* exponentially overallocate to minimize reallocations */
4800 if (requiredsize < 2 * oldsize)
4801 requiredsize = 2 * oldsize;
4802 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4803 return -1;
4804 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4806 return 0;
4808 /* lookup the character, put the result in the output string and adjust
4809 various state variables. Return a new reference to the object that
4810 was put in the output buffer in *result, or Py_None, if the mapping was
4811 undefined (in which case no character was written).
4812 The called must decref result.
4813 Return 0 on success, -1 on error. */
4814 static
4815 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
4816 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4817 PyObject **res)
4819 if (charmaptranslate_lookup(*curinp, mapping, res))
4820 return -1;
4821 if (*res==NULL) {
4822 /* not found => default to 1:1 mapping */
4823 *(*outp)++ = *curinp;
4825 else if (*res==Py_None)
4827 else if (PyInt_Check(*res)) {
4828 /* no overflow check, because we know that the space is enough */
4829 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4831 else if (PyUnicode_Check(*res)) {
4832 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4833 if (repsize==1) {
4834 /* no overflow check, because we know that the space is enough */
4835 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4837 else if (repsize!=0) {
4838 /* more than one character */
4839 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4840 (insize - (curinp-startinp)) +
4841 repsize - 1;
4842 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4843 return -1;
4844 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4845 *outp += repsize;
4848 else
4849 return -1;
4850 return 0;
4853 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
4854 Py_ssize_t size,
4855 PyObject *mapping,
4856 const char *errors)
4858 /* output object */
4859 PyObject *res = NULL;
4860 /* pointers to the beginning and end+1 of input */
4861 const Py_UNICODE *startp = p;
4862 const Py_UNICODE *endp = p + size;
4863 /* pointer into the output */
4864 Py_UNICODE *str;
4865 /* current output position */
4866 Py_ssize_t respos = 0;
4867 char *reason = "character maps to <undefined>";
4868 PyObject *errorHandler = NULL;
4869 PyObject *exc = NULL;
4870 /* the following variable is used for caching string comparisons
4871 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4872 * 3=ignore, 4=xmlcharrefreplace */
4873 int known_errorHandler = -1;
4875 if (mapping == NULL) {
4876 PyErr_BadArgument();
4877 return NULL;
4880 /* allocate enough for a simple 1:1 translation without
4881 replacements, if we need more, we'll resize */
4882 res = PyUnicode_FromUnicode(NULL, size);
4883 if (res == NULL)
4884 goto onError;
4885 if (size == 0)
4886 return res;
4887 str = PyUnicode_AS_UNICODE(res);
4889 while (p<endp) {
4890 /* try to encode it */
4891 PyObject *x = NULL;
4892 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4893 Py_XDECREF(x);
4894 goto onError;
4896 Py_XDECREF(x);
4897 if (x!=Py_None) /* it worked => adjust input pointer */
4898 ++p;
4899 else { /* untranslatable character */
4900 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4901 Py_ssize_t repsize;
4902 Py_ssize_t newpos;
4903 Py_UNICODE *uni2;
4904 /* startpos for collecting untranslatable chars */
4905 const Py_UNICODE *collstart = p;
4906 const Py_UNICODE *collend = p+1;
4907 const Py_UNICODE *coll;
4909 /* find all untranslatable characters */
4910 while (collend < endp) {
4911 if (charmaptranslate_lookup(*collend, mapping, &x))
4912 goto onError;
4913 Py_XDECREF(x);
4914 if (x!=Py_None)
4915 break;
4916 ++collend;
4918 /* cache callback name lookup
4919 * (if not done yet, i.e. it's the first error) */
4920 if (known_errorHandler==-1) {
4921 if ((errors==NULL) || (!strcmp(errors, "strict")))
4922 known_errorHandler = 1;
4923 else if (!strcmp(errors, "replace"))
4924 known_errorHandler = 2;
4925 else if (!strcmp(errors, "ignore"))
4926 known_errorHandler = 3;
4927 else if (!strcmp(errors, "xmlcharrefreplace"))
4928 known_errorHandler = 4;
4929 else
4930 known_errorHandler = 0;
4932 switch (known_errorHandler) {
4933 case 1: /* strict */
4934 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4935 goto onError;
4936 case 2: /* replace */
4937 /* No need to check for space, this is a 1:1 replacement */
4938 for (coll = collstart; coll<collend; ++coll)
4939 *str++ = '?';
4940 /* fall through */
4941 case 3: /* ignore */
4942 p = collend;
4943 break;
4944 case 4: /* xmlcharrefreplace */
4945 /* generate replacement (temporarily (mis)uses p) */
4946 for (p = collstart; p < collend; ++p) {
4947 char buffer[2+29+1+1];
4948 char *cp;
4949 sprintf(buffer, "&#%d;", (int)*p);
4950 if (charmaptranslate_makespace(&res, &str,
4951 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4952 goto onError;
4953 for (cp = buffer; *cp; ++cp)
4954 *str++ = *cp;
4956 p = collend;
4957 break;
4958 default:
4959 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4960 reason, startp, size, &exc,
4961 collstart-startp, collend-startp, &newpos);
4962 if (repunicode == NULL)
4963 goto onError;
4964 /* generate replacement */
4965 repsize = PyUnicode_GET_SIZE(repunicode);
4966 if (charmaptranslate_makespace(&res, &str,
4967 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4968 Py_DECREF(repunicode);
4969 goto onError;
4971 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4972 *str++ = *uni2;
4973 p = startp + newpos;
4974 Py_DECREF(repunicode);
4978 /* Resize if we allocated to much */
4979 respos = str-PyUnicode_AS_UNICODE(res);
4980 if (respos<PyUnicode_GET_SIZE(res)) {
4981 if (PyUnicode_Resize(&res, respos) < 0)
4982 goto onError;
4984 Py_XDECREF(exc);
4985 Py_XDECREF(errorHandler);
4986 return res;
4988 onError:
4989 Py_XDECREF(res);
4990 Py_XDECREF(exc);
4991 Py_XDECREF(errorHandler);
4992 return NULL;
4995 PyObject *PyUnicode_Translate(PyObject *str,
4996 PyObject *mapping,
4997 const char *errors)
4999 PyObject *result;
5001 str = PyUnicode_FromObject(str);
5002 if (str == NULL)
5003 goto onError;
5004 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5005 PyUnicode_GET_SIZE(str),
5006 mapping,
5007 errors);
5008 Py_DECREF(str);
5009 return result;
5011 onError:
5012 Py_XDECREF(str);
5013 return NULL;
5016 /* --- Decimal Encoder ---------------------------------------------------- */
5018 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5019 Py_ssize_t length,
5020 char *output,
5021 const char *errors)
5023 Py_UNICODE *p, *end;
5024 PyObject *errorHandler = NULL;
5025 PyObject *exc = NULL;
5026 const char *encoding = "decimal";
5027 const char *reason = "invalid decimal Unicode string";
5028 /* the following variable is used for caching string comparisons
5029 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5030 int known_errorHandler = -1;
5032 if (output == NULL) {
5033 PyErr_BadArgument();
5034 return -1;
5037 p = s;
5038 end = s + length;
5039 while (p < end) {
5040 register Py_UNICODE ch = *p;
5041 int decimal;
5042 PyObject *repunicode;
5043 Py_ssize_t repsize;
5044 Py_ssize_t newpos;
5045 Py_UNICODE *uni2;
5046 Py_UNICODE *collstart;
5047 Py_UNICODE *collend;
5049 if (Py_UNICODE_ISSPACE(ch)) {
5050 *output++ = ' ';
5051 ++p;
5052 continue;
5054 decimal = Py_UNICODE_TODECIMAL(ch);
5055 if (decimal >= 0) {
5056 *output++ = '0' + decimal;
5057 ++p;
5058 continue;
5060 if (0 < ch && ch < 256) {
5061 *output++ = (char)ch;
5062 ++p;
5063 continue;
5065 /* All other characters are considered unencodable */
5066 collstart = p;
5067 collend = p+1;
5068 while (collend < end) {
5069 if ((0 < *collend && *collend < 256) ||
5070 !Py_UNICODE_ISSPACE(*collend) ||
5071 Py_UNICODE_TODECIMAL(*collend))
5072 break;
5074 /* cache callback name lookup
5075 * (if not done yet, i.e. it's the first error) */
5076 if (known_errorHandler==-1) {
5077 if ((errors==NULL) || (!strcmp(errors, "strict")))
5078 known_errorHandler = 1;
5079 else if (!strcmp(errors, "replace"))
5080 known_errorHandler = 2;
5081 else if (!strcmp(errors, "ignore"))
5082 known_errorHandler = 3;
5083 else if (!strcmp(errors, "xmlcharrefreplace"))
5084 known_errorHandler = 4;
5085 else
5086 known_errorHandler = 0;
5088 switch (known_errorHandler) {
5089 case 1: /* strict */
5090 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5091 goto onError;
5092 case 2: /* replace */
5093 for (p = collstart; p < collend; ++p)
5094 *output++ = '?';
5095 /* fall through */
5096 case 3: /* ignore */
5097 p = collend;
5098 break;
5099 case 4: /* xmlcharrefreplace */
5100 /* generate replacement (temporarily (mis)uses p) */
5101 for (p = collstart; p < collend; ++p)
5102 output += sprintf(output, "&#%d;", (int)*p);
5103 p = collend;
5104 break;
5105 default:
5106 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5107 encoding, reason, s, length, &exc,
5108 collstart-s, collend-s, &newpos);
5109 if (repunicode == NULL)
5110 goto onError;
5111 /* generate replacement */
5112 repsize = PyUnicode_GET_SIZE(repunicode);
5113 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5114 Py_UNICODE ch = *uni2;
5115 if (Py_UNICODE_ISSPACE(ch))
5116 *output++ = ' ';
5117 else {
5118 decimal = Py_UNICODE_TODECIMAL(ch);
5119 if (decimal >= 0)
5120 *output++ = '0' + decimal;
5121 else if (0 < ch && ch < 256)
5122 *output++ = (char)ch;
5123 else {
5124 Py_DECREF(repunicode);
5125 raise_encode_exception(&exc, encoding,
5126 s, length, collstart-s, collend-s, reason);
5127 goto onError;
5131 p = s + newpos;
5132 Py_DECREF(repunicode);
5135 /* 0-terminate the output string */
5136 *output++ = '\0';
5137 Py_XDECREF(exc);
5138 Py_XDECREF(errorHandler);
5139 return 0;
5141 onError:
5142 Py_XDECREF(exc);
5143 Py_XDECREF(errorHandler);
5144 return -1;
5147 /* --- Helpers ------------------------------------------------------------ */
5149 #include "stringlib/unicodedefs.h"
5151 #define FROM_UNICODE
5153 #include "stringlib/fastsearch.h"
5155 #include "stringlib/count.h"
5156 #include "stringlib/find.h"
5157 #include "stringlib/partition.h"
5159 /* helper macro to fixup start/end slice values */
5160 #define FIX_START_END(obj) \
5161 if (start < 0) \
5162 start += (obj)->length; \
5163 if (start < 0) \
5164 start = 0; \
5165 if (end > (obj)->length) \
5166 end = (obj)->length; \
5167 if (end < 0) \
5168 end += (obj)->length; \
5169 if (end < 0) \
5170 end = 0;
5172 Py_ssize_t PyUnicode_Count(PyObject *str,
5173 PyObject *substr,
5174 Py_ssize_t start,
5175 Py_ssize_t end)
5177 Py_ssize_t result;
5178 PyUnicodeObject* str_obj;
5179 PyUnicodeObject* sub_obj;
5181 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5182 if (!str_obj)
5183 return -1;
5184 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5185 if (!sub_obj) {
5186 Py_DECREF(str_obj);
5187 return -1;
5190 FIX_START_END(str_obj);
5192 result = stringlib_count(
5193 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5196 Py_DECREF(sub_obj);
5197 Py_DECREF(str_obj);
5199 return result;
5202 Py_ssize_t PyUnicode_Find(PyObject *str,
5203 PyObject *sub,
5204 Py_ssize_t start,
5205 Py_ssize_t end,
5206 int direction)
5208 Py_ssize_t result;
5210 str = PyUnicode_FromObject(str);
5211 if (!str)
5212 return -2;
5213 sub = PyUnicode_FromObject(sub);
5214 if (!sub) {
5215 Py_DECREF(str);
5216 return -2;
5219 if (direction > 0)
5220 result = stringlib_find_slice(
5221 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5222 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5223 start, end
5225 else
5226 result = stringlib_rfind_slice(
5227 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5228 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5229 start, end
5232 Py_DECREF(str);
5233 Py_DECREF(sub);
5235 return result;
5238 static
5239 int tailmatch(PyUnicodeObject *self,
5240 PyUnicodeObject *substring,
5241 Py_ssize_t start,
5242 Py_ssize_t end,
5243 int direction)
5245 if (substring->length == 0)
5246 return 1;
5248 FIX_START_END(self);
5250 end -= substring->length;
5251 if (end < start)
5252 return 0;
5254 if (direction > 0) {
5255 if (Py_UNICODE_MATCH(self, end, substring))
5256 return 1;
5257 } else {
5258 if (Py_UNICODE_MATCH(self, start, substring))
5259 return 1;
5262 return 0;
5265 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5266 PyObject *substr,
5267 Py_ssize_t start,
5268 Py_ssize_t end,
5269 int direction)
5271 Py_ssize_t result;
5273 str = PyUnicode_FromObject(str);
5274 if (str == NULL)
5275 return -1;
5276 substr = PyUnicode_FromObject(substr);
5277 if (substr == NULL) {
5278 Py_DECREF(str);
5279 return -1;
5282 result = tailmatch((PyUnicodeObject *)str,
5283 (PyUnicodeObject *)substr,
5284 start, end, direction);
5285 Py_DECREF(str);
5286 Py_DECREF(substr);
5287 return result;
5290 /* Apply fixfct filter to the Unicode object self and return a
5291 reference to the modified object */
5293 static
5294 PyObject *fixup(PyUnicodeObject *self,
5295 int (*fixfct)(PyUnicodeObject *s))
5298 PyUnicodeObject *u;
5300 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5301 if (u == NULL)
5302 return NULL;
5304 Py_UNICODE_COPY(u->str, self->str, self->length);
5306 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5307 /* fixfct should return TRUE if it modified the buffer. If
5308 FALSE, return a reference to the original buffer instead
5309 (to save space, not time) */
5310 Py_INCREF(self);
5311 Py_DECREF(u);
5312 return (PyObject*) self;
5314 return (PyObject*) u;
5317 static
5318 int fixupper(PyUnicodeObject *self)
5320 Py_ssize_t len = self->length;
5321 Py_UNICODE *s = self->str;
5322 int status = 0;
5324 while (len-- > 0) {
5325 register Py_UNICODE ch;
5327 ch = Py_UNICODE_TOUPPER(*s);
5328 if (ch != *s) {
5329 status = 1;
5330 *s = ch;
5332 s++;
5335 return status;
5338 static
5339 int fixlower(PyUnicodeObject *self)
5341 Py_ssize_t len = self->length;
5342 Py_UNICODE *s = self->str;
5343 int status = 0;
5345 while (len-- > 0) {
5346 register Py_UNICODE ch;
5348 ch = Py_UNICODE_TOLOWER(*s);
5349 if (ch != *s) {
5350 status = 1;
5351 *s = ch;
5353 s++;
5356 return status;
5359 static
5360 int fixswapcase(PyUnicodeObject *self)
5362 Py_ssize_t len = self->length;
5363 Py_UNICODE *s = self->str;
5364 int status = 0;
5366 while (len-- > 0) {
5367 if (Py_UNICODE_ISUPPER(*s)) {
5368 *s = Py_UNICODE_TOLOWER(*s);
5369 status = 1;
5370 } else if (Py_UNICODE_ISLOWER(*s)) {
5371 *s = Py_UNICODE_TOUPPER(*s);
5372 status = 1;
5374 s++;
5377 return status;
5380 static
5381 int fixcapitalize(PyUnicodeObject *self)
5383 Py_ssize_t len = self->length;
5384 Py_UNICODE *s = self->str;
5385 int status = 0;
5387 if (len == 0)
5388 return 0;
5389 if (Py_UNICODE_ISLOWER(*s)) {
5390 *s = Py_UNICODE_TOUPPER(*s);
5391 status = 1;
5393 s++;
5394 while (--len > 0) {
5395 if (Py_UNICODE_ISUPPER(*s)) {
5396 *s = Py_UNICODE_TOLOWER(*s);
5397 status = 1;
5399 s++;
5401 return status;
5404 static
5405 int fixtitle(PyUnicodeObject *self)
5407 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5408 register Py_UNICODE *e;
5409 int previous_is_cased;
5411 /* Shortcut for single character strings */
5412 if (PyUnicode_GET_SIZE(self) == 1) {
5413 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5414 if (*p != ch) {
5415 *p = ch;
5416 return 1;
5418 else
5419 return 0;
5422 e = p + PyUnicode_GET_SIZE(self);
5423 previous_is_cased = 0;
5424 for (; p < e; p++) {
5425 register const Py_UNICODE ch = *p;
5427 if (previous_is_cased)
5428 *p = Py_UNICODE_TOLOWER(ch);
5429 else
5430 *p = Py_UNICODE_TOTITLE(ch);
5432 if (Py_UNICODE_ISLOWER(ch) ||
5433 Py_UNICODE_ISUPPER(ch) ||
5434 Py_UNICODE_ISTITLE(ch))
5435 previous_is_cased = 1;
5436 else
5437 previous_is_cased = 0;
5439 return 1;
5442 PyObject *
5443 PyUnicode_Join(PyObject *separator, PyObject *seq)
5445 PyObject *internal_separator = NULL;
5446 const Py_UNICODE blank = ' ';
5447 const Py_UNICODE *sep = &blank;
5448 Py_ssize_t seplen = 1;
5449 PyUnicodeObject *res = NULL; /* the result */
5450 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5451 Py_ssize_t res_used; /* # used bytes */
5452 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5453 PyObject *fseq; /* PySequence_Fast(seq) */
5454 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
5455 PyObject *item;
5456 Py_ssize_t i;
5458 fseq = PySequence_Fast(seq, "");
5459 if (fseq == NULL) {
5460 return NULL;
5463 /* Grrrr. A codec may be invoked to convert str objects to
5464 * Unicode, and so it's possible to call back into Python code
5465 * during PyUnicode_FromObject(), and so it's possible for a sick
5466 * codec to change the size of fseq (if seq is a list). Therefore
5467 * we have to keep refetching the size -- can't assume seqlen
5468 * is invariant.
5470 seqlen = PySequence_Fast_GET_SIZE(fseq);
5471 /* If empty sequence, return u"". */
5472 if (seqlen == 0) {
5473 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5474 goto Done;
5476 /* If singleton sequence with an exact Unicode, return that. */
5477 if (seqlen == 1) {
5478 item = PySequence_Fast_GET_ITEM(fseq, 0);
5479 if (PyUnicode_CheckExact(item)) {
5480 Py_INCREF(item);
5481 res = (PyUnicodeObject *)item;
5482 goto Done;
5486 /* At least two items to join, or one that isn't exact Unicode. */
5487 if (seqlen > 1) {
5488 /* Set up sep and seplen -- they're needed. */
5489 if (separator == NULL) {
5490 sep = &blank;
5491 seplen = 1;
5493 else {
5494 internal_separator = PyUnicode_FromObject(separator);
5495 if (internal_separator == NULL)
5496 goto onError;
5497 sep = PyUnicode_AS_UNICODE(internal_separator);
5498 seplen = PyUnicode_GET_SIZE(internal_separator);
5499 /* In case PyUnicode_FromObject() mutated seq. */
5500 seqlen = PySequence_Fast_GET_SIZE(fseq);
5504 /* Get space. */
5505 res = _PyUnicode_New(res_alloc);
5506 if (res == NULL)
5507 goto onError;
5508 res_p = PyUnicode_AS_UNICODE(res);
5509 res_used = 0;
5511 for (i = 0; i < seqlen; ++i) {
5512 Py_ssize_t itemlen;
5513 Py_ssize_t new_res_used;
5515 item = PySequence_Fast_GET_ITEM(fseq, i);
5516 /* Convert item to Unicode. */
5517 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5518 PyErr_Format(PyExc_TypeError,
5519 "sequence item %zd: expected string or Unicode,"
5520 " %.80s found",
5521 i, Py_TYPE(item)->tp_name);
5522 goto onError;
5524 item = PyUnicode_FromObject(item);
5525 if (item == NULL)
5526 goto onError;
5527 /* We own a reference to item from here on. */
5529 /* In case PyUnicode_FromObject() mutated seq. */
5530 seqlen = PySequence_Fast_GET_SIZE(fseq);
5532 /* Make sure we have enough space for the separator and the item. */
5533 itemlen = PyUnicode_GET_SIZE(item);
5534 new_res_used = res_used + itemlen;
5535 if (new_res_used < 0)
5536 goto Overflow;
5537 if (i < seqlen - 1) {
5538 new_res_used += seplen;
5539 if (new_res_used < 0)
5540 goto Overflow;
5542 if (new_res_used > res_alloc) {
5543 /* double allocated size until it's big enough */
5544 do {
5545 res_alloc += res_alloc;
5546 if (res_alloc <= 0)
5547 goto Overflow;
5548 } while (new_res_used > res_alloc);
5549 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5550 Py_DECREF(item);
5551 goto onError;
5553 res_p = PyUnicode_AS_UNICODE(res) + res_used;
5556 /* Copy item, and maybe the separator. */
5557 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5558 res_p += itemlen;
5559 if (i < seqlen - 1) {
5560 Py_UNICODE_COPY(res_p, sep, seplen);
5561 res_p += seplen;
5563 Py_DECREF(item);
5564 res_used = new_res_used;
5567 /* Shrink res to match the used area; this probably can't fail,
5568 * but it's cheap to check.
5570 if (_PyUnicode_Resize(&res, res_used) < 0)
5571 goto onError;
5573 Done:
5574 Py_XDECREF(internal_separator);
5575 Py_DECREF(fseq);
5576 return (PyObject *)res;
5578 Overflow:
5579 PyErr_SetString(PyExc_OverflowError,
5580 "join() result is too long for a Python string");
5581 Py_DECREF(item);
5582 /* fall through */
5584 onError:
5585 Py_XDECREF(internal_separator);
5586 Py_DECREF(fseq);
5587 Py_XDECREF(res);
5588 return NULL;
5591 static
5592 PyUnicodeObject *pad(PyUnicodeObject *self,
5593 Py_ssize_t left,
5594 Py_ssize_t right,
5595 Py_UNICODE fill)
5597 PyUnicodeObject *u;
5599 if (left < 0)
5600 left = 0;
5601 if (right < 0)
5602 right = 0;
5604 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5605 Py_INCREF(self);
5606 return self;
5609 if (left > PY_SSIZE_T_MAX - self->length ||
5610 right > PY_SSIZE_T_MAX - (left + self->length)) {
5611 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5612 return NULL;
5614 u = _PyUnicode_New(left + self->length + right);
5615 if (u) {
5616 if (left)
5617 Py_UNICODE_FILL(u->str, fill, left);
5618 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5619 if (right)
5620 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5623 return u;
5626 #define SPLIT_APPEND(data, left, right) \
5627 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
5628 if (!str) \
5629 goto onError; \
5630 if (PyList_Append(list, str)) { \
5631 Py_DECREF(str); \
5632 goto onError; \
5634 else \
5635 Py_DECREF(str);
5637 static
5638 PyObject *split_whitespace(PyUnicodeObject *self,
5639 PyObject *list,
5640 Py_ssize_t maxcount)
5642 register Py_ssize_t i;
5643 register Py_ssize_t j;
5644 Py_ssize_t len = self->length;
5645 PyObject *str;
5646 register const Py_UNICODE *buf = self->str;
5648 for (i = j = 0; i < len; ) {
5649 /* find a token */
5650 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5651 i++;
5652 j = i;
5653 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
5654 i++;
5655 if (j < i) {
5656 if (maxcount-- <= 0)
5657 break;
5658 SPLIT_APPEND(buf, j, i);
5659 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5660 i++;
5661 j = i;
5664 if (j < len) {
5665 SPLIT_APPEND(buf, j, len);
5667 return list;
5669 onError:
5670 Py_DECREF(list);
5671 return NULL;
5674 PyObject *PyUnicode_Splitlines(PyObject *string,
5675 int keepends)
5677 register Py_ssize_t i;
5678 register Py_ssize_t j;
5679 Py_ssize_t len;
5680 PyObject *list;
5681 PyObject *str;
5682 Py_UNICODE *data;
5684 string = PyUnicode_FromObject(string);
5685 if (string == NULL)
5686 return NULL;
5687 data = PyUnicode_AS_UNICODE(string);
5688 len = PyUnicode_GET_SIZE(string);
5690 list = PyList_New(0);
5691 if (!list)
5692 goto onError;
5694 for (i = j = 0; i < len; ) {
5695 Py_ssize_t eol;
5697 /* Find a line and append it */
5698 while (i < len && !BLOOM_LINEBREAK(data[i]))
5699 i++;
5701 /* Skip the line break reading CRLF as one line break */
5702 eol = i;
5703 if (i < len) {
5704 if (data[i] == '\r' && i + 1 < len &&
5705 data[i+1] == '\n')
5706 i += 2;
5707 else
5708 i++;
5709 if (keepends)
5710 eol = i;
5712 SPLIT_APPEND(data, j, eol);
5713 j = i;
5715 if (j < len) {
5716 SPLIT_APPEND(data, j, len);
5719 Py_DECREF(string);
5720 return list;
5722 onError:
5723 Py_XDECREF(list);
5724 Py_DECREF(string);
5725 return NULL;
5728 static
5729 PyObject *split_char(PyUnicodeObject *self,
5730 PyObject *list,
5731 Py_UNICODE ch,
5732 Py_ssize_t maxcount)
5734 register Py_ssize_t i;
5735 register Py_ssize_t j;
5736 Py_ssize_t len = self->length;
5737 PyObject *str;
5738 register const Py_UNICODE *buf = self->str;
5740 for (i = j = 0; i < len; ) {
5741 if (buf[i] == ch) {
5742 if (maxcount-- <= 0)
5743 break;
5744 SPLIT_APPEND(buf, j, i);
5745 i = j = i + 1;
5746 } else
5747 i++;
5749 if (j <= len) {
5750 SPLIT_APPEND(buf, j, len);
5752 return list;
5754 onError:
5755 Py_DECREF(list);
5756 return NULL;
5759 static
5760 PyObject *split_substring(PyUnicodeObject *self,
5761 PyObject *list,
5762 PyUnicodeObject *substring,
5763 Py_ssize_t maxcount)
5765 register Py_ssize_t i;
5766 register Py_ssize_t j;
5767 Py_ssize_t len = self->length;
5768 Py_ssize_t sublen = substring->length;
5769 PyObject *str;
5771 for (i = j = 0; i <= len - sublen; ) {
5772 if (Py_UNICODE_MATCH(self, i, substring)) {
5773 if (maxcount-- <= 0)
5774 break;
5775 SPLIT_APPEND(self->str, j, i);
5776 i = j = i + sublen;
5777 } else
5778 i++;
5780 if (j <= len) {
5781 SPLIT_APPEND(self->str, j, len);
5783 return list;
5785 onError:
5786 Py_DECREF(list);
5787 return NULL;
5790 static
5791 PyObject *rsplit_whitespace(PyUnicodeObject *self,
5792 PyObject *list,
5793 Py_ssize_t maxcount)
5795 register Py_ssize_t i;
5796 register Py_ssize_t j;
5797 Py_ssize_t len = self->length;
5798 PyObject *str;
5799 register const Py_UNICODE *buf = self->str;
5801 for (i = j = len - 1; i >= 0; ) {
5802 /* find a token */
5803 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5804 i--;
5805 j = i;
5806 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
5807 i--;
5808 if (j > i) {
5809 if (maxcount-- <= 0)
5810 break;
5811 SPLIT_APPEND(buf, i + 1, j + 1);
5812 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5813 i--;
5814 j = i;
5817 if (j >= 0) {
5818 SPLIT_APPEND(buf, 0, j + 1);
5820 if (PyList_Reverse(list) < 0)
5821 goto onError;
5822 return list;
5824 onError:
5825 Py_DECREF(list);
5826 return NULL;
5829 static
5830 PyObject *rsplit_char(PyUnicodeObject *self,
5831 PyObject *list,
5832 Py_UNICODE ch,
5833 Py_ssize_t maxcount)
5835 register Py_ssize_t i;
5836 register Py_ssize_t j;
5837 Py_ssize_t len = self->length;
5838 PyObject *str;
5839 register const Py_UNICODE *buf = self->str;
5841 for (i = j = len - 1; i >= 0; ) {
5842 if (buf[i] == ch) {
5843 if (maxcount-- <= 0)
5844 break;
5845 SPLIT_APPEND(buf, i + 1, j + 1);
5846 j = i = i - 1;
5847 } else
5848 i--;
5850 if (j >= -1) {
5851 SPLIT_APPEND(buf, 0, j + 1);
5853 if (PyList_Reverse(list) < 0)
5854 goto onError;
5855 return list;
5857 onError:
5858 Py_DECREF(list);
5859 return NULL;
5862 static
5863 PyObject *rsplit_substring(PyUnicodeObject *self,
5864 PyObject *list,
5865 PyUnicodeObject *substring,
5866 Py_ssize_t maxcount)
5868 register Py_ssize_t i;
5869 register Py_ssize_t j;
5870 Py_ssize_t len = self->length;
5871 Py_ssize_t sublen = substring->length;
5872 PyObject *str;
5874 for (i = len - sublen, j = len; i >= 0; ) {
5875 if (Py_UNICODE_MATCH(self, i, substring)) {
5876 if (maxcount-- <= 0)
5877 break;
5878 SPLIT_APPEND(self->str, i + sublen, j);
5879 j = i;
5880 i -= sublen;
5881 } else
5882 i--;
5884 if (j >= 0) {
5885 SPLIT_APPEND(self->str, 0, j);
5887 if (PyList_Reverse(list) < 0)
5888 goto onError;
5889 return list;
5891 onError:
5892 Py_DECREF(list);
5893 return NULL;
5896 #undef SPLIT_APPEND
5898 static
5899 PyObject *split(PyUnicodeObject *self,
5900 PyUnicodeObject *substring,
5901 Py_ssize_t maxcount)
5903 PyObject *list;
5905 if (maxcount < 0)
5906 maxcount = PY_SSIZE_T_MAX;
5908 list = PyList_New(0);
5909 if (!list)
5910 return NULL;
5912 if (substring == NULL)
5913 return split_whitespace(self,list,maxcount);
5915 else if (substring->length == 1)
5916 return split_char(self,list,substring->str[0],maxcount);
5918 else if (substring->length == 0) {
5919 Py_DECREF(list);
5920 PyErr_SetString(PyExc_ValueError, "empty separator");
5921 return NULL;
5923 else
5924 return split_substring(self,list,substring,maxcount);
5927 static
5928 PyObject *rsplit(PyUnicodeObject *self,
5929 PyUnicodeObject *substring,
5930 Py_ssize_t maxcount)
5932 PyObject *list;
5934 if (maxcount < 0)
5935 maxcount = PY_SSIZE_T_MAX;
5937 list = PyList_New(0);
5938 if (!list)
5939 return NULL;
5941 if (substring == NULL)
5942 return rsplit_whitespace(self,list,maxcount);
5944 else if (substring->length == 1)
5945 return rsplit_char(self,list,substring->str[0],maxcount);
5947 else if (substring->length == 0) {
5948 Py_DECREF(list);
5949 PyErr_SetString(PyExc_ValueError, "empty separator");
5950 return NULL;
5952 else
5953 return rsplit_substring(self,list,substring,maxcount);
5956 static
5957 PyObject *replace(PyUnicodeObject *self,
5958 PyUnicodeObject *str1,
5959 PyUnicodeObject *str2,
5960 Py_ssize_t maxcount)
5962 PyUnicodeObject *u;
5964 if (maxcount < 0)
5965 maxcount = PY_SSIZE_T_MAX;
5967 if (str1->length == str2->length) {
5968 /* same length */
5969 Py_ssize_t i;
5970 if (str1->length == 1) {
5971 /* replace characters */
5972 Py_UNICODE u1, u2;
5973 if (!findchar(self->str, self->length, str1->str[0]))
5974 goto nothing;
5975 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5976 if (!u)
5977 return NULL;
5978 Py_UNICODE_COPY(u->str, self->str, self->length);
5979 u1 = str1->str[0];
5980 u2 = str2->str[0];
5981 for (i = 0; i < u->length; i++)
5982 if (u->str[i] == u1) {
5983 if (--maxcount < 0)
5984 break;
5985 u->str[i] = u2;
5987 } else {
5988 i = fastsearch(
5989 self->str, self->length, str1->str, str1->length, FAST_SEARCH
5991 if (i < 0)
5992 goto nothing;
5993 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5994 if (!u)
5995 return NULL;
5996 Py_UNICODE_COPY(u->str, self->str, self->length);
5997 while (i <= self->length - str1->length)
5998 if (Py_UNICODE_MATCH(self, i, str1)) {
5999 if (--maxcount < 0)
6000 break;
6001 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6002 i += str1->length;
6003 } else
6004 i++;
6006 } else {
6008 Py_ssize_t n, i, j, e;
6009 Py_ssize_t product, new_size, delta;
6010 Py_UNICODE *p;
6012 /* replace strings */
6013 n = stringlib_count(self->str, self->length, str1->str, str1->length);
6014 if (n > maxcount)
6015 n = maxcount;
6016 if (n == 0)
6017 goto nothing;
6018 /* new_size = self->length + n * (str2->length - str1->length)); */
6019 delta = (str2->length - str1->length);
6020 if (delta == 0) {
6021 new_size = self->length;
6022 } else {
6023 product = n * (str2->length - str1->length);
6024 if ((product / (str2->length - str1->length)) != n) {
6025 PyErr_SetString(PyExc_OverflowError,
6026 "replace string is too long");
6027 return NULL;
6029 new_size = self->length + product;
6030 if (new_size < 0) {
6031 PyErr_SetString(PyExc_OverflowError,
6032 "replace string is too long");
6033 return NULL;
6036 u = _PyUnicode_New(new_size);
6037 if (!u)
6038 return NULL;
6039 i = 0;
6040 p = u->str;
6041 e = self->length - str1->length;
6042 if (str1->length > 0) {
6043 while (n-- > 0) {
6044 /* look for next match */
6045 j = i;
6046 while (j <= e) {
6047 if (Py_UNICODE_MATCH(self, j, str1))
6048 break;
6049 j++;
6051 if (j > i) {
6052 if (j > e)
6053 break;
6054 /* copy unchanged part [i:j] */
6055 Py_UNICODE_COPY(p, self->str+i, j-i);
6056 p += j - i;
6058 /* copy substitution string */
6059 if (str2->length > 0) {
6060 Py_UNICODE_COPY(p, str2->str, str2->length);
6061 p += str2->length;
6063 i = j + str1->length;
6065 if (i < self->length)
6066 /* copy tail [i:] */
6067 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6068 } else {
6069 /* interleave */
6070 while (n > 0) {
6071 Py_UNICODE_COPY(p, str2->str, str2->length);
6072 p += str2->length;
6073 if (--n <= 0)
6074 break;
6075 *p++ = self->str[i++];
6077 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6080 return (PyObject *) u;
6082 nothing:
6083 /* nothing to replace; return original string (when possible) */
6084 if (PyUnicode_CheckExact(self)) {
6085 Py_INCREF(self);
6086 return (PyObject *) self;
6088 return PyUnicode_FromUnicode(self->str, self->length);
6091 /* --- Unicode Object Methods --------------------------------------------- */
6093 PyDoc_STRVAR(title__doc__,
6094 "S.title() -> unicode\n\
6096 Return a titlecased version of S, i.e. words start with title case\n\
6097 characters, all remaining cased characters have lower case.");
6099 static PyObject*
6100 unicode_title(PyUnicodeObject *self)
6102 return fixup(self, fixtitle);
6105 PyDoc_STRVAR(capitalize__doc__,
6106 "S.capitalize() -> unicode\n\
6108 Return a capitalized version of S, i.e. make the first character\n\
6109 have upper case.");
6111 static PyObject*
6112 unicode_capitalize(PyUnicodeObject *self)
6114 return fixup(self, fixcapitalize);
6117 #if 0
6118 PyDoc_STRVAR(capwords__doc__,
6119 "S.capwords() -> unicode\n\
6121 Apply .capitalize() to all words in S and return the result with\n\
6122 normalized whitespace (all whitespace strings are replaced by ' ').");
6124 static PyObject*
6125 unicode_capwords(PyUnicodeObject *self)
6127 PyObject *list;
6128 PyObject *item;
6129 Py_ssize_t i;
6131 /* Split into words */
6132 list = split(self, NULL, -1);
6133 if (!list)
6134 return NULL;
6136 /* Capitalize each word */
6137 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6138 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6139 fixcapitalize);
6140 if (item == NULL)
6141 goto onError;
6142 Py_DECREF(PyList_GET_ITEM(list, i));
6143 PyList_SET_ITEM(list, i, item);
6146 /* Join the words to form a new string */
6147 item = PyUnicode_Join(NULL, list);
6149 onError:
6150 Py_DECREF(list);
6151 return (PyObject *)item;
6153 #endif
6155 /* Argument converter. Coerces to a single unicode character */
6157 static int
6158 convert_uc(PyObject *obj, void *addr)
6160 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6161 PyObject *uniobj;
6162 Py_UNICODE *unistr;
6164 uniobj = PyUnicode_FromObject(obj);
6165 if (uniobj == NULL) {
6166 PyErr_SetString(PyExc_TypeError,
6167 "The fill character cannot be converted to Unicode");
6168 return 0;
6170 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6171 PyErr_SetString(PyExc_TypeError,
6172 "The fill character must be exactly one character long");
6173 Py_DECREF(uniobj);
6174 return 0;
6176 unistr = PyUnicode_AS_UNICODE(uniobj);
6177 *fillcharloc = unistr[0];
6178 Py_DECREF(uniobj);
6179 return 1;
6182 PyDoc_STRVAR(center__doc__,
6183 "S.center(width[, fillchar]) -> unicode\n\
6185 Return S centered in a Unicode string of length width. Padding is\n\
6186 done using the specified fill character (default is a space)");
6188 static PyObject *
6189 unicode_center(PyUnicodeObject *self, PyObject *args)
6191 Py_ssize_t marg, left;
6192 Py_ssize_t width;
6193 Py_UNICODE fillchar = ' ';
6195 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6196 return NULL;
6198 if (self->length >= width && PyUnicode_CheckExact(self)) {
6199 Py_INCREF(self);
6200 return (PyObject*) self;
6203 marg = width - self->length;
6204 left = marg / 2 + (marg & width & 1);
6206 return (PyObject*) pad(self, left, marg - left, fillchar);
6209 #if 0
6211 /* This code should go into some future Unicode collation support
6212 module. The basic comparison should compare ordinals on a naive
6213 basis (this is what Java does and thus JPython too). */
6215 /* speedy UTF-16 code point order comparison */
6216 /* gleaned from: */
6217 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6219 static short utf16Fixup[32] =
6221 0, 0, 0, 0, 0, 0, 0, 0,
6222 0, 0, 0, 0, 0, 0, 0, 0,
6223 0, 0, 0, 0, 0, 0, 0, 0,
6224 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6227 static int
6228 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6230 Py_ssize_t len1, len2;
6232 Py_UNICODE *s1 = str1->str;
6233 Py_UNICODE *s2 = str2->str;
6235 len1 = str1->length;
6236 len2 = str2->length;
6238 while (len1 > 0 && len2 > 0) {
6239 Py_UNICODE c1, c2;
6241 c1 = *s1++;
6242 c2 = *s2++;
6244 if (c1 > (1<<11) * 26)
6245 c1 += utf16Fixup[c1>>11];
6246 if (c2 > (1<<11) * 26)
6247 c2 += utf16Fixup[c2>>11];
6248 /* now c1 and c2 are in UTF-32-compatible order */
6250 if (c1 != c2)
6251 return (c1 < c2) ? -1 : 1;
6253 len1--; len2--;
6256 return (len1 < len2) ? -1 : (len1 != len2);
6259 #else
6261 static int
6262 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6264 register Py_ssize_t len1, len2;
6266 Py_UNICODE *s1 = str1->str;
6267 Py_UNICODE *s2 = str2->str;
6269 len1 = str1->length;
6270 len2 = str2->length;
6272 while (len1 > 0 && len2 > 0) {
6273 Py_UNICODE c1, c2;
6275 c1 = *s1++;
6276 c2 = *s2++;
6278 if (c1 != c2)
6279 return (c1 < c2) ? -1 : 1;
6281 len1--; len2--;
6284 return (len1 < len2) ? -1 : (len1 != len2);
6287 #endif
6289 int PyUnicode_Compare(PyObject *left,
6290 PyObject *right)
6292 PyUnicodeObject *u = NULL, *v = NULL;
6293 int result;
6295 /* Coerce the two arguments */
6296 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6297 if (u == NULL)
6298 goto onError;
6299 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6300 if (v == NULL)
6301 goto onError;
6303 /* Shortcut for empty or interned objects */
6304 if (v == u) {
6305 Py_DECREF(u);
6306 Py_DECREF(v);
6307 return 0;
6310 result = unicode_compare(u, v);
6312 Py_DECREF(u);
6313 Py_DECREF(v);
6314 return result;
6316 onError:
6317 Py_XDECREF(u);
6318 Py_XDECREF(v);
6319 return -1;
6322 PyObject *PyUnicode_RichCompare(PyObject *left,
6323 PyObject *right,
6324 int op)
6326 int result;
6328 result = PyUnicode_Compare(left, right);
6329 if (result == -1 && PyErr_Occurred())
6330 goto onError;
6332 /* Convert the return value to a Boolean */
6333 switch (op) {
6334 case Py_EQ:
6335 result = (result == 0);
6336 break;
6337 case Py_NE:
6338 result = (result != 0);
6339 break;
6340 case Py_LE:
6341 result = (result <= 0);
6342 break;
6343 case Py_GE:
6344 result = (result >= 0);
6345 break;
6346 case Py_LT:
6347 result = (result == -1);
6348 break;
6349 case Py_GT:
6350 result = (result == 1);
6351 break;
6353 return PyBool_FromLong(result);
6355 onError:
6357 /* Standard case
6359 Type errors mean that PyUnicode_FromObject() could not convert
6360 one of the arguments (usually the right hand side) to Unicode,
6361 ie. we can't handle the comparison request. However, it is
6362 possible that the other object knows a comparison method, which
6363 is why we return Py_NotImplemented to give the other object a
6364 chance.
6367 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6368 PyErr_Clear();
6369 Py_INCREF(Py_NotImplemented);
6370 return Py_NotImplemented;
6372 if (op != Py_EQ && op != Py_NE)
6373 return NULL;
6375 /* Equality comparison.
6377 This is a special case: we silence any PyExc_UnicodeDecodeError
6378 and instead turn it into a PyErr_UnicodeWarning.
6381 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6382 return NULL;
6383 PyErr_Clear();
6384 if (PyErr_Warn(PyExc_UnicodeWarning,
6385 (op == Py_EQ) ?
6386 "Unicode equal comparison "
6387 "failed to convert both arguments to Unicode - "
6388 "interpreting them as being unequal" :
6389 "Unicode unequal comparison "
6390 "failed to convert both arguments to Unicode - "
6391 "interpreting them as being unequal"
6392 ) < 0)
6393 return NULL;
6394 result = (op == Py_NE);
6395 return PyBool_FromLong(result);
6398 int PyUnicode_Contains(PyObject *container,
6399 PyObject *element)
6401 PyObject *str, *sub;
6402 int result;
6404 /* Coerce the two arguments */
6405 sub = PyUnicode_FromObject(element);
6406 if (!sub) {
6407 PyErr_SetString(PyExc_TypeError,
6408 "'in <string>' requires string as left operand");
6409 return -1;
6412 str = PyUnicode_FromObject(container);
6413 if (!str) {
6414 Py_DECREF(sub);
6415 return -1;
6418 result = stringlib_contains_obj(str, sub);
6420 Py_DECREF(str);
6421 Py_DECREF(sub);
6423 return result;
6426 /* Concat to string or Unicode object giving a new Unicode object. */
6428 PyObject *PyUnicode_Concat(PyObject *left,
6429 PyObject *right)
6431 PyUnicodeObject *u = NULL, *v = NULL, *w;
6433 /* Coerce the two arguments */
6434 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6435 if (u == NULL)
6436 goto onError;
6437 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6438 if (v == NULL)
6439 goto onError;
6441 /* Shortcuts */
6442 if (v == unicode_empty) {
6443 Py_DECREF(v);
6444 return (PyObject *)u;
6446 if (u == unicode_empty) {
6447 Py_DECREF(u);
6448 return (PyObject *)v;
6451 /* Concat the two Unicode strings */
6452 w = _PyUnicode_New(u->length + v->length);
6453 if (w == NULL)
6454 goto onError;
6455 Py_UNICODE_COPY(w->str, u->str, u->length);
6456 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6458 Py_DECREF(u);
6459 Py_DECREF(v);
6460 return (PyObject *)w;
6462 onError:
6463 Py_XDECREF(u);
6464 Py_XDECREF(v);
6465 return NULL;
6468 PyDoc_STRVAR(count__doc__,
6469 "S.count(sub[, start[, end]]) -> int\n\
6471 Return the number of non-overlapping occurrences of substring sub in\n\
6472 Unicode string S[start:end]. Optional arguments start and end are\n\
6473 interpreted as in slice notation.");
6475 static PyObject *
6476 unicode_count(PyUnicodeObject *self, PyObject *args)
6478 PyUnicodeObject *substring;
6479 Py_ssize_t start = 0;
6480 Py_ssize_t end = PY_SSIZE_T_MAX;
6481 PyObject *result;
6483 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6484 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6485 return NULL;
6487 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6488 (PyObject *)substring);
6489 if (substring == NULL)
6490 return NULL;
6492 FIX_START_END(self);
6494 result = PyInt_FromSsize_t(
6495 stringlib_count(self->str + start, end - start,
6496 substring->str, substring->length)
6499 Py_DECREF(substring);
6501 return result;
6504 PyDoc_STRVAR(encode__doc__,
6505 "S.encode([encoding[,errors]]) -> string or unicode\n\
6507 Encodes S using the codec registered for encoding. encoding defaults\n\
6508 to the default encoding. errors may be given to set a different error\n\
6509 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6510 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6511 'xmlcharrefreplace' as well as any other name registered with\n\
6512 codecs.register_error that can handle UnicodeEncodeErrors.");
6514 static PyObject *
6515 unicode_encode(PyUnicodeObject *self, PyObject *args)
6517 char *encoding = NULL;
6518 char *errors = NULL;
6519 PyObject *v;
6521 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6522 return NULL;
6523 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6524 if (v == NULL)
6525 goto onError;
6526 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6527 PyErr_Format(PyExc_TypeError,
6528 "encoder did not return a string/unicode object "
6529 "(type=%.400s)",
6530 Py_TYPE(v)->tp_name);
6531 Py_DECREF(v);
6532 return NULL;
6534 return v;
6536 onError:
6537 return NULL;
6540 PyDoc_STRVAR(decode__doc__,
6541 "S.decode([encoding[,errors]]) -> string or unicode\n\
6543 Decodes S using the codec registered for encoding. encoding defaults\n\
6544 to the default encoding. errors may be given to set a different error\n\
6545 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6546 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6547 as well as any other name registerd with codecs.register_error that is\n\
6548 able to handle UnicodeDecodeErrors.");
6550 static PyObject *
6551 unicode_decode(PyUnicodeObject *self, PyObject *args)
6553 char *encoding = NULL;
6554 char *errors = NULL;
6555 PyObject *v;
6557 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6558 return NULL;
6559 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
6560 if (v == NULL)
6561 goto onError;
6562 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6563 PyErr_Format(PyExc_TypeError,
6564 "decoder did not return a string/unicode object "
6565 "(type=%.400s)",
6566 Py_TYPE(v)->tp_name);
6567 Py_DECREF(v);
6568 return NULL;
6570 return v;
6572 onError:
6573 return NULL;
6576 PyDoc_STRVAR(expandtabs__doc__,
6577 "S.expandtabs([tabsize]) -> unicode\n\
6579 Return a copy of S where all tab characters are expanded using spaces.\n\
6580 If tabsize is not given, a tab size of 8 characters is assumed.");
6582 static PyObject*
6583 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6585 Py_UNICODE *e;
6586 Py_UNICODE *p;
6587 Py_UNICODE *q;
6588 Py_UNICODE *qe;
6589 Py_ssize_t i, j, incr;
6590 PyUnicodeObject *u;
6591 int tabsize = 8;
6593 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6594 return NULL;
6596 /* First pass: determine size of output string */
6597 i = 0; /* chars up to and including most recent \n or \r */
6598 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6599 e = self->str + self->length; /* end of input */
6600 for (p = self->str; p < e; p++)
6601 if (*p == '\t') {
6602 if (tabsize > 0) {
6603 incr = tabsize - (j % tabsize); /* cannot overflow */
6604 if (j > PY_SSIZE_T_MAX - incr)
6605 goto overflow1;
6606 j += incr;
6609 else {
6610 if (j > PY_SSIZE_T_MAX - 1)
6611 goto overflow1;
6612 j++;
6613 if (*p == '\n' || *p == '\r') {
6614 if (i > PY_SSIZE_T_MAX - j)
6615 goto overflow1;
6616 i += j;
6617 j = 0;
6621 if (i > PY_SSIZE_T_MAX - j)
6622 goto overflow1;
6624 /* Second pass: create output string and fill it */
6625 u = _PyUnicode_New(i + j);
6626 if (!u)
6627 return NULL;
6629 j = 0; /* same as in first pass */
6630 q = u->str; /* next output char */
6631 qe = u->str + u->length; /* end of output */
6633 for (p = self->str; p < e; p++)
6634 if (*p == '\t') {
6635 if (tabsize > 0) {
6636 i = tabsize - (j % tabsize);
6637 j += i;
6638 while (i--) {
6639 if (q >= qe)
6640 goto overflow2;
6641 *q++ = ' ';
6645 else {
6646 if (q >= qe)
6647 goto overflow2;
6648 *q++ = *p;
6649 j++;
6650 if (*p == '\n' || *p == '\r')
6651 j = 0;
6654 return (PyObject*) u;
6656 overflow2:
6657 Py_DECREF(u);
6658 overflow1:
6659 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6660 return NULL;
6663 PyDoc_STRVAR(find__doc__,
6664 "S.find(sub [,start [,end]]) -> int\n\
6666 Return the lowest index in S where substring sub is found,\n\
6667 such that sub is contained within s[start:end]. Optional\n\
6668 arguments start and end are interpreted as in slice notation.\n\
6670 Return -1 on failure.");
6672 static PyObject *
6673 unicode_find(PyUnicodeObject *self, PyObject *args)
6675 PyObject *substring;
6676 Py_ssize_t start;
6677 Py_ssize_t end;
6678 Py_ssize_t result;
6680 if (!_ParseTupleFinds(args, &substring, &start, &end))
6681 return NULL;
6683 result = stringlib_find_slice(
6684 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6685 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6686 start, end
6689 Py_DECREF(substring);
6691 return PyInt_FromSsize_t(result);
6694 static PyObject *
6695 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6697 if (index < 0 || index >= self->length) {
6698 PyErr_SetString(PyExc_IndexError, "string index out of range");
6699 return NULL;
6702 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6705 static long
6706 unicode_hash(PyUnicodeObject *self)
6708 /* Since Unicode objects compare equal to their ASCII string
6709 counterparts, they should use the individual character values
6710 as basis for their hash value. This is needed to assure that
6711 strings and Unicode objects behave in the same way as
6712 dictionary keys. */
6714 register Py_ssize_t len;
6715 register Py_UNICODE *p;
6716 register long x;
6718 if (self->hash != -1)
6719 return self->hash;
6720 len = PyUnicode_GET_SIZE(self);
6721 p = PyUnicode_AS_UNICODE(self);
6722 x = *p << 7;
6723 while (--len >= 0)
6724 x = (1000003*x) ^ *p++;
6725 x ^= PyUnicode_GET_SIZE(self);
6726 if (x == -1)
6727 x = -2;
6728 self->hash = x;
6729 return x;
6732 PyDoc_STRVAR(index__doc__,
6733 "S.index(sub [,start [,end]]) -> int\n\
6735 Like S.find() but raise ValueError when the substring is not found.");
6737 static PyObject *
6738 unicode_index(PyUnicodeObject *self, PyObject *args)
6740 Py_ssize_t result;
6741 PyObject *substring;
6742 Py_ssize_t start;
6743 Py_ssize_t end;
6745 if (!_ParseTupleFinds(args, &substring, &start, &end))
6746 return NULL;
6748 result = stringlib_find_slice(
6749 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6750 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6751 start, end
6754 Py_DECREF(substring);
6756 if (result < 0) {
6757 PyErr_SetString(PyExc_ValueError, "substring not found");
6758 return NULL;
6761 return PyInt_FromSsize_t(result);
6764 PyDoc_STRVAR(islower__doc__,
6765 "S.islower() -> bool\n\
6767 Return True if all cased characters in S are lowercase and there is\n\
6768 at least one cased character in S, False otherwise.");
6770 static PyObject*
6771 unicode_islower(PyUnicodeObject *self)
6773 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6774 register const Py_UNICODE *e;
6775 int cased;
6777 /* Shortcut for single character strings */
6778 if (PyUnicode_GET_SIZE(self) == 1)
6779 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6781 /* Special case for empty strings */
6782 if (PyUnicode_GET_SIZE(self) == 0)
6783 return PyBool_FromLong(0);
6785 e = p + PyUnicode_GET_SIZE(self);
6786 cased = 0;
6787 for (; p < e; p++) {
6788 register const Py_UNICODE ch = *p;
6790 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6791 return PyBool_FromLong(0);
6792 else if (!cased && Py_UNICODE_ISLOWER(ch))
6793 cased = 1;
6795 return PyBool_FromLong(cased);
6798 PyDoc_STRVAR(isupper__doc__,
6799 "S.isupper() -> bool\n\
6801 Return True if all cased characters in S are uppercase and there is\n\
6802 at least one cased character in S, False otherwise.");
6804 static PyObject*
6805 unicode_isupper(PyUnicodeObject *self)
6807 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6808 register const Py_UNICODE *e;
6809 int cased;
6811 /* Shortcut for single character strings */
6812 if (PyUnicode_GET_SIZE(self) == 1)
6813 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6815 /* Special case for empty strings */
6816 if (PyUnicode_GET_SIZE(self) == 0)
6817 return PyBool_FromLong(0);
6819 e = p + PyUnicode_GET_SIZE(self);
6820 cased = 0;
6821 for (; p < e; p++) {
6822 register const Py_UNICODE ch = *p;
6824 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6825 return PyBool_FromLong(0);
6826 else if (!cased && Py_UNICODE_ISUPPER(ch))
6827 cased = 1;
6829 return PyBool_FromLong(cased);
6832 PyDoc_STRVAR(istitle__doc__,
6833 "S.istitle() -> bool\n\
6835 Return True if S is a titlecased string and there is at least one\n\
6836 character in S, i.e. upper- and titlecase characters may only\n\
6837 follow uncased characters and lowercase characters only cased ones.\n\
6838 Return False otherwise.");
6840 static PyObject*
6841 unicode_istitle(PyUnicodeObject *self)
6843 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6844 register const Py_UNICODE *e;
6845 int cased, previous_is_cased;
6847 /* Shortcut for single character strings */
6848 if (PyUnicode_GET_SIZE(self) == 1)
6849 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6850 (Py_UNICODE_ISUPPER(*p) != 0));
6852 /* Special case for empty strings */
6853 if (PyUnicode_GET_SIZE(self) == 0)
6854 return PyBool_FromLong(0);
6856 e = p + PyUnicode_GET_SIZE(self);
6857 cased = 0;
6858 previous_is_cased = 0;
6859 for (; p < e; p++) {
6860 register const Py_UNICODE ch = *p;
6862 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6863 if (previous_is_cased)
6864 return PyBool_FromLong(0);
6865 previous_is_cased = 1;
6866 cased = 1;
6868 else if (Py_UNICODE_ISLOWER(ch)) {
6869 if (!previous_is_cased)
6870 return PyBool_FromLong(0);
6871 previous_is_cased = 1;
6872 cased = 1;
6874 else
6875 previous_is_cased = 0;
6877 return PyBool_FromLong(cased);
6880 PyDoc_STRVAR(isspace__doc__,
6881 "S.isspace() -> bool\n\
6883 Return True if all characters in S are whitespace\n\
6884 and there is at least one character in S, False otherwise.");
6886 static PyObject*
6887 unicode_isspace(PyUnicodeObject *self)
6889 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6890 register const Py_UNICODE *e;
6892 /* Shortcut for single character strings */
6893 if (PyUnicode_GET_SIZE(self) == 1 &&
6894 Py_UNICODE_ISSPACE(*p))
6895 return PyBool_FromLong(1);
6897 /* Special case for empty strings */
6898 if (PyUnicode_GET_SIZE(self) == 0)
6899 return PyBool_FromLong(0);
6901 e = p + PyUnicode_GET_SIZE(self);
6902 for (; p < e; p++) {
6903 if (!Py_UNICODE_ISSPACE(*p))
6904 return PyBool_FromLong(0);
6906 return PyBool_FromLong(1);
6909 PyDoc_STRVAR(isalpha__doc__,
6910 "S.isalpha() -> bool\n\
6912 Return True if all characters in S are alphabetic\n\
6913 and there is at least one character in S, False otherwise.");
6915 static PyObject*
6916 unicode_isalpha(PyUnicodeObject *self)
6918 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6919 register const Py_UNICODE *e;
6921 /* Shortcut for single character strings */
6922 if (PyUnicode_GET_SIZE(self) == 1 &&
6923 Py_UNICODE_ISALPHA(*p))
6924 return PyBool_FromLong(1);
6926 /* Special case for empty strings */
6927 if (PyUnicode_GET_SIZE(self) == 0)
6928 return PyBool_FromLong(0);
6930 e = p + PyUnicode_GET_SIZE(self);
6931 for (; p < e; p++) {
6932 if (!Py_UNICODE_ISALPHA(*p))
6933 return PyBool_FromLong(0);
6935 return PyBool_FromLong(1);
6938 PyDoc_STRVAR(isalnum__doc__,
6939 "S.isalnum() -> bool\n\
6941 Return True if all characters in S are alphanumeric\n\
6942 and there is at least one character in S, False otherwise.");
6944 static PyObject*
6945 unicode_isalnum(PyUnicodeObject *self)
6947 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6948 register const Py_UNICODE *e;
6950 /* Shortcut for single character strings */
6951 if (PyUnicode_GET_SIZE(self) == 1 &&
6952 Py_UNICODE_ISALNUM(*p))
6953 return PyBool_FromLong(1);
6955 /* Special case for empty strings */
6956 if (PyUnicode_GET_SIZE(self) == 0)
6957 return PyBool_FromLong(0);
6959 e = p + PyUnicode_GET_SIZE(self);
6960 for (; p < e; p++) {
6961 if (!Py_UNICODE_ISALNUM(*p))
6962 return PyBool_FromLong(0);
6964 return PyBool_FromLong(1);
6967 PyDoc_STRVAR(isdecimal__doc__,
6968 "S.isdecimal() -> bool\n\
6970 Return True if there are only decimal characters in S,\n\
6971 False otherwise.");
6973 static PyObject*
6974 unicode_isdecimal(PyUnicodeObject *self)
6976 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6977 register const Py_UNICODE *e;
6979 /* Shortcut for single character strings */
6980 if (PyUnicode_GET_SIZE(self) == 1 &&
6981 Py_UNICODE_ISDECIMAL(*p))
6982 return PyBool_FromLong(1);
6984 /* Special case for empty strings */
6985 if (PyUnicode_GET_SIZE(self) == 0)
6986 return PyBool_FromLong(0);
6988 e = p + PyUnicode_GET_SIZE(self);
6989 for (; p < e; p++) {
6990 if (!Py_UNICODE_ISDECIMAL(*p))
6991 return PyBool_FromLong(0);
6993 return PyBool_FromLong(1);
6996 PyDoc_STRVAR(isdigit__doc__,
6997 "S.isdigit() -> bool\n\
6999 Return True if all characters in S are digits\n\
7000 and there is at least one character in S, False otherwise.");
7002 static PyObject*
7003 unicode_isdigit(PyUnicodeObject *self)
7005 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7006 register const Py_UNICODE *e;
7008 /* Shortcut for single character strings */
7009 if (PyUnicode_GET_SIZE(self) == 1 &&
7010 Py_UNICODE_ISDIGIT(*p))
7011 return PyBool_FromLong(1);
7013 /* Special case for empty strings */
7014 if (PyUnicode_GET_SIZE(self) == 0)
7015 return PyBool_FromLong(0);
7017 e = p + PyUnicode_GET_SIZE(self);
7018 for (; p < e; p++) {
7019 if (!Py_UNICODE_ISDIGIT(*p))
7020 return PyBool_FromLong(0);
7022 return PyBool_FromLong(1);
7025 PyDoc_STRVAR(isnumeric__doc__,
7026 "S.isnumeric() -> bool\n\
7028 Return True if there are only numeric characters in S,\n\
7029 False otherwise.");
7031 static PyObject*
7032 unicode_isnumeric(PyUnicodeObject *self)
7034 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7035 register const Py_UNICODE *e;
7037 /* Shortcut for single character strings */
7038 if (PyUnicode_GET_SIZE(self) == 1 &&
7039 Py_UNICODE_ISNUMERIC(*p))
7040 return PyBool_FromLong(1);
7042 /* Special case for empty strings */
7043 if (PyUnicode_GET_SIZE(self) == 0)
7044 return PyBool_FromLong(0);
7046 e = p + PyUnicode_GET_SIZE(self);
7047 for (; p < e; p++) {
7048 if (!Py_UNICODE_ISNUMERIC(*p))
7049 return PyBool_FromLong(0);
7051 return PyBool_FromLong(1);
7054 PyDoc_STRVAR(join__doc__,
7055 "S.join(sequence) -> unicode\n\
7057 Return a string which is the concatenation of the strings in the\n\
7058 sequence. The separator between elements is S.");
7060 static PyObject*
7061 unicode_join(PyObject *self, PyObject *data)
7063 return PyUnicode_Join(self, data);
7066 static Py_ssize_t
7067 unicode_length(PyUnicodeObject *self)
7069 return self->length;
7072 PyDoc_STRVAR(ljust__doc__,
7073 "S.ljust(width[, fillchar]) -> int\n\
7075 Return S left-justified in a Unicode string of length width. Padding is\n\
7076 done using the specified fill character (default is a space).");
7078 static PyObject *
7079 unicode_ljust(PyUnicodeObject *self, PyObject *args)
7081 Py_ssize_t width;
7082 Py_UNICODE fillchar = ' ';
7084 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7085 return NULL;
7087 if (self->length >= width && PyUnicode_CheckExact(self)) {
7088 Py_INCREF(self);
7089 return (PyObject*) self;
7092 return (PyObject*) pad(self, 0, width - self->length, fillchar);
7095 PyDoc_STRVAR(lower__doc__,
7096 "S.lower() -> unicode\n\
7098 Return a copy of the string S converted to lowercase.");
7100 static PyObject*
7101 unicode_lower(PyUnicodeObject *self)
7103 return fixup(self, fixlower);
7106 #define LEFTSTRIP 0
7107 #define RIGHTSTRIP 1
7108 #define BOTHSTRIP 2
7110 /* Arrays indexed by above */
7111 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7113 #define STRIPNAME(i) (stripformat[i]+3)
7115 /* externally visible for str.strip(unicode) */
7116 PyObject *
7117 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7119 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7120 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7121 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7122 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7123 Py_ssize_t i, j;
7125 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7127 i = 0;
7128 if (striptype != RIGHTSTRIP) {
7129 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7130 i++;
7134 j = len;
7135 if (striptype != LEFTSTRIP) {
7136 do {
7137 j--;
7138 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7139 j++;
7142 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7143 Py_INCREF(self);
7144 return (PyObject*)self;
7146 else
7147 return PyUnicode_FromUnicode(s+i, j-i);
7151 static PyObject *
7152 do_strip(PyUnicodeObject *self, int striptype)
7154 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7155 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7157 i = 0;
7158 if (striptype != RIGHTSTRIP) {
7159 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7160 i++;
7164 j = len;
7165 if (striptype != LEFTSTRIP) {
7166 do {
7167 j--;
7168 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7169 j++;
7172 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7173 Py_INCREF(self);
7174 return (PyObject*)self;
7176 else
7177 return PyUnicode_FromUnicode(s+i, j-i);
7181 static PyObject *
7182 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7184 PyObject *sep = NULL;
7186 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7187 return NULL;
7189 if (sep != NULL && sep != Py_None) {
7190 if (PyUnicode_Check(sep))
7191 return _PyUnicode_XStrip(self, striptype, sep);
7192 else if (PyString_Check(sep)) {
7193 PyObject *res;
7194 sep = PyUnicode_FromObject(sep);
7195 if (sep==NULL)
7196 return NULL;
7197 res = _PyUnicode_XStrip(self, striptype, sep);
7198 Py_DECREF(sep);
7199 return res;
7201 else {
7202 PyErr_Format(PyExc_TypeError,
7203 "%s arg must be None, unicode or str",
7204 STRIPNAME(striptype));
7205 return NULL;
7209 return do_strip(self, striptype);
7213 PyDoc_STRVAR(strip__doc__,
7214 "S.strip([chars]) -> unicode\n\
7216 Return a copy of the string S with leading and trailing\n\
7217 whitespace removed.\n\
7218 If chars is given and not None, remove characters in chars instead.\n\
7219 If chars is a str, it will be converted to unicode before stripping");
7221 static PyObject *
7222 unicode_strip(PyUnicodeObject *self, PyObject *args)
7224 if (PyTuple_GET_SIZE(args) == 0)
7225 return do_strip(self, BOTHSTRIP); /* Common case */
7226 else
7227 return do_argstrip(self, BOTHSTRIP, args);
7231 PyDoc_STRVAR(lstrip__doc__,
7232 "S.lstrip([chars]) -> unicode\n\
7234 Return a copy of the string S with leading whitespace removed.\n\
7235 If chars is given and not None, remove characters in chars instead.\n\
7236 If chars is a str, it will be converted to unicode before stripping");
7238 static PyObject *
7239 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7241 if (PyTuple_GET_SIZE(args) == 0)
7242 return do_strip(self, LEFTSTRIP); /* Common case */
7243 else
7244 return do_argstrip(self, LEFTSTRIP, args);
7248 PyDoc_STRVAR(rstrip__doc__,
7249 "S.rstrip([chars]) -> unicode\n\
7251 Return a copy of the string S with trailing whitespace removed.\n\
7252 If chars is given and not None, remove characters in chars instead.\n\
7253 If chars is a str, it will be converted to unicode before stripping");
7255 static PyObject *
7256 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7258 if (PyTuple_GET_SIZE(args) == 0)
7259 return do_strip(self, RIGHTSTRIP); /* Common case */
7260 else
7261 return do_argstrip(self, RIGHTSTRIP, args);
7265 static PyObject*
7266 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7268 PyUnicodeObject *u;
7269 Py_UNICODE *p;
7270 Py_ssize_t nchars;
7271 size_t nbytes;
7273 if (len < 0)
7274 len = 0;
7276 if (len == 1 && PyUnicode_CheckExact(str)) {
7277 /* no repeat, return original string */
7278 Py_INCREF(str);
7279 return (PyObject*) str;
7282 /* ensure # of chars needed doesn't overflow int and # of bytes
7283 * needed doesn't overflow size_t
7285 nchars = len * str->length;
7286 if (len && nchars / len != str->length) {
7287 PyErr_SetString(PyExc_OverflowError,
7288 "repeated string is too long");
7289 return NULL;
7291 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7292 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7293 PyErr_SetString(PyExc_OverflowError,
7294 "repeated string is too long");
7295 return NULL;
7297 u = _PyUnicode_New(nchars);
7298 if (!u)
7299 return NULL;
7301 p = u->str;
7303 if (str->length == 1 && len > 0) {
7304 Py_UNICODE_FILL(p, str->str[0], len);
7305 } else {
7306 Py_ssize_t done = 0; /* number of characters copied this far */
7307 if (done < nchars) {
7308 Py_UNICODE_COPY(p, str->str, str->length);
7309 done = str->length;
7311 while (done < nchars) {
7312 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7313 Py_UNICODE_COPY(p+done, p, n);
7314 done += n;
7318 return (PyObject*) u;
7321 PyObject *PyUnicode_Replace(PyObject *obj,
7322 PyObject *subobj,
7323 PyObject *replobj,
7324 Py_ssize_t maxcount)
7326 PyObject *self;
7327 PyObject *str1;
7328 PyObject *str2;
7329 PyObject *result;
7331 self = PyUnicode_FromObject(obj);
7332 if (self == NULL)
7333 return NULL;
7334 str1 = PyUnicode_FromObject(subobj);
7335 if (str1 == NULL) {
7336 Py_DECREF(self);
7337 return NULL;
7339 str2 = PyUnicode_FromObject(replobj);
7340 if (str2 == NULL) {
7341 Py_DECREF(self);
7342 Py_DECREF(str1);
7343 return NULL;
7345 result = replace((PyUnicodeObject *)self,
7346 (PyUnicodeObject *)str1,
7347 (PyUnicodeObject *)str2,
7348 maxcount);
7349 Py_DECREF(self);
7350 Py_DECREF(str1);
7351 Py_DECREF(str2);
7352 return result;
7355 PyDoc_STRVAR(replace__doc__,
7356 "S.replace (old, new[, count]) -> unicode\n\
7358 Return a copy of S with all occurrences of substring\n\
7359 old replaced by new. If the optional argument count is\n\
7360 given, only the first count occurrences are replaced.");
7362 static PyObject*
7363 unicode_replace(PyUnicodeObject *self, PyObject *args)
7365 PyUnicodeObject *str1;
7366 PyUnicodeObject *str2;
7367 Py_ssize_t maxcount = -1;
7368 PyObject *result;
7370 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7371 return NULL;
7372 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7373 if (str1 == NULL)
7374 return NULL;
7375 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7376 if (str2 == NULL) {
7377 Py_DECREF(str1);
7378 return NULL;
7381 result = replace(self, str1, str2, maxcount);
7383 Py_DECREF(str1);
7384 Py_DECREF(str2);
7385 return result;
7388 static
7389 PyObject *unicode_repr(PyObject *unicode)
7391 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7392 PyUnicode_GET_SIZE(unicode),
7396 PyDoc_STRVAR(rfind__doc__,
7397 "S.rfind(sub [,start [,end]]) -> int\n\
7399 Return the highest index in S where substring sub is found,\n\
7400 such that sub is contained within s[start:end]. Optional\n\
7401 arguments start and end are interpreted as in slice notation.\n\
7403 Return -1 on failure.");
7405 static PyObject *
7406 unicode_rfind(PyUnicodeObject *self, PyObject *args)
7408 PyObject *substring;
7409 Py_ssize_t start;
7410 Py_ssize_t end;
7411 Py_ssize_t result;
7413 if (!_ParseTupleFinds(args, &substring, &start, &end))
7414 return NULL;
7416 result = stringlib_rfind_slice(
7417 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7418 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7419 start, end
7422 Py_DECREF(substring);
7424 return PyInt_FromSsize_t(result);
7427 PyDoc_STRVAR(rindex__doc__,
7428 "S.rindex(sub [,start [,end]]) -> int\n\
7430 Like S.rfind() but raise ValueError when the substring is not found.");
7432 static PyObject *
7433 unicode_rindex(PyUnicodeObject *self, PyObject *args)
7435 PyObject *substring;
7436 Py_ssize_t start;
7437 Py_ssize_t end;
7438 Py_ssize_t result;
7440 if (!_ParseTupleFinds(args, &substring, &start, &end))
7441 return NULL;
7443 result = stringlib_rfind_slice(
7444 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7445 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7446 start, end
7449 Py_DECREF(substring);
7451 if (result < 0) {
7452 PyErr_SetString(PyExc_ValueError, "substring not found");
7453 return NULL;
7455 return PyInt_FromSsize_t(result);
7458 PyDoc_STRVAR(rjust__doc__,
7459 "S.rjust(width[, fillchar]) -> unicode\n\
7461 Return S right-justified in a Unicode string of length width. Padding is\n\
7462 done using the specified fill character (default is a space).");
7464 static PyObject *
7465 unicode_rjust(PyUnicodeObject *self, PyObject *args)
7467 Py_ssize_t width;
7468 Py_UNICODE fillchar = ' ';
7470 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7471 return NULL;
7473 if (self->length >= width && PyUnicode_CheckExact(self)) {
7474 Py_INCREF(self);
7475 return (PyObject*) self;
7478 return (PyObject*) pad(self, width - self->length, 0, fillchar);
7481 static PyObject*
7482 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
7484 /* standard clamping */
7485 if (start < 0)
7486 start = 0;
7487 if (end < 0)
7488 end = 0;
7489 if (end > self->length)
7490 end = self->length;
7491 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
7492 /* full slice, return original string */
7493 Py_INCREF(self);
7494 return (PyObject*) self;
7496 if (start > end)
7497 start = end;
7498 /* copy slice */
7499 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7500 end - start);
7503 PyObject *PyUnicode_Split(PyObject *s,
7504 PyObject *sep,
7505 Py_ssize_t maxsplit)
7507 PyObject *result;
7509 s = PyUnicode_FromObject(s);
7510 if (s == NULL)
7511 return NULL;
7512 if (sep != NULL) {
7513 sep = PyUnicode_FromObject(sep);
7514 if (sep == NULL) {
7515 Py_DECREF(s);
7516 return NULL;
7520 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7522 Py_DECREF(s);
7523 Py_XDECREF(sep);
7524 return result;
7527 PyDoc_STRVAR(split__doc__,
7528 "S.split([sep [,maxsplit]]) -> list of strings\n\
7530 Return a list of the words in S, using sep as the\n\
7531 delimiter string. If maxsplit is given, at most maxsplit\n\
7532 splits are done. If sep is not specified or is None, any\n\
7533 whitespace string is a separator and empty strings are\n\
7534 removed from the result.");
7536 static PyObject*
7537 unicode_split(PyUnicodeObject *self, PyObject *args)
7539 PyObject *substring = Py_None;
7540 Py_ssize_t maxcount = -1;
7542 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7543 return NULL;
7545 if (substring == Py_None)
7546 return split(self, NULL, maxcount);
7547 else if (PyUnicode_Check(substring))
7548 return split(self, (PyUnicodeObject *)substring, maxcount);
7549 else
7550 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7553 PyObject *
7554 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7556 PyObject* str_obj;
7557 PyObject* sep_obj;
7558 PyObject* out;
7560 str_obj = PyUnicode_FromObject(str_in);
7561 if (!str_obj)
7562 return NULL;
7563 sep_obj = PyUnicode_FromObject(sep_in);
7564 if (!sep_obj) {
7565 Py_DECREF(str_obj);
7566 return NULL;
7569 out = stringlib_partition(
7570 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7571 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7574 Py_DECREF(sep_obj);
7575 Py_DECREF(str_obj);
7577 return out;
7581 PyObject *
7582 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7584 PyObject* str_obj;
7585 PyObject* sep_obj;
7586 PyObject* out;
7588 str_obj = PyUnicode_FromObject(str_in);
7589 if (!str_obj)
7590 return NULL;
7591 sep_obj = PyUnicode_FromObject(sep_in);
7592 if (!sep_obj) {
7593 Py_DECREF(str_obj);
7594 return NULL;
7597 out = stringlib_rpartition(
7598 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7599 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7602 Py_DECREF(sep_obj);
7603 Py_DECREF(str_obj);
7605 return out;
7608 PyDoc_STRVAR(partition__doc__,
7609 "S.partition(sep) -> (head, sep, tail)\n\
7611 Search for the separator sep in S, and return the part before it,\n\
7612 the separator itself, and the part after it. If the separator is not\n\
7613 found, return S and two empty strings.");
7615 static PyObject*
7616 unicode_partition(PyUnicodeObject *self, PyObject *separator)
7618 return PyUnicode_Partition((PyObject *)self, separator);
7621 PyDoc_STRVAR(rpartition__doc__,
7622 "S.rpartition(sep) -> (tail, sep, head)\n\
7624 Search for the separator sep in S, starting at the end of S, and return\n\
7625 the part before it, the separator itself, and the part after it. If the\n\
7626 separator is not found, return two empty strings and S.");
7628 static PyObject*
7629 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7631 return PyUnicode_RPartition((PyObject *)self, separator);
7634 PyObject *PyUnicode_RSplit(PyObject *s,
7635 PyObject *sep,
7636 Py_ssize_t maxsplit)
7638 PyObject *result;
7640 s = PyUnicode_FromObject(s);
7641 if (s == NULL)
7642 return NULL;
7643 if (sep != NULL) {
7644 sep = PyUnicode_FromObject(sep);
7645 if (sep == NULL) {
7646 Py_DECREF(s);
7647 return NULL;
7651 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7653 Py_DECREF(s);
7654 Py_XDECREF(sep);
7655 return result;
7658 PyDoc_STRVAR(rsplit__doc__,
7659 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7661 Return a list of the words in S, using sep as the\n\
7662 delimiter string, starting at the end of the string and\n\
7663 working to the front. If maxsplit is given, at most maxsplit\n\
7664 splits are done. If sep is not specified, any whitespace string\n\
7665 is a separator.");
7667 static PyObject*
7668 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7670 PyObject *substring = Py_None;
7671 Py_ssize_t maxcount = -1;
7673 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7674 return NULL;
7676 if (substring == Py_None)
7677 return rsplit(self, NULL, maxcount);
7678 else if (PyUnicode_Check(substring))
7679 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7680 else
7681 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7684 PyDoc_STRVAR(splitlines__doc__,
7685 "S.splitlines([keepends]) -> list of strings\n\
7687 Return a list of the lines in S, breaking at line boundaries.\n\
7688 Line breaks are not included in the resulting list unless keepends\n\
7689 is given and true.");
7691 static PyObject*
7692 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7694 int keepends = 0;
7696 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7697 return NULL;
7699 return PyUnicode_Splitlines((PyObject *)self, keepends);
7702 static
7703 PyObject *unicode_str(PyUnicodeObject *self)
7705 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
7708 PyDoc_STRVAR(swapcase__doc__,
7709 "S.swapcase() -> unicode\n\
7711 Return a copy of S with uppercase characters converted to lowercase\n\
7712 and vice versa.");
7714 static PyObject*
7715 unicode_swapcase(PyUnicodeObject *self)
7717 return fixup(self, fixswapcase);
7720 PyDoc_STRVAR(translate__doc__,
7721 "S.translate(table) -> unicode\n\
7723 Return a copy of the string S, where all characters have been mapped\n\
7724 through the given translation table, which must be a mapping of\n\
7725 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7726 Unmapped characters are left untouched. Characters mapped to None\n\
7727 are deleted.");
7729 static PyObject*
7730 unicode_translate(PyUnicodeObject *self, PyObject *table)
7732 return PyUnicode_TranslateCharmap(self->str,
7733 self->length,
7734 table,
7735 "ignore");
7738 PyDoc_STRVAR(upper__doc__,
7739 "S.upper() -> unicode\n\
7741 Return a copy of S converted to uppercase.");
7743 static PyObject*
7744 unicode_upper(PyUnicodeObject *self)
7746 return fixup(self, fixupper);
7749 PyDoc_STRVAR(zfill__doc__,
7750 "S.zfill(width) -> unicode\n\
7752 Pad a numeric string S with zeros on the left, to fill a field\n\
7753 of the specified width. The string S is never truncated.");
7755 static PyObject *
7756 unicode_zfill(PyUnicodeObject *self, PyObject *args)
7758 Py_ssize_t fill;
7759 PyUnicodeObject *u;
7761 Py_ssize_t width;
7762 if (!PyArg_ParseTuple(args, "n:zfill", &width))
7763 return NULL;
7765 if (self->length >= width) {
7766 if (PyUnicode_CheckExact(self)) {
7767 Py_INCREF(self);
7768 return (PyObject*) self;
7770 else
7771 return PyUnicode_FromUnicode(
7772 PyUnicode_AS_UNICODE(self),
7773 PyUnicode_GET_SIZE(self)
7777 fill = width - self->length;
7779 u = pad(self, fill, 0, '0');
7781 if (u == NULL)
7782 return NULL;
7784 if (u->str[fill] == '+' || u->str[fill] == '-') {
7785 /* move sign to beginning of string */
7786 u->str[0] = u->str[fill];
7787 u->str[fill] = '0';
7790 return (PyObject*) u;
7793 #if 0
7794 static PyObject*
7795 free_listsize(PyUnicodeObject *self)
7797 return PyInt_FromLong(numfree);
7799 #endif
7801 PyDoc_STRVAR(startswith__doc__,
7802 "S.startswith(prefix[, start[, end]]) -> bool\n\
7804 Return True if S starts with the specified prefix, False otherwise.\n\
7805 With optional start, test S beginning at that position.\n\
7806 With optional end, stop comparing S at that position.\n\
7807 prefix can also be a tuple of strings to try.");
7809 static PyObject *
7810 unicode_startswith(PyUnicodeObject *self,
7811 PyObject *args)
7813 PyObject *subobj;
7814 PyUnicodeObject *substring;
7815 Py_ssize_t start = 0;
7816 Py_ssize_t end = PY_SSIZE_T_MAX;
7817 int result;
7819 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
7820 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7821 return NULL;
7822 if (PyTuple_Check(subobj)) {
7823 Py_ssize_t i;
7824 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7825 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7826 PyTuple_GET_ITEM(subobj, i));
7827 if (substring == NULL)
7828 return NULL;
7829 result = tailmatch(self, substring, start, end, -1);
7830 Py_DECREF(substring);
7831 if (result) {
7832 Py_RETURN_TRUE;
7835 /* nothing matched */
7836 Py_RETURN_FALSE;
7838 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7839 if (substring == NULL)
7840 return NULL;
7841 result = tailmatch(self, substring, start, end, -1);
7842 Py_DECREF(substring);
7843 return PyBool_FromLong(result);
7847 PyDoc_STRVAR(endswith__doc__,
7848 "S.endswith(suffix[, start[, end]]) -> bool\n\
7850 Return True if S ends with the specified suffix, False otherwise.\n\
7851 With optional start, test S beginning at that position.\n\
7852 With optional end, stop comparing S at that position.\n\
7853 suffix can also be a tuple of strings to try.");
7855 static PyObject *
7856 unicode_endswith(PyUnicodeObject *self,
7857 PyObject *args)
7859 PyObject *subobj;
7860 PyUnicodeObject *substring;
7861 Py_ssize_t start = 0;
7862 Py_ssize_t end = PY_SSIZE_T_MAX;
7863 int result;
7865 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7866 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7867 return NULL;
7868 if (PyTuple_Check(subobj)) {
7869 Py_ssize_t i;
7870 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7871 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7872 PyTuple_GET_ITEM(subobj, i));
7873 if (substring == NULL)
7874 return NULL;
7875 result = tailmatch(self, substring, start, end, +1);
7876 Py_DECREF(substring);
7877 if (result) {
7878 Py_RETURN_TRUE;
7881 Py_RETURN_FALSE;
7883 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7884 if (substring == NULL)
7885 return NULL;
7887 result = tailmatch(self, substring, start, end, +1);
7888 Py_DECREF(substring);
7889 return PyBool_FromLong(result);
7893 /* Implements do_string_format, which is unicode because of stringlib */
7894 #include "stringlib/string_format.h"
7896 PyDoc_STRVAR(format__doc__,
7897 "S.format(*args, **kwargs) -> unicode\n\
7901 static PyObject *
7902 unicode__format__(PyObject *self, PyObject *args)
7904 PyObject *format_spec;
7905 PyObject *result = NULL;
7906 PyObject *tmp = NULL;
7908 /* If 2.x, convert format_spec to the same type as value */
7909 /* This is to allow things like u''.format('') */
7910 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7911 goto done;
7912 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7913 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
7914 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
7915 goto done;
7917 tmp = PyObject_Unicode(format_spec);
7918 if (tmp == NULL)
7919 goto done;
7920 format_spec = tmp;
7922 result = _PyUnicode_FormatAdvanced(self,
7923 PyUnicode_AS_UNICODE(format_spec),
7924 PyUnicode_GET_SIZE(format_spec));
7925 done:
7926 Py_XDECREF(tmp);
7927 return result;
7930 PyDoc_STRVAR(p_format__doc__,
7931 "S.__format__(format_spec) -> unicode\n\
7935 static PyObject *
7936 unicode__sizeof__(PyUnicodeObject *v)
7938 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7939 sizeof(Py_UNICODE) * (v->length + 1));
7942 PyDoc_STRVAR(sizeof__doc__,
7943 "S.__sizeof__() -> size of S in memory, in bytes\n\
7947 static PyObject *
7948 unicode_getnewargs(PyUnicodeObject *v)
7950 return Py_BuildValue("(u#)", v->str, v->length);
7954 static PyMethodDef unicode_methods[] = {
7956 /* Order is according to common usage: often used methods should
7957 appear first, since lookup is done sequentially. */
7959 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7960 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7961 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
7962 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
7963 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7964 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7965 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7966 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7967 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7968 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7969 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
7970 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
7971 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7972 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7973 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
7974 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
7975 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
7976 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7977 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7978 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7979 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
7980 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
7981 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
7982 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
7983 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
7984 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7985 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7986 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7987 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7988 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7989 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7990 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7991 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7992 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7993 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7994 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7995 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7996 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7997 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
7998 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
7999 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8000 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
8001 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8002 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
8003 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
8004 #if 0
8005 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
8006 #endif
8008 #if 0
8009 /* This one is just used for debugging the implementation. */
8010 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
8011 #endif
8013 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
8014 {NULL, NULL}
8017 static PyObject *
8018 unicode_mod(PyObject *v, PyObject *w)
8020 if (!PyUnicode_Check(v)) {
8021 Py_INCREF(Py_NotImplemented);
8022 return Py_NotImplemented;
8024 return PyUnicode_Format(v, w);
8027 static PyNumberMethods unicode_as_number = {
8028 0, /*nb_add*/
8029 0, /*nb_subtract*/
8030 0, /*nb_multiply*/
8031 0, /*nb_divide*/
8032 unicode_mod, /*nb_remainder*/
8035 static PySequenceMethods unicode_as_sequence = {
8036 (lenfunc) unicode_length, /* sq_length */
8037 PyUnicode_Concat, /* sq_concat */
8038 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8039 (ssizeargfunc) unicode_getitem, /* sq_item */
8040 (ssizessizeargfunc) unicode_slice, /* sq_slice */
8041 0, /* sq_ass_item */
8042 0, /* sq_ass_slice */
8043 PyUnicode_Contains, /* sq_contains */
8046 static PyObject*
8047 unicode_subscript(PyUnicodeObject* self, PyObject* item)
8049 if (PyIndex_Check(item)) {
8050 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
8051 if (i == -1 && PyErr_Occurred())
8052 return NULL;
8053 if (i < 0)
8054 i += PyUnicode_GET_SIZE(self);
8055 return unicode_getitem(self, i);
8056 } else if (PySlice_Check(item)) {
8057 Py_ssize_t start, stop, step, slicelength, cur, i;
8058 Py_UNICODE* source_buf;
8059 Py_UNICODE* result_buf;
8060 PyObject* result;
8062 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
8063 &start, &stop, &step, &slicelength) < 0) {
8064 return NULL;
8067 if (slicelength <= 0) {
8068 return PyUnicode_FromUnicode(NULL, 0);
8069 } else if (start == 0 && step == 1 && slicelength == self->length &&
8070 PyUnicode_CheckExact(self)) {
8071 Py_INCREF(self);
8072 return (PyObject *)self;
8073 } else if (step == 1) {
8074 return PyUnicode_FromUnicode(self->str + start, slicelength);
8075 } else {
8076 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
8077 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8078 sizeof(Py_UNICODE));
8080 if (result_buf == NULL)
8081 return PyErr_NoMemory();
8083 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8084 result_buf[i] = source_buf[cur];
8087 result = PyUnicode_FromUnicode(result_buf, slicelength);
8088 PyObject_FREE(result_buf);
8089 return result;
8091 } else {
8092 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8093 return NULL;
8097 static PyMappingMethods unicode_as_mapping = {
8098 (lenfunc)unicode_length, /* mp_length */
8099 (binaryfunc)unicode_subscript, /* mp_subscript */
8100 (objobjargproc)0, /* mp_ass_subscript */
8103 static Py_ssize_t
8104 unicode_buffer_getreadbuf(PyUnicodeObject *self,
8105 Py_ssize_t index,
8106 const void **ptr)
8108 if (index != 0) {
8109 PyErr_SetString(PyExc_SystemError,
8110 "accessing non-existent unicode segment");
8111 return -1;
8113 *ptr = (void *) self->str;
8114 return PyUnicode_GET_DATA_SIZE(self);
8117 static Py_ssize_t
8118 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
8119 const void **ptr)
8121 PyErr_SetString(PyExc_TypeError,
8122 "cannot use unicode as modifiable buffer");
8123 return -1;
8126 static int
8127 unicode_buffer_getsegcount(PyUnicodeObject *self,
8128 Py_ssize_t *lenp)
8130 if (lenp)
8131 *lenp = PyUnicode_GET_DATA_SIZE(self);
8132 return 1;
8135 static Py_ssize_t
8136 unicode_buffer_getcharbuf(PyUnicodeObject *self,
8137 Py_ssize_t index,
8138 const void **ptr)
8140 PyObject *str;
8142 if (index != 0) {
8143 PyErr_SetString(PyExc_SystemError,
8144 "accessing non-existent unicode segment");
8145 return -1;
8147 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
8148 if (str == NULL)
8149 return -1;
8150 *ptr = (void *) PyString_AS_STRING(str);
8151 return PyString_GET_SIZE(str);
8154 /* Helpers for PyUnicode_Format() */
8156 static PyObject *
8157 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8159 Py_ssize_t argidx = *p_argidx;
8160 if (argidx < arglen) {
8161 (*p_argidx)++;
8162 if (arglen < 0)
8163 return args;
8164 else
8165 return PyTuple_GetItem(args, argidx);
8167 PyErr_SetString(PyExc_TypeError,
8168 "not enough arguments for format string");
8169 return NULL;
8172 #define F_LJUST (1<<0)
8173 #define F_SIGN (1<<1)
8174 #define F_BLANK (1<<2)
8175 #define F_ALT (1<<3)
8176 #define F_ZERO (1<<4)
8178 static Py_ssize_t
8179 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8181 register Py_ssize_t i;
8182 Py_ssize_t len = strlen(charbuffer);
8183 for (i = len - 1; i >= 0; i--)
8184 buffer[i] = (Py_UNICODE) charbuffer[i];
8186 return len;
8189 static int
8190 doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8192 Py_ssize_t result;
8194 PyOS_ascii_formatd((char *)buffer, len, format, x);
8195 result = strtounicode(buffer, (char *)buffer);
8196 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8199 static int
8200 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8202 Py_ssize_t result;
8204 PyOS_snprintf((char *)buffer, len, format, x);
8205 result = strtounicode(buffer, (char *)buffer);
8206 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8209 /* XXX To save some code duplication, formatfloat/long/int could have been
8210 shared with stringobject.c, converting from 8-bit to Unicode after the
8211 formatting is done. */
8213 static int
8214 formatfloat(Py_UNICODE *buf,
8215 size_t buflen,
8216 int flags,
8217 int prec,
8218 int type,
8219 PyObject *v)
8221 /* fmt = '%#.' + `prec` + `type`
8222 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
8223 char fmt[20];
8224 double x;
8226 x = PyFloat_AsDouble(v);
8227 if (x == -1.0 && PyErr_Occurred())
8228 return -1;
8229 if (prec < 0)
8230 prec = 6;
8231 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8232 type = 'g';
8233 /* Worst case length calc to ensure no buffer overrun:
8235 'g' formats:
8236 fmt = %#.<prec>g
8237 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8238 for any double rep.)
8239 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8241 'f' formats:
8242 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8243 len = 1 + 50 + 1 + prec = 52 + prec
8245 If prec=0 the effective precision is 1 (the leading digit is
8246 always given), therefore increase the length by one.
8249 if (((type == 'g' || type == 'G') &&
8250 buflen <= (size_t)10 + (size_t)prec) ||
8251 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
8252 PyErr_SetString(PyExc_OverflowError,
8253 "formatted float is too long (precision too large?)");
8254 return -1;
8256 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8257 (flags&F_ALT) ? "#" : "",
8258 prec, type);
8259 return doubletounicode(buf, buflen, fmt, x);
8262 static PyObject*
8263 formatlong(PyObject *val, int flags, int prec, int type)
8265 char *buf;
8266 int i, len;
8267 PyObject *str; /* temporary string object. */
8268 PyUnicodeObject *result;
8270 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8271 if (!str)
8272 return NULL;
8273 result = _PyUnicode_New(len);
8274 if (!result) {
8275 Py_DECREF(str);
8276 return NULL;
8278 for (i = 0; i < len; i++)
8279 result->str[i] = buf[i];
8280 result->str[len] = 0;
8281 Py_DECREF(str);
8282 return (PyObject*)result;
8285 static int
8286 formatint(Py_UNICODE *buf,
8287 size_t buflen,
8288 int flags,
8289 int prec,
8290 int type,
8291 PyObject *v)
8293 /* fmt = '%#.' + `prec` + 'l' + `type`
8294 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8295 * + 1 + 1
8296 * = 24
8298 char fmt[64]; /* plenty big enough! */
8299 char *sign;
8300 long x;
8302 x = PyInt_AsLong(v);
8303 if (x == -1 && PyErr_Occurred())
8304 return -1;
8305 if (x < 0 && type == 'u') {
8306 type = 'd';
8308 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8309 sign = "-";
8310 else
8311 sign = "";
8312 if (prec < 0)
8313 prec = 1;
8315 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8316 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8318 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8319 PyErr_SetString(PyExc_OverflowError,
8320 "formatted integer is too long (precision too large?)");
8321 return -1;
8324 if ((flags & F_ALT) &&
8325 (type == 'x' || type == 'X')) {
8326 /* When converting under %#x or %#X, there are a number
8327 * of issues that cause pain:
8328 * - when 0 is being converted, the C standard leaves off
8329 * the '0x' or '0X', which is inconsistent with other
8330 * %#x/%#X conversions and inconsistent with Python's
8331 * hex() function
8332 * - there are platforms that violate the standard and
8333 * convert 0 with the '0x' or '0X'
8334 * (Metrowerks, Compaq Tru64)
8335 * - there are platforms that give '0x' when converting
8336 * under %#X, but convert 0 in accordance with the
8337 * standard (OS/2 EMX)
8339 * We can achieve the desired consistency by inserting our
8340 * own '0x' or '0X' prefix, and substituting %x/%X in place
8341 * of %#x/%#X.
8343 * Note that this is the same approach as used in
8344 * formatint() in stringobject.c
8346 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8347 sign, type, prec, type);
8349 else {
8350 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8351 sign, (flags&F_ALT) ? "#" : "",
8352 prec, type);
8354 if (sign[0])
8355 return longtounicode(buf, buflen, fmt, -x);
8356 else
8357 return longtounicode(buf, buflen, fmt, x);
8360 static int
8361 formatchar(Py_UNICODE *buf,
8362 size_t buflen,
8363 PyObject *v)
8365 /* presume that the buffer is at least 2 characters long */
8366 if (PyUnicode_Check(v)) {
8367 if (PyUnicode_GET_SIZE(v) != 1)
8368 goto onError;
8369 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8372 else if (PyString_Check(v)) {
8373 if (PyString_GET_SIZE(v) != 1)
8374 goto onError;
8375 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8378 else {
8379 /* Integer input truncated to a character */
8380 long x;
8381 x = PyInt_AsLong(v);
8382 if (x == -1 && PyErr_Occurred())
8383 goto onError;
8384 #ifdef Py_UNICODE_WIDE
8385 if (x < 0 || x > 0x10ffff) {
8386 PyErr_SetString(PyExc_OverflowError,
8387 "%c arg not in range(0x110000) "
8388 "(wide Python build)");
8389 return -1;
8391 #else
8392 if (x < 0 || x > 0xffff) {
8393 PyErr_SetString(PyExc_OverflowError,
8394 "%c arg not in range(0x10000) "
8395 "(narrow Python build)");
8396 return -1;
8398 #endif
8399 buf[0] = (Py_UNICODE) x;
8401 buf[1] = '\0';
8402 return 1;
8404 onError:
8405 PyErr_SetString(PyExc_TypeError,
8406 "%c requires int or char");
8407 return -1;
8410 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8412 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8413 chars are formatted. XXX This is a magic number. Each formatting
8414 routine does bounds checking to ensure no overflow, but a better
8415 solution may be to malloc a buffer of appropriate size for each
8416 format. For now, the current solution is sufficient.
8418 #define FORMATBUFLEN (size_t)120
8420 PyObject *PyUnicode_Format(PyObject *format,
8421 PyObject *args)
8423 Py_UNICODE *fmt, *res;
8424 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8425 int args_owned = 0;
8426 PyUnicodeObject *result = NULL;
8427 PyObject *dict = NULL;
8428 PyObject *uformat;
8430 if (format == NULL || args == NULL) {
8431 PyErr_BadInternalCall();
8432 return NULL;
8434 uformat = PyUnicode_FromObject(format);
8435 if (uformat == NULL)
8436 return NULL;
8437 fmt = PyUnicode_AS_UNICODE(uformat);
8438 fmtcnt = PyUnicode_GET_SIZE(uformat);
8440 reslen = rescnt = fmtcnt + 100;
8441 result = _PyUnicode_New(reslen);
8442 if (result == NULL)
8443 goto onError;
8444 res = PyUnicode_AS_UNICODE(result);
8446 if (PyTuple_Check(args)) {
8447 arglen = PyTuple_Size(args);
8448 argidx = 0;
8450 else {
8451 arglen = -1;
8452 argidx = -2;
8454 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
8455 !PyObject_TypeCheck(args, &PyBaseString_Type))
8456 dict = args;
8458 while (--fmtcnt >= 0) {
8459 if (*fmt != '%') {
8460 if (--rescnt < 0) {
8461 rescnt = fmtcnt + 100;
8462 reslen += rescnt;
8463 if (_PyUnicode_Resize(&result, reslen) < 0)
8464 goto onError;
8465 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8466 --rescnt;
8468 *res++ = *fmt++;
8470 else {
8471 /* Got a format specifier */
8472 int flags = 0;
8473 Py_ssize_t width = -1;
8474 int prec = -1;
8475 Py_UNICODE c = '\0';
8476 Py_UNICODE fill;
8477 int isnumok;
8478 PyObject *v = NULL;
8479 PyObject *temp = NULL;
8480 Py_UNICODE *pbuf;
8481 Py_UNICODE sign;
8482 Py_ssize_t len;
8483 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
8485 fmt++;
8486 if (*fmt == '(') {
8487 Py_UNICODE *keystart;
8488 Py_ssize_t keylen;
8489 PyObject *key;
8490 int pcount = 1;
8492 if (dict == NULL) {
8493 PyErr_SetString(PyExc_TypeError,
8494 "format requires a mapping");
8495 goto onError;
8497 ++fmt;
8498 --fmtcnt;
8499 keystart = fmt;
8500 /* Skip over balanced parentheses */
8501 while (pcount > 0 && --fmtcnt >= 0) {
8502 if (*fmt == ')')
8503 --pcount;
8504 else if (*fmt == '(')
8505 ++pcount;
8506 fmt++;
8508 keylen = fmt - keystart - 1;
8509 if (fmtcnt < 0 || pcount > 0) {
8510 PyErr_SetString(PyExc_ValueError,
8511 "incomplete format key");
8512 goto onError;
8514 #if 0
8515 /* keys are converted to strings using UTF-8 and
8516 then looked up since Python uses strings to hold
8517 variables names etc. in its namespaces and we
8518 wouldn't want to break common idioms. */
8519 key = PyUnicode_EncodeUTF8(keystart,
8520 keylen,
8521 NULL);
8522 #else
8523 key = PyUnicode_FromUnicode(keystart, keylen);
8524 #endif
8525 if (key == NULL)
8526 goto onError;
8527 if (args_owned) {
8528 Py_DECREF(args);
8529 args_owned = 0;
8531 args = PyObject_GetItem(dict, key);
8532 Py_DECREF(key);
8533 if (args == NULL) {
8534 goto onError;
8536 args_owned = 1;
8537 arglen = -1;
8538 argidx = -2;
8540 while (--fmtcnt >= 0) {
8541 switch (c = *fmt++) {
8542 case '-': flags |= F_LJUST; continue;
8543 case '+': flags |= F_SIGN; continue;
8544 case ' ': flags |= F_BLANK; continue;
8545 case '#': flags |= F_ALT; continue;
8546 case '0': flags |= F_ZERO; continue;
8548 break;
8550 if (c == '*') {
8551 v = getnextarg(args, arglen, &argidx);
8552 if (v == NULL)
8553 goto onError;
8554 if (!PyInt_Check(v)) {
8555 PyErr_SetString(PyExc_TypeError,
8556 "* wants int");
8557 goto onError;
8559 width = PyInt_AsLong(v);
8560 if (width < 0) {
8561 flags |= F_LJUST;
8562 width = -width;
8564 if (--fmtcnt >= 0)
8565 c = *fmt++;
8567 else if (c >= '0' && c <= '9') {
8568 width = c - '0';
8569 while (--fmtcnt >= 0) {
8570 c = *fmt++;
8571 if (c < '0' || c > '9')
8572 break;
8573 if ((width*10) / 10 != width) {
8574 PyErr_SetString(PyExc_ValueError,
8575 "width too big");
8576 goto onError;
8578 width = width*10 + (c - '0');
8581 if (c == '.') {
8582 prec = 0;
8583 if (--fmtcnt >= 0)
8584 c = *fmt++;
8585 if (c == '*') {
8586 v = getnextarg(args, arglen, &argidx);
8587 if (v == NULL)
8588 goto onError;
8589 if (!PyInt_Check(v)) {
8590 PyErr_SetString(PyExc_TypeError,
8591 "* wants int");
8592 goto onError;
8594 prec = PyInt_AsLong(v);
8595 if (prec < 0)
8596 prec = 0;
8597 if (--fmtcnt >= 0)
8598 c = *fmt++;
8600 else if (c >= '0' && c <= '9') {
8601 prec = c - '0';
8602 while (--fmtcnt >= 0) {
8603 c = Py_CHARMASK(*fmt++);
8604 if (c < '0' || c > '9')
8605 break;
8606 if ((prec*10) / 10 != prec) {
8607 PyErr_SetString(PyExc_ValueError,
8608 "prec too big");
8609 goto onError;
8611 prec = prec*10 + (c - '0');
8614 } /* prec */
8615 if (fmtcnt >= 0) {
8616 if (c == 'h' || c == 'l' || c == 'L') {
8617 if (--fmtcnt >= 0)
8618 c = *fmt++;
8621 if (fmtcnt < 0) {
8622 PyErr_SetString(PyExc_ValueError,
8623 "incomplete format");
8624 goto onError;
8626 if (c != '%') {
8627 v = getnextarg(args, arglen, &argidx);
8628 if (v == NULL)
8629 goto onError;
8631 sign = 0;
8632 fill = ' ';
8633 switch (c) {
8635 case '%':
8636 pbuf = formatbuf;
8637 /* presume that buffer length is at least 1 */
8638 pbuf[0] = '%';
8639 len = 1;
8640 break;
8642 case 's':
8643 case 'r':
8644 if (PyUnicode_Check(v) && c == 's') {
8645 temp = v;
8646 Py_INCREF(temp);
8648 else {
8649 PyObject *unicode;
8650 if (c == 's')
8651 temp = PyObject_Unicode(v);
8652 else
8653 temp = PyObject_Repr(v);
8654 if (temp == NULL)
8655 goto onError;
8656 if (PyUnicode_Check(temp))
8657 /* nothing to do */;
8658 else if (PyString_Check(temp)) {
8659 /* convert to string to Unicode */
8660 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8661 PyString_GET_SIZE(temp),
8662 NULL,
8663 "strict");
8664 Py_DECREF(temp);
8665 temp = unicode;
8666 if (temp == NULL)
8667 goto onError;
8669 else {
8670 Py_DECREF(temp);
8671 PyErr_SetString(PyExc_TypeError,
8672 "%s argument has non-string str()");
8673 goto onError;
8676 pbuf = PyUnicode_AS_UNICODE(temp);
8677 len = PyUnicode_GET_SIZE(temp);
8678 if (prec >= 0 && len > prec)
8679 len = prec;
8680 break;
8682 case 'i':
8683 case 'd':
8684 case 'u':
8685 case 'o':
8686 case 'x':
8687 case 'X':
8688 if (c == 'i')
8689 c = 'd';
8690 isnumok = 0;
8691 if (PyNumber_Check(v)) {
8692 PyObject *iobj=NULL;
8694 if (PyInt_Check(v) || (PyLong_Check(v))) {
8695 iobj = v;
8696 Py_INCREF(iobj);
8698 else {
8699 iobj = PyNumber_Int(v);
8700 if (iobj==NULL) iobj = PyNumber_Long(v);
8702 if (iobj!=NULL) {
8703 if (PyInt_Check(iobj)) {
8704 isnumok = 1;
8705 pbuf = formatbuf;
8706 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8707 flags, prec, c, iobj);
8708 Py_DECREF(iobj);
8709 if (len < 0)
8710 goto onError;
8711 sign = 1;
8713 else if (PyLong_Check(iobj)) {
8714 isnumok = 1;
8715 temp = formatlong(iobj, flags, prec, c);
8716 Py_DECREF(iobj);
8717 if (!temp)
8718 goto onError;
8719 pbuf = PyUnicode_AS_UNICODE(temp);
8720 len = PyUnicode_GET_SIZE(temp);
8721 sign = 1;
8723 else {
8724 Py_DECREF(iobj);
8728 if (!isnumok) {
8729 PyErr_Format(PyExc_TypeError,
8730 "%%%c format: a number is required, "
8731 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8732 goto onError;
8734 if (flags & F_ZERO)
8735 fill = '0';
8736 break;
8738 case 'e':
8739 case 'E':
8740 case 'f':
8741 case 'F':
8742 case 'g':
8743 case 'G':
8744 if (c == 'F')
8745 c = 'f';
8746 pbuf = formatbuf;
8747 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8748 flags, prec, c, v);
8749 if (len < 0)
8750 goto onError;
8751 sign = 1;
8752 if (flags & F_ZERO)
8753 fill = '0';
8754 break;
8756 case 'c':
8757 pbuf = formatbuf;
8758 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8759 if (len < 0)
8760 goto onError;
8761 break;
8763 default:
8764 PyErr_Format(PyExc_ValueError,
8765 "unsupported format character '%c' (0x%x) "
8766 "at index %zd",
8767 (31<=c && c<=126) ? (char)c : '?',
8768 (int)c,
8769 (Py_ssize_t)(fmt - 1 -
8770 PyUnicode_AS_UNICODE(uformat)));
8771 goto onError;
8773 if (sign) {
8774 if (*pbuf == '-' || *pbuf == '+') {
8775 sign = *pbuf++;
8776 len--;
8778 else if (flags & F_SIGN)
8779 sign = '+';
8780 else if (flags & F_BLANK)
8781 sign = ' ';
8782 else
8783 sign = 0;
8785 if (width < len)
8786 width = len;
8787 if (rescnt - (sign != 0) < width) {
8788 reslen -= rescnt;
8789 rescnt = width + fmtcnt + 100;
8790 reslen += rescnt;
8791 if (reslen < 0) {
8792 Py_XDECREF(temp);
8793 PyErr_NoMemory();
8794 goto onError;
8796 if (_PyUnicode_Resize(&result, reslen) < 0) {
8797 Py_XDECREF(temp);
8798 goto onError;
8800 res = PyUnicode_AS_UNICODE(result)
8801 + reslen - rescnt;
8803 if (sign) {
8804 if (fill != ' ')
8805 *res++ = sign;
8806 rescnt--;
8807 if (width > len)
8808 width--;
8810 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8811 assert(pbuf[0] == '0');
8812 assert(pbuf[1] == c);
8813 if (fill != ' ') {
8814 *res++ = *pbuf++;
8815 *res++ = *pbuf++;
8817 rescnt -= 2;
8818 width -= 2;
8819 if (width < 0)
8820 width = 0;
8821 len -= 2;
8823 if (width > len && !(flags & F_LJUST)) {
8824 do {
8825 --rescnt;
8826 *res++ = fill;
8827 } while (--width > len);
8829 if (fill == ' ') {
8830 if (sign)
8831 *res++ = sign;
8832 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8833 assert(pbuf[0] == '0');
8834 assert(pbuf[1] == c);
8835 *res++ = *pbuf++;
8836 *res++ = *pbuf++;
8839 Py_UNICODE_COPY(res, pbuf, len);
8840 res += len;
8841 rescnt -= len;
8842 while (--width >= len) {
8843 --rescnt;
8844 *res++ = ' ';
8846 if (dict && (argidx < arglen) && c != '%') {
8847 PyErr_SetString(PyExc_TypeError,
8848 "not all arguments converted during string formatting");
8849 Py_XDECREF(temp);
8850 goto onError;
8852 Py_XDECREF(temp);
8853 } /* '%' */
8854 } /* until end */
8855 if (argidx < arglen && !dict) {
8856 PyErr_SetString(PyExc_TypeError,
8857 "not all arguments converted during string formatting");
8858 goto onError;
8861 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8862 goto onError;
8863 if (args_owned) {
8864 Py_DECREF(args);
8866 Py_DECREF(uformat);
8867 return (PyObject *)result;
8869 onError:
8870 Py_XDECREF(result);
8871 Py_DECREF(uformat);
8872 if (args_owned) {
8873 Py_DECREF(args);
8875 return NULL;
8878 static PyBufferProcs unicode_as_buffer = {
8879 (readbufferproc) unicode_buffer_getreadbuf,
8880 (writebufferproc) unicode_buffer_getwritebuf,
8881 (segcountproc) unicode_buffer_getsegcount,
8882 (charbufferproc) unicode_buffer_getcharbuf,
8885 static PyObject *
8886 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8888 static PyObject *
8889 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8891 PyObject *x = NULL;
8892 static char *kwlist[] = {"string", "encoding", "errors", 0};
8893 char *encoding = NULL;
8894 char *errors = NULL;
8896 if (type != &PyUnicode_Type)
8897 return unicode_subtype_new(type, args, kwds);
8898 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8899 kwlist, &x, &encoding, &errors))
8900 return NULL;
8901 if (x == NULL)
8902 return (PyObject *)_PyUnicode_New(0);
8903 if (encoding == NULL && errors == NULL)
8904 return PyObject_Unicode(x);
8905 else
8906 return PyUnicode_FromEncodedObject(x, encoding, errors);
8909 static PyObject *
8910 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8912 PyUnicodeObject *tmp, *pnew;
8913 Py_ssize_t n;
8915 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8916 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8917 if (tmp == NULL)
8918 return NULL;
8919 assert(PyUnicode_Check(tmp));
8920 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8921 if (pnew == NULL) {
8922 Py_DECREF(tmp);
8923 return NULL;
8925 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8926 if (pnew->str == NULL) {
8927 _Py_ForgetReference((PyObject *)pnew);
8928 PyObject_Del(pnew);
8929 Py_DECREF(tmp);
8930 return PyErr_NoMemory();
8932 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8933 pnew->length = n;
8934 pnew->hash = tmp->hash;
8935 Py_DECREF(tmp);
8936 return (PyObject *)pnew;
8939 PyDoc_STRVAR(unicode_doc,
8940 "unicode(string [, encoding[, errors]]) -> object\n\
8942 Create a new Unicode object from the given encoded string.\n\
8943 encoding defaults to the current default string encoding.\n\
8944 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8946 PyTypeObject PyUnicode_Type = {
8947 PyVarObject_HEAD_INIT(&PyType_Type, 0)
8948 "unicode", /* tp_name */
8949 sizeof(PyUnicodeObject), /* tp_size */
8950 0, /* tp_itemsize */
8951 /* Slots */
8952 (destructor)unicode_dealloc, /* tp_dealloc */
8953 0, /* tp_print */
8954 0, /* tp_getattr */
8955 0, /* tp_setattr */
8956 0, /* tp_compare */
8957 unicode_repr, /* tp_repr */
8958 &unicode_as_number, /* tp_as_number */
8959 &unicode_as_sequence, /* tp_as_sequence */
8960 &unicode_as_mapping, /* tp_as_mapping */
8961 (hashfunc) unicode_hash, /* tp_hash*/
8962 0, /* tp_call*/
8963 (reprfunc) unicode_str, /* tp_str */
8964 PyObject_GenericGetAttr, /* tp_getattro */
8965 0, /* tp_setattro */
8966 &unicode_as_buffer, /* tp_as_buffer */
8967 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
8968 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
8969 unicode_doc, /* tp_doc */
8970 0, /* tp_traverse */
8971 0, /* tp_clear */
8972 PyUnicode_RichCompare, /* tp_richcompare */
8973 0, /* tp_weaklistoffset */
8974 0, /* tp_iter */
8975 0, /* tp_iternext */
8976 unicode_methods, /* tp_methods */
8977 0, /* tp_members */
8978 0, /* tp_getset */
8979 &PyBaseString_Type, /* tp_base */
8980 0, /* tp_dict */
8981 0, /* tp_descr_get */
8982 0, /* tp_descr_set */
8983 0, /* tp_dictoffset */
8984 0, /* tp_init */
8985 0, /* tp_alloc */
8986 unicode_new, /* tp_new */
8987 PyObject_Del, /* tp_free */
8990 /* Initialize the Unicode implementation */
8992 void _PyUnicode_Init(void)
8994 int i;
8996 /* XXX - move this array to unicodectype.c ? */
8997 Py_UNICODE linebreak[] = {
8998 0x000A, /* LINE FEED */
8999 0x000D, /* CARRIAGE RETURN */
9000 0x001C, /* FILE SEPARATOR */
9001 0x001D, /* GROUP SEPARATOR */
9002 0x001E, /* RECORD SEPARATOR */
9003 0x0085, /* NEXT LINE */
9004 0x2028, /* LINE SEPARATOR */
9005 0x2029, /* PARAGRAPH SEPARATOR */
9008 /* Init the implementation */
9009 free_list = NULL;
9010 numfree = 0;
9011 unicode_empty = _PyUnicode_New(0);
9012 if (!unicode_empty)
9013 return;
9015 strcpy(unicode_default_encoding, "ascii");
9016 for (i = 0; i < 256; i++)
9017 unicode_latin1[i] = NULL;
9018 if (PyType_Ready(&PyUnicode_Type) < 0)
9019 Py_FatalError("Can't initialize 'unicode'");
9021 /* initialize the linebreak bloom filter */
9022 bloom_linebreak = make_bloom_mask(
9023 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9026 PyType_Ready(&EncodingMapType);
9029 /* Finalize the Unicode implementation */
9032 PyUnicode_ClearFreeList(void)
9034 int freelist_size = numfree;
9035 PyUnicodeObject *u;
9037 for (u = free_list; u != NULL;) {
9038 PyUnicodeObject *v = u;
9039 u = *(PyUnicodeObject **)u;
9040 if (v->str)
9041 PyObject_DEL(v->str);
9042 Py_XDECREF(v->defenc);
9043 PyObject_Del(v);
9044 numfree--;
9046 free_list = NULL;
9047 assert(numfree == 0);
9048 return freelist_size;
9051 void
9052 _PyUnicode_Fini(void)
9054 int i;
9056 Py_XDECREF(unicode_empty);
9057 unicode_empty = NULL;
9059 for (i = 0; i < 256; i++) {
9060 if (unicode_latin1[i]) {
9061 Py_DECREF(unicode_latin1[i]);
9062 unicode_latin1[i] = NULL;
9065 (void)PyUnicode_ClearFreeList();
9068 #ifdef __cplusplus
9070 #endif
9074 Local variables:
9075 c-basic-offset: 4
9076 indent-tabs-mode: nil
9077 End: