Merged revisions 83951 via svnmerge from
[python/dscho.git] / Objects / unicodeobject.c
blob028b42d8ebf837bba959259e36815f08d2557677
1 /*
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
10 Copyright (c) Corporation for National Research Initiatives.
12 --------------------------------------------------------------------
13 The original string type implementation is:
15 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
18 By obtaining, using, and/or copying this software and/or its
19 associated documentation, you agree that you have read, understood,
20 and will comply with the following terms and conditions:
22 Permission to use, copy, modify, and distribute this software and its
23 associated documentation for any purpose and without fee is hereby
24 granted, provided that the above copyright notice appears in all
25 copies, and that both that copyright notice and this permission notice
26 appear in supporting documentation, and that the name of Secret Labs
27 AB or the author not be used in advertising or publicity pertaining to
28 distribution of the software without specific, written prior
29 permission.
31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 --------------------------------------------------------------------
42 #define PY_SSIZE_T_CLEAN
43 #include "Python.h"
44 #include "bytes_methods.h"
46 #include "unicodeobject.h"
47 #include "ucnhash.h"
49 #ifdef MS_WINDOWS
50 #include <windows.h>
51 #endif
53 /* Limit for the Unicode object free list */
55 #define PyUnicode_MAXFREELIST 1024
57 /* Limit for the Unicode object free list stay alive optimization.
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
61 limit. This reduces malloc() overhead for small Unicode objects.
63 At worst this will result in PyUnicode_MAXFREELIST *
64 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
65 malloc()-overhead) bytes of unused garbage.
67 Setting the limit to 0 effectively turns the feature off.
69 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
74 #define KEEPALIVE_SIZE_LIMIT 9
76 /* Endianness switches; defaults to little endian */
78 #ifdef WORDS_BIGENDIAN
79 # define BYTEORDER_IS_BIG_ENDIAN
80 #else
81 # define BYTEORDER_IS_LITTLE_ENDIAN
82 #endif
84 /* --- Globals ------------------------------------------------------------
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
92 #ifdef __cplusplus
93 extern "C" {
94 #endif
96 /* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
101 Another way to look at this is that to say that the actual reference
102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
104 static PyObject *interned;
106 /* Free list for Unicode objects */
107 static PyUnicodeObject *free_list;
108 static int numfree;
110 /* The empty Unicode object is shared to improve performance. */
111 static PyUnicodeObject *unicode_empty;
113 /* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115 static PyUnicodeObject *unicode_latin1[256];
117 /* Default encoding to use and assume when NULL is passed as encoding
118 parameter; it is fixed to "utf-8". Always use the
119 PyUnicode_GetDefaultEncoding() API to access this global.
121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
122 hard coded default!
124 static const char unicode_default_encoding[] = "utf-8";
126 /* Fast detection of the most frequent whitespace characters */
127 const unsigned char _Py_ascii_whitespace[] = {
128 0, 0, 0, 0, 0, 0, 0, 0,
129 /* case 0x0009: * HORIZONTAL TABULATION */
130 /* case 0x000A: * LINE FEED */
131 /* case 0x000B: * VERTICAL TABULATION */
132 /* case 0x000C: * FORM FEED */
133 /* case 0x000D: * CARRIAGE RETURN */
134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
136 /* case 0x001C: * FILE SEPARATOR */
137 /* case 0x001D: * GROUP SEPARATOR */
138 /* case 0x001E: * RECORD SEPARATOR */
139 /* case 0x001F: * UNIT SEPARATOR */
140 0, 0, 0, 0, 1, 1, 1, 1,
141 /* case 0x0020: * SPACE */
142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
157 static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
162 static void raise_encode_exception(PyObject **exceptionObject,
163 const char *encoding,
164 const Py_UNICODE *unicode, Py_ssize_t size,
165 Py_ssize_t startpos, Py_ssize_t endpos,
166 const char *reason);
168 /* Same for linebreaks */
169 static unsigned char ascii_linebreak[] = {
170 0, 0, 0, 0, 0, 0, 0, 0,
171 /* 0x000A, * LINE FEED */
172 /* 0x000D, * CARRIAGE RETURN */
173 0, 0, 1, 0, 0, 1, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 /* 0x001C, * FILE SEPARATOR */
176 /* 0x001D, * GROUP SEPARATOR */
177 /* 0x001E, * RECORD SEPARATOR */
178 0, 0, 0, 0, 1, 1, 1, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0,
186 0, 0, 0, 0, 0, 0, 0, 0,
187 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 0, 0, 0, 0, 0, 0, 0,
190 0, 0, 0, 0, 0, 0, 0, 0,
191 0, 0, 0, 0, 0, 0, 0, 0
195 Py_UNICODE
196 PyUnicode_GetMax(void)
198 #ifdef Py_UNICODE_WIDE
199 return 0x10FFFF;
200 #else
201 /* This is actually an illegal character, so it should
202 not be passed to unichr. */
203 return 0xFFFF;
204 #endif
207 /* --- Bloom Filters ----------------------------------------------------- */
209 /* stuff to implement simple "bloom filters" for Unicode characters.
210 to keep things simple, we use a single bitmask, using the least 5
211 bits from each unicode characters as the bit index. */
213 /* the linebreak mask is set up by Unicode_Init below */
215 #define BLOOM_MASK unsigned long
217 static BLOOM_MASK bloom_linebreak;
219 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
221 #define BLOOM_LINEBREAK(ch) \
222 ((ch) < 128U ? ascii_linebreak[(ch)] : \
223 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
225 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
227 /* calculate simple bloom-style bitmask for a given unicode string */
229 long mask;
230 Py_ssize_t i;
232 mask = 0;
233 for (i = 0; i < len; i++)
234 mask |= (1 << (ptr[i] & 0x1F));
236 return mask;
239 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
241 Py_ssize_t i;
243 for (i = 0; i < setlen; i++)
244 if (set[i] == chr)
245 return 1;
247 return 0;
250 #define BLOOM_MEMBER(mask, chr, set, setlen) \
251 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
253 /* --- Unicode Object ----------------------------------------------------- */
255 static
256 int unicode_resize(register PyUnicodeObject *unicode,
257 Py_ssize_t length)
259 void *oldstr;
261 /* Shortcut if there's nothing much to do. */
262 if (unicode->length == length)
263 goto reset;
265 /* Resizing shared object (unicode_empty or single character
266 objects) in-place is not allowed. Use PyUnicode_Resize()
267 instead ! */
269 if (unicode == unicode_empty ||
270 (unicode->length == 1 &&
271 unicode->str[0] < 256U &&
272 unicode_latin1[unicode->str[0]] == unicode)) {
273 PyErr_SetString(PyExc_SystemError,
274 "can't resize shared str objects");
275 return -1;
278 /* We allocate one more byte to make sure the string is Ux0000 terminated.
279 The overallocation is also used by fastsearch, which assumes that it's
280 safe to look at str[length] (without making any assumptions about what
281 it contains). */
283 oldstr = unicode->str;
284 unicode->str = PyObject_REALLOC(unicode->str,
285 sizeof(Py_UNICODE) * (length + 1));
286 if (!unicode->str) {
287 unicode->str = (Py_UNICODE *)oldstr;
288 PyErr_NoMemory();
289 return -1;
291 unicode->str[length] = 0;
292 unicode->length = length;
294 reset:
295 /* Reset the object caches */
296 if (unicode->defenc) {
297 Py_CLEAR(unicode->defenc);
299 unicode->hash = -1;
301 return 0;
304 /* We allocate one more byte to make sure the string is
305 Ux0000 terminated; some code (e.g. new_identifier)
306 relies on that.
308 XXX This allocator could further be enhanced by assuring that the
309 free list never reduces its size below 1.
313 static
314 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
316 register PyUnicodeObject *unicode;
318 /* Optimization for empty strings */
319 if (length == 0 && unicode_empty != NULL) {
320 Py_INCREF(unicode_empty);
321 return unicode_empty;
324 /* Ensure we won't overflow the size. */
325 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
326 return (PyUnicodeObject *)PyErr_NoMemory();
329 /* Unicode freelist & memory allocation */
330 if (free_list) {
331 unicode = free_list;
332 free_list = *(PyUnicodeObject **)unicode;
333 numfree--;
334 if (unicode->str) {
335 /* Keep-Alive optimization: we only upsize the buffer,
336 never downsize it. */
337 if ((unicode->length < length) &&
338 unicode_resize(unicode, length) < 0) {
339 PyObject_DEL(unicode->str);
340 unicode->str = NULL;
343 else {
344 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
345 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
347 PyObject_INIT(unicode, &PyUnicode_Type);
349 else {
350 size_t new_size;
351 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
352 if (unicode == NULL)
353 return NULL;
354 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
355 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
358 if (!unicode->str) {
359 PyErr_NoMemory();
360 goto onError;
362 /* Initialize the first element to guard against cases where
363 * the caller fails before initializing str -- unicode_resize()
364 * reads str[0], and the Keep-Alive optimization can keep memory
365 * allocated for str alive across a call to unicode_dealloc(unicode).
366 * We don't want unicode_resize to read uninitialized memory in
367 * that case.
369 unicode->str[0] = 0;
370 unicode->str[length] = 0;
371 unicode->length = length;
372 unicode->hash = -1;
373 unicode->state = 0;
374 unicode->defenc = NULL;
375 return unicode;
377 onError:
378 /* XXX UNREF/NEWREF interface should be more symmetrical */
379 _Py_DEC_REFTOTAL;
380 _Py_ForgetReference((PyObject *)unicode);
381 PyObject_Del(unicode);
382 return NULL;
385 static
386 void unicode_dealloc(register PyUnicodeObject *unicode)
388 switch (PyUnicode_CHECK_INTERNED(unicode)) {
389 case SSTATE_NOT_INTERNED:
390 break;
392 case SSTATE_INTERNED_MORTAL:
393 /* revive dead object temporarily for DelItem */
394 Py_REFCNT(unicode) = 3;
395 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
396 Py_FatalError(
397 "deletion of interned string failed");
398 break;
400 case SSTATE_INTERNED_IMMORTAL:
401 Py_FatalError("Immortal interned string died.");
403 default:
404 Py_FatalError("Inconsistent interned string state.");
407 if (PyUnicode_CheckExact(unicode) &&
408 numfree < PyUnicode_MAXFREELIST) {
409 /* Keep-Alive optimization */
410 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
411 PyObject_DEL(unicode->str);
412 unicode->str = NULL;
413 unicode->length = 0;
415 if (unicode->defenc) {
416 Py_CLEAR(unicode->defenc);
418 /* Add to free list */
419 *(PyUnicodeObject **)unicode = free_list;
420 free_list = unicode;
421 numfree++;
423 else {
424 PyObject_DEL(unicode->str);
425 Py_XDECREF(unicode->defenc);
426 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
430 static
431 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
433 register PyUnicodeObject *v;
435 /* Argument checks */
436 if (unicode == NULL) {
437 PyErr_BadInternalCall();
438 return -1;
440 v = *unicode;
441 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
442 PyErr_BadInternalCall();
443 return -1;
446 /* Resizing unicode_empty and single character objects is not
447 possible since these are being shared. We simply return a fresh
448 copy with the same Unicode content. */
449 if (v->length != length &&
450 (v == unicode_empty || v->length == 1)) {
451 PyUnicodeObject *w = _PyUnicode_New(length);
452 if (w == NULL)
453 return -1;
454 Py_UNICODE_COPY(w->str, v->str,
455 length < v->length ? length : v->length);
456 Py_DECREF(*unicode);
457 *unicode = w;
458 return 0;
461 /* Note that we don't have to modify *unicode for unshared Unicode
462 objects, since we can modify them in-place. */
463 return unicode_resize(v, length);
466 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
468 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
471 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
472 Py_ssize_t size)
474 PyUnicodeObject *unicode;
476 /* If the Unicode data is known at construction time, we can apply
477 some optimizations which share commonly used objects. */
478 if (u != NULL) {
480 /* Optimization for empty strings */
481 if (size == 0 && unicode_empty != NULL) {
482 Py_INCREF(unicode_empty);
483 return (PyObject *)unicode_empty;
486 /* Single character Unicode objects in the Latin-1 range are
487 shared when using this constructor */
488 if (size == 1 && *u < 256) {
489 unicode = unicode_latin1[*u];
490 if (!unicode) {
491 unicode = _PyUnicode_New(1);
492 if (!unicode)
493 return NULL;
494 unicode->str[0] = *u;
495 unicode_latin1[*u] = unicode;
497 Py_INCREF(unicode);
498 return (PyObject *)unicode;
502 unicode = _PyUnicode_New(size);
503 if (!unicode)
504 return NULL;
506 /* Copy the Unicode data into the new object */
507 if (u != NULL)
508 Py_UNICODE_COPY(unicode->str, u, size);
510 return (PyObject *)unicode;
513 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
515 PyUnicodeObject *unicode;
517 if (size < 0) {
518 PyErr_SetString(PyExc_SystemError,
519 "Negative size passed to PyUnicode_FromStringAndSize");
520 return NULL;
523 /* If the Unicode data is known at construction time, we can apply
524 some optimizations which share commonly used objects.
525 Also, this means the input must be UTF-8, so fall back to the
526 UTF-8 decoder at the end. */
527 if (u != NULL) {
529 /* Optimization for empty strings */
530 if (size == 0 && unicode_empty != NULL) {
531 Py_INCREF(unicode_empty);
532 return (PyObject *)unicode_empty;
535 /* Single characters are shared when using this constructor.
536 Restrict to ASCII, since the input must be UTF-8. */
537 if (size == 1 && Py_CHARMASK(*u) < 128) {
538 unicode = unicode_latin1[Py_CHARMASK(*u)];
539 if (!unicode) {
540 unicode = _PyUnicode_New(1);
541 if (!unicode)
542 return NULL;
543 unicode->str[0] = Py_CHARMASK(*u);
544 unicode_latin1[Py_CHARMASK(*u)] = unicode;
546 Py_INCREF(unicode);
547 return (PyObject *)unicode;
550 return PyUnicode_DecodeUTF8(u, size, NULL);
553 unicode = _PyUnicode_New(size);
554 if (!unicode)
555 return NULL;
557 return (PyObject *)unicode;
560 PyObject *PyUnicode_FromString(const char *u)
562 size_t size = strlen(u);
563 if (size > PY_SSIZE_T_MAX) {
564 PyErr_SetString(PyExc_OverflowError, "input too long");
565 return NULL;
568 return PyUnicode_FromStringAndSize(u, size);
571 #ifdef HAVE_WCHAR_H
573 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
574 # define CONVERT_WCHAR_TO_SURROGATES
575 #endif
577 #ifdef CONVERT_WCHAR_TO_SURROGATES
579 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
580 to convert from UTF32 to UTF16. */
582 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
583 Py_ssize_t size)
585 PyUnicodeObject *unicode;
586 register Py_ssize_t i;
587 Py_ssize_t alloc;
588 const wchar_t *orig_w;
590 if (w == NULL) {
591 if (size == 0)
592 return PyUnicode_FromStringAndSize(NULL, 0);
593 PyErr_BadInternalCall();
594 return NULL;
597 if (size == -1) {
598 size = wcslen(w);
601 alloc = size;
602 orig_w = w;
603 for (i = size; i > 0; i--) {
604 if (*w > 0xFFFF)
605 alloc++;
606 w++;
608 w = orig_w;
609 unicode = _PyUnicode_New(alloc);
610 if (!unicode)
611 return NULL;
613 /* Copy the wchar_t data into the new object */
615 register Py_UNICODE *u;
616 u = PyUnicode_AS_UNICODE(unicode);
617 for (i = size; i > 0; i--) {
618 if (*w > 0xFFFF) {
619 wchar_t ordinal = *w++;
620 ordinal -= 0x10000;
621 *u++ = 0xD800 | (ordinal >> 10);
622 *u++ = 0xDC00 | (ordinal & 0x3FF);
624 else
625 *u++ = *w++;
628 return (PyObject *)unicode;
631 #else
633 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
634 Py_ssize_t size)
636 PyUnicodeObject *unicode;
638 if (w == NULL) {
639 if (size == 0)
640 return PyUnicode_FromStringAndSize(NULL, 0);
641 PyErr_BadInternalCall();
642 return NULL;
645 if (size == -1) {
646 size = wcslen(w);
649 unicode = _PyUnicode_New(size);
650 if (!unicode)
651 return NULL;
653 /* Copy the wchar_t data into the new object */
654 #ifdef HAVE_USABLE_WCHAR_T
655 memcpy(unicode->str, w, size * sizeof(wchar_t));
656 #else
658 register Py_UNICODE *u;
659 register Py_ssize_t i;
660 u = PyUnicode_AS_UNICODE(unicode);
661 for (i = size; i > 0; i--)
662 *u++ = *w++;
664 #endif
666 return (PyObject *)unicode;
669 #endif /* CONVERT_WCHAR_TO_SURROGATES */
671 #undef CONVERT_WCHAR_TO_SURROGATES
673 static void
674 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
676 *fmt++ = '%';
677 if (width) {
678 if (zeropad)
679 *fmt++ = '0';
680 fmt += sprintf(fmt, "%d", width);
682 if (precision)
683 fmt += sprintf(fmt, ".%d", precision);
684 if (longflag)
685 *fmt++ = 'l';
686 else if (size_tflag) {
687 char *f = PY_FORMAT_SIZE_T;
688 while (*f)
689 *fmt++ = *f++;
691 *fmt++ = c;
692 *fmt = '\0';
695 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
697 PyObject *
698 PyUnicode_FromFormatV(const char *format, va_list vargs)
700 va_list count;
701 Py_ssize_t callcount = 0;
702 PyObject **callresults = NULL;
703 PyObject **callresult = NULL;
704 Py_ssize_t n = 0;
705 int width = 0;
706 int precision = 0;
707 int zeropad;
708 const char* f;
709 Py_UNICODE *s;
710 PyObject *string;
711 /* used by sprintf */
712 char buffer[21];
713 /* use abuffer instead of buffer, if we need more space
714 * (which can happen if there's a format specifier with width). */
715 char *abuffer = NULL;
716 char *realbuffer;
717 Py_ssize_t abuffersize = 0;
718 char fmt[60]; /* should be enough for %0width.precisionld */
719 const char *copy;
721 #ifdef VA_LIST_IS_ARRAY
722 Py_MEMCPY(count, vargs, sizeof(va_list));
723 #else
724 #ifdef __va_copy
725 __va_copy(count, vargs);
726 #else
727 count = vargs;
728 #endif
729 #endif
730 /* step 1: count the number of %S/%R/%A/%s format specifications
731 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
732 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
733 * result in an array) */
734 for (f = format; *f; f++) {
735 if (*f == '%') {
736 if (*(f+1)=='%')
737 continue;
738 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
739 ++callcount;
740 while (ISDIGIT((unsigned)*f))
741 width = (width*10) + *f++ - '0';
742 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
744 if (*f == 's')
745 ++callcount;
748 /* step 2: allocate memory for the results of
749 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
750 if (callcount) {
751 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
752 if (!callresults) {
753 PyErr_NoMemory();
754 return NULL;
756 callresult = callresults;
758 /* step 3: figure out how large a buffer we need */
759 for (f = format; *f; f++) {
760 if (*f == '%') {
761 const char* p = f;
762 width = 0;
763 while (ISDIGIT((unsigned)*f))
764 width = (width*10) + *f++ - '0';
765 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
768 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
769 * they don't affect the amount of space we reserve.
771 if ((*f == 'l' || *f == 'z') &&
772 (f[1] == 'd' || f[1] == 'u'))
773 ++f;
775 switch (*f) {
776 case 'c':
777 (void)va_arg(count, int);
778 /* fall through... */
779 case '%':
780 n++;
781 break;
782 case 'd': case 'u': case 'i': case 'x':
783 (void) va_arg(count, int);
784 /* 20 bytes is enough to hold a 64-bit
785 integer. Decimal takes the most space.
786 This isn't enough for octal.
787 If a width is specified we need more
788 (which we allocate later). */
789 if (width < 20)
790 width = 20;
791 n += width;
792 if (abuffersize < width)
793 abuffersize = width;
794 break;
795 case 's':
797 /* UTF-8 */
798 const char *s = va_arg(count, const char*);
799 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
800 if (!str)
801 goto fail;
802 n += PyUnicode_GET_SIZE(str);
803 /* Remember the str and switch to the next slot */
804 *callresult++ = str;
805 break;
807 case 'U':
809 PyObject *obj = va_arg(count, PyObject *);
810 assert(obj && PyUnicode_Check(obj));
811 n += PyUnicode_GET_SIZE(obj);
812 break;
814 case 'V':
816 PyObject *obj = va_arg(count, PyObject *);
817 const char *str = va_arg(count, const char *);
818 assert(obj || str);
819 assert(!obj || PyUnicode_Check(obj));
820 if (obj)
821 n += PyUnicode_GET_SIZE(obj);
822 else
823 n += strlen(str);
824 break;
826 case 'S':
828 PyObject *obj = va_arg(count, PyObject *);
829 PyObject *str;
830 assert(obj);
831 str = PyObject_Str(obj);
832 if (!str)
833 goto fail;
834 n += PyUnicode_GET_SIZE(str);
835 /* Remember the str and switch to the next slot */
836 *callresult++ = str;
837 break;
839 case 'R':
841 PyObject *obj = va_arg(count, PyObject *);
842 PyObject *repr;
843 assert(obj);
844 repr = PyObject_Repr(obj);
845 if (!repr)
846 goto fail;
847 n += PyUnicode_GET_SIZE(repr);
848 /* Remember the repr and switch to the next slot */
849 *callresult++ = repr;
850 break;
852 case 'A':
854 PyObject *obj = va_arg(count, PyObject *);
855 PyObject *ascii;
856 assert(obj);
857 ascii = PyObject_ASCII(obj);
858 if (!ascii)
859 goto fail;
860 n += PyUnicode_GET_SIZE(ascii);
861 /* Remember the repr and switch to the next slot */
862 *callresult++ = ascii;
863 break;
865 case 'p':
866 (void) va_arg(count, int);
867 /* maximum 64-bit pointer representation:
868 * 0xffffffffffffffff
869 * so 19 characters is enough.
870 * XXX I count 18 -- what's the extra for?
872 n += 19;
873 break;
874 default:
875 /* if we stumble upon an unknown
876 formatting code, copy the rest of
877 the format string to the output
878 string. (we cannot just skip the
879 code, since there's no way to know
880 what's in the argument list) */
881 n += strlen(p);
882 goto expand;
884 } else
885 n++;
887 expand:
888 if (abuffersize > 20) {
889 abuffer = PyObject_Malloc(abuffersize);
890 if (!abuffer) {
891 PyErr_NoMemory();
892 goto fail;
894 realbuffer = abuffer;
896 else
897 realbuffer = buffer;
898 /* step 4: fill the buffer */
899 /* Since we've analyzed how much space we need for the worst case,
900 we don't have to resize the string.
901 There can be no errors beyond this point. */
902 string = PyUnicode_FromUnicode(NULL, n);
903 if (!string)
904 goto fail;
906 s = PyUnicode_AS_UNICODE(string);
907 callresult = callresults;
909 for (f = format; *f; f++) {
910 if (*f == '%') {
911 const char* p = f++;
912 int longflag = 0;
913 int size_tflag = 0;
914 zeropad = (*f == '0');
915 /* parse the width.precision part */
916 width = 0;
917 while (ISDIGIT((unsigned)*f))
918 width = (width*10) + *f++ - '0';
919 precision = 0;
920 if (*f == '.') {
921 f++;
922 while (ISDIGIT((unsigned)*f))
923 precision = (precision*10) + *f++ - '0';
925 /* handle the long flag, but only for %ld and %lu.
926 others can be added when necessary. */
927 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
928 longflag = 1;
929 ++f;
931 /* handle the size_t flag. */
932 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
933 size_tflag = 1;
934 ++f;
937 switch (*f) {
938 case 'c':
939 *s++ = va_arg(vargs, int);
940 break;
941 case 'd':
942 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
943 if (longflag)
944 sprintf(realbuffer, fmt, va_arg(vargs, long));
945 else if (size_tflag)
946 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
947 else
948 sprintf(realbuffer, fmt, va_arg(vargs, int));
949 appendstring(realbuffer);
950 break;
951 case 'u':
952 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
953 if (longflag)
954 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
955 else if (size_tflag)
956 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
957 else
958 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
959 appendstring(realbuffer);
960 break;
961 case 'i':
962 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
963 sprintf(realbuffer, fmt, va_arg(vargs, int));
964 appendstring(realbuffer);
965 break;
966 case 'x':
967 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
968 sprintf(realbuffer, fmt, va_arg(vargs, int));
969 appendstring(realbuffer);
970 break;
971 case 's':
973 /* unused, since we already have the result */
974 (void) va_arg(vargs, char *);
975 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
976 PyUnicode_GET_SIZE(*callresult));
977 s += PyUnicode_GET_SIZE(*callresult);
978 /* We're done with the unicode()/repr() => forget it */
979 Py_DECREF(*callresult);
980 /* switch to next unicode()/repr() result */
981 ++callresult;
982 break;
984 case 'U':
986 PyObject *obj = va_arg(vargs, PyObject *);
987 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
988 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
989 s += size;
990 break;
992 case 'V':
994 PyObject *obj = va_arg(vargs, PyObject *);
995 const char *str = va_arg(vargs, const char *);
996 if (obj) {
997 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
998 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
999 s += size;
1000 } else {
1001 appendstring(str);
1003 break;
1005 case 'S':
1006 case 'R':
1008 Py_UNICODE *ucopy;
1009 Py_ssize_t usize;
1010 Py_ssize_t upos;
1011 /* unused, since we already have the result */
1012 (void) va_arg(vargs, PyObject *);
1013 ucopy = PyUnicode_AS_UNICODE(*callresult);
1014 usize = PyUnicode_GET_SIZE(*callresult);
1015 for (upos = 0; upos<usize;)
1016 *s++ = ucopy[upos++];
1017 /* We're done with the unicode()/repr() => forget it */
1018 Py_DECREF(*callresult);
1019 /* switch to next unicode()/repr() result */
1020 ++callresult;
1021 break;
1023 case 'p':
1024 sprintf(buffer, "%p", va_arg(vargs, void*));
1025 /* %p is ill-defined: ensure leading 0x. */
1026 if (buffer[1] == 'X')
1027 buffer[1] = 'x';
1028 else if (buffer[1] != 'x') {
1029 memmove(buffer+2, buffer, strlen(buffer)+1);
1030 buffer[0] = '0';
1031 buffer[1] = 'x';
1033 appendstring(buffer);
1034 break;
1035 case '%':
1036 *s++ = '%';
1037 break;
1038 default:
1039 appendstring(p);
1040 goto end;
1042 } else
1043 *s++ = *f;
1046 end:
1047 if (callresults)
1048 PyObject_Free(callresults);
1049 if (abuffer)
1050 PyObject_Free(abuffer);
1051 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1052 return string;
1053 fail:
1054 if (callresults) {
1055 PyObject **callresult2 = callresults;
1056 while (callresult2 < callresult) {
1057 Py_DECREF(*callresult2);
1058 ++callresult2;
1060 PyObject_Free(callresults);
1062 if (abuffer)
1063 PyObject_Free(abuffer);
1064 return NULL;
1067 #undef appendstring
1069 PyObject *
1070 PyUnicode_FromFormat(const char *format, ...)
1072 PyObject* ret;
1073 va_list vargs;
1075 #ifdef HAVE_STDARG_PROTOTYPES
1076 va_start(vargs, format);
1077 #else
1078 va_start(vargs);
1079 #endif
1080 ret = PyUnicode_FromFormatV(format, vargs);
1081 va_end(vargs);
1082 return ret;
1085 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1086 wchar_t *w,
1087 Py_ssize_t size)
1089 if (unicode == NULL) {
1090 PyErr_BadInternalCall();
1091 return -1;
1094 /* If possible, try to copy the 0-termination as well */
1095 if (size > PyUnicode_GET_SIZE(unicode))
1096 size = PyUnicode_GET_SIZE(unicode) + 1;
1098 #ifdef HAVE_USABLE_WCHAR_T
1099 memcpy(w, unicode->str, size * sizeof(wchar_t));
1100 #else
1102 register Py_UNICODE *u;
1103 register Py_ssize_t i;
1104 u = PyUnicode_AS_UNICODE(unicode);
1105 for (i = size; i > 0; i--)
1106 *w++ = *u++;
1108 #endif
1110 if (size > PyUnicode_GET_SIZE(unicode))
1111 return PyUnicode_GET_SIZE(unicode);
1112 else
1113 return size;
1116 #endif
1118 PyObject *PyUnicode_FromOrdinal(int ordinal)
1120 Py_UNICODE s[2];
1122 if (ordinal < 0 || ordinal > 0x10ffff) {
1123 PyErr_SetString(PyExc_ValueError,
1124 "chr() arg not in range(0x110000)");
1125 return NULL;
1128 #ifndef Py_UNICODE_WIDE
1129 if (ordinal > 0xffff) {
1130 ordinal -= 0x10000;
1131 s[0] = 0xD800 | (ordinal >> 10);
1132 s[1] = 0xDC00 | (ordinal & 0x3FF);
1133 return PyUnicode_FromUnicode(s, 2);
1135 #endif
1137 s[0] = (Py_UNICODE)ordinal;
1138 return PyUnicode_FromUnicode(s, 1);
1141 PyObject *PyUnicode_FromObject(register PyObject *obj)
1143 /* XXX Perhaps we should make this API an alias of
1144 PyObject_Str() instead ?! */
1145 if (PyUnicode_CheckExact(obj)) {
1146 Py_INCREF(obj);
1147 return obj;
1149 if (PyUnicode_Check(obj)) {
1150 /* For a Unicode subtype that's not a Unicode object,
1151 return a true Unicode object with the same data. */
1152 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1153 PyUnicode_GET_SIZE(obj));
1155 PyErr_Format(PyExc_TypeError,
1156 "Can't convert '%.100s' object to str implicitly",
1157 Py_TYPE(obj)->tp_name);
1158 return NULL;
1161 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1162 const char *encoding,
1163 const char *errors)
1165 const char *s = NULL;
1166 Py_ssize_t len;
1167 PyObject *v;
1169 if (obj == NULL) {
1170 PyErr_BadInternalCall();
1171 return NULL;
1174 if (PyUnicode_Check(obj)) {
1175 PyErr_SetString(PyExc_TypeError,
1176 "decoding str is not supported");
1177 return NULL;
1180 /* Coerce object */
1181 if (PyBytes_Check(obj)) {
1182 s = PyBytes_AS_STRING(obj);
1183 len = PyBytes_GET_SIZE(obj);
1185 else if (PyByteArray_Check(obj)) {
1186 s = PyByteArray_AS_STRING(obj);
1187 len = PyByteArray_GET_SIZE(obj);
1189 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1190 /* Overwrite the error message with something more useful in
1191 case of a TypeError. */
1192 if (PyErr_ExceptionMatches(PyExc_TypeError))
1193 PyErr_Format(PyExc_TypeError,
1194 "coercing to str: need string or buffer, "
1195 "%.80s found",
1196 Py_TYPE(obj)->tp_name);
1197 goto onError;
1200 /* Convert to Unicode */
1201 if (len == 0) {
1202 Py_INCREF(unicode_empty);
1203 v = (PyObject *)unicode_empty;
1205 else
1206 v = PyUnicode_Decode(s, len, encoding, errors);
1208 return v;
1210 onError:
1211 return NULL;
1214 PyObject *PyUnicode_Decode(const char *s,
1215 Py_ssize_t size,
1216 const char *encoding,
1217 const char *errors)
1219 PyObject *buffer = NULL, *unicode;
1220 Py_buffer info;
1221 char lower[20]; /* Enough for any encoding name we recognize */
1222 char *l;
1223 const char *e;
1225 if (encoding == NULL)
1226 encoding = PyUnicode_GetDefaultEncoding();
1228 /* Convert encoding to lower case and replace '_' with '-' in order to
1229 catch e.g. UTF_8 */
1230 e = encoding;
1231 l = lower;
1232 while (*e && l < &lower[(sizeof lower) - 2]) {
1233 if (ISUPPER(*e)) {
1234 *l++ = TOLOWER(*e++);
1236 else if (*e == '_') {
1237 *l++ = '-';
1238 e++;
1240 else {
1241 *l++ = *e++;
1244 *l = '\0';
1246 /* Shortcuts for common default encodings */
1247 if (strcmp(lower, "utf-8") == 0)
1248 return PyUnicode_DecodeUTF8(s, size, errors);
1249 else if ((strcmp(lower, "latin-1") == 0) ||
1250 (strcmp(lower, "iso-8859-1") == 0))
1251 return PyUnicode_DecodeLatin1(s, size, errors);
1252 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1253 else if (strcmp(lower, "mbcs") == 0)
1254 return PyUnicode_DecodeMBCS(s, size, errors);
1255 #endif
1256 else if (strcmp(lower, "ascii") == 0)
1257 return PyUnicode_DecodeASCII(s, size, errors);
1258 else if (strcmp(lower, "utf-16") == 0)
1259 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1260 else if (strcmp(lower, "utf-32") == 0)
1261 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1263 /* Decode via the codec registry */
1264 buffer = NULL;
1265 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
1266 goto onError;
1267 buffer = PyMemoryView_FromBuffer(&info);
1268 if (buffer == NULL)
1269 goto onError;
1270 unicode = PyCodec_Decode(buffer, encoding, errors);
1271 if (unicode == NULL)
1272 goto onError;
1273 if (!PyUnicode_Check(unicode)) {
1274 PyErr_Format(PyExc_TypeError,
1275 "decoder did not return a str object (type=%.400s)",
1276 Py_TYPE(unicode)->tp_name);
1277 Py_DECREF(unicode);
1278 goto onError;
1280 Py_DECREF(buffer);
1281 return unicode;
1283 onError:
1284 Py_XDECREF(buffer);
1285 return NULL;
1288 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1289 const char *encoding,
1290 const char *errors)
1292 PyObject *v;
1294 if (!PyUnicode_Check(unicode)) {
1295 PyErr_BadArgument();
1296 goto onError;
1299 if (encoding == NULL)
1300 encoding = PyUnicode_GetDefaultEncoding();
1302 /* Decode via the codec registry */
1303 v = PyCodec_Decode(unicode, encoding, errors);
1304 if (v == NULL)
1305 goto onError;
1306 return v;
1308 onError:
1309 return NULL;
1312 PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1313 const char *encoding,
1314 const char *errors)
1316 PyObject *v;
1318 if (!PyUnicode_Check(unicode)) {
1319 PyErr_BadArgument();
1320 goto onError;
1323 if (encoding == NULL)
1324 encoding = PyUnicode_GetDefaultEncoding();
1326 /* Decode via the codec registry */
1327 v = PyCodec_Decode(unicode, encoding, errors);
1328 if (v == NULL)
1329 goto onError;
1330 if (!PyUnicode_Check(v)) {
1331 PyErr_Format(PyExc_TypeError,
1332 "decoder did not return a str object (type=%.400s)",
1333 Py_TYPE(v)->tp_name);
1334 Py_DECREF(v);
1335 goto onError;
1337 return v;
1339 onError:
1340 return NULL;
1343 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1344 Py_ssize_t size,
1345 const char *encoding,
1346 const char *errors)
1348 PyObject *v, *unicode;
1350 unicode = PyUnicode_FromUnicode(s, size);
1351 if (unicode == NULL)
1352 return NULL;
1353 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1354 Py_DECREF(unicode);
1355 return v;
1358 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1359 const char *encoding,
1360 const char *errors)
1362 PyObject *v;
1364 if (!PyUnicode_Check(unicode)) {
1365 PyErr_BadArgument();
1366 goto onError;
1369 if (encoding == NULL)
1370 encoding = PyUnicode_GetDefaultEncoding();
1372 /* Encode via the codec registry */
1373 v = PyCodec_Encode(unicode, encoding, errors);
1374 if (v == NULL)
1375 goto onError;
1376 return v;
1378 onError:
1379 return NULL;
1382 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1383 const char *encoding,
1384 const char *errors)
1386 PyObject *v;
1388 if (!PyUnicode_Check(unicode)) {
1389 PyErr_BadArgument();
1390 return NULL;
1393 if (encoding == NULL)
1394 encoding = PyUnicode_GetDefaultEncoding();
1396 /* Shortcuts for common default encodings */
1397 if (errors == NULL) {
1398 if (strcmp(encoding, "utf-8") == 0)
1399 return PyUnicode_AsUTF8String(unicode);
1400 else if (strcmp(encoding, "latin-1") == 0)
1401 return PyUnicode_AsLatin1String(unicode);
1402 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1403 else if (strcmp(encoding, "mbcs") == 0)
1404 return PyUnicode_AsMBCSString(unicode);
1405 #endif
1406 else if (strcmp(encoding, "ascii") == 0)
1407 return PyUnicode_AsASCIIString(unicode);
1408 /* During bootstrap, we may need to find the encodings
1409 package, to load the file system encoding, and require the
1410 file system encoding in order to load the encodings
1411 package.
1413 Break out of this dependency by assuming that the path to
1414 the encodings module is ASCII-only. XXX could try wcstombs
1415 instead, if the file system encoding is the locale's
1416 encoding. */
1417 else if (Py_FileSystemDefaultEncoding &&
1418 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1419 !PyThreadState_GET()->interp->codecs_initialized)
1420 return PyUnicode_AsASCIIString(unicode);
1423 /* Encode via the codec registry */
1424 v = PyCodec_Encode(unicode, encoding, errors);
1425 if (v == NULL)
1426 return NULL;
1428 /* The normal path */
1429 if (PyBytes_Check(v))
1430 return v;
1432 /* If the codec returns a buffer, raise a warning and convert to bytes */
1433 if (PyByteArray_Check(v)) {
1434 char msg[100];
1435 PyObject *b;
1436 PyOS_snprintf(msg, sizeof(msg),
1437 "encoder %s returned buffer instead of bytes",
1438 encoding);
1439 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
1440 Py_DECREF(v);
1441 return NULL;
1444 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1445 Py_DECREF(v);
1446 return b;
1449 PyErr_Format(PyExc_TypeError,
1450 "encoder did not return a bytes object (type=%.400s)",
1451 Py_TYPE(v)->tp_name);
1452 Py_DECREF(v);
1453 return NULL;
1456 PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1457 const char *encoding,
1458 const char *errors)
1460 PyObject *v;
1462 if (!PyUnicode_Check(unicode)) {
1463 PyErr_BadArgument();
1464 goto onError;
1467 if (encoding == NULL)
1468 encoding = PyUnicode_GetDefaultEncoding();
1470 /* Encode via the codec registry */
1471 v = PyCodec_Encode(unicode, encoding, errors);
1472 if (v == NULL)
1473 goto onError;
1474 if (!PyUnicode_Check(v)) {
1475 PyErr_Format(PyExc_TypeError,
1476 "encoder did not return an str object (type=%.400s)",
1477 Py_TYPE(v)->tp_name);
1478 Py_DECREF(v);
1479 goto onError;
1481 return v;
1483 onError:
1484 return NULL;
1487 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1488 const char *errors)
1490 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1491 if (v)
1492 return v;
1493 if (errors != NULL)
1494 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1495 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1496 PyUnicode_GET_SIZE(unicode),
1497 NULL);
1498 if (!v)
1499 return NULL;
1500 ((PyUnicodeObject *)unicode)->defenc = v;
1501 return v;
1504 PyObject*
1505 PyUnicode_DecodeFSDefault(const char *s) {
1506 Py_ssize_t size = (Py_ssize_t)strlen(s);
1507 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1510 PyObject*
1511 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1513 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1514 can be undefined. If it is case, decode using UTF-8. The following assumes
1515 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1516 bootstrapping process where the codecs aren't ready yet.
1518 if (Py_FileSystemDefaultEncoding) {
1519 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1520 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
1521 return PyUnicode_DecodeMBCS(s, size, "replace");
1523 #elif defined(__APPLE__)
1524 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
1525 return PyUnicode_DecodeUTF8(s, size, "replace");
1527 #endif
1528 return PyUnicode_Decode(s, size,
1529 Py_FileSystemDefaultEncoding,
1530 "replace");
1532 else {
1533 return PyUnicode_DecodeUTF8(s, size, "replace");
1537 /* Convert the argument to a bytes object, according to the file
1538 system encoding */
1541 PyUnicode_FSConverter(PyObject* arg, void* addr)
1543 PyObject *output = NULL;
1544 Py_ssize_t size;
1545 void *data;
1546 if (arg == NULL) {
1547 Py_DECREF(*(PyObject**)addr);
1548 return 1;
1550 if (PyBytes_Check(arg) || PyByteArray_Check(arg)) {
1551 output = arg;
1552 Py_INCREF(output);
1554 else {
1555 arg = PyUnicode_FromObject(arg);
1556 if (!arg)
1557 return 0;
1558 output = PyUnicode_AsEncodedObject(arg,
1559 Py_FileSystemDefaultEncoding,
1560 "surrogateescape");
1561 Py_DECREF(arg);
1562 if (!output)
1563 return 0;
1564 if (!PyBytes_Check(output)) {
1565 Py_DECREF(output);
1566 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1567 return 0;
1570 if (PyBytes_Check(output)) {
1571 size = PyBytes_GET_SIZE(output);
1572 data = PyBytes_AS_STRING(output);
1574 else {
1575 size = PyByteArray_GET_SIZE(output);
1576 data = PyByteArray_AS_STRING(output);
1578 if (size != strlen(data)) {
1579 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1580 Py_DECREF(output);
1581 return 0;
1583 *(PyObject**)addr = output;
1584 return Py_CLEANUP_SUPPORTED;
1588 char*
1589 _PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
1591 PyObject *bytes;
1592 if (!PyUnicode_Check(unicode)) {
1593 PyErr_BadArgument();
1594 return NULL;
1596 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1597 if (bytes == NULL)
1598 return NULL;
1599 if (psize != NULL)
1600 *psize = PyBytes_GET_SIZE(bytes);
1601 return PyBytes_AS_STRING(bytes);
1604 char*
1605 _PyUnicode_AsString(PyObject *unicode)
1607 return _PyUnicode_AsStringAndSize(unicode, NULL);
1610 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1612 if (!PyUnicode_Check(unicode)) {
1613 PyErr_BadArgument();
1614 goto onError;
1616 return PyUnicode_AS_UNICODE(unicode);
1618 onError:
1619 return NULL;
1622 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1624 if (!PyUnicode_Check(unicode)) {
1625 PyErr_BadArgument();
1626 goto onError;
1628 return PyUnicode_GET_SIZE(unicode);
1630 onError:
1631 return -1;
1634 const char *PyUnicode_GetDefaultEncoding(void)
1636 return unicode_default_encoding;
1639 int PyUnicode_SetDefaultEncoding(const char *encoding)
1641 if (strcmp(encoding, unicode_default_encoding) != 0) {
1642 PyErr_Format(PyExc_ValueError,
1643 "Can only set default encoding to %s",
1644 unicode_default_encoding);
1645 return -1;
1647 return 0;
1650 /* error handling callback helper:
1651 build arguments, call the callback and check the arguments,
1652 if no exception occurred, copy the replacement to the output
1653 and adjust various state variables.
1654 return 0 on success, -1 on error
1657 static
1658 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1659 const char *encoding, const char *reason,
1660 const char **input, const char **inend, Py_ssize_t *startinpos,
1661 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1662 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1664 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
1666 PyObject *restuple = NULL;
1667 PyObject *repunicode = NULL;
1668 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1669 Py_ssize_t insize;
1670 Py_ssize_t requiredsize;
1671 Py_ssize_t newpos;
1672 Py_UNICODE *repptr;
1673 PyObject *inputobj = NULL;
1674 Py_ssize_t repsize;
1675 int res = -1;
1677 if (*errorHandler == NULL) {
1678 *errorHandler = PyCodec_LookupError(errors);
1679 if (*errorHandler == NULL)
1680 goto onError;
1683 if (*exceptionObject == NULL) {
1684 *exceptionObject = PyUnicodeDecodeError_Create(
1685 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1686 if (*exceptionObject == NULL)
1687 goto onError;
1689 else {
1690 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1691 goto onError;
1692 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1693 goto onError;
1694 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1695 goto onError;
1698 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1699 if (restuple == NULL)
1700 goto onError;
1701 if (!PyTuple_Check(restuple)) {
1702 PyErr_SetString(PyExc_TypeError, &argparse[4]);
1703 goto onError;
1705 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1706 goto onError;
1708 /* Copy back the bytes variables, which might have been modified by the
1709 callback */
1710 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1711 if (!inputobj)
1712 goto onError;
1713 if (!PyBytes_Check(inputobj)) {
1714 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1716 *input = PyBytes_AS_STRING(inputobj);
1717 insize = PyBytes_GET_SIZE(inputobj);
1718 *inend = *input + insize;
1719 /* we can DECREF safely, as the exception has another reference,
1720 so the object won't go away. */
1721 Py_DECREF(inputobj);
1723 if (newpos<0)
1724 newpos = insize+newpos;
1725 if (newpos<0 || newpos>insize) {
1726 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1727 goto onError;
1730 /* need more space? (at least enough for what we
1731 have+the replacement+the rest of the string (starting
1732 at the new input position), so we won't have to check space
1733 when there are no errors in the rest of the string) */
1734 repptr = PyUnicode_AS_UNICODE(repunicode);
1735 repsize = PyUnicode_GET_SIZE(repunicode);
1736 requiredsize = *outpos + repsize + insize-newpos;
1737 if (requiredsize > outsize) {
1738 if (requiredsize<2*outsize)
1739 requiredsize = 2*outsize;
1740 if (_PyUnicode_Resize(output, requiredsize) < 0)
1741 goto onError;
1742 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1744 *endinpos = newpos;
1745 *inptr = *input + newpos;
1746 Py_UNICODE_COPY(*outptr, repptr, repsize);
1747 *outptr += repsize;
1748 *outpos += repsize;
1750 /* we made it! */
1751 res = 0;
1753 onError:
1754 Py_XDECREF(restuple);
1755 return res;
1758 /* --- UTF-7 Codec -------------------------------------------------------- */
1760 /* See RFC2152 for details. We encode conservatively and decode liberally. */
1762 /* Three simple macros defining base-64. */
1764 /* Is c a base-64 character? */
1766 #define IS_BASE64(c) \
1767 (((c) >= 'A' && (c) <= 'Z') || \
1768 ((c) >= 'a' && (c) <= 'z') || \
1769 ((c) >= '0' && (c) <= '9') || \
1770 (c) == '+' || (c) == '/')
1772 /* given that c is a base-64 character, what is its base-64 value? */
1774 #define FROM_BASE64(c) \
1775 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1776 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1777 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1778 (c) == '+' ? 62 : 63)
1780 /* What is the base-64 character of the bottom 6 bits of n? */
1782 #define TO_BASE64(n) \
1783 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1785 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1786 * decoded as itself. We are permissive on decoding; the only ASCII
1787 * byte not decoding to itself is the + which begins a base64
1788 * string. */
1790 #define DECODE_DIRECT(c) \
1791 ((c) <= 127 && (c) != '+')
1793 /* The UTF-7 encoder treats ASCII characters differently according to
1794 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1795 * the above). See RFC2152. This array identifies these different
1796 * sets:
1797 * 0 : "Set D"
1798 * alphanumeric and '(),-./:?
1799 * 1 : "Set O"
1800 * !"#$%&*;<=>@[]^_`{|}
1801 * 2 : "whitespace"
1802 * ht nl cr sp
1803 * 3 : special (must be base64 encoded)
1804 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1807 static
1808 char utf7_category[128] = {
1809 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1810 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1811 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1812 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1813 /* sp ! " # $ % & ' ( ) * + , - . / */
1814 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1815 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1816 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1817 /* @ A B C D E F G H I J K L M N O */
1818 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1819 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
1820 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1821 /* ` a b c d e f g h i j k l m n o */
1822 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1823 /* p q r s t u v w x y z { | } ~ del */
1824 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
1827 /* ENCODE_DIRECT: this character should be encoded as itself. The
1828 * answer depends on whether we are encoding set O as itself, and also
1829 * on whether we are encoding whitespace as itself. RFC2152 makes it
1830 * clear that the answers to these questions vary between
1831 * applications, so this code needs to be flexible. */
1833 #define ENCODE_DIRECT(c, directO, directWS) \
1834 ((c) < 128 && (c) > 0 && \
1835 ((utf7_category[(c)] == 0) || \
1836 (directWS && (utf7_category[(c)] == 2)) || \
1837 (directO && (utf7_category[(c)] == 1))))
1839 PyObject *PyUnicode_DecodeUTF7(const char *s,
1840 Py_ssize_t size,
1841 const char *errors)
1843 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1846 /* The decoder. The only state we preserve is our read position,
1847 * i.e. how many characters we have consumed. So if we end in the
1848 * middle of a shift sequence we have to back off the read position
1849 * and the output to the beginning of the sequence, otherwise we lose
1850 * all the shift state (seen bits, number of bits seen, high
1851 * surrogate). */
1853 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1854 Py_ssize_t size,
1855 const char *errors,
1856 Py_ssize_t *consumed)
1858 const char *starts = s;
1859 Py_ssize_t startinpos;
1860 Py_ssize_t endinpos;
1861 Py_ssize_t outpos;
1862 const char *e;
1863 PyUnicodeObject *unicode;
1864 Py_UNICODE *p;
1865 const char *errmsg = "";
1866 int inShift = 0;
1867 Py_UNICODE *shiftOutStart;
1868 unsigned int base64bits = 0;
1869 unsigned long base64buffer = 0;
1870 Py_UNICODE surrogate = 0;
1871 PyObject *errorHandler = NULL;
1872 PyObject *exc = NULL;
1874 unicode = _PyUnicode_New(size);
1875 if (!unicode)
1876 return NULL;
1877 if (size == 0) {
1878 if (consumed)
1879 *consumed = 0;
1880 return (PyObject *)unicode;
1883 p = unicode->str;
1884 shiftOutStart = p;
1885 e = s + size;
1887 while (s < e) {
1888 Py_UNICODE ch;
1889 restart:
1890 ch = (unsigned char) *s;
1892 if (inShift) { /* in a base-64 section */
1893 if (IS_BASE64(ch)) { /* consume a base-64 character */
1894 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1895 base64bits += 6;
1896 s++;
1897 if (base64bits >= 16) {
1898 /* we have enough bits for a UTF-16 value */
1899 Py_UNICODE outCh = (Py_UNICODE)
1900 (base64buffer >> (base64bits-16));
1901 base64bits -= 16;
1902 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1903 if (surrogate) {
1904 /* expecting a second surrogate */
1905 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1906 #ifdef Py_UNICODE_WIDE
1907 *p++ = (((surrogate & 0x3FF)<<10)
1908 | (outCh & 0x3FF)) + 0x10000;
1909 #else
1910 *p++ = surrogate;
1911 *p++ = outCh;
1912 #endif
1913 surrogate = 0;
1915 else {
1916 surrogate = 0;
1917 errmsg = "second surrogate missing";
1918 goto utf7Error;
1921 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1922 /* first surrogate */
1923 surrogate = outCh;
1925 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1926 errmsg = "unexpected second surrogate";
1927 goto utf7Error;
1929 else {
1930 *p++ = outCh;
1934 else { /* now leaving a base-64 section */
1935 inShift = 0;
1936 s++;
1937 if (surrogate) {
1938 errmsg = "second surrogate missing at end of shift sequence";
1939 goto utf7Error;
1941 if (base64bits > 0) { /* left-over bits */
1942 if (base64bits >= 6) {
1943 /* We've seen at least one base-64 character */
1944 errmsg = "partial character in shift sequence";
1945 goto utf7Error;
1947 else {
1948 /* Some bits remain; they should be zero */
1949 if (base64buffer != 0) {
1950 errmsg = "non-zero padding bits in shift sequence";
1951 goto utf7Error;
1955 if (ch != '-') {
1956 /* '-' is absorbed; other terminating
1957 characters are preserved */
1958 *p++ = ch;
1962 else if ( ch == '+' ) {
1963 startinpos = s-starts;
1964 s++; /* consume '+' */
1965 if (s < e && *s == '-') { /* '+-' encodes '+' */
1966 s++;
1967 *p++ = '+';
1969 else { /* begin base64-encoded section */
1970 inShift = 1;
1971 shiftOutStart = p;
1972 base64bits = 0;
1975 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
1976 *p++ = ch;
1977 s++;
1979 else {
1980 startinpos = s-starts;
1981 s++;
1982 errmsg = "unexpected special character";
1983 goto utf7Error;
1985 continue;
1986 utf7Error:
1987 outpos = p-PyUnicode_AS_UNICODE(unicode);
1988 endinpos = s-starts;
1989 if (unicode_decode_call_errorhandler(
1990 errors, &errorHandler,
1991 "utf7", errmsg,
1992 &starts, &e, &startinpos, &endinpos, &exc, &s,
1993 &unicode, &outpos, &p))
1994 goto onError;
1997 /* end of string */
1999 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2000 /* if we're in an inconsistent state, that's an error */
2001 if (surrogate ||
2002 (base64bits >= 6) ||
2003 (base64bits > 0 && base64buffer != 0)) {
2004 outpos = p-PyUnicode_AS_UNICODE(unicode);
2005 endinpos = size;
2006 if (unicode_decode_call_errorhandler(
2007 errors, &errorHandler,
2008 "utf7", "unterminated shift sequence",
2009 &starts, &e, &startinpos, &endinpos, &exc, &s,
2010 &unicode, &outpos, &p))
2011 goto onError;
2012 if (s < e)
2013 goto restart;
2017 /* return state */
2018 if (consumed) {
2019 if (inShift) {
2020 p = shiftOutStart; /* back off output */
2021 *consumed = startinpos;
2023 else {
2024 *consumed = s-starts;
2028 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
2029 goto onError;
2031 Py_XDECREF(errorHandler);
2032 Py_XDECREF(exc);
2033 return (PyObject *)unicode;
2035 onError:
2036 Py_XDECREF(errorHandler);
2037 Py_XDECREF(exc);
2038 Py_DECREF(unicode);
2039 return NULL;
2043 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
2044 Py_ssize_t size,
2045 int base64SetO,
2046 int base64WhiteSpace,
2047 const char *errors)
2049 PyObject *v;
2050 /* It might be possible to tighten this worst case */
2051 Py_ssize_t allocated = 8 * size;
2052 int inShift = 0;
2053 Py_ssize_t i = 0;
2054 unsigned int base64bits = 0;
2055 unsigned long base64buffer = 0;
2056 char * out;
2057 char * start;
2059 if (size == 0)
2060 return PyBytes_FromStringAndSize(NULL, 0);
2062 if (allocated / 8 != size)
2063 return PyErr_NoMemory();
2065 v = PyBytes_FromStringAndSize(NULL, allocated);
2066 if (v == NULL)
2067 return NULL;
2069 start = out = PyBytes_AS_STRING(v);
2070 for (;i < size; ++i) {
2071 Py_UNICODE ch = s[i];
2073 if (inShift) {
2074 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2075 /* shifting out */
2076 if (base64bits) { /* output remaining bits */
2077 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2078 base64buffer = 0;
2079 base64bits = 0;
2081 inShift = 0;
2082 /* Characters not in the BASE64 set implicitly unshift the sequence
2083 so no '-' is required, except if the character is itself a '-' */
2084 if (IS_BASE64(ch) || ch == '-') {
2085 *out++ = '-';
2087 *out++ = (char) ch;
2089 else {
2090 goto encode_char;
2093 else { /* not in a shift sequence */
2094 if (ch == '+') {
2095 *out++ = '+';
2096 *out++ = '-';
2098 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2099 *out++ = (char) ch;
2101 else {
2102 *out++ = '+';
2103 inShift = 1;
2104 goto encode_char;
2107 continue;
2108 encode_char:
2109 #ifdef Py_UNICODE_WIDE
2110 if (ch >= 0x10000) {
2111 /* code first surrogate */
2112 base64bits += 16;
2113 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2114 while (base64bits >= 6) {
2115 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2116 base64bits -= 6;
2118 /* prepare second surrogate */
2119 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2121 #endif
2122 base64bits += 16;
2123 base64buffer = (base64buffer << 16) | ch;
2124 while (base64bits >= 6) {
2125 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2126 base64bits -= 6;
2129 if (base64bits)
2130 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2131 if (inShift)
2132 *out++ = '-';
2133 if (_PyBytes_Resize(&v, out - start) < 0)
2134 return NULL;
2135 return v;
2138 #undef IS_BASE64
2139 #undef FROM_BASE64
2140 #undef TO_BASE64
2141 #undef DECODE_DIRECT
2142 #undef ENCODE_DIRECT
2144 /* --- UTF-8 Codec -------------------------------------------------------- */
2146 static
2147 char utf8_code_length[256] = {
2148 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2149 illegal prefix. See RFC 3629 for details */
2150 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2151 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2152 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2153 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2154 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2155 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2156 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2157 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2158 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
2159 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2160 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2161 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2162 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2163 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2164 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2165 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
2168 PyObject *PyUnicode_DecodeUTF8(const char *s,
2169 Py_ssize_t size,
2170 const char *errors)
2172 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2175 /* Mask to check or force alignment of a pointer to C 'long' boundaries */
2176 #define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2178 /* Mask to quickly check whether a C 'long' contains a
2179 non-ASCII, UTF8-encoded char. */
2180 #if (SIZEOF_LONG == 8)
2181 # define ASCII_CHAR_MASK 0x8080808080808080L
2182 #elif (SIZEOF_LONG == 4)
2183 # define ASCII_CHAR_MASK 0x80808080L
2184 #else
2185 # error C 'long' size should be either 4 or 8!
2186 #endif
2188 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
2189 Py_ssize_t size,
2190 const char *errors,
2191 Py_ssize_t *consumed)
2193 const char *starts = s;
2194 int n;
2195 int k;
2196 Py_ssize_t startinpos;
2197 Py_ssize_t endinpos;
2198 Py_ssize_t outpos;
2199 const char *e, *aligned_end;
2200 PyUnicodeObject *unicode;
2201 Py_UNICODE *p;
2202 const char *errmsg = "";
2203 PyObject *errorHandler = NULL;
2204 PyObject *exc = NULL;
2206 /* Note: size will always be longer than the resulting Unicode
2207 character count */
2208 unicode = _PyUnicode_New(size);
2209 if (!unicode)
2210 return NULL;
2211 if (size == 0) {
2212 if (consumed)
2213 *consumed = 0;
2214 return (PyObject *)unicode;
2217 /* Unpack UTF-8 encoded data */
2218 p = unicode->str;
2219 e = s + size;
2220 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
2222 while (s < e) {
2223 Py_UCS4 ch = (unsigned char)*s;
2225 if (ch < 0x80) {
2226 /* Fast path for runs of ASCII characters. Given that common UTF-8
2227 input will consist of an overwhelming majority of ASCII
2228 characters, we try to optimize for this case by checking
2229 as many characters as a C 'long' can contain.
2230 First, check if we can do an aligned read, as most CPUs have
2231 a penalty for unaligned reads.
2233 if (!((size_t) s & LONG_PTR_MASK)) {
2234 /* Help register allocation */
2235 register const char *_s = s;
2236 register Py_UNICODE *_p = p;
2237 while (_s < aligned_end) {
2238 /* Read a whole long at a time (either 4 or 8 bytes),
2239 and do a fast unrolled copy if it only contains ASCII
2240 characters. */
2241 unsigned long data = *(unsigned long *) _s;
2242 if (data & ASCII_CHAR_MASK)
2243 break;
2244 _p[0] = (unsigned char) _s[0];
2245 _p[1] = (unsigned char) _s[1];
2246 _p[2] = (unsigned char) _s[2];
2247 _p[3] = (unsigned char) _s[3];
2248 #if (SIZEOF_LONG == 8)
2249 _p[4] = (unsigned char) _s[4];
2250 _p[5] = (unsigned char) _s[5];
2251 _p[6] = (unsigned char) _s[6];
2252 _p[7] = (unsigned char) _s[7];
2253 #endif
2254 _s += SIZEOF_LONG;
2255 _p += SIZEOF_LONG;
2257 s = _s;
2258 p = _p;
2259 if (s == e)
2260 break;
2261 ch = (unsigned char)*s;
2265 if (ch < 0x80) {
2266 *p++ = (Py_UNICODE)ch;
2267 s++;
2268 continue;
2271 n = utf8_code_length[ch];
2273 if (s + n > e) {
2274 if (consumed)
2275 break;
2276 else {
2277 errmsg = "unexpected end of data";
2278 startinpos = s-starts;
2279 endinpos = startinpos+1;
2280 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2281 endinpos++;
2282 goto utf8Error;
2286 switch (n) {
2288 case 0:
2289 errmsg = "invalid start byte";
2290 startinpos = s-starts;
2291 endinpos = startinpos+1;
2292 goto utf8Error;
2294 case 1:
2295 errmsg = "internal error";
2296 startinpos = s-starts;
2297 endinpos = startinpos+1;
2298 goto utf8Error;
2300 case 2:
2301 if ((s[1] & 0xc0) != 0x80) {
2302 errmsg = "invalid continuation byte";
2303 startinpos = s-starts;
2304 endinpos = startinpos + 1;
2305 goto utf8Error;
2307 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2308 assert ((ch > 0x007F) && (ch <= 0x07FF));
2309 *p++ = (Py_UNICODE)ch;
2310 break;
2312 case 3:
2313 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2314 will result in surrogates in range d800-dfff. Surrogates are
2315 not valid UTF-8 so they are rejected.
2316 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2317 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2318 if ((s[1] & 0xc0) != 0x80 ||
2319 (s[2] & 0xc0) != 0x80 ||
2320 ((unsigned char)s[0] == 0xE0 &&
2321 (unsigned char)s[1] < 0xA0) ||
2322 ((unsigned char)s[0] == 0xED &&
2323 (unsigned char)s[1] > 0x9F)) {
2324 errmsg = "invalid continuation byte";
2325 startinpos = s-starts;
2326 endinpos = startinpos + 1;
2328 /* if s[1] first two bits are 1 and 0, then the invalid
2329 continuation byte is s[2], so increment endinpos by 1,
2330 if not, s[1] is invalid and endinpos doesn't need to
2331 be incremented. */
2332 if ((s[1] & 0xC0) == 0x80)
2333 endinpos++;
2334 goto utf8Error;
2336 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2337 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2338 *p++ = (Py_UNICODE)ch;
2339 break;
2341 case 4:
2342 if ((s[1] & 0xc0) != 0x80 ||
2343 (s[2] & 0xc0) != 0x80 ||
2344 (s[3] & 0xc0) != 0x80 ||
2345 ((unsigned char)s[0] == 0xF0 &&
2346 (unsigned char)s[1] < 0x90) ||
2347 ((unsigned char)s[0] == 0xF4 &&
2348 (unsigned char)s[1] > 0x8F)) {
2349 errmsg = "invalid continuation byte";
2350 startinpos = s-starts;
2351 endinpos = startinpos + 1;
2352 if ((s[1] & 0xC0) == 0x80) {
2353 endinpos++;
2354 if ((s[2] & 0xC0) == 0x80)
2355 endinpos++;
2357 goto utf8Error;
2359 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2360 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2361 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2363 #ifdef Py_UNICODE_WIDE
2364 *p++ = (Py_UNICODE)ch;
2365 #else
2366 /* compute and append the two surrogates: */
2368 /* translate from 10000..10FFFF to 0..FFFF */
2369 ch -= 0x10000;
2371 /* high surrogate = top 10 bits added to D800 */
2372 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2374 /* low surrogate = bottom 10 bits added to DC00 */
2375 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2376 #endif
2377 break;
2379 s += n;
2380 continue;
2382 utf8Error:
2383 outpos = p-PyUnicode_AS_UNICODE(unicode);
2384 if (unicode_decode_call_errorhandler(
2385 errors, &errorHandler,
2386 "utf8", errmsg,
2387 &starts, &e, &startinpos, &endinpos, &exc, &s,
2388 &unicode, &outpos, &p))
2389 goto onError;
2390 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
2392 if (consumed)
2393 *consumed = s-starts;
2395 /* Adjust length */
2396 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2397 goto onError;
2399 Py_XDECREF(errorHandler);
2400 Py_XDECREF(exc);
2401 return (PyObject *)unicode;
2403 onError:
2404 Py_XDECREF(errorHandler);
2405 Py_XDECREF(exc);
2406 Py_DECREF(unicode);
2407 return NULL;
2410 #undef ASCII_CHAR_MASK
2413 /* Allocation strategy: if the string is short, convert into a stack buffer
2414 and allocate exactly as much space needed at the end. Else allocate the
2415 maximum possible needed (4 result bytes per Unicode character), and return
2416 the excess memory at the end.
2418 PyObject *
2419 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2420 Py_ssize_t size,
2421 const char *errors)
2423 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
2425 Py_ssize_t i; /* index into s of next input byte */
2426 PyObject *result; /* result string object */
2427 char *p; /* next free byte in output buffer */
2428 Py_ssize_t nallocated; /* number of result bytes allocated */
2429 Py_ssize_t nneeded; /* number of result bytes needed */
2430 char stackbuf[MAX_SHORT_UNICHARS * 4];
2431 PyObject *errorHandler = NULL;
2432 PyObject *exc = NULL;
2434 assert(s != NULL);
2435 assert(size >= 0);
2437 if (size <= MAX_SHORT_UNICHARS) {
2438 /* Write into the stack buffer; nallocated can't overflow.
2439 * At the end, we'll allocate exactly as much heap space as it
2440 * turns out we need.
2442 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2443 result = NULL; /* will allocate after we're done */
2444 p = stackbuf;
2446 else {
2447 /* Overallocate on the heap, and give the excess back at the end. */
2448 nallocated = size * 4;
2449 if (nallocated / 4 != size) /* overflow! */
2450 return PyErr_NoMemory();
2451 result = PyBytes_FromStringAndSize(NULL, nallocated);
2452 if (result == NULL)
2453 return NULL;
2454 p = PyBytes_AS_STRING(result);
2457 for (i = 0; i < size;) {
2458 Py_UCS4 ch = s[i++];
2460 if (ch < 0x80)
2461 /* Encode ASCII */
2462 *p++ = (char) ch;
2464 else if (ch < 0x0800) {
2465 /* Encode Latin-1 */
2466 *p++ = (char)(0xc0 | (ch >> 6));
2467 *p++ = (char)(0x80 | (ch & 0x3f));
2468 } else if (0xD800 <= ch && ch <= 0xDFFF) {
2469 #ifndef Py_UNICODE_WIDE
2470 /* Special case: check for high and low surrogate */
2471 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2472 Py_UCS4 ch2 = s[i];
2473 /* Combine the two surrogates to form a UCS4 value */
2474 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2475 i++;
2477 /* Encode UCS4 Unicode ordinals */
2478 *p++ = (char)(0xf0 | (ch >> 18));
2479 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2480 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2481 *p++ = (char)(0x80 | (ch & 0x3f));
2482 } else {
2483 #endif
2484 Py_ssize_t newpos;
2485 PyObject *rep;
2486 Py_ssize_t repsize, k;
2487 rep = unicode_encode_call_errorhandler
2488 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2489 s, size, &exc, i-1, i, &newpos);
2490 if (!rep)
2491 goto error;
2493 if (PyBytes_Check(rep))
2494 repsize = PyBytes_GET_SIZE(rep);
2495 else
2496 repsize = PyUnicode_GET_SIZE(rep);
2498 if (repsize > 4) {
2499 Py_ssize_t offset;
2501 if (result == NULL)
2502 offset = p - stackbuf;
2503 else
2504 offset = p - PyBytes_AS_STRING(result);
2506 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2507 /* integer overflow */
2508 PyErr_NoMemory();
2509 goto error;
2511 nallocated += repsize - 4;
2512 if (result != NULL) {
2513 if (_PyBytes_Resize(&result, nallocated) < 0)
2514 goto error;
2515 } else {
2516 result = PyBytes_FromStringAndSize(NULL, nallocated);
2517 if (result == NULL)
2518 goto error;
2519 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2521 p = PyBytes_AS_STRING(result) + offset;
2524 if (PyBytes_Check(rep)) {
2525 char *prep = PyBytes_AS_STRING(rep);
2526 for(k = repsize; k > 0; k--)
2527 *p++ = *prep++;
2528 } else /* rep is unicode */ {
2529 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2530 Py_UNICODE c;
2532 for(k=0; k<repsize; k++) {
2533 c = prep[k];
2534 if (0x80 <= c) {
2535 raise_encode_exception(&exc, "utf-8", s, size,
2536 i-1, i, "surrogates not allowed");
2537 goto error;
2539 *p++ = (char)prep[k];
2542 Py_DECREF(rep);
2543 #ifndef Py_UNICODE_WIDE
2545 #endif
2546 } else if (ch < 0x10000) {
2547 *p++ = (char)(0xe0 | (ch >> 12));
2548 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2549 *p++ = (char)(0x80 | (ch & 0x3f));
2550 } else /* ch >= 0x10000 */ {
2551 /* Encode UCS4 Unicode ordinals */
2552 *p++ = (char)(0xf0 | (ch >> 18));
2553 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2554 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2555 *p++ = (char)(0x80 | (ch & 0x3f));
2559 if (result == NULL) {
2560 /* This was stack allocated. */
2561 nneeded = p - stackbuf;
2562 assert(nneeded <= nallocated);
2563 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
2565 else {
2566 /* Cut back to size actually needed. */
2567 nneeded = p - PyBytes_AS_STRING(result);
2568 assert(nneeded <= nallocated);
2569 _PyBytes_Resize(&result, nneeded);
2571 Py_XDECREF(errorHandler);
2572 Py_XDECREF(exc);
2573 return result;
2574 error:
2575 Py_XDECREF(errorHandler);
2576 Py_XDECREF(exc);
2577 Py_XDECREF(result);
2578 return NULL;
2580 #undef MAX_SHORT_UNICHARS
2583 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2585 if (!PyUnicode_Check(unicode)) {
2586 PyErr_BadArgument();
2587 return NULL;
2589 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2590 PyUnicode_GET_SIZE(unicode),
2591 NULL);
2594 /* --- UTF-32 Codec ------------------------------------------------------- */
2596 PyObject *
2597 PyUnicode_DecodeUTF32(const char *s,
2598 Py_ssize_t size,
2599 const char *errors,
2600 int *byteorder)
2602 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2605 PyObject *
2606 PyUnicode_DecodeUTF32Stateful(const char *s,
2607 Py_ssize_t size,
2608 const char *errors,
2609 int *byteorder,
2610 Py_ssize_t *consumed)
2612 const char *starts = s;
2613 Py_ssize_t startinpos;
2614 Py_ssize_t endinpos;
2615 Py_ssize_t outpos;
2616 PyUnicodeObject *unicode;
2617 Py_UNICODE *p;
2618 #ifndef Py_UNICODE_WIDE
2619 int pairs = 0;
2620 #else
2621 const int pairs = 0;
2622 #endif
2623 const unsigned char *q, *e, *qq;
2624 int bo = 0; /* assume native ordering by default */
2625 const char *errmsg = "";
2626 /* Offsets from q for retrieving bytes in the right order. */
2627 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2628 int iorder[] = {0, 1, 2, 3};
2629 #else
2630 int iorder[] = {3, 2, 1, 0};
2631 #endif
2632 PyObject *errorHandler = NULL;
2633 PyObject *exc = NULL;
2635 q = (unsigned char *)s;
2636 e = q + size;
2638 if (byteorder)
2639 bo = *byteorder;
2641 /* Check for BOM marks (U+FEFF) in the input and adjust current
2642 byte order setting accordingly. In native mode, the leading BOM
2643 mark is skipped, in all other modes, it is copied to the output
2644 stream as-is (giving a ZWNBSP character). */
2645 if (bo == 0) {
2646 if (size >= 4) {
2647 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2648 (q[iorder[1]] << 8) | q[iorder[0]];
2649 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2650 if (bom == 0x0000FEFF) {
2651 q += 4;
2652 bo = -1;
2654 else if (bom == 0xFFFE0000) {
2655 q += 4;
2656 bo = 1;
2658 #else
2659 if (bom == 0x0000FEFF) {
2660 q += 4;
2661 bo = 1;
2663 else if (bom == 0xFFFE0000) {
2664 q += 4;
2665 bo = -1;
2667 #endif
2671 if (bo == -1) {
2672 /* force LE */
2673 iorder[0] = 0;
2674 iorder[1] = 1;
2675 iorder[2] = 2;
2676 iorder[3] = 3;
2678 else if (bo == 1) {
2679 /* force BE */
2680 iorder[0] = 3;
2681 iorder[1] = 2;
2682 iorder[2] = 1;
2683 iorder[3] = 0;
2686 /* On narrow builds we split characters outside the BMP into two
2687 codepoints => count how much extra space we need. */
2688 #ifndef Py_UNICODE_WIDE
2689 for (qq = q; qq < e; qq += 4)
2690 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2691 pairs++;
2692 #endif
2694 /* This might be one to much, because of a BOM */
2695 unicode = _PyUnicode_New((size+3)/4+pairs);
2696 if (!unicode)
2697 return NULL;
2698 if (size == 0)
2699 return (PyObject *)unicode;
2701 /* Unpack UTF-32 encoded data */
2702 p = unicode->str;
2704 while (q < e) {
2705 Py_UCS4 ch;
2706 /* remaining bytes at the end? (size should be divisible by 4) */
2707 if (e-q<4) {
2708 if (consumed)
2709 break;
2710 errmsg = "truncated data";
2711 startinpos = ((const char *)q)-starts;
2712 endinpos = ((const char *)e)-starts;
2713 goto utf32Error;
2714 /* The remaining input chars are ignored if the callback
2715 chooses to skip the input */
2717 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2718 (q[iorder[1]] << 8) | q[iorder[0]];
2720 if (ch >= 0x110000)
2722 errmsg = "codepoint not in range(0x110000)";
2723 startinpos = ((const char *)q)-starts;
2724 endinpos = startinpos+4;
2725 goto utf32Error;
2727 #ifndef Py_UNICODE_WIDE
2728 if (ch >= 0x10000)
2730 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2731 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2733 else
2734 #endif
2735 *p++ = ch;
2736 q += 4;
2737 continue;
2738 utf32Error:
2739 outpos = p-PyUnicode_AS_UNICODE(unicode);
2740 if (unicode_decode_call_errorhandler(
2741 errors, &errorHandler,
2742 "utf32", errmsg,
2743 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2744 &unicode, &outpos, &p))
2745 goto onError;
2748 if (byteorder)
2749 *byteorder = bo;
2751 if (consumed)
2752 *consumed = (const char *)q-starts;
2754 /* Adjust length */
2755 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2756 goto onError;
2758 Py_XDECREF(errorHandler);
2759 Py_XDECREF(exc);
2760 return (PyObject *)unicode;
2762 onError:
2763 Py_DECREF(unicode);
2764 Py_XDECREF(errorHandler);
2765 Py_XDECREF(exc);
2766 return NULL;
2769 PyObject *
2770 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2771 Py_ssize_t size,
2772 const char *errors,
2773 int byteorder)
2775 PyObject *v;
2776 unsigned char *p;
2777 Py_ssize_t nsize, bytesize;
2778 #ifndef Py_UNICODE_WIDE
2779 Py_ssize_t i, pairs;
2780 #else
2781 const int pairs = 0;
2782 #endif
2783 /* Offsets from p for storing byte pairs in the right order. */
2784 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2785 int iorder[] = {0, 1, 2, 3};
2786 #else
2787 int iorder[] = {3, 2, 1, 0};
2788 #endif
2790 #define STORECHAR(CH) \
2791 do { \
2792 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2793 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2794 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2795 p[iorder[0]] = (CH) & 0xff; \
2796 p += 4; \
2797 } while(0)
2799 /* In narrow builds we can output surrogate pairs as one codepoint,
2800 so we need less space. */
2801 #ifndef Py_UNICODE_WIDE
2802 for (i = pairs = 0; i < size-1; i++)
2803 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2804 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2805 pairs++;
2806 #endif
2807 nsize = (size - pairs + (byteorder == 0));
2808 bytesize = nsize * 4;
2809 if (bytesize / 4 != nsize)
2810 return PyErr_NoMemory();
2811 v = PyBytes_FromStringAndSize(NULL, bytesize);
2812 if (v == NULL)
2813 return NULL;
2815 p = (unsigned char *)PyBytes_AS_STRING(v);
2816 if (byteorder == 0)
2817 STORECHAR(0xFEFF);
2818 if (size == 0)
2819 goto done;
2821 if (byteorder == -1) {
2822 /* force LE */
2823 iorder[0] = 0;
2824 iorder[1] = 1;
2825 iorder[2] = 2;
2826 iorder[3] = 3;
2828 else if (byteorder == 1) {
2829 /* force BE */
2830 iorder[0] = 3;
2831 iorder[1] = 2;
2832 iorder[2] = 1;
2833 iorder[3] = 0;
2836 while (size-- > 0) {
2837 Py_UCS4 ch = *s++;
2838 #ifndef Py_UNICODE_WIDE
2839 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2840 Py_UCS4 ch2 = *s;
2841 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2842 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2843 s++;
2844 size--;
2847 #endif
2848 STORECHAR(ch);
2851 done:
2852 return v;
2853 #undef STORECHAR
2856 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2858 if (!PyUnicode_Check(unicode)) {
2859 PyErr_BadArgument();
2860 return NULL;
2862 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2863 PyUnicode_GET_SIZE(unicode),
2864 NULL,
2868 /* --- UTF-16 Codec ------------------------------------------------------- */
2870 PyObject *
2871 PyUnicode_DecodeUTF16(const char *s,
2872 Py_ssize_t size,
2873 const char *errors,
2874 int *byteorder)
2876 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2879 /* Two masks for fast checking of whether a C 'long' may contain
2880 UTF16-encoded surrogate characters. This is an efficient heuristic,
2881 assuming that non-surrogate characters with a code point >= 0x8000 are
2882 rare in most input.
2883 FAST_CHAR_MASK is used when the input is in native byte ordering,
2884 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
2886 #if (SIZEOF_LONG == 8)
2887 # define FAST_CHAR_MASK 0x8000800080008000L
2888 # define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2889 #elif (SIZEOF_LONG == 4)
2890 # define FAST_CHAR_MASK 0x80008000L
2891 # define SWAPPED_FAST_CHAR_MASK 0x00800080L
2892 #else
2893 # error C 'long' size should be either 4 or 8!
2894 #endif
2896 PyObject *
2897 PyUnicode_DecodeUTF16Stateful(const char *s,
2898 Py_ssize_t size,
2899 const char *errors,
2900 int *byteorder,
2901 Py_ssize_t *consumed)
2903 const char *starts = s;
2904 Py_ssize_t startinpos;
2905 Py_ssize_t endinpos;
2906 Py_ssize_t outpos;
2907 PyUnicodeObject *unicode;
2908 Py_UNICODE *p;
2909 const unsigned char *q, *e, *aligned_end;
2910 int bo = 0; /* assume native ordering by default */
2911 int native_ordering = 0;
2912 const char *errmsg = "";
2913 /* Offsets from q for retrieving byte pairs in the right order. */
2914 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2915 int ihi = 1, ilo = 0;
2916 #else
2917 int ihi = 0, ilo = 1;
2918 #endif
2919 PyObject *errorHandler = NULL;
2920 PyObject *exc = NULL;
2922 /* Note: size will always be longer than the resulting Unicode
2923 character count */
2924 unicode = _PyUnicode_New(size);
2925 if (!unicode)
2926 return NULL;
2927 if (size == 0)
2928 return (PyObject *)unicode;
2930 /* Unpack UTF-16 encoded data */
2931 p = unicode->str;
2932 q = (unsigned char *)s;
2933 e = q + size - 1;
2935 if (byteorder)
2936 bo = *byteorder;
2938 /* Check for BOM marks (U+FEFF) in the input and adjust current
2939 byte order setting accordingly. In native mode, the leading BOM
2940 mark is skipped, in all other modes, it is copied to the output
2941 stream as-is (giving a ZWNBSP character). */
2942 if (bo == 0) {
2943 if (size >= 2) {
2944 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2945 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2946 if (bom == 0xFEFF) {
2947 q += 2;
2948 bo = -1;
2950 else if (bom == 0xFFFE) {
2951 q += 2;
2952 bo = 1;
2954 #else
2955 if (bom == 0xFEFF) {
2956 q += 2;
2957 bo = 1;
2959 else if (bom == 0xFFFE) {
2960 q += 2;
2961 bo = -1;
2963 #endif
2967 if (bo == -1) {
2968 /* force LE */
2969 ihi = 1;
2970 ilo = 0;
2972 else if (bo == 1) {
2973 /* force BE */
2974 ihi = 0;
2975 ilo = 1;
2977 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2978 native_ordering = ilo < ihi;
2979 #else
2980 native_ordering = ilo > ihi;
2981 #endif
2983 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
2984 while (q < e) {
2985 Py_UNICODE ch;
2986 /* First check for possible aligned read of a C 'long'. Unaligned
2987 reads are more expensive, better to defer to another iteration. */
2988 if (!((size_t) q & LONG_PTR_MASK)) {
2989 /* Fast path for runs of non-surrogate chars. */
2990 register const unsigned char *_q = q;
2991 Py_UNICODE *_p = p;
2992 if (native_ordering) {
2993 /* Native ordering is simple: as long as the input cannot
2994 possibly contain a surrogate char, do an unrolled copy
2995 of several 16-bit code points to the target object.
2996 The non-surrogate check is done on several input bytes
2997 at a time (as many as a C 'long' can contain). */
2998 while (_q < aligned_end) {
2999 unsigned long data = * (unsigned long *) _q;
3000 if (data & FAST_CHAR_MASK)
3001 break;
3002 _p[0] = ((unsigned short *) _q)[0];
3003 _p[1] = ((unsigned short *) _q)[1];
3004 #if (SIZEOF_LONG == 8)
3005 _p[2] = ((unsigned short *) _q)[2];
3006 _p[3] = ((unsigned short *) _q)[3];
3007 #endif
3008 _q += SIZEOF_LONG;
3009 _p += SIZEOF_LONG / 2;
3012 else {
3013 /* Byteswapped ordering is similar, but we must decompose
3014 the copy bytewise, and take care of zero'ing out the
3015 upper bytes if the target object is in 32-bit units
3016 (that is, in UCS-4 builds). */
3017 while (_q < aligned_end) {
3018 unsigned long data = * (unsigned long *) _q;
3019 if (data & SWAPPED_FAST_CHAR_MASK)
3020 break;
3021 /* Zero upper bytes in UCS-4 builds */
3022 #if (Py_UNICODE_SIZE > 2)
3023 _p[0] = 0;
3024 _p[1] = 0;
3025 #if (SIZEOF_LONG == 8)
3026 _p[2] = 0;
3027 _p[3] = 0;
3028 #endif
3029 #endif
3030 /* Issue #4916; UCS-4 builds on big endian machines must
3031 fill the two last bytes of each 4-byte unit. */
3032 #if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3033 # define OFF 2
3034 #else
3035 # define OFF 0
3036 #endif
3037 ((unsigned char *) _p)[OFF + 1] = _q[0];
3038 ((unsigned char *) _p)[OFF + 0] = _q[1];
3039 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3040 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3041 #if (SIZEOF_LONG == 8)
3042 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3043 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3044 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3045 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3046 #endif
3047 #undef OFF
3048 _q += SIZEOF_LONG;
3049 _p += SIZEOF_LONG / 2;
3052 p = _p;
3053 q = _q;
3054 if (q >= e)
3055 break;
3057 ch = (q[ihi] << 8) | q[ilo];
3059 q += 2;
3061 if (ch < 0xD800 || ch > 0xDFFF) {
3062 *p++ = ch;
3063 continue;
3066 /* UTF-16 code pair: */
3067 if (q > e) {
3068 errmsg = "unexpected end of data";
3069 startinpos = (((const char *)q) - 2) - starts;
3070 endinpos = ((const char *)e) + 1 - starts;
3071 goto utf16Error;
3073 if (0xD800 <= ch && ch <= 0xDBFF) {
3074 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3075 q += 2;
3076 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3077 #ifndef Py_UNICODE_WIDE
3078 *p++ = ch;
3079 *p++ = ch2;
3080 #else
3081 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3082 #endif
3083 continue;
3085 else {
3086 errmsg = "illegal UTF-16 surrogate";
3087 startinpos = (((const char *)q)-4)-starts;
3088 endinpos = startinpos+2;
3089 goto utf16Error;
3093 errmsg = "illegal encoding";
3094 startinpos = (((const char *)q)-2)-starts;
3095 endinpos = startinpos+2;
3096 /* Fall through to report the error */
3098 utf16Error:
3099 outpos = p - PyUnicode_AS_UNICODE(unicode);
3100 if (unicode_decode_call_errorhandler(
3101 errors,
3102 &errorHandler,
3103 "utf16", errmsg,
3104 &starts,
3105 (const char **)&e,
3106 &startinpos,
3107 &endinpos,
3108 &exc,
3109 (const char **)&q,
3110 &unicode,
3111 &outpos,
3112 &p))
3113 goto onError;
3115 /* remaining byte at the end? (size should be even) */
3116 if (e == q) {
3117 if (!consumed) {
3118 errmsg = "truncated data";
3119 startinpos = ((const char *)q) - starts;
3120 endinpos = ((const char *)e) + 1 - starts;
3121 outpos = p - PyUnicode_AS_UNICODE(unicode);
3122 if (unicode_decode_call_errorhandler(
3123 errors,
3124 &errorHandler,
3125 "utf16", errmsg,
3126 &starts,
3127 (const char **)&e,
3128 &startinpos,
3129 &endinpos,
3130 &exc,
3131 (const char **)&q,
3132 &unicode,
3133 &outpos,
3134 &p))
3135 goto onError;
3136 /* The remaining input chars are ignored if the callback
3137 chooses to skip the input */
3141 if (byteorder)
3142 *byteorder = bo;
3144 if (consumed)
3145 *consumed = (const char *)q-starts;
3147 /* Adjust length */
3148 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3149 goto onError;
3151 Py_XDECREF(errorHandler);
3152 Py_XDECREF(exc);
3153 return (PyObject *)unicode;
3155 onError:
3156 Py_DECREF(unicode);
3157 Py_XDECREF(errorHandler);
3158 Py_XDECREF(exc);
3159 return NULL;
3162 #undef FAST_CHAR_MASK
3163 #undef SWAPPED_FAST_CHAR_MASK
3165 PyObject *
3166 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
3167 Py_ssize_t size,
3168 const char *errors,
3169 int byteorder)
3171 PyObject *v;
3172 unsigned char *p;
3173 Py_ssize_t nsize, bytesize;
3174 #ifdef Py_UNICODE_WIDE
3175 Py_ssize_t i, pairs;
3176 #else
3177 const int pairs = 0;
3178 #endif
3179 /* Offsets from p for storing byte pairs in the right order. */
3180 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
3181 int ihi = 1, ilo = 0;
3182 #else
3183 int ihi = 0, ilo = 1;
3184 #endif
3186 #define STORECHAR(CH) \
3187 do { \
3188 p[ihi] = ((CH) >> 8) & 0xff; \
3189 p[ilo] = (CH) & 0xff; \
3190 p += 2; \
3191 } while(0)
3193 #ifdef Py_UNICODE_WIDE
3194 for (i = pairs = 0; i < size; i++)
3195 if (s[i] >= 0x10000)
3196 pairs++;
3197 #endif
3198 /* 2 * (size + pairs + (byteorder == 0)) */
3199 if (size > PY_SSIZE_T_MAX ||
3200 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
3201 return PyErr_NoMemory();
3202 nsize = size + pairs + (byteorder == 0);
3203 bytesize = nsize * 2;
3204 if (bytesize / 2 != nsize)
3205 return PyErr_NoMemory();
3206 v = PyBytes_FromStringAndSize(NULL, bytesize);
3207 if (v == NULL)
3208 return NULL;
3210 p = (unsigned char *)PyBytes_AS_STRING(v);
3211 if (byteorder == 0)
3212 STORECHAR(0xFEFF);
3213 if (size == 0)
3214 goto done;
3216 if (byteorder == -1) {
3217 /* force LE */
3218 ihi = 1;
3219 ilo = 0;
3221 else if (byteorder == 1) {
3222 /* force BE */
3223 ihi = 0;
3224 ilo = 1;
3227 while (size-- > 0) {
3228 Py_UNICODE ch = *s++;
3229 Py_UNICODE ch2 = 0;
3230 #ifdef Py_UNICODE_WIDE
3231 if (ch >= 0x10000) {
3232 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3233 ch = 0xD800 | ((ch-0x10000) >> 10);
3235 #endif
3236 STORECHAR(ch);
3237 if (ch2)
3238 STORECHAR(ch2);
3241 done:
3242 return v;
3243 #undef STORECHAR
3246 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3248 if (!PyUnicode_Check(unicode)) {
3249 PyErr_BadArgument();
3250 return NULL;
3252 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
3253 PyUnicode_GET_SIZE(unicode),
3254 NULL,
3258 /* --- Unicode Escape Codec ----------------------------------------------- */
3260 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
3262 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
3263 Py_ssize_t size,
3264 const char *errors)
3266 const char *starts = s;
3267 Py_ssize_t startinpos;
3268 Py_ssize_t endinpos;
3269 Py_ssize_t outpos;
3270 int i;
3271 PyUnicodeObject *v;
3272 Py_UNICODE *p;
3273 const char *end;
3274 char* message;
3275 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
3276 PyObject *errorHandler = NULL;
3277 PyObject *exc = NULL;
3279 /* Escaped strings will always be longer than the resulting
3280 Unicode string, so we start with size here and then reduce the
3281 length after conversion to the true value.
3282 (but if the error callback returns a long replacement string
3283 we'll have to allocate more space) */
3284 v = _PyUnicode_New(size);
3285 if (v == NULL)
3286 goto onError;
3287 if (size == 0)
3288 return (PyObject *)v;
3290 p = PyUnicode_AS_UNICODE(v);
3291 end = s + size;
3293 while (s < end) {
3294 unsigned char c;
3295 Py_UNICODE x;
3296 int digits;
3298 /* Non-escape characters are interpreted as Unicode ordinals */
3299 if (*s != '\\') {
3300 *p++ = (unsigned char) *s++;
3301 continue;
3304 startinpos = s-starts;
3305 /* \ - Escapes */
3306 s++;
3307 c = *s++;
3308 if (s > end)
3309 c = '\0'; /* Invalid after \ */
3310 switch (c) {
3312 /* \x escapes */
3313 case '\n': break;
3314 case '\\': *p++ = '\\'; break;
3315 case '\'': *p++ = '\''; break;
3316 case '\"': *p++ = '\"'; break;
3317 case 'b': *p++ = '\b'; break;
3318 case 'f': *p++ = '\014'; break; /* FF */
3319 case 't': *p++ = '\t'; break;
3320 case 'n': *p++ = '\n'; break;
3321 case 'r': *p++ = '\r'; break;
3322 case 'v': *p++ = '\013'; break; /* VT */
3323 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3325 /* \OOO (octal) escapes */
3326 case '0': case '1': case '2': case '3':
3327 case '4': case '5': case '6': case '7':
3328 x = s[-1] - '0';
3329 if (s < end && '0' <= *s && *s <= '7') {
3330 x = (x<<3) + *s++ - '0';
3331 if (s < end && '0' <= *s && *s <= '7')
3332 x = (x<<3) + *s++ - '0';
3334 *p++ = x;
3335 break;
3337 /* hex escapes */
3338 /* \xXX */
3339 case 'x':
3340 digits = 2;
3341 message = "truncated \\xXX escape";
3342 goto hexescape;
3344 /* \uXXXX */
3345 case 'u':
3346 digits = 4;
3347 message = "truncated \\uXXXX escape";
3348 goto hexescape;
3350 /* \UXXXXXXXX */
3351 case 'U':
3352 digits = 8;
3353 message = "truncated \\UXXXXXXXX escape";
3354 hexescape:
3355 chr = 0;
3356 outpos = p-PyUnicode_AS_UNICODE(v);
3357 if (s+digits>end) {
3358 endinpos = size;
3359 if (unicode_decode_call_errorhandler(
3360 errors, &errorHandler,
3361 "unicodeescape", "end of string in escape sequence",
3362 &starts, &end, &startinpos, &endinpos, &exc, &s,
3363 &v, &outpos, &p))
3364 goto onError;
3365 goto nextByte;
3367 for (i = 0; i < digits; ++i) {
3368 c = (unsigned char) s[i];
3369 if (!ISXDIGIT(c)) {
3370 endinpos = (s+i+1)-starts;
3371 if (unicode_decode_call_errorhandler(
3372 errors, &errorHandler,
3373 "unicodeescape", message,
3374 &starts, &end, &startinpos, &endinpos, &exc, &s,
3375 &v, &outpos, &p))
3376 goto onError;
3377 goto nextByte;
3379 chr = (chr<<4) & ~0xF;
3380 if (c >= '0' && c <= '9')
3381 chr += c - '0';
3382 else if (c >= 'a' && c <= 'f')
3383 chr += 10 + c - 'a';
3384 else
3385 chr += 10 + c - 'A';
3387 s += i;
3388 if (chr == 0xffffffff && PyErr_Occurred())
3389 /* _decoding_error will have already written into the
3390 target buffer. */
3391 break;
3392 store:
3393 /* when we get here, chr is a 32-bit unicode character */
3394 if (chr <= 0xffff)
3395 /* UCS-2 character */
3396 *p++ = (Py_UNICODE) chr;
3397 else if (chr <= 0x10ffff) {
3398 /* UCS-4 character. Either store directly, or as
3399 surrogate pair. */
3400 #ifdef Py_UNICODE_WIDE
3401 *p++ = chr;
3402 #else
3403 chr -= 0x10000L;
3404 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
3405 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
3406 #endif
3407 } else {
3408 endinpos = s-starts;
3409 outpos = p-PyUnicode_AS_UNICODE(v);
3410 if (unicode_decode_call_errorhandler(
3411 errors, &errorHandler,
3412 "unicodeescape", "illegal Unicode character",
3413 &starts, &end, &startinpos, &endinpos, &exc, &s,
3414 &v, &outpos, &p))
3415 goto onError;
3417 break;
3419 /* \N{name} */
3420 case 'N':
3421 message = "malformed \\N character escape";
3422 if (ucnhash_CAPI == NULL) {
3423 /* load the unicode data module */
3424 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
3425 if (ucnhash_CAPI == NULL)
3426 goto ucnhashError;
3428 if (*s == '{') {
3429 const char *start = s+1;
3430 /* look for the closing brace */
3431 while (*s != '}' && s < end)
3432 s++;
3433 if (s > start && s < end && *s == '}') {
3434 /* found a name. look it up in the unicode database */
3435 message = "unknown Unicode character name";
3436 s++;
3437 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
3438 goto store;
3441 endinpos = s-starts;
3442 outpos = p-PyUnicode_AS_UNICODE(v);
3443 if (unicode_decode_call_errorhandler(
3444 errors, &errorHandler,
3445 "unicodeescape", message,
3446 &starts, &end, &startinpos, &endinpos, &exc, &s,
3447 &v, &outpos, &p))
3448 goto onError;
3449 break;
3451 default:
3452 if (s > end) {
3453 message = "\\ at end of string";
3454 s--;
3455 endinpos = s-starts;
3456 outpos = p-PyUnicode_AS_UNICODE(v);
3457 if (unicode_decode_call_errorhandler(
3458 errors, &errorHandler,
3459 "unicodeescape", message,
3460 &starts, &end, &startinpos, &endinpos, &exc, &s,
3461 &v, &outpos, &p))
3462 goto onError;
3464 else {
3465 *p++ = '\\';
3466 *p++ = (unsigned char)s[-1];
3468 break;
3470 nextByte:
3473 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3474 goto onError;
3475 Py_XDECREF(errorHandler);
3476 Py_XDECREF(exc);
3477 return (PyObject *)v;
3479 ucnhashError:
3480 PyErr_SetString(
3481 PyExc_UnicodeError,
3482 "\\N escapes not supported (can't load unicodedata module)"
3484 Py_XDECREF(v);
3485 Py_XDECREF(errorHandler);
3486 Py_XDECREF(exc);
3487 return NULL;
3489 onError:
3490 Py_XDECREF(v);
3491 Py_XDECREF(errorHandler);
3492 Py_XDECREF(exc);
3493 return NULL;
3496 /* Return a Unicode-Escape string version of the Unicode object.
3498 If quotes is true, the string is enclosed in u"" or u'' quotes as
3499 appropriate.
3503 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
3504 Py_ssize_t size,
3505 Py_UNICODE ch)
3507 /* like wcschr, but doesn't stop at NULL characters */
3509 while (size-- > 0) {
3510 if (*s == ch)
3511 return s;
3512 s++;
3515 return NULL;
3518 static const char *hexdigits = "0123456789abcdef";
3520 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3521 Py_ssize_t size)
3523 PyObject *repr;
3524 char *p;
3526 #ifdef Py_UNICODE_WIDE
3527 const Py_ssize_t expandsize = 10;
3528 #else
3529 const Py_ssize_t expandsize = 6;
3530 #endif
3532 /* XXX(nnorwitz): rather than over-allocating, it would be
3533 better to choose a different scheme. Perhaps scan the
3534 first N-chars of the string and allocate based on that size.
3536 /* Initial allocation is based on the longest-possible unichr
3537 escape.
3539 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3540 unichr, so in this case it's the longest unichr escape. In
3541 narrow (UTF-16) builds this is five chars per source unichr
3542 since there are two unichrs in the surrogate pair, so in narrow
3543 (UTF-16) builds it's not the longest unichr escape.
3545 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3546 so in the narrow (UTF-16) build case it's the longest unichr
3547 escape.
3550 if (size == 0)
3551 return PyBytes_FromStringAndSize(NULL, 0);
3553 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3554 return PyErr_NoMemory();
3556 repr = PyBytes_FromStringAndSize(NULL,
3558 + expandsize*size
3559 + 1);
3560 if (repr == NULL)
3561 return NULL;
3563 p = PyBytes_AS_STRING(repr);
3565 while (size-- > 0) {
3566 Py_UNICODE ch = *s++;
3568 /* Escape backslashes */
3569 if (ch == '\\') {
3570 *p++ = '\\';
3571 *p++ = (char) ch;
3572 continue;
3575 #ifdef Py_UNICODE_WIDE
3576 /* Map 21-bit characters to '\U00xxxxxx' */
3577 else if (ch >= 0x10000) {
3578 *p++ = '\\';
3579 *p++ = 'U';
3580 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3581 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3582 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3583 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3584 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3585 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3586 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3587 *p++ = hexdigits[ch & 0x0000000F];
3588 continue;
3590 #else
3591 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3592 else if (ch >= 0xD800 && ch < 0xDC00) {
3593 Py_UNICODE ch2;
3594 Py_UCS4 ucs;
3596 ch2 = *s++;
3597 size--;
3598 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3599 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3600 *p++ = '\\';
3601 *p++ = 'U';
3602 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3603 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3604 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3605 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3606 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3607 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3608 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3609 *p++ = hexdigits[ucs & 0x0000000F];
3610 continue;
3612 /* Fall through: isolated surrogates are copied as-is */
3613 s--;
3614 size++;
3616 #endif
3618 /* Map 16-bit characters to '\uxxxx' */
3619 if (ch >= 256) {
3620 *p++ = '\\';
3621 *p++ = 'u';
3622 *p++ = hexdigits[(ch >> 12) & 0x000F];
3623 *p++ = hexdigits[(ch >> 8) & 0x000F];
3624 *p++ = hexdigits[(ch >> 4) & 0x000F];
3625 *p++ = hexdigits[ch & 0x000F];
3628 /* Map special whitespace to '\t', \n', '\r' */
3629 else if (ch == '\t') {
3630 *p++ = '\\';
3631 *p++ = 't';
3633 else if (ch == '\n') {
3634 *p++ = '\\';
3635 *p++ = 'n';
3637 else if (ch == '\r') {
3638 *p++ = '\\';
3639 *p++ = 'r';
3642 /* Map non-printable US ASCII to '\xhh' */
3643 else if (ch < ' ' || ch >= 0x7F) {
3644 *p++ = '\\';
3645 *p++ = 'x';
3646 *p++ = hexdigits[(ch >> 4) & 0x000F];
3647 *p++ = hexdigits[ch & 0x000F];
3650 /* Copy everything else as-is */
3651 else
3652 *p++ = (char) ch;
3655 assert(p - PyBytes_AS_STRING(repr) > 0);
3656 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3657 return NULL;
3658 return repr;
3661 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3663 PyObject *s;
3664 if (!PyUnicode_Check(unicode)) {
3665 PyErr_BadArgument();
3666 return NULL;
3668 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3669 PyUnicode_GET_SIZE(unicode));
3670 return s;
3673 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3675 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3676 Py_ssize_t size,
3677 const char *errors)
3679 const char *starts = s;
3680 Py_ssize_t startinpos;
3681 Py_ssize_t endinpos;
3682 Py_ssize_t outpos;
3683 PyUnicodeObject *v;
3684 Py_UNICODE *p;
3685 const char *end;
3686 const char *bs;
3687 PyObject *errorHandler = NULL;
3688 PyObject *exc = NULL;
3690 /* Escaped strings will always be longer than the resulting
3691 Unicode string, so we start with size here and then reduce the
3692 length after conversion to the true value. (But decoding error
3693 handler might have to resize the string) */
3694 v = _PyUnicode_New(size);
3695 if (v == NULL)
3696 goto onError;
3697 if (size == 0)
3698 return (PyObject *)v;
3699 p = PyUnicode_AS_UNICODE(v);
3700 end = s + size;
3701 while (s < end) {
3702 unsigned char c;
3703 Py_UCS4 x;
3704 int i;
3705 int count;
3707 /* Non-escape characters are interpreted as Unicode ordinals */
3708 if (*s != '\\') {
3709 *p++ = (unsigned char)*s++;
3710 continue;
3712 startinpos = s-starts;
3714 /* \u-escapes are only interpreted iff the number of leading
3715 backslashes if odd */
3716 bs = s;
3717 for (;s < end;) {
3718 if (*s != '\\')
3719 break;
3720 *p++ = (unsigned char)*s++;
3722 if (((s - bs) & 1) == 0 ||
3723 s >= end ||
3724 (*s != 'u' && *s != 'U')) {
3725 continue;
3727 p--;
3728 count = *s=='u' ? 4 : 8;
3729 s++;
3731 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3732 outpos = p-PyUnicode_AS_UNICODE(v);
3733 for (x = 0, i = 0; i < count; ++i, ++s) {
3734 c = (unsigned char)*s;
3735 if (!ISXDIGIT(c)) {
3736 endinpos = s-starts;
3737 if (unicode_decode_call_errorhandler(
3738 errors, &errorHandler,
3739 "rawunicodeescape", "truncated \\uXXXX",
3740 &starts, &end, &startinpos, &endinpos, &exc, &s,
3741 &v, &outpos, &p))
3742 goto onError;
3743 goto nextByte;
3745 x = (x<<4) & ~0xF;
3746 if (c >= '0' && c <= '9')
3747 x += c - '0';
3748 else if (c >= 'a' && c <= 'f')
3749 x += 10 + c - 'a';
3750 else
3751 x += 10 + c - 'A';
3753 if (x <= 0xffff)
3754 /* UCS-2 character */
3755 *p++ = (Py_UNICODE) x;
3756 else if (x <= 0x10ffff) {
3757 /* UCS-4 character. Either store directly, or as
3758 surrogate pair. */
3759 #ifdef Py_UNICODE_WIDE
3760 *p++ = (Py_UNICODE) x;
3761 #else
3762 x -= 0x10000L;
3763 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3764 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3765 #endif
3766 } else {
3767 endinpos = s-starts;
3768 outpos = p-PyUnicode_AS_UNICODE(v);
3769 if (unicode_decode_call_errorhandler(
3770 errors, &errorHandler,
3771 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3772 &starts, &end, &startinpos, &endinpos, &exc, &s,
3773 &v, &outpos, &p))
3774 goto onError;
3776 nextByte:
3779 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3780 goto onError;
3781 Py_XDECREF(errorHandler);
3782 Py_XDECREF(exc);
3783 return (PyObject *)v;
3785 onError:
3786 Py_XDECREF(v);
3787 Py_XDECREF(errorHandler);
3788 Py_XDECREF(exc);
3789 return NULL;
3792 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3793 Py_ssize_t size)
3795 PyObject *repr;
3796 char *p;
3797 char *q;
3799 #ifdef Py_UNICODE_WIDE
3800 const Py_ssize_t expandsize = 10;
3801 #else
3802 const Py_ssize_t expandsize = 6;
3803 #endif
3805 if (size > PY_SSIZE_T_MAX / expandsize)
3806 return PyErr_NoMemory();
3808 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
3809 if (repr == NULL)
3810 return NULL;
3811 if (size == 0)
3812 return repr;
3814 p = q = PyBytes_AS_STRING(repr);
3815 while (size-- > 0) {
3816 Py_UNICODE ch = *s++;
3817 #ifdef Py_UNICODE_WIDE
3818 /* Map 32-bit characters to '\Uxxxxxxxx' */
3819 if (ch >= 0x10000) {
3820 *p++ = '\\';
3821 *p++ = 'U';
3822 *p++ = hexdigits[(ch >> 28) & 0xf];
3823 *p++ = hexdigits[(ch >> 24) & 0xf];
3824 *p++ = hexdigits[(ch >> 20) & 0xf];
3825 *p++ = hexdigits[(ch >> 16) & 0xf];
3826 *p++ = hexdigits[(ch >> 12) & 0xf];
3827 *p++ = hexdigits[(ch >> 8) & 0xf];
3828 *p++ = hexdigits[(ch >> 4) & 0xf];
3829 *p++ = hexdigits[ch & 15];
3831 else
3832 #else
3833 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3834 if (ch >= 0xD800 && ch < 0xDC00) {
3835 Py_UNICODE ch2;
3836 Py_UCS4 ucs;
3838 ch2 = *s++;
3839 size--;
3840 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3841 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3842 *p++ = '\\';
3843 *p++ = 'U';
3844 *p++ = hexdigits[(ucs >> 28) & 0xf];
3845 *p++ = hexdigits[(ucs >> 24) & 0xf];
3846 *p++ = hexdigits[(ucs >> 20) & 0xf];
3847 *p++ = hexdigits[(ucs >> 16) & 0xf];
3848 *p++ = hexdigits[(ucs >> 12) & 0xf];
3849 *p++ = hexdigits[(ucs >> 8) & 0xf];
3850 *p++ = hexdigits[(ucs >> 4) & 0xf];
3851 *p++ = hexdigits[ucs & 0xf];
3852 continue;
3854 /* Fall through: isolated surrogates are copied as-is */
3855 s--;
3856 size++;
3858 #endif
3859 /* Map 16-bit characters to '\uxxxx' */
3860 if (ch >= 256) {
3861 *p++ = '\\';
3862 *p++ = 'u';
3863 *p++ = hexdigits[(ch >> 12) & 0xf];
3864 *p++ = hexdigits[(ch >> 8) & 0xf];
3865 *p++ = hexdigits[(ch >> 4) & 0xf];
3866 *p++ = hexdigits[ch & 15];
3868 /* Copy everything else as-is */
3869 else
3870 *p++ = (char) ch;
3872 size = p - q;
3874 assert(size > 0);
3875 if (_PyBytes_Resize(&repr, size) < 0)
3876 return NULL;
3877 return repr;
3880 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3882 PyObject *s;
3883 if (!PyUnicode_Check(unicode)) {
3884 PyErr_BadArgument();
3885 return NULL;
3887 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3888 PyUnicode_GET_SIZE(unicode));
3890 return s;
3893 /* --- Unicode Internal Codec ------------------------------------------- */
3895 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3896 Py_ssize_t size,
3897 const char *errors)
3899 const char *starts = s;
3900 Py_ssize_t startinpos;
3901 Py_ssize_t endinpos;
3902 Py_ssize_t outpos;
3903 PyUnicodeObject *v;
3904 Py_UNICODE *p;
3905 const char *end;
3906 const char *reason;
3907 PyObject *errorHandler = NULL;
3908 PyObject *exc = NULL;
3910 #ifdef Py_UNICODE_WIDE
3911 Py_UNICODE unimax = PyUnicode_GetMax();
3912 #endif
3914 /* XXX overflow detection missing */
3915 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3916 if (v == NULL)
3917 goto onError;
3918 if (PyUnicode_GetSize((PyObject *)v) == 0)
3919 return (PyObject *)v;
3920 p = PyUnicode_AS_UNICODE(v);
3921 end = s + size;
3923 while (s < end) {
3924 memcpy(p, s, sizeof(Py_UNICODE));
3925 /* We have to sanity check the raw data, otherwise doom looms for
3926 some malformed UCS-4 data. */
3927 if (
3928 #ifdef Py_UNICODE_WIDE
3929 *p > unimax || *p < 0 ||
3930 #endif
3931 end-s < Py_UNICODE_SIZE
3934 startinpos = s - starts;
3935 if (end-s < Py_UNICODE_SIZE) {
3936 endinpos = end-starts;
3937 reason = "truncated input";
3939 else {
3940 endinpos = s - starts + Py_UNICODE_SIZE;
3941 reason = "illegal code point (> 0x10FFFF)";
3943 outpos = p - PyUnicode_AS_UNICODE(v);
3944 if (unicode_decode_call_errorhandler(
3945 errors, &errorHandler,
3946 "unicode_internal", reason,
3947 &starts, &end, &startinpos, &endinpos, &exc, &s,
3948 &v, &outpos, &p)) {
3949 goto onError;
3952 else {
3953 p++;
3954 s += Py_UNICODE_SIZE;
3958 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3959 goto onError;
3960 Py_XDECREF(errorHandler);
3961 Py_XDECREF(exc);
3962 return (PyObject *)v;
3964 onError:
3965 Py_XDECREF(v);
3966 Py_XDECREF(errorHandler);
3967 Py_XDECREF(exc);
3968 return NULL;
3971 /* --- Latin-1 Codec ------------------------------------------------------ */
3973 PyObject *PyUnicode_DecodeLatin1(const char *s,
3974 Py_ssize_t size,
3975 const char *errors)
3977 PyUnicodeObject *v;
3978 Py_UNICODE *p;
3979 const char *e, *unrolled_end;
3981 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3982 if (size == 1) {
3983 Py_UNICODE r = *(unsigned char*)s;
3984 return PyUnicode_FromUnicode(&r, 1);
3987 v = _PyUnicode_New(size);
3988 if (v == NULL)
3989 goto onError;
3990 if (size == 0)
3991 return (PyObject *)v;
3992 p = PyUnicode_AS_UNICODE(v);
3993 e = s + size;
3994 /* Unrolling the copy makes it much faster by reducing the looping
3995 overhead. This is similar to what many memcpy() implementations do. */
3996 unrolled_end = e - 4;
3997 while (s < unrolled_end) {
3998 p[0] = (unsigned char) s[0];
3999 p[1] = (unsigned char) s[1];
4000 p[2] = (unsigned char) s[2];
4001 p[3] = (unsigned char) s[3];
4002 s += 4;
4003 p += 4;
4005 while (s < e)
4006 *p++ = (unsigned char) *s++;
4007 return (PyObject *)v;
4009 onError:
4010 Py_XDECREF(v);
4011 return NULL;
4014 /* create or adjust a UnicodeEncodeError */
4015 static void make_encode_exception(PyObject **exceptionObject,
4016 const char *encoding,
4017 const Py_UNICODE *unicode, Py_ssize_t size,
4018 Py_ssize_t startpos, Py_ssize_t endpos,
4019 const char *reason)
4021 if (*exceptionObject == NULL) {
4022 *exceptionObject = PyUnicodeEncodeError_Create(
4023 encoding, unicode, size, startpos, endpos, reason);
4025 else {
4026 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4027 goto onError;
4028 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4029 goto onError;
4030 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4031 goto onError;
4032 return;
4033 onError:
4034 Py_DECREF(*exceptionObject);
4035 *exceptionObject = NULL;
4039 /* raises a UnicodeEncodeError */
4040 static void raise_encode_exception(PyObject **exceptionObject,
4041 const char *encoding,
4042 const Py_UNICODE *unicode, Py_ssize_t size,
4043 Py_ssize_t startpos, Py_ssize_t endpos,
4044 const char *reason)
4046 make_encode_exception(exceptionObject,
4047 encoding, unicode, size, startpos, endpos, reason);
4048 if (*exceptionObject != NULL)
4049 PyCodec_StrictErrors(*exceptionObject);
4052 /* error handling callback helper:
4053 build arguments, call the callback and check the arguments,
4054 put the result into newpos and return the replacement string, which
4055 has to be freed by the caller */
4056 static PyObject *unicode_encode_call_errorhandler(const char *errors,
4057 PyObject **errorHandler,
4058 const char *encoding, const char *reason,
4059 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4060 Py_ssize_t startpos, Py_ssize_t endpos,
4061 Py_ssize_t *newpos)
4063 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
4065 PyObject *restuple;
4066 PyObject *resunicode;
4068 if (*errorHandler == NULL) {
4069 *errorHandler = PyCodec_LookupError(errors);
4070 if (*errorHandler == NULL)
4071 return NULL;
4074 make_encode_exception(exceptionObject,
4075 encoding, unicode, size, startpos, endpos, reason);
4076 if (*exceptionObject == NULL)
4077 return NULL;
4079 restuple = PyObject_CallFunctionObjArgs(
4080 *errorHandler, *exceptionObject, NULL);
4081 if (restuple == NULL)
4082 return NULL;
4083 if (!PyTuple_Check(restuple)) {
4084 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4085 Py_DECREF(restuple);
4086 return NULL;
4088 if (!PyArg_ParseTuple(restuple, argparse,
4089 &resunicode, newpos)) {
4090 Py_DECREF(restuple);
4091 return NULL;
4093 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4094 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4095 Py_DECREF(restuple);
4096 return NULL;
4098 if (*newpos<0)
4099 *newpos = size+*newpos;
4100 if (*newpos<0 || *newpos>size) {
4101 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4102 Py_DECREF(restuple);
4103 return NULL;
4105 Py_INCREF(resunicode);
4106 Py_DECREF(restuple);
4107 return resunicode;
4110 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
4111 Py_ssize_t size,
4112 const char *errors,
4113 int limit)
4115 /* output object */
4116 PyObject *res;
4117 /* pointers to the beginning and end+1 of input */
4118 const Py_UNICODE *startp = p;
4119 const Py_UNICODE *endp = p + size;
4120 /* pointer to the beginning of the unencodable characters */
4121 /* const Py_UNICODE *badp = NULL; */
4122 /* pointer into the output */
4123 char *str;
4124 /* current output position */
4125 Py_ssize_t ressize;
4126 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4127 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
4128 PyObject *errorHandler = NULL;
4129 PyObject *exc = NULL;
4130 /* the following variable is used for caching string comparisons
4131 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4132 int known_errorHandler = -1;
4134 /* allocate enough for a simple encoding without
4135 replacements, if we need more, we'll resize */
4136 if (size == 0)
4137 return PyBytes_FromStringAndSize(NULL, 0);
4138 res = PyBytes_FromStringAndSize(NULL, size);
4139 if (res == NULL)
4140 return NULL;
4141 str = PyBytes_AS_STRING(res);
4142 ressize = size;
4144 while (p<endp) {
4145 Py_UNICODE c = *p;
4147 /* can we encode this? */
4148 if (c<limit) {
4149 /* no overflow check, because we know that the space is enough */
4150 *str++ = (char)c;
4151 ++p;
4153 else {
4154 Py_ssize_t unicodepos = p-startp;
4155 Py_ssize_t requiredsize;
4156 PyObject *repunicode;
4157 Py_ssize_t repsize;
4158 Py_ssize_t newpos;
4159 Py_ssize_t respos;
4160 Py_UNICODE *uni2;
4161 /* startpos for collecting unencodable chars */
4162 const Py_UNICODE *collstart = p;
4163 const Py_UNICODE *collend = p;
4164 /* find all unecodable characters */
4165 while ((collend < endp) && ((*collend)>=limit))
4166 ++collend;
4167 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4168 if (known_errorHandler==-1) {
4169 if ((errors==NULL) || (!strcmp(errors, "strict")))
4170 known_errorHandler = 1;
4171 else if (!strcmp(errors, "replace"))
4172 known_errorHandler = 2;
4173 else if (!strcmp(errors, "ignore"))
4174 known_errorHandler = 3;
4175 else if (!strcmp(errors, "xmlcharrefreplace"))
4176 known_errorHandler = 4;
4177 else
4178 known_errorHandler = 0;
4180 switch (known_errorHandler) {
4181 case 1: /* strict */
4182 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4183 goto onError;
4184 case 2: /* replace */
4185 while (collstart++<collend)
4186 *str++ = '?'; /* fall through */
4187 case 3: /* ignore */
4188 p = collend;
4189 break;
4190 case 4: /* xmlcharrefreplace */
4191 respos = str - PyBytes_AS_STRING(res);
4192 /* determine replacement size (temporarily (mis)uses p) */
4193 for (p = collstart, repsize = 0; p < collend; ++p) {
4194 if (*p<10)
4195 repsize += 2+1+1;
4196 else if (*p<100)
4197 repsize += 2+2+1;
4198 else if (*p<1000)
4199 repsize += 2+3+1;
4200 else if (*p<10000)
4201 repsize += 2+4+1;
4202 #ifndef Py_UNICODE_WIDE
4203 else
4204 repsize += 2+5+1;
4205 #else
4206 else if (*p<100000)
4207 repsize += 2+5+1;
4208 else if (*p<1000000)
4209 repsize += 2+6+1;
4210 else
4211 repsize += 2+7+1;
4212 #endif
4214 requiredsize = respos+repsize+(endp-collend);
4215 if (requiredsize > ressize) {
4216 if (requiredsize<2*ressize)
4217 requiredsize = 2*ressize;
4218 if (_PyBytes_Resize(&res, requiredsize))
4219 goto onError;
4220 str = PyBytes_AS_STRING(res) + respos;
4221 ressize = requiredsize;
4223 /* generate replacement (temporarily (mis)uses p) */
4224 for (p = collstart; p < collend; ++p) {
4225 str += sprintf(str, "&#%d;", (int)*p);
4227 p = collend;
4228 break;
4229 default:
4230 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4231 encoding, reason, startp, size, &exc,
4232 collstart-startp, collend-startp, &newpos);
4233 if (repunicode == NULL)
4234 goto onError;
4235 if (PyBytes_Check(repunicode)) {
4236 /* Directly copy bytes result to output. */
4237 repsize = PyBytes_Size(repunicode);
4238 if (repsize > 1) {
4239 /* Make room for all additional bytes. */
4240 respos = str - PyBytes_AS_STRING(res);
4241 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4242 Py_DECREF(repunicode);
4243 goto onError;
4245 str = PyBytes_AS_STRING(res) + respos;
4246 ressize += repsize-1;
4248 memcpy(str, PyBytes_AsString(repunicode), repsize);
4249 str += repsize;
4250 p = startp + newpos;
4251 Py_DECREF(repunicode);
4252 break;
4254 /* need more space? (at least enough for what we
4255 have+the replacement+the rest of the string, so
4256 we won't have to check space for encodable characters) */
4257 respos = str - PyBytes_AS_STRING(res);
4258 repsize = PyUnicode_GET_SIZE(repunicode);
4259 requiredsize = respos+repsize+(endp-collend);
4260 if (requiredsize > ressize) {
4261 if (requiredsize<2*ressize)
4262 requiredsize = 2*ressize;
4263 if (_PyBytes_Resize(&res, requiredsize)) {
4264 Py_DECREF(repunicode);
4265 goto onError;
4267 str = PyBytes_AS_STRING(res) + respos;
4268 ressize = requiredsize;
4270 /* check if there is anything unencodable in the replacement
4271 and copy it to the output */
4272 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4273 c = *uni2;
4274 if (c >= limit) {
4275 raise_encode_exception(&exc, encoding, startp, size,
4276 unicodepos, unicodepos+1, reason);
4277 Py_DECREF(repunicode);
4278 goto onError;
4280 *str = (char)c;
4282 p = startp + newpos;
4283 Py_DECREF(repunicode);
4287 /* Resize if we allocated to much */
4288 size = str - PyBytes_AS_STRING(res);
4289 if (size < ressize) { /* If this falls res will be NULL */
4290 assert(size >= 0);
4291 if (_PyBytes_Resize(&res, size) < 0)
4292 goto onError;
4295 Py_XDECREF(errorHandler);
4296 Py_XDECREF(exc);
4297 return res;
4299 onError:
4300 Py_XDECREF(res);
4301 Py_XDECREF(errorHandler);
4302 Py_XDECREF(exc);
4303 return NULL;
4306 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
4307 Py_ssize_t size,
4308 const char *errors)
4310 return unicode_encode_ucs1(p, size, errors, 256);
4313 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4315 if (!PyUnicode_Check(unicode)) {
4316 PyErr_BadArgument();
4317 return NULL;
4319 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
4320 PyUnicode_GET_SIZE(unicode),
4321 NULL);
4324 /* --- 7-bit ASCII Codec -------------------------------------------------- */
4326 PyObject *PyUnicode_DecodeASCII(const char *s,
4327 Py_ssize_t size,
4328 const char *errors)
4330 const char *starts = s;
4331 PyUnicodeObject *v;
4332 Py_UNICODE *p;
4333 Py_ssize_t startinpos;
4334 Py_ssize_t endinpos;
4335 Py_ssize_t outpos;
4336 const char *e;
4337 PyObject *errorHandler = NULL;
4338 PyObject *exc = NULL;
4340 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4341 if (size == 1 && *(unsigned char*)s < 128) {
4342 Py_UNICODE r = *(unsigned char*)s;
4343 return PyUnicode_FromUnicode(&r, 1);
4346 v = _PyUnicode_New(size);
4347 if (v == NULL)
4348 goto onError;
4349 if (size == 0)
4350 return (PyObject *)v;
4351 p = PyUnicode_AS_UNICODE(v);
4352 e = s + size;
4353 while (s < e) {
4354 register unsigned char c = (unsigned char)*s;
4355 if (c < 128) {
4356 *p++ = c;
4357 ++s;
4359 else {
4360 startinpos = s-starts;
4361 endinpos = startinpos + 1;
4362 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4363 if (unicode_decode_call_errorhandler(
4364 errors, &errorHandler,
4365 "ascii", "ordinal not in range(128)",
4366 &starts, &e, &startinpos, &endinpos, &exc, &s,
4367 &v, &outpos, &p))
4368 goto onError;
4371 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4372 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4373 goto onError;
4374 Py_XDECREF(errorHandler);
4375 Py_XDECREF(exc);
4376 return (PyObject *)v;
4378 onError:
4379 Py_XDECREF(v);
4380 Py_XDECREF(errorHandler);
4381 Py_XDECREF(exc);
4382 return NULL;
4385 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
4386 Py_ssize_t size,
4387 const char *errors)
4389 return unicode_encode_ucs1(p, size, errors, 128);
4392 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4394 if (!PyUnicode_Check(unicode)) {
4395 PyErr_BadArgument();
4396 return NULL;
4398 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
4399 PyUnicode_GET_SIZE(unicode),
4400 NULL);
4403 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
4405 /* --- MBCS codecs for Windows -------------------------------------------- */
4407 #if SIZEOF_INT < SIZEOF_SIZE_T
4408 #define NEED_RETRY
4409 #endif
4411 /* XXX This code is limited to "true" double-byte encodings, as
4412 a) it assumes an incomplete character consists of a single byte, and
4413 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
4414 encodings, see IsDBCSLeadByteEx documentation. */
4416 static int is_dbcs_lead_byte(const char *s, int offset)
4418 const char *curr = s + offset;
4420 if (IsDBCSLeadByte(*curr)) {
4421 const char *prev = CharPrev(s, curr);
4422 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
4424 return 0;
4428 * Decode MBCS string into unicode object. If 'final' is set, converts
4429 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4431 static int decode_mbcs(PyUnicodeObject **v,
4432 const char *s, /* MBCS string */
4433 int size, /* sizeof MBCS string */
4434 int final)
4436 Py_UNICODE *p;
4437 Py_ssize_t n = 0;
4438 int usize = 0;
4440 assert(size >= 0);
4442 /* Skip trailing lead-byte unless 'final' is set */
4443 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
4444 --size;
4446 /* First get the size of the result */
4447 if (size > 0) {
4448 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4449 if (usize == 0) {
4450 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4451 return -1;
4455 if (*v == NULL) {
4456 /* Create unicode object */
4457 *v = _PyUnicode_New(usize);
4458 if (*v == NULL)
4459 return -1;
4461 else {
4462 /* Extend unicode object */
4463 n = PyUnicode_GET_SIZE(*v);
4464 if (_PyUnicode_Resize(v, n + usize) < 0)
4465 return -1;
4468 /* Do the conversion */
4469 if (size > 0) {
4470 p = PyUnicode_AS_UNICODE(*v) + n;
4471 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4472 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4473 return -1;
4477 return size;
4480 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
4481 Py_ssize_t size,
4482 const char *errors,
4483 Py_ssize_t *consumed)
4485 PyUnicodeObject *v = NULL;
4486 int done;
4488 if (consumed)
4489 *consumed = 0;
4491 #ifdef NEED_RETRY
4492 retry:
4493 if (size > INT_MAX)
4494 done = decode_mbcs(&v, s, INT_MAX, 0);
4495 else
4496 #endif
4497 done = decode_mbcs(&v, s, (int)size, !consumed);
4499 if (done < 0) {
4500 Py_XDECREF(v);
4501 return NULL;
4504 if (consumed)
4505 *consumed += done;
4507 #ifdef NEED_RETRY
4508 if (size > INT_MAX) {
4509 s += done;
4510 size -= done;
4511 goto retry;
4513 #endif
4515 return (PyObject *)v;
4518 PyObject *PyUnicode_DecodeMBCS(const char *s,
4519 Py_ssize_t size,
4520 const char *errors)
4522 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4526 * Convert unicode into string object (MBCS).
4527 * Returns 0 if succeed, -1 otherwise.
4529 static int encode_mbcs(PyObject **repr,
4530 const Py_UNICODE *p, /* unicode */
4531 int size) /* size of unicode */
4533 int mbcssize = 0;
4534 Py_ssize_t n = 0;
4536 assert(size >= 0);
4538 /* First get the size of the result */
4539 if (size > 0) {
4540 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4541 if (mbcssize == 0) {
4542 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4543 return -1;
4547 if (*repr == NULL) {
4548 /* Create string object */
4549 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4550 if (*repr == NULL)
4551 return -1;
4553 else {
4554 /* Extend string object */
4555 n = PyBytes_Size(*repr);
4556 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4557 return -1;
4560 /* Do the conversion */
4561 if (size > 0) {
4562 char *s = PyBytes_AS_STRING(*repr) + n;
4563 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4564 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4565 return -1;
4569 return 0;
4572 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
4573 Py_ssize_t size,
4574 const char *errors)
4576 PyObject *repr = NULL;
4577 int ret;
4579 #ifdef NEED_RETRY
4580 retry:
4581 if (size > INT_MAX)
4582 ret = encode_mbcs(&repr, p, INT_MAX);
4583 else
4584 #endif
4585 ret = encode_mbcs(&repr, p, (int)size);
4587 if (ret < 0) {
4588 Py_XDECREF(repr);
4589 return NULL;
4592 #ifdef NEED_RETRY
4593 if (size > INT_MAX) {
4594 p += INT_MAX;
4595 size -= INT_MAX;
4596 goto retry;
4598 #endif
4600 return repr;
4603 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4605 if (!PyUnicode_Check(unicode)) {
4606 PyErr_BadArgument();
4607 return NULL;
4609 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4610 PyUnicode_GET_SIZE(unicode),
4611 NULL);
4614 #undef NEED_RETRY
4616 #endif /* MS_WINDOWS */
4618 /* --- Character Mapping Codec -------------------------------------------- */
4620 PyObject *PyUnicode_DecodeCharmap(const char *s,
4621 Py_ssize_t size,
4622 PyObject *mapping,
4623 const char *errors)
4625 const char *starts = s;
4626 Py_ssize_t startinpos;
4627 Py_ssize_t endinpos;
4628 Py_ssize_t outpos;
4629 const char *e;
4630 PyUnicodeObject *v;
4631 Py_UNICODE *p;
4632 Py_ssize_t extrachars = 0;
4633 PyObject *errorHandler = NULL;
4634 PyObject *exc = NULL;
4635 Py_UNICODE *mapstring = NULL;
4636 Py_ssize_t maplen = 0;
4638 /* Default to Latin-1 */
4639 if (mapping == NULL)
4640 return PyUnicode_DecodeLatin1(s, size, errors);
4642 v = _PyUnicode_New(size);
4643 if (v == NULL)
4644 goto onError;
4645 if (size == 0)
4646 return (PyObject *)v;
4647 p = PyUnicode_AS_UNICODE(v);
4648 e = s + size;
4649 if (PyUnicode_CheckExact(mapping)) {
4650 mapstring = PyUnicode_AS_UNICODE(mapping);
4651 maplen = PyUnicode_GET_SIZE(mapping);
4652 while (s < e) {
4653 unsigned char ch = *s;
4654 Py_UNICODE x = 0xfffe; /* illegal value */
4656 if (ch < maplen)
4657 x = mapstring[ch];
4659 if (x == 0xfffe) {
4660 /* undefined mapping */
4661 outpos = p-PyUnicode_AS_UNICODE(v);
4662 startinpos = s-starts;
4663 endinpos = startinpos+1;
4664 if (unicode_decode_call_errorhandler(
4665 errors, &errorHandler,
4666 "charmap", "character maps to <undefined>",
4667 &starts, &e, &startinpos, &endinpos, &exc, &s,
4668 &v, &outpos, &p)) {
4669 goto onError;
4671 continue;
4673 *p++ = x;
4674 ++s;
4677 else {
4678 while (s < e) {
4679 unsigned char ch = *s;
4680 PyObject *w, *x;
4682 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4683 w = PyLong_FromLong((long)ch);
4684 if (w == NULL)
4685 goto onError;
4686 x = PyObject_GetItem(mapping, w);
4687 Py_DECREF(w);
4688 if (x == NULL) {
4689 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4690 /* No mapping found means: mapping is undefined. */
4691 PyErr_Clear();
4692 x = Py_None;
4693 Py_INCREF(x);
4694 } else
4695 goto onError;
4698 /* Apply mapping */
4699 if (PyLong_Check(x)) {
4700 long value = PyLong_AS_LONG(x);
4701 if (value < 0 || value > 65535) {
4702 PyErr_SetString(PyExc_TypeError,
4703 "character mapping must be in range(65536)");
4704 Py_DECREF(x);
4705 goto onError;
4707 *p++ = (Py_UNICODE)value;
4709 else if (x == Py_None) {
4710 /* undefined mapping */
4711 outpos = p-PyUnicode_AS_UNICODE(v);
4712 startinpos = s-starts;
4713 endinpos = startinpos+1;
4714 if (unicode_decode_call_errorhandler(
4715 errors, &errorHandler,
4716 "charmap", "character maps to <undefined>",
4717 &starts, &e, &startinpos, &endinpos, &exc, &s,
4718 &v, &outpos, &p)) {
4719 Py_DECREF(x);
4720 goto onError;
4722 Py_DECREF(x);
4723 continue;
4725 else if (PyUnicode_Check(x)) {
4726 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4728 if (targetsize == 1)
4729 /* 1-1 mapping */
4730 *p++ = *PyUnicode_AS_UNICODE(x);
4732 else if (targetsize > 1) {
4733 /* 1-n mapping */
4734 if (targetsize > extrachars) {
4735 /* resize first */
4736 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4737 Py_ssize_t needed = (targetsize - extrachars) + \
4738 (targetsize << 2);
4739 extrachars += needed;
4740 /* XXX overflow detection missing */
4741 if (_PyUnicode_Resize(&v,
4742 PyUnicode_GET_SIZE(v) + needed) < 0) {
4743 Py_DECREF(x);
4744 goto onError;
4746 p = PyUnicode_AS_UNICODE(v) + oldpos;
4748 Py_UNICODE_COPY(p,
4749 PyUnicode_AS_UNICODE(x),
4750 targetsize);
4751 p += targetsize;
4752 extrachars -= targetsize;
4754 /* 1-0 mapping: skip the character */
4756 else {
4757 /* wrong return value */
4758 PyErr_SetString(PyExc_TypeError,
4759 "character mapping must return integer, None or str");
4760 Py_DECREF(x);
4761 goto onError;
4763 Py_DECREF(x);
4764 ++s;
4767 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4768 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4769 goto onError;
4770 Py_XDECREF(errorHandler);
4771 Py_XDECREF(exc);
4772 return (PyObject *)v;
4774 onError:
4775 Py_XDECREF(errorHandler);
4776 Py_XDECREF(exc);
4777 Py_XDECREF(v);
4778 return NULL;
4781 /* Charmap encoding: the lookup table */
4783 struct encoding_map{
4784 PyObject_HEAD
4785 unsigned char level1[32];
4786 int count2, count3;
4787 unsigned char level23[1];
4790 static PyObject*
4791 encoding_map_size(PyObject *obj, PyObject* args)
4793 struct encoding_map *map = (struct encoding_map*)obj;
4794 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4795 128*map->count3);
4798 static PyMethodDef encoding_map_methods[] = {
4799 {"size", encoding_map_size, METH_NOARGS,
4800 PyDoc_STR("Return the size (in bytes) of this object") },
4801 { 0 }
4804 static void
4805 encoding_map_dealloc(PyObject* o)
4807 PyObject_FREE(o);
4810 static PyTypeObject EncodingMapType = {
4811 PyVarObject_HEAD_INIT(NULL, 0)
4812 "EncodingMap", /*tp_name*/
4813 sizeof(struct encoding_map), /*tp_basicsize*/
4814 0, /*tp_itemsize*/
4815 /* methods */
4816 encoding_map_dealloc, /*tp_dealloc*/
4817 0, /*tp_print*/
4818 0, /*tp_getattr*/
4819 0, /*tp_setattr*/
4820 0, /*tp_reserved*/
4821 0, /*tp_repr*/
4822 0, /*tp_as_number*/
4823 0, /*tp_as_sequence*/
4824 0, /*tp_as_mapping*/
4825 0, /*tp_hash*/
4826 0, /*tp_call*/
4827 0, /*tp_str*/
4828 0, /*tp_getattro*/
4829 0, /*tp_setattro*/
4830 0, /*tp_as_buffer*/
4831 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4832 0, /*tp_doc*/
4833 0, /*tp_traverse*/
4834 0, /*tp_clear*/
4835 0, /*tp_richcompare*/
4836 0, /*tp_weaklistoffset*/
4837 0, /*tp_iter*/
4838 0, /*tp_iternext*/
4839 encoding_map_methods, /*tp_methods*/
4840 0, /*tp_members*/
4841 0, /*tp_getset*/
4842 0, /*tp_base*/
4843 0, /*tp_dict*/
4844 0, /*tp_descr_get*/
4845 0, /*tp_descr_set*/
4846 0, /*tp_dictoffset*/
4847 0, /*tp_init*/
4848 0, /*tp_alloc*/
4849 0, /*tp_new*/
4850 0, /*tp_free*/
4851 0, /*tp_is_gc*/
4854 PyObject*
4855 PyUnicode_BuildEncodingMap(PyObject* string)
4857 Py_UNICODE *decode;
4858 PyObject *result;
4859 struct encoding_map *mresult;
4860 int i;
4861 int need_dict = 0;
4862 unsigned char level1[32];
4863 unsigned char level2[512];
4864 unsigned char *mlevel1, *mlevel2, *mlevel3;
4865 int count2 = 0, count3 = 0;
4867 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4868 PyErr_BadArgument();
4869 return NULL;
4871 decode = PyUnicode_AS_UNICODE(string);
4872 memset(level1, 0xFF, sizeof level1);
4873 memset(level2, 0xFF, sizeof level2);
4875 /* If there isn't a one-to-one mapping of NULL to \0,
4876 or if there are non-BMP characters, we need to use
4877 a mapping dictionary. */
4878 if (decode[0] != 0)
4879 need_dict = 1;
4880 for (i = 1; i < 256; i++) {
4881 int l1, l2;
4882 if (decode[i] == 0
4883 #ifdef Py_UNICODE_WIDE
4884 || decode[i] > 0xFFFF
4885 #endif
4887 need_dict = 1;
4888 break;
4890 if (decode[i] == 0xFFFE)
4891 /* unmapped character */
4892 continue;
4893 l1 = decode[i] >> 11;
4894 l2 = decode[i] >> 7;
4895 if (level1[l1] == 0xFF)
4896 level1[l1] = count2++;
4897 if (level2[l2] == 0xFF)
4898 level2[l2] = count3++;
4901 if (count2 >= 0xFF || count3 >= 0xFF)
4902 need_dict = 1;
4904 if (need_dict) {
4905 PyObject *result = PyDict_New();
4906 PyObject *key, *value;
4907 if (!result)
4908 return NULL;
4909 for (i = 0; i < 256; i++) {
4910 key = value = NULL;
4911 key = PyLong_FromLong(decode[i]);
4912 value = PyLong_FromLong(i);
4913 if (!key || !value)
4914 goto failed1;
4915 if (PyDict_SetItem(result, key, value) == -1)
4916 goto failed1;
4917 Py_DECREF(key);
4918 Py_DECREF(value);
4920 return result;
4921 failed1:
4922 Py_XDECREF(key);
4923 Py_XDECREF(value);
4924 Py_DECREF(result);
4925 return NULL;
4928 /* Create a three-level trie */
4929 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4930 16*count2 + 128*count3 - 1);
4931 if (!result)
4932 return PyErr_NoMemory();
4933 PyObject_Init(result, &EncodingMapType);
4934 mresult = (struct encoding_map*)result;
4935 mresult->count2 = count2;
4936 mresult->count3 = count3;
4937 mlevel1 = mresult->level1;
4938 mlevel2 = mresult->level23;
4939 mlevel3 = mresult->level23 + 16*count2;
4940 memcpy(mlevel1, level1, 32);
4941 memset(mlevel2, 0xFF, 16*count2);
4942 memset(mlevel3, 0, 128*count3);
4943 count3 = 0;
4944 for (i = 1; i < 256; i++) {
4945 int o1, o2, o3, i2, i3;
4946 if (decode[i] == 0xFFFE)
4947 /* unmapped character */
4948 continue;
4949 o1 = decode[i]>>11;
4950 o2 = (decode[i]>>7) & 0xF;
4951 i2 = 16*mlevel1[o1] + o2;
4952 if (mlevel2[i2] == 0xFF)
4953 mlevel2[i2] = count3++;
4954 o3 = decode[i] & 0x7F;
4955 i3 = 128*mlevel2[i2] + o3;
4956 mlevel3[i3] = i;
4958 return result;
4961 static int
4962 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4964 struct encoding_map *map = (struct encoding_map*)mapping;
4965 int l1 = c>>11;
4966 int l2 = (c>>7) & 0xF;
4967 int l3 = c & 0x7F;
4968 int i;
4970 #ifdef Py_UNICODE_WIDE
4971 if (c > 0xFFFF) {
4972 return -1;
4974 #endif
4975 if (c == 0)
4976 return 0;
4977 /* level 1*/
4978 i = map->level1[l1];
4979 if (i == 0xFF) {
4980 return -1;
4982 /* level 2*/
4983 i = map->level23[16*i+l2];
4984 if (i == 0xFF) {
4985 return -1;
4987 /* level 3 */
4988 i = map->level23[16*map->count2 + 128*i + l3];
4989 if (i == 0) {
4990 return -1;
4992 return i;
4995 /* Lookup the character ch in the mapping. If the character
4996 can't be found, Py_None is returned (or NULL, if another
4997 error occurred). */
4998 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
5000 PyObject *w = PyLong_FromLong((long)c);
5001 PyObject *x;
5003 if (w == NULL)
5004 return NULL;
5005 x = PyObject_GetItem(mapping, w);
5006 Py_DECREF(w);
5007 if (x == NULL) {
5008 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5009 /* No mapping found means: mapping is undefined. */
5010 PyErr_Clear();
5011 x = Py_None;
5012 Py_INCREF(x);
5013 return x;
5014 } else
5015 return NULL;
5017 else if (x == Py_None)
5018 return x;
5019 else if (PyLong_Check(x)) {
5020 long value = PyLong_AS_LONG(x);
5021 if (value < 0 || value > 255) {
5022 PyErr_SetString(PyExc_TypeError,
5023 "character mapping must be in range(256)");
5024 Py_DECREF(x);
5025 return NULL;
5027 return x;
5029 else if (PyBytes_Check(x))
5030 return x;
5031 else {
5032 /* wrong return value */
5033 PyErr_Format(PyExc_TypeError,
5034 "character mapping must return integer, bytes or None, not %.400s",
5035 x->ob_type->tp_name);
5036 Py_DECREF(x);
5037 return NULL;
5041 static int
5042 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
5044 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5045 /* exponentially overallocate to minimize reallocations */
5046 if (requiredsize < 2*outsize)
5047 requiredsize = 2*outsize;
5048 if (_PyBytes_Resize(outobj, requiredsize))
5049 return -1;
5050 return 0;
5053 typedef enum charmapencode_result {
5054 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
5055 }charmapencode_result;
5056 /* lookup the character, put the result in the output string and adjust
5057 various state variables. Resize the output bytes object if not enough
5058 space is available. Return a new reference to the object that
5059 was put in the output buffer, or Py_None, if the mapping was undefined
5060 (in which case no character was written) or NULL, if a
5061 reallocation error occurred. The caller must decref the result */
5062 static
5063 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
5064 PyObject **outobj, Py_ssize_t *outpos)
5066 PyObject *rep;
5067 char *outstart;
5068 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5070 if (Py_TYPE(mapping) == &EncodingMapType) {
5071 int res = encoding_map_lookup(c, mapping);
5072 Py_ssize_t requiredsize = *outpos+1;
5073 if (res == -1)
5074 return enc_FAILED;
5075 if (outsize<requiredsize)
5076 if (charmapencode_resize(outobj, outpos, requiredsize))
5077 return enc_EXCEPTION;
5078 outstart = PyBytes_AS_STRING(*outobj);
5079 outstart[(*outpos)++] = (char)res;
5080 return enc_SUCCESS;
5083 rep = charmapencode_lookup(c, mapping);
5084 if (rep==NULL)
5085 return enc_EXCEPTION;
5086 else if (rep==Py_None) {
5087 Py_DECREF(rep);
5088 return enc_FAILED;
5089 } else {
5090 if (PyLong_Check(rep)) {
5091 Py_ssize_t requiredsize = *outpos+1;
5092 if (outsize<requiredsize)
5093 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5094 Py_DECREF(rep);
5095 return enc_EXCEPTION;
5097 outstart = PyBytes_AS_STRING(*outobj);
5098 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
5100 else {
5101 const char *repchars = PyBytes_AS_STRING(rep);
5102 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5103 Py_ssize_t requiredsize = *outpos+repsize;
5104 if (outsize<requiredsize)
5105 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5106 Py_DECREF(rep);
5107 return enc_EXCEPTION;
5109 outstart = PyBytes_AS_STRING(*outobj);
5110 memcpy(outstart + *outpos, repchars, repsize);
5111 *outpos += repsize;
5114 Py_DECREF(rep);
5115 return enc_SUCCESS;
5118 /* handle an error in PyUnicode_EncodeCharmap
5119 Return 0 on success, -1 on error */
5120 static
5121 int charmap_encoding_error(
5122 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
5123 PyObject **exceptionObject,
5124 int *known_errorHandler, PyObject **errorHandler, const char *errors,
5125 PyObject **res, Py_ssize_t *respos)
5127 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5128 Py_ssize_t repsize;
5129 Py_ssize_t newpos;
5130 Py_UNICODE *uni2;
5131 /* startpos for collecting unencodable chars */
5132 Py_ssize_t collstartpos = *inpos;
5133 Py_ssize_t collendpos = *inpos+1;
5134 Py_ssize_t collpos;
5135 char *encoding = "charmap";
5136 char *reason = "character maps to <undefined>";
5137 charmapencode_result x;
5139 /* find all unencodable characters */
5140 while (collendpos < size) {
5141 PyObject *rep;
5142 if (Py_TYPE(mapping) == &EncodingMapType) {
5143 int res = encoding_map_lookup(p[collendpos], mapping);
5144 if (res != -1)
5145 break;
5146 ++collendpos;
5147 continue;
5150 rep = charmapencode_lookup(p[collendpos], mapping);
5151 if (rep==NULL)
5152 return -1;
5153 else if (rep!=Py_None) {
5154 Py_DECREF(rep);
5155 break;
5157 Py_DECREF(rep);
5158 ++collendpos;
5160 /* cache callback name lookup
5161 * (if not done yet, i.e. it's the first error) */
5162 if (*known_errorHandler==-1) {
5163 if ((errors==NULL) || (!strcmp(errors, "strict")))
5164 *known_errorHandler = 1;
5165 else if (!strcmp(errors, "replace"))
5166 *known_errorHandler = 2;
5167 else if (!strcmp(errors, "ignore"))
5168 *known_errorHandler = 3;
5169 else if (!strcmp(errors, "xmlcharrefreplace"))
5170 *known_errorHandler = 4;
5171 else
5172 *known_errorHandler = 0;
5174 switch (*known_errorHandler) {
5175 case 1: /* strict */
5176 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5177 return -1;
5178 case 2: /* replace */
5179 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
5180 x = charmapencode_output('?', mapping, res, respos);
5181 if (x==enc_EXCEPTION) {
5182 return -1;
5184 else if (x==enc_FAILED) {
5185 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5186 return -1;
5189 /* fall through */
5190 case 3: /* ignore */
5191 *inpos = collendpos;
5192 break;
5193 case 4: /* xmlcharrefreplace */
5194 /* generate replacement (temporarily (mis)uses p) */
5195 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
5196 char buffer[2+29+1+1];
5197 char *cp;
5198 sprintf(buffer, "&#%d;", (int)p[collpos]);
5199 for (cp = buffer; *cp; ++cp) {
5200 x = charmapencode_output(*cp, mapping, res, respos);
5201 if (x==enc_EXCEPTION)
5202 return -1;
5203 else if (x==enc_FAILED) {
5204 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5205 return -1;
5209 *inpos = collendpos;
5210 break;
5211 default:
5212 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
5213 encoding, reason, p, size, exceptionObject,
5214 collstartpos, collendpos, &newpos);
5215 if (repunicode == NULL)
5216 return -1;
5217 if (PyBytes_Check(repunicode)) {
5218 /* Directly copy bytes result to output. */
5219 Py_ssize_t outsize = PyBytes_Size(*res);
5220 Py_ssize_t requiredsize;
5221 repsize = PyBytes_Size(repunicode);
5222 requiredsize = *respos + repsize;
5223 if (requiredsize > outsize)
5224 /* Make room for all additional bytes. */
5225 if (charmapencode_resize(res, respos, requiredsize)) {
5226 Py_DECREF(repunicode);
5227 return -1;
5229 memcpy(PyBytes_AsString(*res) + *respos,
5230 PyBytes_AsString(repunicode), repsize);
5231 *respos += repsize;
5232 *inpos = newpos;
5233 Py_DECREF(repunicode);
5234 break;
5236 /* generate replacement */
5237 repsize = PyUnicode_GET_SIZE(repunicode);
5238 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5239 x = charmapencode_output(*uni2, mapping, res, respos);
5240 if (x==enc_EXCEPTION) {
5241 return -1;
5243 else if (x==enc_FAILED) {
5244 Py_DECREF(repunicode);
5245 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5246 return -1;
5249 *inpos = newpos;
5250 Py_DECREF(repunicode);
5252 return 0;
5255 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
5256 Py_ssize_t size,
5257 PyObject *mapping,
5258 const char *errors)
5260 /* output object */
5261 PyObject *res = NULL;
5262 /* current input position */
5263 Py_ssize_t inpos = 0;
5264 /* current output position */
5265 Py_ssize_t respos = 0;
5266 PyObject *errorHandler = NULL;
5267 PyObject *exc = NULL;
5268 /* the following variable is used for caching string comparisons
5269 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5270 * 3=ignore, 4=xmlcharrefreplace */
5271 int known_errorHandler = -1;
5273 /* Default to Latin-1 */
5274 if (mapping == NULL)
5275 return PyUnicode_EncodeLatin1(p, size, errors);
5277 /* allocate enough for a simple encoding without
5278 replacements, if we need more, we'll resize */
5279 res = PyBytes_FromStringAndSize(NULL, size);
5280 if (res == NULL)
5281 goto onError;
5282 if (size == 0)
5283 return res;
5285 while (inpos<size) {
5286 /* try to encode it */
5287 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5288 if (x==enc_EXCEPTION) /* error */
5289 goto onError;
5290 if (x==enc_FAILED) { /* unencodable character */
5291 if (charmap_encoding_error(p, size, &inpos, mapping,
5292 &exc,
5293 &known_errorHandler, &errorHandler, errors,
5294 &res, &respos)) {
5295 goto onError;
5298 else
5299 /* done with this character => adjust input position */
5300 ++inpos;
5303 /* Resize if we allocated to much */
5304 if (respos<PyBytes_GET_SIZE(res))
5305 if (_PyBytes_Resize(&res, respos) < 0)
5306 goto onError;
5308 Py_XDECREF(exc);
5309 Py_XDECREF(errorHandler);
5310 return res;
5312 onError:
5313 Py_XDECREF(res);
5314 Py_XDECREF(exc);
5315 Py_XDECREF(errorHandler);
5316 return NULL;
5319 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
5320 PyObject *mapping)
5322 if (!PyUnicode_Check(unicode) || mapping == NULL) {
5323 PyErr_BadArgument();
5324 return NULL;
5326 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
5327 PyUnicode_GET_SIZE(unicode),
5328 mapping,
5329 NULL);
5332 /* create or adjust a UnicodeTranslateError */
5333 static void make_translate_exception(PyObject **exceptionObject,
5334 const Py_UNICODE *unicode, Py_ssize_t size,
5335 Py_ssize_t startpos, Py_ssize_t endpos,
5336 const char *reason)
5338 if (*exceptionObject == NULL) {
5339 *exceptionObject = PyUnicodeTranslateError_Create(
5340 unicode, size, startpos, endpos, reason);
5342 else {
5343 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5344 goto onError;
5345 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5346 goto onError;
5347 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5348 goto onError;
5349 return;
5350 onError:
5351 Py_DECREF(*exceptionObject);
5352 *exceptionObject = NULL;
5356 /* raises a UnicodeTranslateError */
5357 static void raise_translate_exception(PyObject **exceptionObject,
5358 const Py_UNICODE *unicode, Py_ssize_t size,
5359 Py_ssize_t startpos, Py_ssize_t endpos,
5360 const char *reason)
5362 make_translate_exception(exceptionObject,
5363 unicode, size, startpos, endpos, reason);
5364 if (*exceptionObject != NULL)
5365 PyCodec_StrictErrors(*exceptionObject);
5368 /* error handling callback helper:
5369 build arguments, call the callback and check the arguments,
5370 put the result into newpos and return the replacement string, which
5371 has to be freed by the caller */
5372 static PyObject *unicode_translate_call_errorhandler(const char *errors,
5373 PyObject **errorHandler,
5374 const char *reason,
5375 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5376 Py_ssize_t startpos, Py_ssize_t endpos,
5377 Py_ssize_t *newpos)
5379 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
5381 Py_ssize_t i_newpos;
5382 PyObject *restuple;
5383 PyObject *resunicode;
5385 if (*errorHandler == NULL) {
5386 *errorHandler = PyCodec_LookupError(errors);
5387 if (*errorHandler == NULL)
5388 return NULL;
5391 make_translate_exception(exceptionObject,
5392 unicode, size, startpos, endpos, reason);
5393 if (*exceptionObject == NULL)
5394 return NULL;
5396 restuple = PyObject_CallFunctionObjArgs(
5397 *errorHandler, *exceptionObject, NULL);
5398 if (restuple == NULL)
5399 return NULL;
5400 if (!PyTuple_Check(restuple)) {
5401 PyErr_SetString(PyExc_TypeError, &argparse[4]);
5402 Py_DECREF(restuple);
5403 return NULL;
5405 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
5406 &resunicode, &i_newpos)) {
5407 Py_DECREF(restuple);
5408 return NULL;
5410 if (i_newpos<0)
5411 *newpos = size+i_newpos;
5412 else
5413 *newpos = i_newpos;
5414 if (*newpos<0 || *newpos>size) {
5415 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5416 Py_DECREF(restuple);
5417 return NULL;
5419 Py_INCREF(resunicode);
5420 Py_DECREF(restuple);
5421 return resunicode;
5424 /* Lookup the character ch in the mapping and put the result in result,
5425 which must be decrefed by the caller.
5426 Return 0 on success, -1 on error */
5427 static
5428 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5430 PyObject *w = PyLong_FromLong((long)c);
5431 PyObject *x;
5433 if (w == NULL)
5434 return -1;
5435 x = PyObject_GetItem(mapping, w);
5436 Py_DECREF(w);
5437 if (x == NULL) {
5438 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5439 /* No mapping found means: use 1:1 mapping. */
5440 PyErr_Clear();
5441 *result = NULL;
5442 return 0;
5443 } else
5444 return -1;
5446 else if (x == Py_None) {
5447 *result = x;
5448 return 0;
5450 else if (PyLong_Check(x)) {
5451 long value = PyLong_AS_LONG(x);
5452 long max = PyUnicode_GetMax();
5453 if (value < 0 || value > max) {
5454 PyErr_Format(PyExc_TypeError,
5455 "character mapping must be in range(0x%x)", max+1);
5456 Py_DECREF(x);
5457 return -1;
5459 *result = x;
5460 return 0;
5462 else if (PyUnicode_Check(x)) {
5463 *result = x;
5464 return 0;
5466 else {
5467 /* wrong return value */
5468 PyErr_SetString(PyExc_TypeError,
5469 "character mapping must return integer, None or str");
5470 Py_DECREF(x);
5471 return -1;
5474 /* ensure that *outobj is at least requiredsize characters long,
5475 if not reallocate and adjust various state variables.
5476 Return 0 on success, -1 on error */
5477 static
5478 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
5479 Py_ssize_t requiredsize)
5481 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
5482 if (requiredsize > oldsize) {
5483 /* remember old output position */
5484 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5485 /* exponentially overallocate to minimize reallocations */
5486 if (requiredsize < 2 * oldsize)
5487 requiredsize = 2 * oldsize;
5488 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5489 return -1;
5490 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
5492 return 0;
5494 /* lookup the character, put the result in the output string and adjust
5495 various state variables. Return a new reference to the object that
5496 was put in the output buffer in *result, or Py_None, if the mapping was
5497 undefined (in which case no character was written).
5498 The called must decref result.
5499 Return 0 on success, -1 on error. */
5500 static
5501 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
5502 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5503 PyObject **res)
5505 if (charmaptranslate_lookup(*curinp, mapping, res))
5506 return -1;
5507 if (*res==NULL) {
5508 /* not found => default to 1:1 mapping */
5509 *(*outp)++ = *curinp;
5511 else if (*res==Py_None)
5513 else if (PyLong_Check(*res)) {
5514 /* no overflow check, because we know that the space is enough */
5515 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
5517 else if (PyUnicode_Check(*res)) {
5518 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5519 if (repsize==1) {
5520 /* no overflow check, because we know that the space is enough */
5521 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5523 else if (repsize!=0) {
5524 /* more than one character */
5525 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5526 (insize - (curinp-startinp)) +
5527 repsize - 1;
5528 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5529 return -1;
5530 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5531 *outp += repsize;
5534 else
5535 return -1;
5536 return 0;
5539 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
5540 Py_ssize_t size,
5541 PyObject *mapping,
5542 const char *errors)
5544 /* output object */
5545 PyObject *res = NULL;
5546 /* pointers to the beginning and end+1 of input */
5547 const Py_UNICODE *startp = p;
5548 const Py_UNICODE *endp = p + size;
5549 /* pointer into the output */
5550 Py_UNICODE *str;
5551 /* current output position */
5552 Py_ssize_t respos = 0;
5553 char *reason = "character maps to <undefined>";
5554 PyObject *errorHandler = NULL;
5555 PyObject *exc = NULL;
5556 /* the following variable is used for caching string comparisons
5557 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5558 * 3=ignore, 4=xmlcharrefreplace */
5559 int known_errorHandler = -1;
5561 if (mapping == NULL) {
5562 PyErr_BadArgument();
5563 return NULL;
5566 /* allocate enough for a simple 1:1 translation without
5567 replacements, if we need more, we'll resize */
5568 res = PyUnicode_FromUnicode(NULL, size);
5569 if (res == NULL)
5570 goto onError;
5571 if (size == 0)
5572 return res;
5573 str = PyUnicode_AS_UNICODE(res);
5575 while (p<endp) {
5576 /* try to encode it */
5577 PyObject *x = NULL;
5578 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5579 Py_XDECREF(x);
5580 goto onError;
5582 Py_XDECREF(x);
5583 if (x!=Py_None) /* it worked => adjust input pointer */
5584 ++p;
5585 else { /* untranslatable character */
5586 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5587 Py_ssize_t repsize;
5588 Py_ssize_t newpos;
5589 Py_UNICODE *uni2;
5590 /* startpos for collecting untranslatable chars */
5591 const Py_UNICODE *collstart = p;
5592 const Py_UNICODE *collend = p+1;
5593 const Py_UNICODE *coll;
5595 /* find all untranslatable characters */
5596 while (collend < endp) {
5597 if (charmaptranslate_lookup(*collend, mapping, &x))
5598 goto onError;
5599 Py_XDECREF(x);
5600 if (x!=Py_None)
5601 break;
5602 ++collend;
5604 /* cache callback name lookup
5605 * (if not done yet, i.e. it's the first error) */
5606 if (known_errorHandler==-1) {
5607 if ((errors==NULL) || (!strcmp(errors, "strict")))
5608 known_errorHandler = 1;
5609 else if (!strcmp(errors, "replace"))
5610 known_errorHandler = 2;
5611 else if (!strcmp(errors, "ignore"))
5612 known_errorHandler = 3;
5613 else if (!strcmp(errors, "xmlcharrefreplace"))
5614 known_errorHandler = 4;
5615 else
5616 known_errorHandler = 0;
5618 switch (known_errorHandler) {
5619 case 1: /* strict */
5620 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5621 goto onError;
5622 case 2: /* replace */
5623 /* No need to check for space, this is a 1:1 replacement */
5624 for (coll = collstart; coll<collend; ++coll)
5625 *str++ = '?';
5626 /* fall through */
5627 case 3: /* ignore */
5628 p = collend;
5629 break;
5630 case 4: /* xmlcharrefreplace */
5631 /* generate replacement (temporarily (mis)uses p) */
5632 for (p = collstart; p < collend; ++p) {
5633 char buffer[2+29+1+1];
5634 char *cp;
5635 sprintf(buffer, "&#%d;", (int)*p);
5636 if (charmaptranslate_makespace(&res, &str,
5637 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5638 goto onError;
5639 for (cp = buffer; *cp; ++cp)
5640 *str++ = *cp;
5642 p = collend;
5643 break;
5644 default:
5645 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5646 reason, startp, size, &exc,
5647 collstart-startp, collend-startp, &newpos);
5648 if (repunicode == NULL)
5649 goto onError;
5650 /* generate replacement */
5651 repsize = PyUnicode_GET_SIZE(repunicode);
5652 if (charmaptranslate_makespace(&res, &str,
5653 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5654 Py_DECREF(repunicode);
5655 goto onError;
5657 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5658 *str++ = *uni2;
5659 p = startp + newpos;
5660 Py_DECREF(repunicode);
5664 /* Resize if we allocated to much */
5665 respos = str-PyUnicode_AS_UNICODE(res);
5666 if (respos<PyUnicode_GET_SIZE(res)) {
5667 if (PyUnicode_Resize(&res, respos) < 0)
5668 goto onError;
5670 Py_XDECREF(exc);
5671 Py_XDECREF(errorHandler);
5672 return res;
5674 onError:
5675 Py_XDECREF(res);
5676 Py_XDECREF(exc);
5677 Py_XDECREF(errorHandler);
5678 return NULL;
5681 PyObject *PyUnicode_Translate(PyObject *str,
5682 PyObject *mapping,
5683 const char *errors)
5685 PyObject *result;
5687 str = PyUnicode_FromObject(str);
5688 if (str == NULL)
5689 goto onError;
5690 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5691 PyUnicode_GET_SIZE(str),
5692 mapping,
5693 errors);
5694 Py_DECREF(str);
5695 return result;
5697 onError:
5698 Py_XDECREF(str);
5699 return NULL;
5702 /* --- Decimal Encoder ---------------------------------------------------- */
5704 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5705 Py_ssize_t length,
5706 char *output,
5707 const char *errors)
5709 Py_UNICODE *p, *end;
5710 PyObject *errorHandler = NULL;
5711 PyObject *exc = NULL;
5712 const char *encoding = "decimal";
5713 const char *reason = "invalid decimal Unicode string";
5714 /* the following variable is used for caching string comparisons
5715 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5716 int known_errorHandler = -1;
5718 if (output == NULL) {
5719 PyErr_BadArgument();
5720 return -1;
5723 p = s;
5724 end = s + length;
5725 while (p < end) {
5726 register Py_UNICODE ch = *p;
5727 int decimal;
5728 PyObject *repunicode;
5729 Py_ssize_t repsize;
5730 Py_ssize_t newpos;
5731 Py_UNICODE *uni2;
5732 Py_UNICODE *collstart;
5733 Py_UNICODE *collend;
5735 if (Py_UNICODE_ISSPACE(ch)) {
5736 *output++ = ' ';
5737 ++p;
5738 continue;
5740 decimal = Py_UNICODE_TODECIMAL(ch);
5741 if (decimal >= 0) {
5742 *output++ = '0' + decimal;
5743 ++p;
5744 continue;
5746 if (0 < ch && ch < 256) {
5747 *output++ = (char)ch;
5748 ++p;
5749 continue;
5751 /* All other characters are considered unencodable */
5752 collstart = p;
5753 collend = p+1;
5754 while (collend < end) {
5755 if ((0 < *collend && *collend < 256) ||
5756 !Py_UNICODE_ISSPACE(*collend) ||
5757 Py_UNICODE_TODECIMAL(*collend))
5758 break;
5760 /* cache callback name lookup
5761 * (if not done yet, i.e. it's the first error) */
5762 if (known_errorHandler==-1) {
5763 if ((errors==NULL) || (!strcmp(errors, "strict")))
5764 known_errorHandler = 1;
5765 else if (!strcmp(errors, "replace"))
5766 known_errorHandler = 2;
5767 else if (!strcmp(errors, "ignore"))
5768 known_errorHandler = 3;
5769 else if (!strcmp(errors, "xmlcharrefreplace"))
5770 known_errorHandler = 4;
5771 else
5772 known_errorHandler = 0;
5774 switch (known_errorHandler) {
5775 case 1: /* strict */
5776 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5777 goto onError;
5778 case 2: /* replace */
5779 for (p = collstart; p < collend; ++p)
5780 *output++ = '?';
5781 /* fall through */
5782 case 3: /* ignore */
5783 p = collend;
5784 break;
5785 case 4: /* xmlcharrefreplace */
5786 /* generate replacement (temporarily (mis)uses p) */
5787 for (p = collstart; p < collend; ++p)
5788 output += sprintf(output, "&#%d;", (int)*p);
5789 p = collend;
5790 break;
5791 default:
5792 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5793 encoding, reason, s, length, &exc,
5794 collstart-s, collend-s, &newpos);
5795 if (repunicode == NULL)
5796 goto onError;
5797 if (!PyUnicode_Check(repunicode)) {
5798 /* Byte results not supported, since they have no decimal property. */
5799 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5800 Py_DECREF(repunicode);
5801 goto onError;
5803 /* generate replacement */
5804 repsize = PyUnicode_GET_SIZE(repunicode);
5805 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5806 Py_UNICODE ch = *uni2;
5807 if (Py_UNICODE_ISSPACE(ch))
5808 *output++ = ' ';
5809 else {
5810 decimal = Py_UNICODE_TODECIMAL(ch);
5811 if (decimal >= 0)
5812 *output++ = '0' + decimal;
5813 else if (0 < ch && ch < 256)
5814 *output++ = (char)ch;
5815 else {
5816 Py_DECREF(repunicode);
5817 raise_encode_exception(&exc, encoding,
5818 s, length, collstart-s, collend-s, reason);
5819 goto onError;
5823 p = s + newpos;
5824 Py_DECREF(repunicode);
5827 /* 0-terminate the output string */
5828 *output++ = '\0';
5829 Py_XDECREF(exc);
5830 Py_XDECREF(errorHandler);
5831 return 0;
5833 onError:
5834 Py_XDECREF(exc);
5835 Py_XDECREF(errorHandler);
5836 return -1;
5839 /* --- Helpers ------------------------------------------------------------ */
5841 #include "stringlib/unicodedefs.h"
5842 #include "stringlib/fastsearch.h"
5843 #include "stringlib/count.h"
5844 /* Include _ParseTupleFinds from find.h */
5845 #define FROM_UNICODE
5846 #include "stringlib/find.h"
5847 #include "stringlib/partition.h"
5849 #define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
5850 #define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
5851 #include "stringlib/localeutil.h"
5853 /* helper macro to fixup start/end slice values */
5854 #define FIX_START_END(obj) \
5855 if (start < 0) \
5856 start += (obj)->length; \
5857 if (start < 0) \
5858 start = 0; \
5859 if (end > (obj)->length) \
5860 end = (obj)->length; \
5861 if (end < 0) \
5862 end += (obj)->length; \
5863 if (end < 0) \
5864 end = 0;
5866 Py_ssize_t PyUnicode_Count(PyObject *str,
5867 PyObject *substr,
5868 Py_ssize_t start,
5869 Py_ssize_t end)
5871 Py_ssize_t result;
5872 PyUnicodeObject* str_obj;
5873 PyUnicodeObject* sub_obj;
5875 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5876 if (!str_obj)
5877 return -1;
5878 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5879 if (!sub_obj) {
5880 Py_DECREF(str_obj);
5881 return -1;
5884 FIX_START_END(str_obj);
5886 result = stringlib_count(
5887 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5890 Py_DECREF(sub_obj);
5891 Py_DECREF(str_obj);
5893 return result;
5896 Py_ssize_t PyUnicode_Find(PyObject *str,
5897 PyObject *sub,
5898 Py_ssize_t start,
5899 Py_ssize_t end,
5900 int direction)
5902 Py_ssize_t result;
5904 str = PyUnicode_FromObject(str);
5905 if (!str)
5906 return -2;
5907 sub = PyUnicode_FromObject(sub);
5908 if (!sub) {
5909 Py_DECREF(str);
5910 return -2;
5913 if (direction > 0)
5914 result = stringlib_find_slice(
5915 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5916 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5917 start, end
5919 else
5920 result = stringlib_rfind_slice(
5921 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5922 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5923 start, end
5926 Py_DECREF(str);
5927 Py_DECREF(sub);
5929 return result;
5932 static
5933 int tailmatch(PyUnicodeObject *self,
5934 PyUnicodeObject *substring,
5935 Py_ssize_t start,
5936 Py_ssize_t end,
5937 int direction)
5939 if (substring->length == 0)
5940 return 1;
5942 FIX_START_END(self);
5944 end -= substring->length;
5945 if (end < start)
5946 return 0;
5948 if (direction > 0) {
5949 if (Py_UNICODE_MATCH(self, end, substring))
5950 return 1;
5951 } else {
5952 if (Py_UNICODE_MATCH(self, start, substring))
5953 return 1;
5956 return 0;
5959 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5960 PyObject *substr,
5961 Py_ssize_t start,
5962 Py_ssize_t end,
5963 int direction)
5965 Py_ssize_t result;
5967 str = PyUnicode_FromObject(str);
5968 if (str == NULL)
5969 return -1;
5970 substr = PyUnicode_FromObject(substr);
5971 if (substr == NULL) {
5972 Py_DECREF(str);
5973 return -1;
5976 result = tailmatch((PyUnicodeObject *)str,
5977 (PyUnicodeObject *)substr,
5978 start, end, direction);
5979 Py_DECREF(str);
5980 Py_DECREF(substr);
5981 return result;
5984 /* Apply fixfct filter to the Unicode object self and return a
5985 reference to the modified object */
5987 static
5988 PyObject *fixup(PyUnicodeObject *self,
5989 int (*fixfct)(PyUnicodeObject *s))
5992 PyUnicodeObject *u;
5994 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5995 if (u == NULL)
5996 return NULL;
5998 Py_UNICODE_COPY(u->str, self->str, self->length);
6000 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
6001 /* fixfct should return TRUE if it modified the buffer. If
6002 FALSE, return a reference to the original buffer instead
6003 (to save space, not time) */
6004 Py_INCREF(self);
6005 Py_DECREF(u);
6006 return (PyObject*) self;
6008 return (PyObject*) u;
6011 static
6012 int fixupper(PyUnicodeObject *self)
6014 Py_ssize_t len = self->length;
6015 Py_UNICODE *s = self->str;
6016 int status = 0;
6018 while (len-- > 0) {
6019 register Py_UNICODE ch;
6021 ch = Py_UNICODE_TOUPPER(*s);
6022 if (ch != *s) {
6023 status = 1;
6024 *s = ch;
6026 s++;
6029 return status;
6032 static
6033 int fixlower(PyUnicodeObject *self)
6035 Py_ssize_t len = self->length;
6036 Py_UNICODE *s = self->str;
6037 int status = 0;
6039 while (len-- > 0) {
6040 register Py_UNICODE ch;
6042 ch = Py_UNICODE_TOLOWER(*s);
6043 if (ch != *s) {
6044 status = 1;
6045 *s = ch;
6047 s++;
6050 return status;
6053 static
6054 int fixswapcase(PyUnicodeObject *self)
6056 Py_ssize_t len = self->length;
6057 Py_UNICODE *s = self->str;
6058 int status = 0;
6060 while (len-- > 0) {
6061 if (Py_UNICODE_ISUPPER(*s)) {
6062 *s = Py_UNICODE_TOLOWER(*s);
6063 status = 1;
6064 } else if (Py_UNICODE_ISLOWER(*s)) {
6065 *s = Py_UNICODE_TOUPPER(*s);
6066 status = 1;
6068 s++;
6071 return status;
6074 static
6075 int fixcapitalize(PyUnicodeObject *self)
6077 Py_ssize_t len = self->length;
6078 Py_UNICODE *s = self->str;
6079 int status = 0;
6081 if (len == 0)
6082 return 0;
6083 if (Py_UNICODE_ISLOWER(*s)) {
6084 *s = Py_UNICODE_TOUPPER(*s);
6085 status = 1;
6087 s++;
6088 while (--len > 0) {
6089 if (Py_UNICODE_ISUPPER(*s)) {
6090 *s = Py_UNICODE_TOLOWER(*s);
6091 status = 1;
6093 s++;
6095 return status;
6098 static
6099 int fixtitle(PyUnicodeObject *self)
6101 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6102 register Py_UNICODE *e;
6103 int previous_is_cased;
6105 /* Shortcut for single character strings */
6106 if (PyUnicode_GET_SIZE(self) == 1) {
6107 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6108 if (*p != ch) {
6109 *p = ch;
6110 return 1;
6112 else
6113 return 0;
6116 e = p + PyUnicode_GET_SIZE(self);
6117 previous_is_cased = 0;
6118 for (; p < e; p++) {
6119 register const Py_UNICODE ch = *p;
6121 if (previous_is_cased)
6122 *p = Py_UNICODE_TOLOWER(ch);
6123 else
6124 *p = Py_UNICODE_TOTITLE(ch);
6126 if (Py_UNICODE_ISLOWER(ch) ||
6127 Py_UNICODE_ISUPPER(ch) ||
6128 Py_UNICODE_ISTITLE(ch))
6129 previous_is_cased = 1;
6130 else
6131 previous_is_cased = 0;
6133 return 1;
6136 PyObject *
6137 PyUnicode_Join(PyObject *separator, PyObject *seq)
6139 const Py_UNICODE blank = ' ';
6140 const Py_UNICODE *sep = &blank;
6141 Py_ssize_t seplen = 1;
6142 PyUnicodeObject *res = NULL; /* the result */
6143 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6144 PyObject *fseq; /* PySequence_Fast(seq) */
6145 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6146 PyObject **items;
6147 PyObject *item;
6148 Py_ssize_t sz, i;
6150 fseq = PySequence_Fast(seq, "");
6151 if (fseq == NULL) {
6152 return NULL;
6155 /* NOTE: the following code can't call back into Python code,
6156 * so we are sure that fseq won't be mutated.
6159 seqlen = PySequence_Fast_GET_SIZE(fseq);
6160 /* If empty sequence, return u"". */
6161 if (seqlen == 0) {
6162 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6163 goto Done;
6165 items = PySequence_Fast_ITEMS(fseq);
6166 /* If singleton sequence with an exact Unicode, return that. */
6167 if (seqlen == 1) {
6168 item = items[0];
6169 if (PyUnicode_CheckExact(item)) {
6170 Py_INCREF(item);
6171 res = (PyUnicodeObject *)item;
6172 goto Done;
6175 else {
6176 /* Set up sep and seplen */
6177 if (separator == NULL) {
6178 sep = &blank;
6179 seplen = 1;
6181 else {
6182 if (!PyUnicode_Check(separator)) {
6183 PyErr_Format(PyExc_TypeError,
6184 "separator: expected str instance,"
6185 " %.80s found",
6186 Py_TYPE(separator)->tp_name);
6187 goto onError;
6189 sep = PyUnicode_AS_UNICODE(separator);
6190 seplen = PyUnicode_GET_SIZE(separator);
6194 /* There are at least two things to join, or else we have a subclass
6195 * of str in the sequence.
6196 * Do a pre-pass to figure out the total amount of space we'll
6197 * need (sz), and see whether all argument are strings.
6199 sz = 0;
6200 for (i = 0; i < seqlen; i++) {
6201 const Py_ssize_t old_sz = sz;
6202 item = items[i];
6203 if (!PyUnicode_Check(item)) {
6204 PyErr_Format(PyExc_TypeError,
6205 "sequence item %zd: expected str instance,"
6206 " %.80s found",
6207 i, Py_TYPE(item)->tp_name);
6208 goto onError;
6210 sz += PyUnicode_GET_SIZE(item);
6211 if (i != 0)
6212 sz += seplen;
6213 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6214 PyErr_SetString(PyExc_OverflowError,
6215 "join() result is too long for a Python string");
6216 goto onError;
6220 res = _PyUnicode_New(sz);
6221 if (res == NULL)
6222 goto onError;
6224 /* Catenate everything. */
6225 res_p = PyUnicode_AS_UNICODE(res);
6226 for (i = 0; i < seqlen; ++i) {
6227 Py_ssize_t itemlen;
6228 item = items[i];
6229 itemlen = PyUnicode_GET_SIZE(item);
6230 /* Copy item, and maybe the separator. */
6231 if (i) {
6232 Py_UNICODE_COPY(res_p, sep, seplen);
6233 res_p += seplen;
6235 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6236 res_p += itemlen;
6239 Done:
6240 Py_DECREF(fseq);
6241 return (PyObject *)res;
6243 onError:
6244 Py_DECREF(fseq);
6245 Py_XDECREF(res);
6246 return NULL;
6249 static
6250 PyUnicodeObject *pad(PyUnicodeObject *self,
6251 Py_ssize_t left,
6252 Py_ssize_t right,
6253 Py_UNICODE fill)
6255 PyUnicodeObject *u;
6257 if (left < 0)
6258 left = 0;
6259 if (right < 0)
6260 right = 0;
6262 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
6263 Py_INCREF(self);
6264 return self;
6267 if (left > PY_SSIZE_T_MAX - self->length ||
6268 right > PY_SSIZE_T_MAX - (left + self->length)) {
6269 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6270 return NULL;
6272 u = _PyUnicode_New(left + self->length + right);
6273 if (u) {
6274 if (left)
6275 Py_UNICODE_FILL(u->str, fill, left);
6276 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6277 if (right)
6278 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6281 return u;
6284 #define SPLIT_APPEND(data, left, right) \
6285 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
6286 if (!str) \
6287 goto onError; \
6288 if (PyList_Append(list, str)) { \
6289 Py_DECREF(str); \
6290 goto onError; \
6292 else \
6293 Py_DECREF(str);
6295 static
6296 PyObject *split_whitespace(PyUnicodeObject *self,
6297 PyObject *list,
6298 Py_ssize_t maxcount)
6300 register Py_ssize_t i;
6301 register Py_ssize_t j;
6302 Py_ssize_t len = self->length;
6303 PyObject *str;
6304 register const Py_UNICODE *buf = self->str;
6306 for (i = j = 0; i < len; ) {
6307 /* find a token */
6308 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
6309 i++;
6310 j = i;
6311 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
6312 i++;
6313 if (j < i) {
6314 if (maxcount-- <= 0)
6315 break;
6316 SPLIT_APPEND(buf, j, i);
6317 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
6318 i++;
6319 j = i;
6322 if (j < len) {
6323 SPLIT_APPEND(buf, j, len);
6325 return list;
6327 onError:
6328 Py_DECREF(list);
6329 return NULL;
6332 PyObject *PyUnicode_Splitlines(PyObject *string,
6333 int keepends)
6335 register Py_ssize_t i;
6336 register Py_ssize_t j;
6337 Py_ssize_t len;
6338 PyObject *list;
6339 PyObject *str;
6340 Py_UNICODE *data;
6342 string = PyUnicode_FromObject(string);
6343 if (string == NULL)
6344 return NULL;
6345 data = PyUnicode_AS_UNICODE(string);
6346 len = PyUnicode_GET_SIZE(string);
6348 list = PyList_New(0);
6349 if (!list)
6350 goto onError;
6352 for (i = j = 0; i < len; ) {
6353 Py_ssize_t eol;
6355 /* Find a line and append it */
6356 while (i < len && !BLOOM_LINEBREAK(data[i]))
6357 i++;
6359 /* Skip the line break reading CRLF as one line break */
6360 eol = i;
6361 if (i < len) {
6362 if (data[i] == '\r' && i + 1 < len &&
6363 data[i+1] == '\n')
6364 i += 2;
6365 else
6366 i++;
6367 if (keepends)
6368 eol = i;
6370 SPLIT_APPEND(data, j, eol);
6371 j = i;
6373 if (j < len) {
6374 SPLIT_APPEND(data, j, len);
6377 Py_DECREF(string);
6378 return list;
6380 onError:
6381 Py_XDECREF(list);
6382 Py_DECREF(string);
6383 return NULL;
6386 static
6387 PyObject *split_char(PyUnicodeObject *self,
6388 PyObject *list,
6389 Py_UNICODE ch,
6390 Py_ssize_t maxcount)
6392 register Py_ssize_t i;
6393 register Py_ssize_t j;
6394 Py_ssize_t len = self->length;
6395 PyObject *str;
6396 register const Py_UNICODE *buf = self->str;
6398 for (i = j = 0; i < len; ) {
6399 if (buf[i] == ch) {
6400 if (maxcount-- <= 0)
6401 break;
6402 SPLIT_APPEND(buf, j, i);
6403 i = j = i + 1;
6404 } else
6405 i++;
6407 if (j <= len) {
6408 SPLIT_APPEND(buf, j, len);
6410 return list;
6412 onError:
6413 Py_DECREF(list);
6414 return NULL;
6417 static
6418 PyObject *split_substring(PyUnicodeObject *self,
6419 PyObject *list,
6420 PyUnicodeObject *substring,
6421 Py_ssize_t maxcount)
6423 register Py_ssize_t i;
6424 register Py_ssize_t j;
6425 Py_ssize_t len = self->length;
6426 Py_ssize_t sublen = substring->length;
6427 PyObject *str;
6429 for (i = j = 0; i <= len - sublen; ) {
6430 if (Py_UNICODE_MATCH(self, i, substring)) {
6431 if (maxcount-- <= 0)
6432 break;
6433 SPLIT_APPEND(self->str, j, i);
6434 i = j = i + sublen;
6435 } else
6436 i++;
6438 if (j <= len) {
6439 SPLIT_APPEND(self->str, j, len);
6441 return list;
6443 onError:
6444 Py_DECREF(list);
6445 return NULL;
6448 static
6449 PyObject *rsplit_whitespace(PyUnicodeObject *self,
6450 PyObject *list,
6451 Py_ssize_t maxcount)
6453 register Py_ssize_t i;
6454 register Py_ssize_t j;
6455 Py_ssize_t len = self->length;
6456 PyObject *str;
6457 register const Py_UNICODE *buf = self->str;
6459 for (i = j = len - 1; i >= 0; ) {
6460 /* find a token */
6461 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
6462 i--;
6463 j = i;
6464 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
6465 i--;
6466 if (j > i) {
6467 if (maxcount-- <= 0)
6468 break;
6469 SPLIT_APPEND(buf, i + 1, j + 1);
6470 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
6471 i--;
6472 j = i;
6475 if (j >= 0) {
6476 SPLIT_APPEND(buf, 0, j + 1);
6478 if (PyList_Reverse(list) < 0)
6479 goto onError;
6480 return list;
6482 onError:
6483 Py_DECREF(list);
6484 return NULL;
6487 static
6488 PyObject *rsplit_char(PyUnicodeObject *self,
6489 PyObject *list,
6490 Py_UNICODE ch,
6491 Py_ssize_t maxcount)
6493 register Py_ssize_t i;
6494 register Py_ssize_t j;
6495 Py_ssize_t len = self->length;
6496 PyObject *str;
6497 register const Py_UNICODE *buf = self->str;
6499 for (i = j = len - 1; i >= 0; ) {
6500 if (buf[i] == ch) {
6501 if (maxcount-- <= 0)
6502 break;
6503 SPLIT_APPEND(buf, i + 1, j + 1);
6504 j = i = i - 1;
6505 } else
6506 i--;
6508 if (j >= -1) {
6509 SPLIT_APPEND(buf, 0, j + 1);
6511 if (PyList_Reverse(list) < 0)
6512 goto onError;
6513 return list;
6515 onError:
6516 Py_DECREF(list);
6517 return NULL;
6520 static
6521 PyObject *rsplit_substring(PyUnicodeObject *self,
6522 PyObject *list,
6523 PyUnicodeObject *substring,
6524 Py_ssize_t maxcount)
6526 register Py_ssize_t i;
6527 register Py_ssize_t j;
6528 Py_ssize_t len = self->length;
6529 Py_ssize_t sublen = substring->length;
6530 PyObject *str;
6532 for (i = len - sublen, j = len; i >= 0; ) {
6533 if (Py_UNICODE_MATCH(self, i, substring)) {
6534 if (maxcount-- <= 0)
6535 break;
6536 SPLIT_APPEND(self->str, i + sublen, j);
6537 j = i;
6538 i -= sublen;
6539 } else
6540 i--;
6542 if (j >= 0) {
6543 SPLIT_APPEND(self->str, 0, j);
6545 if (PyList_Reverse(list) < 0)
6546 goto onError;
6547 return list;
6549 onError:
6550 Py_DECREF(list);
6551 return NULL;
6554 #undef SPLIT_APPEND
6556 static
6557 PyObject *split(PyUnicodeObject *self,
6558 PyUnicodeObject *substring,
6559 Py_ssize_t maxcount)
6561 PyObject *list;
6563 if (maxcount < 0)
6564 maxcount = PY_SSIZE_T_MAX;
6566 list = PyList_New(0);
6567 if (!list)
6568 return NULL;
6570 if (substring == NULL)
6571 return split_whitespace(self,list,maxcount);
6573 else if (substring->length == 1)
6574 return split_char(self,list,substring->str[0],maxcount);
6576 else if (substring->length == 0) {
6577 Py_DECREF(list);
6578 PyErr_SetString(PyExc_ValueError, "empty separator");
6579 return NULL;
6581 else
6582 return split_substring(self,list,substring,maxcount);
6585 static
6586 PyObject *rsplit(PyUnicodeObject *self,
6587 PyUnicodeObject *substring,
6588 Py_ssize_t maxcount)
6590 PyObject *list;
6592 if (maxcount < 0)
6593 maxcount = PY_SSIZE_T_MAX;
6595 list = PyList_New(0);
6596 if (!list)
6597 return NULL;
6599 if (substring == NULL)
6600 return rsplit_whitespace(self,list,maxcount);
6602 else if (substring->length == 1)
6603 return rsplit_char(self,list,substring->str[0],maxcount);
6605 else if (substring->length == 0) {
6606 Py_DECREF(list);
6607 PyErr_SetString(PyExc_ValueError, "empty separator");
6608 return NULL;
6610 else
6611 return rsplit_substring(self,list,substring,maxcount);
6614 static
6615 PyObject *replace(PyUnicodeObject *self,
6616 PyUnicodeObject *str1,
6617 PyUnicodeObject *str2,
6618 Py_ssize_t maxcount)
6620 PyUnicodeObject *u;
6622 if (maxcount < 0)
6623 maxcount = PY_SSIZE_T_MAX;
6625 if (str1->length == str2->length) {
6626 /* same length */
6627 Py_ssize_t i;
6628 if (str1->length == 1) {
6629 /* replace characters */
6630 Py_UNICODE u1, u2;
6631 if (!findchar(self->str, self->length, str1->str[0]))
6632 goto nothing;
6633 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6634 if (!u)
6635 return NULL;
6636 Py_UNICODE_COPY(u->str, self->str, self->length);
6637 u1 = str1->str[0];
6638 u2 = str2->str[0];
6639 for (i = 0; i < u->length; i++)
6640 if (u->str[i] == u1) {
6641 if (--maxcount < 0)
6642 break;
6643 u->str[i] = u2;
6645 } else {
6646 i = fastsearch(
6647 self->str, self->length, str1->str, str1->length, FAST_SEARCH
6649 if (i < 0)
6650 goto nothing;
6651 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6652 if (!u)
6653 return NULL;
6654 Py_UNICODE_COPY(u->str, self->str, self->length);
6655 while (i <= self->length - str1->length)
6656 if (Py_UNICODE_MATCH(self, i, str1)) {
6657 if (--maxcount < 0)
6658 break;
6659 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6660 i += str1->length;
6661 } else
6662 i++;
6664 } else {
6666 Py_ssize_t n, i, j, e;
6667 Py_ssize_t product, new_size, delta;
6668 Py_UNICODE *p;
6670 /* replace strings */
6671 n = stringlib_count(self->str, self->length, str1->str, str1->length);
6672 if (n > maxcount)
6673 n = maxcount;
6674 if (n == 0)
6675 goto nothing;
6676 /* new_size = self->length + n * (str2->length - str1->length)); */
6677 delta = (str2->length - str1->length);
6678 if (delta == 0) {
6679 new_size = self->length;
6680 } else {
6681 product = n * (str2->length - str1->length);
6682 if ((product / (str2->length - str1->length)) != n) {
6683 PyErr_SetString(PyExc_OverflowError,
6684 "replace string is too long");
6685 return NULL;
6687 new_size = self->length + product;
6688 if (new_size < 0) {
6689 PyErr_SetString(PyExc_OverflowError,
6690 "replace string is too long");
6691 return NULL;
6694 u = _PyUnicode_New(new_size);
6695 if (!u)
6696 return NULL;
6697 i = 0;
6698 p = u->str;
6699 e = self->length - str1->length;
6700 if (str1->length > 0) {
6701 while (n-- > 0) {
6702 /* look for next match */
6703 j = i;
6704 while (j <= e) {
6705 if (Py_UNICODE_MATCH(self, j, str1))
6706 break;
6707 j++;
6709 if (j > i) {
6710 if (j > e)
6711 break;
6712 /* copy unchanged part [i:j] */
6713 Py_UNICODE_COPY(p, self->str+i, j-i);
6714 p += j - i;
6716 /* copy substitution string */
6717 if (str2->length > 0) {
6718 Py_UNICODE_COPY(p, str2->str, str2->length);
6719 p += str2->length;
6721 i = j + str1->length;
6723 if (i < self->length)
6724 /* copy tail [i:] */
6725 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6726 } else {
6727 /* interleave */
6728 while (n > 0) {
6729 Py_UNICODE_COPY(p, str2->str, str2->length);
6730 p += str2->length;
6731 if (--n <= 0)
6732 break;
6733 *p++ = self->str[i++];
6735 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6738 return (PyObject *) u;
6740 nothing:
6741 /* nothing to replace; return original string (when possible) */
6742 if (PyUnicode_CheckExact(self)) {
6743 Py_INCREF(self);
6744 return (PyObject *) self;
6746 return PyUnicode_FromUnicode(self->str, self->length);
6749 /* --- Unicode Object Methods --------------------------------------------- */
6751 PyDoc_STRVAR(title__doc__,
6752 "S.title() -> str\n\
6754 Return a titlecased version of S, i.e. words start with title case\n\
6755 characters, all remaining cased characters have lower case.");
6757 static PyObject*
6758 unicode_title(PyUnicodeObject *self)
6760 return fixup(self, fixtitle);
6763 PyDoc_STRVAR(capitalize__doc__,
6764 "S.capitalize() -> str\n\
6766 Return a capitalized version of S, i.e. make the first character\n\
6767 have upper case and the rest lower case.");
6769 static PyObject*
6770 unicode_capitalize(PyUnicodeObject *self)
6772 return fixup(self, fixcapitalize);
6775 #if 0
6776 PyDoc_STRVAR(capwords__doc__,
6777 "S.capwords() -> str\n\
6779 Apply .capitalize() to all words in S and return the result with\n\
6780 normalized whitespace (all whitespace strings are replaced by ' ').");
6782 static PyObject*
6783 unicode_capwords(PyUnicodeObject *self)
6785 PyObject *list;
6786 PyObject *item;
6787 Py_ssize_t i;
6789 /* Split into words */
6790 list = split(self, NULL, -1);
6791 if (!list)
6792 return NULL;
6794 /* Capitalize each word */
6795 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6796 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6797 fixcapitalize);
6798 if (item == NULL)
6799 goto onError;
6800 Py_DECREF(PyList_GET_ITEM(list, i));
6801 PyList_SET_ITEM(list, i, item);
6804 /* Join the words to form a new string */
6805 item = PyUnicode_Join(NULL, list);
6807 onError:
6808 Py_DECREF(list);
6809 return (PyObject *)item;
6811 #endif
6813 /* Argument converter. Coerces to a single unicode character */
6815 static int
6816 convert_uc(PyObject *obj, void *addr)
6818 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6819 PyObject *uniobj;
6820 Py_UNICODE *unistr;
6822 uniobj = PyUnicode_FromObject(obj);
6823 if (uniobj == NULL) {
6824 PyErr_SetString(PyExc_TypeError,
6825 "The fill character cannot be converted to Unicode");
6826 return 0;
6828 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6829 PyErr_SetString(PyExc_TypeError,
6830 "The fill character must be exactly one character long");
6831 Py_DECREF(uniobj);
6832 return 0;
6834 unistr = PyUnicode_AS_UNICODE(uniobj);
6835 *fillcharloc = unistr[0];
6836 Py_DECREF(uniobj);
6837 return 1;
6840 PyDoc_STRVAR(center__doc__,
6841 "S.center(width[, fillchar]) -> str\n\
6843 Return S centered in a string of length width. Padding is\n\
6844 done using the specified fill character (default is a space)");
6846 static PyObject *
6847 unicode_center(PyUnicodeObject *self, PyObject *args)
6849 Py_ssize_t marg, left;
6850 Py_ssize_t width;
6851 Py_UNICODE fillchar = ' ';
6853 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6854 return NULL;
6856 if (self->length >= width && PyUnicode_CheckExact(self)) {
6857 Py_INCREF(self);
6858 return (PyObject*) self;
6861 marg = width - self->length;
6862 left = marg / 2 + (marg & width & 1);
6864 return (PyObject*) pad(self, left, marg - left, fillchar);
6867 #if 0
6869 /* This code should go into some future Unicode collation support
6870 module. The basic comparison should compare ordinals on a naive
6871 basis (this is what Java does and thus Jython too). */
6873 /* speedy UTF-16 code point order comparison */
6874 /* gleaned from: */
6875 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6877 static short utf16Fixup[32] =
6879 0, 0, 0, 0, 0, 0, 0, 0,
6880 0, 0, 0, 0, 0, 0, 0, 0,
6881 0, 0, 0, 0, 0, 0, 0, 0,
6882 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6885 static int
6886 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6888 Py_ssize_t len1, len2;
6890 Py_UNICODE *s1 = str1->str;
6891 Py_UNICODE *s2 = str2->str;
6893 len1 = str1->length;
6894 len2 = str2->length;
6896 while (len1 > 0 && len2 > 0) {
6897 Py_UNICODE c1, c2;
6899 c1 = *s1++;
6900 c2 = *s2++;
6902 if (c1 > (1<<11) * 26)
6903 c1 += utf16Fixup[c1>>11];
6904 if (c2 > (1<<11) * 26)
6905 c2 += utf16Fixup[c2>>11];
6906 /* now c1 and c2 are in UTF-32-compatible order */
6908 if (c1 != c2)
6909 return (c1 < c2) ? -1 : 1;
6911 len1--; len2--;
6914 return (len1 < len2) ? -1 : (len1 != len2);
6917 #else
6919 static int
6920 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6922 register Py_ssize_t len1, len2;
6924 Py_UNICODE *s1 = str1->str;
6925 Py_UNICODE *s2 = str2->str;
6927 len1 = str1->length;
6928 len2 = str2->length;
6930 while (len1 > 0 && len2 > 0) {
6931 Py_UNICODE c1, c2;
6933 c1 = *s1++;
6934 c2 = *s2++;
6936 if (c1 != c2)
6937 return (c1 < c2) ? -1 : 1;
6939 len1--; len2--;
6942 return (len1 < len2) ? -1 : (len1 != len2);
6945 #endif
6947 int PyUnicode_Compare(PyObject *left,
6948 PyObject *right)
6950 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6951 return unicode_compare((PyUnicodeObject *)left,
6952 (PyUnicodeObject *)right);
6953 PyErr_Format(PyExc_TypeError,
6954 "Can't compare %.100s and %.100s",
6955 left->ob_type->tp_name,
6956 right->ob_type->tp_name);
6957 return -1;
6961 PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6963 int i;
6964 Py_UNICODE *id;
6965 assert(PyUnicode_Check(uni));
6966 id = PyUnicode_AS_UNICODE(uni);
6967 /* Compare Unicode string and source character set string */
6968 for (i = 0; id[i] && str[i]; i++)
6969 if (id[i] != str[i])
6970 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6971 /* This check keeps Python strings that end in '\0' from comparing equal
6972 to C strings identical up to that point. */
6973 if (PyUnicode_GET_SIZE(uni) != i)
6974 /* We'll say the Python string is longer. */
6975 return 1;
6976 if (id[i])
6977 return 1; /* uni is longer */
6978 if (str[i])
6979 return -1; /* str is longer */
6980 return 0;
6984 #define TEST_COND(cond) \
6985 ((cond) ? Py_True : Py_False)
6987 PyObject *PyUnicode_RichCompare(PyObject *left,
6988 PyObject *right,
6989 int op)
6991 int result;
6993 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6994 PyObject *v;
6995 if (((PyUnicodeObject *) left)->length !=
6996 ((PyUnicodeObject *) right)->length) {
6997 if (op == Py_EQ) {
6998 Py_INCREF(Py_False);
6999 return Py_False;
7001 if (op == Py_NE) {
7002 Py_INCREF(Py_True);
7003 return Py_True;
7006 if (left == right)
7007 result = 0;
7008 else
7009 result = unicode_compare((PyUnicodeObject *)left,
7010 (PyUnicodeObject *)right);
7012 /* Convert the return value to a Boolean */
7013 switch (op) {
7014 case Py_EQ:
7015 v = TEST_COND(result == 0);
7016 break;
7017 case Py_NE:
7018 v = TEST_COND(result != 0);
7019 break;
7020 case Py_LE:
7021 v = TEST_COND(result <= 0);
7022 break;
7023 case Py_GE:
7024 v = TEST_COND(result >= 0);
7025 break;
7026 case Py_LT:
7027 v = TEST_COND(result == -1);
7028 break;
7029 case Py_GT:
7030 v = TEST_COND(result == 1);
7031 break;
7032 default:
7033 PyErr_BadArgument();
7034 return NULL;
7036 Py_INCREF(v);
7037 return v;
7040 Py_INCREF(Py_NotImplemented);
7041 return Py_NotImplemented;
7044 int PyUnicode_Contains(PyObject *container,
7045 PyObject *element)
7047 PyObject *str, *sub;
7048 int result;
7050 /* Coerce the two arguments */
7051 sub = PyUnicode_FromObject(element);
7052 if (!sub) {
7053 PyErr_Format(PyExc_TypeError,
7054 "'in <string>' requires string as left operand, not %s",
7055 element->ob_type->tp_name);
7056 return -1;
7059 str = PyUnicode_FromObject(container);
7060 if (!str) {
7061 Py_DECREF(sub);
7062 return -1;
7065 result = stringlib_contains_obj(str, sub);
7067 Py_DECREF(str);
7068 Py_DECREF(sub);
7070 return result;
7073 /* Concat to string or Unicode object giving a new Unicode object. */
7075 PyObject *PyUnicode_Concat(PyObject *left,
7076 PyObject *right)
7078 PyUnicodeObject *u = NULL, *v = NULL, *w;
7080 /* Coerce the two arguments */
7081 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7082 if (u == NULL)
7083 goto onError;
7084 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7085 if (v == NULL)
7086 goto onError;
7088 /* Shortcuts */
7089 if (v == unicode_empty) {
7090 Py_DECREF(v);
7091 return (PyObject *)u;
7093 if (u == unicode_empty) {
7094 Py_DECREF(u);
7095 return (PyObject *)v;
7098 /* Concat the two Unicode strings */
7099 w = _PyUnicode_New(u->length + v->length);
7100 if (w == NULL)
7101 goto onError;
7102 Py_UNICODE_COPY(w->str, u->str, u->length);
7103 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7105 Py_DECREF(u);
7106 Py_DECREF(v);
7107 return (PyObject *)w;
7109 onError:
7110 Py_XDECREF(u);
7111 Py_XDECREF(v);
7112 return NULL;
7115 void
7116 PyUnicode_Append(PyObject **pleft, PyObject *right)
7118 PyObject *new;
7119 if (*pleft == NULL)
7120 return;
7121 if (right == NULL || !PyUnicode_Check(*pleft)) {
7122 Py_DECREF(*pleft);
7123 *pleft = NULL;
7124 return;
7126 new = PyUnicode_Concat(*pleft, right);
7127 Py_DECREF(*pleft);
7128 *pleft = new;
7131 void
7132 PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7134 PyUnicode_Append(pleft, right);
7135 Py_XDECREF(right);
7138 PyDoc_STRVAR(count__doc__,
7139 "S.count(sub[, start[, end]]) -> int\n\
7141 Return the number of non-overlapping occurrences of substring sub in\n\
7142 string S[start:end]. Optional arguments start and end are\n\
7143 interpreted as in slice notation.");
7145 static PyObject *
7146 unicode_count(PyUnicodeObject *self, PyObject *args)
7148 PyUnicodeObject *substring;
7149 Py_ssize_t start = 0;
7150 Py_ssize_t end = PY_SSIZE_T_MAX;
7151 PyObject *result;
7153 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
7154 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7155 return NULL;
7157 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7158 (PyObject *)substring);
7159 if (substring == NULL)
7160 return NULL;
7162 FIX_START_END(self);
7164 result = PyLong_FromSsize_t(
7165 stringlib_count(self->str + start, end - start,
7166 substring->str, substring->length)
7169 Py_DECREF(substring);
7171 return result;
7174 PyDoc_STRVAR(encode__doc__,
7175 "S.encode([encoding[, errors]]) -> bytes\n\
7177 Encode S using the codec registered for encoding. encoding defaults\n\
7178 to the default encoding. errors may be given to set a different error\n\
7179 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
7180 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7181 'xmlcharrefreplace' as well as any other name registered with\n\
7182 codecs.register_error that can handle UnicodeEncodeErrors.");
7184 static PyObject *
7185 unicode_encode(PyUnicodeObject *self, PyObject *args)
7187 char *encoding = NULL;
7188 char *errors = NULL;
7189 PyObject *v;
7191 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
7192 return NULL;
7193 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
7194 if (v == NULL)
7195 goto onError;
7196 if (!PyBytes_Check(v)) {
7197 PyErr_Format(PyExc_TypeError,
7198 "encoder did not return a bytes object "
7199 "(type=%.400s)",
7200 Py_TYPE(v)->tp_name);
7201 Py_DECREF(v);
7202 return NULL;
7204 return v;
7206 onError:
7207 return NULL;
7210 PyDoc_STRVAR(expandtabs__doc__,
7211 "S.expandtabs([tabsize]) -> str\n\
7213 Return a copy of S where all tab characters are expanded using spaces.\n\
7214 If tabsize is not given, a tab size of 8 characters is assumed.");
7216 static PyObject*
7217 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7219 Py_UNICODE *e;
7220 Py_UNICODE *p;
7221 Py_UNICODE *q;
7222 Py_UNICODE *qe;
7223 Py_ssize_t i, j, incr;
7224 PyUnicodeObject *u;
7225 int tabsize = 8;
7227 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
7228 return NULL;
7230 /* First pass: determine size of output string */
7231 i = 0; /* chars up to and including most recent \n or \r */
7232 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7233 e = self->str + self->length; /* end of input */
7234 for (p = self->str; p < e; p++)
7235 if (*p == '\t') {
7236 if (tabsize > 0) {
7237 incr = tabsize - (j % tabsize); /* cannot overflow */
7238 if (j > PY_SSIZE_T_MAX - incr)
7239 goto overflow1;
7240 j += incr;
7243 else {
7244 if (j > PY_SSIZE_T_MAX - 1)
7245 goto overflow1;
7246 j++;
7247 if (*p == '\n' || *p == '\r') {
7248 if (i > PY_SSIZE_T_MAX - j)
7249 goto overflow1;
7250 i += j;
7251 j = 0;
7255 if (i > PY_SSIZE_T_MAX - j)
7256 goto overflow1;
7258 /* Second pass: create output string and fill it */
7259 u = _PyUnicode_New(i + j);
7260 if (!u)
7261 return NULL;
7263 j = 0; /* same as in first pass */
7264 q = u->str; /* next output char */
7265 qe = u->str + u->length; /* end of output */
7267 for (p = self->str; p < e; p++)
7268 if (*p == '\t') {
7269 if (tabsize > 0) {
7270 i = tabsize - (j % tabsize);
7271 j += i;
7272 while (i--) {
7273 if (q >= qe)
7274 goto overflow2;
7275 *q++ = ' ';
7279 else {
7280 if (q >= qe)
7281 goto overflow2;
7282 *q++ = *p;
7283 j++;
7284 if (*p == '\n' || *p == '\r')
7285 j = 0;
7288 return (PyObject*) u;
7290 overflow2:
7291 Py_DECREF(u);
7292 overflow1:
7293 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7294 return NULL;
7297 PyDoc_STRVAR(find__doc__,
7298 "S.find(sub[, start[, end]]) -> int\n\
7300 Return the lowest index in S where substring sub is found,\n\
7301 such that sub is contained within s[start:end]. Optional\n\
7302 arguments start and end are interpreted as in slice notation.\n\
7304 Return -1 on failure.");
7306 static PyObject *
7307 unicode_find(PyUnicodeObject *self, PyObject *args)
7309 PyObject *substring;
7310 Py_ssize_t start;
7311 Py_ssize_t end;
7312 Py_ssize_t result;
7314 if (!_ParseTupleFinds(args, &substring, &start, &end))
7315 return NULL;
7317 result = stringlib_find_slice(
7318 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7319 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7320 start, end
7323 Py_DECREF(substring);
7325 return PyLong_FromSsize_t(result);
7328 static PyObject *
7329 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
7331 if (index < 0 || index >= self->length) {
7332 PyErr_SetString(PyExc_IndexError, "string index out of range");
7333 return NULL;
7336 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7339 /* Believe it or not, this produces the same value for ASCII strings
7340 as string_hash(). */
7341 static long
7342 unicode_hash(PyUnicodeObject *self)
7344 Py_ssize_t len;
7345 Py_UNICODE *p;
7346 long x;
7348 if (self->hash != -1)
7349 return self->hash;
7350 len = Py_SIZE(self);
7351 p = self->str;
7352 x = *p << 7;
7353 while (--len >= 0)
7354 x = (1000003*x) ^ *p++;
7355 x ^= Py_SIZE(self);
7356 if (x == -1)
7357 x = -2;
7358 self->hash = x;
7359 return x;
7362 PyDoc_STRVAR(index__doc__,
7363 "S.index(sub[, start[, end]]) -> int\n\
7365 Like S.find() but raise ValueError when the substring is not found.");
7367 static PyObject *
7368 unicode_index(PyUnicodeObject *self, PyObject *args)
7370 Py_ssize_t result;
7371 PyObject *substring;
7372 Py_ssize_t start;
7373 Py_ssize_t end;
7375 if (!_ParseTupleFinds(args, &substring, &start, &end))
7376 return NULL;
7378 result = stringlib_find_slice(
7379 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7380 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7381 start, end
7384 Py_DECREF(substring);
7386 if (result < 0) {
7387 PyErr_SetString(PyExc_ValueError, "substring not found");
7388 return NULL;
7391 return PyLong_FromSsize_t(result);
7394 PyDoc_STRVAR(islower__doc__,
7395 "S.islower() -> bool\n\
7397 Return True if all cased characters in S are lowercase and there is\n\
7398 at least one cased character in S, False otherwise.");
7400 static PyObject*
7401 unicode_islower(PyUnicodeObject *self)
7403 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7404 register const Py_UNICODE *e;
7405 int cased;
7407 /* Shortcut for single character strings */
7408 if (PyUnicode_GET_SIZE(self) == 1)
7409 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
7411 /* Special case for empty strings */
7412 if (PyUnicode_GET_SIZE(self) == 0)
7413 return PyBool_FromLong(0);
7415 e = p + PyUnicode_GET_SIZE(self);
7416 cased = 0;
7417 for (; p < e; p++) {
7418 register const Py_UNICODE ch = *p;
7420 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7421 return PyBool_FromLong(0);
7422 else if (!cased && Py_UNICODE_ISLOWER(ch))
7423 cased = 1;
7425 return PyBool_FromLong(cased);
7428 PyDoc_STRVAR(isupper__doc__,
7429 "S.isupper() -> bool\n\
7431 Return True if all cased characters in S are uppercase and there is\n\
7432 at least one cased character in S, False otherwise.");
7434 static PyObject*
7435 unicode_isupper(PyUnicodeObject *self)
7437 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7438 register const Py_UNICODE *e;
7439 int cased;
7441 /* Shortcut for single character strings */
7442 if (PyUnicode_GET_SIZE(self) == 1)
7443 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
7445 /* Special case for empty strings */
7446 if (PyUnicode_GET_SIZE(self) == 0)
7447 return PyBool_FromLong(0);
7449 e = p + PyUnicode_GET_SIZE(self);
7450 cased = 0;
7451 for (; p < e; p++) {
7452 register const Py_UNICODE ch = *p;
7454 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7455 return PyBool_FromLong(0);
7456 else if (!cased && Py_UNICODE_ISUPPER(ch))
7457 cased = 1;
7459 return PyBool_FromLong(cased);
7462 PyDoc_STRVAR(istitle__doc__,
7463 "S.istitle() -> bool\n\
7465 Return True if S is a titlecased string and there is at least one\n\
7466 character in S, i.e. upper- and titlecase characters may only\n\
7467 follow uncased characters and lowercase characters only cased ones.\n\
7468 Return False otherwise.");
7470 static PyObject*
7471 unicode_istitle(PyUnicodeObject *self)
7473 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7474 register const Py_UNICODE *e;
7475 int cased, previous_is_cased;
7477 /* Shortcut for single character strings */
7478 if (PyUnicode_GET_SIZE(self) == 1)
7479 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7480 (Py_UNICODE_ISUPPER(*p) != 0));
7482 /* Special case for empty strings */
7483 if (PyUnicode_GET_SIZE(self) == 0)
7484 return PyBool_FromLong(0);
7486 e = p + PyUnicode_GET_SIZE(self);
7487 cased = 0;
7488 previous_is_cased = 0;
7489 for (; p < e; p++) {
7490 register const Py_UNICODE ch = *p;
7492 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7493 if (previous_is_cased)
7494 return PyBool_FromLong(0);
7495 previous_is_cased = 1;
7496 cased = 1;
7498 else if (Py_UNICODE_ISLOWER(ch)) {
7499 if (!previous_is_cased)
7500 return PyBool_FromLong(0);
7501 previous_is_cased = 1;
7502 cased = 1;
7504 else
7505 previous_is_cased = 0;
7507 return PyBool_FromLong(cased);
7510 PyDoc_STRVAR(isspace__doc__,
7511 "S.isspace() -> bool\n\
7513 Return True if all characters in S are whitespace\n\
7514 and there is at least one character in S, False otherwise.");
7516 static PyObject*
7517 unicode_isspace(PyUnicodeObject *self)
7519 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7520 register const Py_UNICODE *e;
7522 /* Shortcut for single character strings */
7523 if (PyUnicode_GET_SIZE(self) == 1 &&
7524 Py_UNICODE_ISSPACE(*p))
7525 return PyBool_FromLong(1);
7527 /* Special case for empty strings */
7528 if (PyUnicode_GET_SIZE(self) == 0)
7529 return PyBool_FromLong(0);
7531 e = p + PyUnicode_GET_SIZE(self);
7532 for (; p < e; p++) {
7533 if (!Py_UNICODE_ISSPACE(*p))
7534 return PyBool_FromLong(0);
7536 return PyBool_FromLong(1);
7539 PyDoc_STRVAR(isalpha__doc__,
7540 "S.isalpha() -> bool\n\
7542 Return True if all characters in S are alphabetic\n\
7543 and there is at least one character in S, False otherwise.");
7545 static PyObject*
7546 unicode_isalpha(PyUnicodeObject *self)
7548 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7549 register const Py_UNICODE *e;
7551 /* Shortcut for single character strings */
7552 if (PyUnicode_GET_SIZE(self) == 1 &&
7553 Py_UNICODE_ISALPHA(*p))
7554 return PyBool_FromLong(1);
7556 /* Special case for empty strings */
7557 if (PyUnicode_GET_SIZE(self) == 0)
7558 return PyBool_FromLong(0);
7560 e = p + PyUnicode_GET_SIZE(self);
7561 for (; p < e; p++) {
7562 if (!Py_UNICODE_ISALPHA(*p))
7563 return PyBool_FromLong(0);
7565 return PyBool_FromLong(1);
7568 PyDoc_STRVAR(isalnum__doc__,
7569 "S.isalnum() -> bool\n\
7571 Return True if all characters in S are alphanumeric\n\
7572 and there is at least one character in S, False otherwise.");
7574 static PyObject*
7575 unicode_isalnum(PyUnicodeObject *self)
7577 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7578 register const Py_UNICODE *e;
7580 /* Shortcut for single character strings */
7581 if (PyUnicode_GET_SIZE(self) == 1 &&
7582 Py_UNICODE_ISALNUM(*p))
7583 return PyBool_FromLong(1);
7585 /* Special case for empty strings */
7586 if (PyUnicode_GET_SIZE(self) == 0)
7587 return PyBool_FromLong(0);
7589 e = p + PyUnicode_GET_SIZE(self);
7590 for (; p < e; p++) {
7591 if (!Py_UNICODE_ISALNUM(*p))
7592 return PyBool_FromLong(0);
7594 return PyBool_FromLong(1);
7597 PyDoc_STRVAR(isdecimal__doc__,
7598 "S.isdecimal() -> bool\n\
7600 Return True if there are only decimal characters in S,\n\
7601 False otherwise.");
7603 static PyObject*
7604 unicode_isdecimal(PyUnicodeObject *self)
7606 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7607 register const Py_UNICODE *e;
7609 /* Shortcut for single character strings */
7610 if (PyUnicode_GET_SIZE(self) == 1 &&
7611 Py_UNICODE_ISDECIMAL(*p))
7612 return PyBool_FromLong(1);
7614 /* Special case for empty strings */
7615 if (PyUnicode_GET_SIZE(self) == 0)
7616 return PyBool_FromLong(0);
7618 e = p + PyUnicode_GET_SIZE(self);
7619 for (; p < e; p++) {
7620 if (!Py_UNICODE_ISDECIMAL(*p))
7621 return PyBool_FromLong(0);
7623 return PyBool_FromLong(1);
7626 PyDoc_STRVAR(isdigit__doc__,
7627 "S.isdigit() -> bool\n\
7629 Return True if all characters in S are digits\n\
7630 and there is at least one character in S, False otherwise.");
7632 static PyObject*
7633 unicode_isdigit(PyUnicodeObject *self)
7635 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7636 register const Py_UNICODE *e;
7638 /* Shortcut for single character strings */
7639 if (PyUnicode_GET_SIZE(self) == 1 &&
7640 Py_UNICODE_ISDIGIT(*p))
7641 return PyBool_FromLong(1);
7643 /* Special case for empty strings */
7644 if (PyUnicode_GET_SIZE(self) == 0)
7645 return PyBool_FromLong(0);
7647 e = p + PyUnicode_GET_SIZE(self);
7648 for (; p < e; p++) {
7649 if (!Py_UNICODE_ISDIGIT(*p))
7650 return PyBool_FromLong(0);
7652 return PyBool_FromLong(1);
7655 PyDoc_STRVAR(isnumeric__doc__,
7656 "S.isnumeric() -> bool\n\
7658 Return True if there are only numeric characters in S,\n\
7659 False otherwise.");
7661 static PyObject*
7662 unicode_isnumeric(PyUnicodeObject *self)
7664 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7665 register const Py_UNICODE *e;
7667 /* Shortcut for single character strings */
7668 if (PyUnicode_GET_SIZE(self) == 1 &&
7669 Py_UNICODE_ISNUMERIC(*p))
7670 return PyBool_FromLong(1);
7672 /* Special case for empty strings */
7673 if (PyUnicode_GET_SIZE(self) == 0)
7674 return PyBool_FromLong(0);
7676 e = p + PyUnicode_GET_SIZE(self);
7677 for (; p < e; p++) {
7678 if (!Py_UNICODE_ISNUMERIC(*p))
7679 return PyBool_FromLong(0);
7681 return PyBool_FromLong(1);
7685 PyUnicode_IsIdentifier(PyObject *self)
7687 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7688 register const Py_UNICODE *e;
7690 /* Special case for empty strings */
7691 if (PyUnicode_GET_SIZE(self) == 0)
7692 return 0;
7694 /* PEP 3131 says that the first character must be in
7695 XID_Start and subsequent characters in XID_Continue,
7696 and for the ASCII range, the 2.x rules apply (i.e
7697 start with letters and underscore, continue with
7698 letters, digits, underscore). However, given the current
7699 definition of XID_Start and XID_Continue, it is sufficient
7700 to check just for these, except that _ must be allowed
7701 as starting an identifier. */
7702 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7703 return 0;
7705 e = p + PyUnicode_GET_SIZE(self);
7706 for (p++; p < e; p++) {
7707 if (!_PyUnicode_IsXidContinue(*p))
7708 return 0;
7710 return 1;
7713 PyDoc_STRVAR(isidentifier__doc__,
7714 "S.isidentifier() -> bool\n\
7716 Return True if S is a valid identifier according\n\
7717 to the language definition.");
7719 static PyObject*
7720 unicode_isidentifier(PyObject *self)
7722 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7725 PyDoc_STRVAR(isprintable__doc__,
7726 "S.isprintable() -> bool\n\
7728 Return True if all characters in S are considered\n\
7729 printable in repr() or S is empty, False otherwise.");
7731 static PyObject*
7732 unicode_isprintable(PyObject *self)
7734 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7735 register const Py_UNICODE *e;
7737 /* Shortcut for single character strings */
7738 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7739 Py_RETURN_TRUE;
7742 e = p + PyUnicode_GET_SIZE(self);
7743 for (; p < e; p++) {
7744 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7745 Py_RETURN_FALSE;
7748 Py_RETURN_TRUE;
7751 PyDoc_STRVAR(join__doc__,
7752 "S.join(iterable) -> str\n\
7754 Return a string which is the concatenation of the strings in the\n\
7755 iterable. The separator between elements is S.");
7757 static PyObject*
7758 unicode_join(PyObject *self, PyObject *data)
7760 return PyUnicode_Join(self, data);
7763 static Py_ssize_t
7764 unicode_length(PyUnicodeObject *self)
7766 return self->length;
7769 PyDoc_STRVAR(ljust__doc__,
7770 "S.ljust(width[, fillchar]) -> str\n\
7772 Return S left-justified in a Unicode string of length width. Padding is\n\
7773 done using the specified fill character (default is a space).");
7775 static PyObject *
7776 unicode_ljust(PyUnicodeObject *self, PyObject *args)
7778 Py_ssize_t width;
7779 Py_UNICODE fillchar = ' ';
7781 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7782 return NULL;
7784 if (self->length >= width && PyUnicode_CheckExact(self)) {
7785 Py_INCREF(self);
7786 return (PyObject*) self;
7789 return (PyObject*) pad(self, 0, width - self->length, fillchar);
7792 PyDoc_STRVAR(lower__doc__,
7793 "S.lower() -> str\n\
7795 Return a copy of the string S converted to lowercase.");
7797 static PyObject*
7798 unicode_lower(PyUnicodeObject *self)
7800 return fixup(self, fixlower);
7803 #define LEFTSTRIP 0
7804 #define RIGHTSTRIP 1
7805 #define BOTHSTRIP 2
7807 /* Arrays indexed by above */
7808 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7810 #define STRIPNAME(i) (stripformat[i]+3)
7812 /* externally visible for str.strip(unicode) */
7813 PyObject *
7814 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7816 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7817 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7818 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7819 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7820 Py_ssize_t i, j;
7822 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7824 i = 0;
7825 if (striptype != RIGHTSTRIP) {
7826 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7827 i++;
7831 j = len;
7832 if (striptype != LEFTSTRIP) {
7833 do {
7834 j--;
7835 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7836 j++;
7839 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7840 Py_INCREF(self);
7841 return (PyObject*)self;
7843 else
7844 return PyUnicode_FromUnicode(s+i, j-i);
7848 static PyObject *
7849 do_strip(PyUnicodeObject *self, int striptype)
7851 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7852 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7854 i = 0;
7855 if (striptype != RIGHTSTRIP) {
7856 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7857 i++;
7861 j = len;
7862 if (striptype != LEFTSTRIP) {
7863 do {
7864 j--;
7865 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7866 j++;
7869 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7870 Py_INCREF(self);
7871 return (PyObject*)self;
7873 else
7874 return PyUnicode_FromUnicode(s+i, j-i);
7878 static PyObject *
7879 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7881 PyObject *sep = NULL;
7883 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7884 return NULL;
7886 if (sep != NULL && sep != Py_None) {
7887 if (PyUnicode_Check(sep))
7888 return _PyUnicode_XStrip(self, striptype, sep);
7889 else {
7890 PyErr_Format(PyExc_TypeError,
7891 "%s arg must be None or str",
7892 STRIPNAME(striptype));
7893 return NULL;
7897 return do_strip(self, striptype);
7901 PyDoc_STRVAR(strip__doc__,
7902 "S.strip([chars]) -> str\n\
7904 Return a copy of the string S with leading and trailing\n\
7905 whitespace removed.\n\
7906 If chars is given and not None, remove characters in chars instead.");
7908 static PyObject *
7909 unicode_strip(PyUnicodeObject *self, PyObject *args)
7911 if (PyTuple_GET_SIZE(args) == 0)
7912 return do_strip(self, BOTHSTRIP); /* Common case */
7913 else
7914 return do_argstrip(self, BOTHSTRIP, args);
7918 PyDoc_STRVAR(lstrip__doc__,
7919 "S.lstrip([chars]) -> str\n\
7921 Return a copy of the string S with leading whitespace removed.\n\
7922 If chars is given and not None, remove characters in chars instead.");
7924 static PyObject *
7925 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7927 if (PyTuple_GET_SIZE(args) == 0)
7928 return do_strip(self, LEFTSTRIP); /* Common case */
7929 else
7930 return do_argstrip(self, LEFTSTRIP, args);
7934 PyDoc_STRVAR(rstrip__doc__,
7935 "S.rstrip([chars]) -> str\n\
7937 Return a copy of the string S with trailing whitespace removed.\n\
7938 If chars is given and not None, remove characters in chars instead.");
7940 static PyObject *
7941 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7943 if (PyTuple_GET_SIZE(args) == 0)
7944 return do_strip(self, RIGHTSTRIP); /* Common case */
7945 else
7946 return do_argstrip(self, RIGHTSTRIP, args);
7950 static PyObject*
7951 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7953 PyUnicodeObject *u;
7954 Py_UNICODE *p;
7955 Py_ssize_t nchars;
7956 size_t nbytes;
7958 if (len < 1) {
7959 Py_INCREF(unicode_empty);
7960 return (PyObject *)unicode_empty;
7963 if (len == 1 && PyUnicode_CheckExact(str)) {
7964 /* no repeat, return original string */
7965 Py_INCREF(str);
7966 return (PyObject*) str;
7969 /* ensure # of chars needed doesn't overflow int and # of bytes
7970 * needed doesn't overflow size_t
7972 nchars = len * str->length;
7973 if (nchars / len != str->length) {
7974 PyErr_SetString(PyExc_OverflowError,
7975 "repeated string is too long");
7976 return NULL;
7978 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7979 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7980 PyErr_SetString(PyExc_OverflowError,
7981 "repeated string is too long");
7982 return NULL;
7984 u = _PyUnicode_New(nchars);
7985 if (!u)
7986 return NULL;
7988 p = u->str;
7990 if (str->length == 1) {
7991 Py_UNICODE_FILL(p, str->str[0], len);
7992 } else {
7993 Py_ssize_t done = str->length; /* number of characters copied this far */
7994 Py_UNICODE_COPY(p, str->str, str->length);
7995 while (done < nchars) {
7996 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7997 Py_UNICODE_COPY(p+done, p, n);
7998 done += n;
8002 return (PyObject*) u;
8005 PyObject *PyUnicode_Replace(PyObject *obj,
8006 PyObject *subobj,
8007 PyObject *replobj,
8008 Py_ssize_t maxcount)
8010 PyObject *self;
8011 PyObject *str1;
8012 PyObject *str2;
8013 PyObject *result;
8015 self = PyUnicode_FromObject(obj);
8016 if (self == NULL)
8017 return NULL;
8018 str1 = PyUnicode_FromObject(subobj);
8019 if (str1 == NULL) {
8020 Py_DECREF(self);
8021 return NULL;
8023 str2 = PyUnicode_FromObject(replobj);
8024 if (str2 == NULL) {
8025 Py_DECREF(self);
8026 Py_DECREF(str1);
8027 return NULL;
8029 result = replace((PyUnicodeObject *)self,
8030 (PyUnicodeObject *)str1,
8031 (PyUnicodeObject *)str2,
8032 maxcount);
8033 Py_DECREF(self);
8034 Py_DECREF(str1);
8035 Py_DECREF(str2);
8036 return result;
8039 PyDoc_STRVAR(replace__doc__,
8040 "S.replace(old, new[, count]) -> str\n\
8042 Return a copy of S with all occurrences of substring\n\
8043 old replaced by new. If the optional argument count is\n\
8044 given, only the first count occurrences are replaced.");
8046 static PyObject*
8047 unicode_replace(PyUnicodeObject *self, PyObject *args)
8049 PyUnicodeObject *str1;
8050 PyUnicodeObject *str2;
8051 Py_ssize_t maxcount = -1;
8052 PyObject *result;
8054 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
8055 return NULL;
8056 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8057 if (str1 == NULL)
8058 return NULL;
8059 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
8060 if (str2 == NULL) {
8061 Py_DECREF(str1);
8062 return NULL;
8065 result = replace(self, str1, str2, maxcount);
8067 Py_DECREF(str1);
8068 Py_DECREF(str2);
8069 return result;
8072 static
8073 PyObject *unicode_repr(PyObject *unicode)
8075 PyObject *repr;
8076 Py_UNICODE *p;
8077 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8078 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8080 /* XXX(nnorwitz): rather than over-allocating, it would be
8081 better to choose a different scheme. Perhaps scan the
8082 first N-chars of the string and allocate based on that size.
8084 /* Initial allocation is based on the longest-possible unichr
8085 escape.
8087 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8088 unichr, so in this case it's the longest unichr escape. In
8089 narrow (UTF-16) builds this is five chars per source unichr
8090 since there are two unichrs in the surrogate pair, so in narrow
8091 (UTF-16) builds it's not the longest unichr escape.
8093 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8094 so in the narrow (UTF-16) build case it's the longest unichr
8095 escape.
8098 repr = PyUnicode_FromUnicode(NULL,
8099 2 /* quotes */
8100 #ifdef Py_UNICODE_WIDE
8101 + 10*size
8102 #else
8103 + 6*size
8104 #endif
8105 + 1);
8106 if (repr == NULL)
8107 return NULL;
8109 p = PyUnicode_AS_UNICODE(repr);
8111 /* Add quote */
8112 *p++ = (findchar(s, size, '\'') &&
8113 !findchar(s, size, '"')) ? '"' : '\'';
8114 while (size-- > 0) {
8115 Py_UNICODE ch = *s++;
8117 /* Escape quotes and backslashes */
8118 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
8119 *p++ = '\\';
8120 *p++ = ch;
8121 continue;
8124 /* Map special whitespace to '\t', \n', '\r' */
8125 if (ch == '\t') {
8126 *p++ = '\\';
8127 *p++ = 't';
8129 else if (ch == '\n') {
8130 *p++ = '\\';
8131 *p++ = 'n';
8133 else if (ch == '\r') {
8134 *p++ = '\\';
8135 *p++ = 'r';
8138 /* Map non-printable US ASCII to '\xhh' */
8139 else if (ch < ' ' || ch == 0x7F) {
8140 *p++ = '\\';
8141 *p++ = 'x';
8142 *p++ = hexdigits[(ch >> 4) & 0x000F];
8143 *p++ = hexdigits[ch & 0x000F];
8146 /* Copy ASCII characters as-is */
8147 else if (ch < 0x7F) {
8148 *p++ = ch;
8151 /* Non-ASCII characters */
8152 else {
8153 Py_UCS4 ucs = ch;
8155 #ifndef Py_UNICODE_WIDE
8156 Py_UNICODE ch2 = 0;
8157 /* Get code point from surrogate pair */
8158 if (size > 0) {
8159 ch2 = *s;
8160 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
8161 && ch2 <= 0xDFFF) {
8162 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
8163 + 0x00010000;
8164 s++;
8165 size--;
8168 #endif
8169 /* Map Unicode whitespace and control characters
8170 (categories Z* and C* except ASCII space)
8172 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8173 /* Map 8-bit characters to '\xhh' */
8174 if (ucs <= 0xff) {
8175 *p++ = '\\';
8176 *p++ = 'x';
8177 *p++ = hexdigits[(ch >> 4) & 0x000F];
8178 *p++ = hexdigits[ch & 0x000F];
8180 /* Map 21-bit characters to '\U00xxxxxx' */
8181 else if (ucs >= 0x10000) {
8182 *p++ = '\\';
8183 *p++ = 'U';
8184 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8185 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8186 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8187 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8188 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8189 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8190 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8191 *p++ = hexdigits[ucs & 0x0000000F];
8193 /* Map 16-bit characters to '\uxxxx' */
8194 else {
8195 *p++ = '\\';
8196 *p++ = 'u';
8197 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8198 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8199 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8200 *p++ = hexdigits[ucs & 0x000F];
8203 /* Copy characters as-is */
8204 else {
8205 *p++ = ch;
8206 #ifndef Py_UNICODE_WIDE
8207 if (ucs >= 0x10000)
8208 *p++ = ch2;
8209 #endif
8213 /* Add quote */
8214 *p++ = PyUnicode_AS_UNICODE(repr)[0];
8216 *p = '\0';
8217 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
8218 return repr;
8221 PyDoc_STRVAR(rfind__doc__,
8222 "S.rfind(sub[, start[, end]]) -> int\n\
8224 Return the highest index in S where substring sub is found,\n\
8225 such that sub is contained within s[start:end]. Optional\n\
8226 arguments start and end are interpreted as in slice notation.\n\
8228 Return -1 on failure.");
8230 static PyObject *
8231 unicode_rfind(PyUnicodeObject *self, PyObject *args)
8233 PyObject *substring;
8234 Py_ssize_t start;
8235 Py_ssize_t end;
8236 Py_ssize_t result;
8238 if (!_ParseTupleFinds(args, &substring, &start, &end))
8239 return NULL;
8241 result = stringlib_rfind_slice(
8242 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8243 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8244 start, end
8247 Py_DECREF(substring);
8249 return PyLong_FromSsize_t(result);
8252 PyDoc_STRVAR(rindex__doc__,
8253 "S.rindex(sub[, start[, end]]) -> int\n\
8255 Like S.rfind() but raise ValueError when the substring is not found.");
8257 static PyObject *
8258 unicode_rindex(PyUnicodeObject *self, PyObject *args)
8260 PyObject *substring;
8261 Py_ssize_t start;
8262 Py_ssize_t end;
8263 Py_ssize_t result;
8265 if (!_ParseTupleFinds(args, &substring, &start, &end))
8266 return NULL;
8268 result = stringlib_rfind_slice(
8269 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8270 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8271 start, end
8274 Py_DECREF(substring);
8276 if (result < 0) {
8277 PyErr_SetString(PyExc_ValueError, "substring not found");
8278 return NULL;
8280 return PyLong_FromSsize_t(result);
8283 PyDoc_STRVAR(rjust__doc__,
8284 "S.rjust(width[, fillchar]) -> str\n\
8286 Return S right-justified in a string of length width. Padding is\n\
8287 done using the specified fill character (default is a space).");
8289 static PyObject *
8290 unicode_rjust(PyUnicodeObject *self, PyObject *args)
8292 Py_ssize_t width;
8293 Py_UNICODE fillchar = ' ';
8295 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
8296 return NULL;
8298 if (self->length >= width && PyUnicode_CheckExact(self)) {
8299 Py_INCREF(self);
8300 return (PyObject*) self;
8303 return (PyObject*) pad(self, width - self->length, 0, fillchar);
8306 PyObject *PyUnicode_Split(PyObject *s,
8307 PyObject *sep,
8308 Py_ssize_t maxsplit)
8310 PyObject *result;
8312 s = PyUnicode_FromObject(s);
8313 if (s == NULL)
8314 return NULL;
8315 if (sep != NULL) {
8316 sep = PyUnicode_FromObject(sep);
8317 if (sep == NULL) {
8318 Py_DECREF(s);
8319 return NULL;
8323 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8325 Py_DECREF(s);
8326 Py_XDECREF(sep);
8327 return result;
8330 PyDoc_STRVAR(split__doc__,
8331 "S.split([sep[, maxsplit]]) -> list of strings\n\
8333 Return a list of the words in S, using sep as the\n\
8334 delimiter string. If maxsplit is given, at most maxsplit\n\
8335 splits are done. If sep is not specified or is None, any\n\
8336 whitespace string is a separator and empty strings are\n\
8337 removed from the result.");
8339 static PyObject*
8340 unicode_split(PyUnicodeObject *self, PyObject *args)
8342 PyObject *substring = Py_None;
8343 Py_ssize_t maxcount = -1;
8345 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
8346 return NULL;
8348 if (substring == Py_None)
8349 return split(self, NULL, maxcount);
8350 else if (PyUnicode_Check(substring))
8351 return split(self, (PyUnicodeObject *)substring, maxcount);
8352 else
8353 return PyUnicode_Split((PyObject *)self, substring, maxcount);
8356 PyObject *
8357 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8359 PyObject* str_obj;
8360 PyObject* sep_obj;
8361 PyObject* out;
8363 str_obj = PyUnicode_FromObject(str_in);
8364 if (!str_obj)
8365 return NULL;
8366 sep_obj = PyUnicode_FromObject(sep_in);
8367 if (!sep_obj) {
8368 Py_DECREF(str_obj);
8369 return NULL;
8372 out = stringlib_partition(
8373 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8374 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8377 Py_DECREF(sep_obj);
8378 Py_DECREF(str_obj);
8380 return out;
8384 PyObject *
8385 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8387 PyObject* str_obj;
8388 PyObject* sep_obj;
8389 PyObject* out;
8391 str_obj = PyUnicode_FromObject(str_in);
8392 if (!str_obj)
8393 return NULL;
8394 sep_obj = PyUnicode_FromObject(sep_in);
8395 if (!sep_obj) {
8396 Py_DECREF(str_obj);
8397 return NULL;
8400 out = stringlib_rpartition(
8401 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8402 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8405 Py_DECREF(sep_obj);
8406 Py_DECREF(str_obj);
8408 return out;
8411 PyDoc_STRVAR(partition__doc__,
8412 "S.partition(sep) -> (head, sep, tail)\n\
8414 Search for the separator sep in S, and return the part before it,\n\
8415 the separator itself, and the part after it. If the separator is not\n\
8416 found, return S and two empty strings.");
8418 static PyObject*
8419 unicode_partition(PyUnicodeObject *self, PyObject *separator)
8421 return PyUnicode_Partition((PyObject *)self, separator);
8424 PyDoc_STRVAR(rpartition__doc__,
8425 "S.rpartition(sep) -> (head, sep, tail)\n\
8427 Search for the separator sep in S, starting at the end of S, and return\n\
8428 the part before it, the separator itself, and the part after it. If the\n\
8429 separator is not found, return two empty strings and S.");
8431 static PyObject*
8432 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8434 return PyUnicode_RPartition((PyObject *)self, separator);
8437 PyObject *PyUnicode_RSplit(PyObject *s,
8438 PyObject *sep,
8439 Py_ssize_t maxsplit)
8441 PyObject *result;
8443 s = PyUnicode_FromObject(s);
8444 if (s == NULL)
8445 return NULL;
8446 if (sep != NULL) {
8447 sep = PyUnicode_FromObject(sep);
8448 if (sep == NULL) {
8449 Py_DECREF(s);
8450 return NULL;
8454 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8456 Py_DECREF(s);
8457 Py_XDECREF(sep);
8458 return result;
8461 PyDoc_STRVAR(rsplit__doc__,
8462 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
8464 Return a list of the words in S, using sep as the\n\
8465 delimiter string, starting at the end of the string and\n\
8466 working to the front. If maxsplit is given, at most maxsplit\n\
8467 splits are done. If sep is not specified, any whitespace string\n\
8468 is a separator.");
8470 static PyObject*
8471 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8473 PyObject *substring = Py_None;
8474 Py_ssize_t maxcount = -1;
8476 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
8477 return NULL;
8479 if (substring == Py_None)
8480 return rsplit(self, NULL, maxcount);
8481 else if (PyUnicode_Check(substring))
8482 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
8483 else
8484 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
8487 PyDoc_STRVAR(splitlines__doc__,
8488 "S.splitlines([keepends]) -> list of strings\n\
8490 Return a list of the lines in S, breaking at line boundaries.\n\
8491 Line breaks are not included in the resulting list unless keepends\n\
8492 is given and true.");
8494 static PyObject*
8495 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8497 int keepends = 0;
8499 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
8500 return NULL;
8502 return PyUnicode_Splitlines((PyObject *)self, keepends);
8505 static
8506 PyObject *unicode_str(PyObject *self)
8508 if (PyUnicode_CheckExact(self)) {
8509 Py_INCREF(self);
8510 return self;
8511 } else
8512 /* Subtype -- return genuine unicode string with the same value. */
8513 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8514 PyUnicode_GET_SIZE(self));
8517 PyDoc_STRVAR(swapcase__doc__,
8518 "S.swapcase() -> str\n\
8520 Return a copy of S with uppercase characters converted to lowercase\n\
8521 and vice versa.");
8523 static PyObject*
8524 unicode_swapcase(PyUnicodeObject *self)
8526 return fixup(self, fixswapcase);
8529 PyDoc_STRVAR(maketrans__doc__,
8530 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
8532 Return a translation table usable for str.translate().\n\
8533 If there is only one argument, it must be a dictionary mapping Unicode\n\
8534 ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
8535 Character keys will be then converted to ordinals.\n\
8536 If there are two arguments, they must be strings of equal length, and\n\
8537 in the resulting dictionary, each character in x will be mapped to the\n\
8538 character at the same position in y. If there is a third argument, it\n\
8539 must be a string, whose characters will be mapped to None in the result.");
8541 static PyObject*
8542 unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8544 PyObject *x, *y = NULL, *z = NULL;
8545 PyObject *new = NULL, *key, *value;
8546 Py_ssize_t i = 0;
8547 int res;
8549 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8550 return NULL;
8551 new = PyDict_New();
8552 if (!new)
8553 return NULL;
8554 if (y != NULL) {
8555 /* x must be a string too, of equal length */
8556 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8557 if (!PyUnicode_Check(x)) {
8558 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8559 "be a string if there is a second argument");
8560 goto err;
8562 if (PyUnicode_GET_SIZE(x) != ylen) {
8563 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8564 "arguments must have equal length");
8565 goto err;
8567 /* create entries for translating chars in x to those in y */
8568 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
8569 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8570 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
8571 if (!key || !value)
8572 goto err;
8573 res = PyDict_SetItem(new, key, value);
8574 Py_DECREF(key);
8575 Py_DECREF(value);
8576 if (res < 0)
8577 goto err;
8579 /* create entries for deleting chars in z */
8580 if (z != NULL) {
8581 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
8582 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
8583 if (!key)
8584 goto err;
8585 res = PyDict_SetItem(new, key, Py_None);
8586 Py_DECREF(key);
8587 if (res < 0)
8588 goto err;
8591 } else {
8592 /* x must be a dict */
8593 if (!PyDict_CheckExact(x)) {
8594 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8595 "to maketrans it must be a dict");
8596 goto err;
8598 /* copy entries into the new dict, converting string keys to int keys */
8599 while (PyDict_Next(x, &i, &key, &value)) {
8600 if (PyUnicode_Check(key)) {
8601 /* convert string keys to integer keys */
8602 PyObject *newkey;
8603 if (PyUnicode_GET_SIZE(key) != 1) {
8604 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8605 "table must be of length 1");
8606 goto err;
8608 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
8609 if (!newkey)
8610 goto err;
8611 res = PyDict_SetItem(new, newkey, value);
8612 Py_DECREF(newkey);
8613 if (res < 0)
8614 goto err;
8615 } else if (PyLong_Check(key)) {
8616 /* just keep integer keys */
8617 if (PyDict_SetItem(new, key, value) < 0)
8618 goto err;
8619 } else {
8620 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8621 "be strings or integers");
8622 goto err;
8626 return new;
8627 err:
8628 Py_DECREF(new);
8629 return NULL;
8632 PyDoc_STRVAR(translate__doc__,
8633 "S.translate(table) -> str\n\
8635 Return a copy of the string S, where all characters have been mapped\n\
8636 through the given translation table, which must be a mapping of\n\
8637 Unicode ordinals to Unicode ordinals, strings, or None.\n\
8638 Unmapped characters are left untouched. Characters mapped to None\n\
8639 are deleted.");
8641 static PyObject*
8642 unicode_translate(PyUnicodeObject *self, PyObject *table)
8644 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
8647 PyDoc_STRVAR(upper__doc__,
8648 "S.upper() -> str\n\
8650 Return a copy of S converted to uppercase.");
8652 static PyObject*
8653 unicode_upper(PyUnicodeObject *self)
8655 return fixup(self, fixupper);
8658 PyDoc_STRVAR(zfill__doc__,
8659 "S.zfill(width) -> str\n\
8661 Pad a numeric string S with zeros on the left, to fill a field\n\
8662 of the specified width. The string S is never truncated.");
8664 static PyObject *
8665 unicode_zfill(PyUnicodeObject *self, PyObject *args)
8667 Py_ssize_t fill;
8668 PyUnicodeObject *u;
8670 Py_ssize_t width;
8671 if (!PyArg_ParseTuple(args, "n:zfill", &width))
8672 return NULL;
8674 if (self->length >= width) {
8675 if (PyUnicode_CheckExact(self)) {
8676 Py_INCREF(self);
8677 return (PyObject*) self;
8679 else
8680 return PyUnicode_FromUnicode(
8681 PyUnicode_AS_UNICODE(self),
8682 PyUnicode_GET_SIZE(self)
8686 fill = width - self->length;
8688 u = pad(self, fill, 0, '0');
8690 if (u == NULL)
8691 return NULL;
8693 if (u->str[fill] == '+' || u->str[fill] == '-') {
8694 /* move sign to beginning of string */
8695 u->str[0] = u->str[fill];
8696 u->str[fill] = '0';
8699 return (PyObject*) u;
8702 #if 0
8703 static PyObject*
8704 unicode_freelistsize(PyUnicodeObject *self)
8706 return PyLong_FromLong(numfree);
8708 #endif
8710 PyDoc_STRVAR(startswith__doc__,
8711 "S.startswith(prefix[, start[, end]]) -> bool\n\
8713 Return True if S starts with the specified prefix, False otherwise.\n\
8714 With optional start, test S beginning at that position.\n\
8715 With optional end, stop comparing S at that position.\n\
8716 prefix can also be a tuple of strings to try.");
8718 static PyObject *
8719 unicode_startswith(PyUnicodeObject *self,
8720 PyObject *args)
8722 PyObject *subobj;
8723 PyUnicodeObject *substring;
8724 Py_ssize_t start = 0;
8725 Py_ssize_t end = PY_SSIZE_T_MAX;
8726 int result;
8728 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
8729 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8730 return NULL;
8731 if (PyTuple_Check(subobj)) {
8732 Py_ssize_t i;
8733 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8734 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8735 PyTuple_GET_ITEM(subobj, i));
8736 if (substring == NULL)
8737 return NULL;
8738 result = tailmatch(self, substring, start, end, -1);
8739 Py_DECREF(substring);
8740 if (result) {
8741 Py_RETURN_TRUE;
8744 /* nothing matched */
8745 Py_RETURN_FALSE;
8747 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
8748 if (substring == NULL)
8749 return NULL;
8750 result = tailmatch(self, substring, start, end, -1);
8751 Py_DECREF(substring);
8752 return PyBool_FromLong(result);
8756 PyDoc_STRVAR(endswith__doc__,
8757 "S.endswith(suffix[, start[, end]]) -> bool\n\
8759 Return True if S ends with the specified suffix, False otherwise.\n\
8760 With optional start, test S beginning at that position.\n\
8761 With optional end, stop comparing S at that position.\n\
8762 suffix can also be a tuple of strings to try.");
8764 static PyObject *
8765 unicode_endswith(PyUnicodeObject *self,
8766 PyObject *args)
8768 PyObject *subobj;
8769 PyUnicodeObject *substring;
8770 Py_ssize_t start = 0;
8771 Py_ssize_t end = PY_SSIZE_T_MAX;
8772 int result;
8774 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8775 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8776 return NULL;
8777 if (PyTuple_Check(subobj)) {
8778 Py_ssize_t i;
8779 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8780 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8781 PyTuple_GET_ITEM(subobj, i));
8782 if (substring == NULL)
8783 return NULL;
8784 result = tailmatch(self, substring, start, end, +1);
8785 Py_DECREF(substring);
8786 if (result) {
8787 Py_RETURN_TRUE;
8790 Py_RETURN_FALSE;
8792 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
8793 if (substring == NULL)
8794 return NULL;
8796 result = tailmatch(self, substring, start, end, +1);
8797 Py_DECREF(substring);
8798 return PyBool_FromLong(result);
8801 #include "stringlib/string_format.h"
8803 PyDoc_STRVAR(format__doc__,
8804 "S.format(*args, **kwargs) -> str\n\
8808 static PyObject *
8809 unicode__format__(PyObject* self, PyObject* args)
8811 PyObject *format_spec;
8813 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8814 return NULL;
8816 return _PyUnicode_FormatAdvanced(self,
8817 PyUnicode_AS_UNICODE(format_spec),
8818 PyUnicode_GET_SIZE(format_spec));
8821 PyDoc_STRVAR(p_format__doc__,
8822 "S.__format__(format_spec) -> str\n\
8826 static PyObject *
8827 unicode__sizeof__(PyUnicodeObject *v)
8829 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8830 sizeof(Py_UNICODE) * (v->length + 1));
8833 PyDoc_STRVAR(sizeof__doc__,
8834 "S.__sizeof__() -> size of S in memory, in bytes");
8836 static PyObject *
8837 unicode_getnewargs(PyUnicodeObject *v)
8839 return Py_BuildValue("(u#)", v->str, v->length);
8843 static PyMethodDef unicode_methods[] = {
8845 /* Order is according to common usage: often used methods should
8846 appear first, since lookup is done sequentially. */
8848 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8849 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8850 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
8851 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
8852 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8853 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8854 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8855 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8856 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8857 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8858 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
8859 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
8860 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8861 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8862 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
8863 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
8864 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8865 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8866 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
8867 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
8868 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
8869 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
8870 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
8871 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8872 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8873 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8874 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8875 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8876 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8877 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8878 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8879 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8880 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8881 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8882 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8883 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8884 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
8885 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
8886 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
8887 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
8888 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8889 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
8890 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8891 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
8892 {"maketrans", (PyCFunction) unicode_maketrans,
8893 METH_VARARGS | METH_STATIC, maketrans__doc__},
8894 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
8895 #if 0
8896 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
8897 #endif
8899 #if 0
8900 /* This one is just used for debugging the implementation. */
8901 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
8902 #endif
8904 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
8905 {NULL, NULL}
8908 static PyObject *
8909 unicode_mod(PyObject *v, PyObject *w)
8911 if (!PyUnicode_Check(v)) {
8912 Py_INCREF(Py_NotImplemented);
8913 return Py_NotImplemented;
8915 return PyUnicode_Format(v, w);
8918 static PyNumberMethods unicode_as_number = {
8919 0, /*nb_add*/
8920 0, /*nb_subtract*/
8921 0, /*nb_multiply*/
8922 unicode_mod, /*nb_remainder*/
8925 static PySequenceMethods unicode_as_sequence = {
8926 (lenfunc) unicode_length, /* sq_length */
8927 PyUnicode_Concat, /* sq_concat */
8928 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8929 (ssizeargfunc) unicode_getitem, /* sq_item */
8930 0, /* sq_slice */
8931 0, /* sq_ass_item */
8932 0, /* sq_ass_slice */
8933 PyUnicode_Contains, /* sq_contains */
8936 static PyObject*
8937 unicode_subscript(PyUnicodeObject* self, PyObject* item)
8939 if (PyIndex_Check(item)) {
8940 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
8941 if (i == -1 && PyErr_Occurred())
8942 return NULL;
8943 if (i < 0)
8944 i += PyUnicode_GET_SIZE(self);
8945 return unicode_getitem(self, i);
8946 } else if (PySlice_Check(item)) {
8947 Py_ssize_t start, stop, step, slicelength, cur, i;
8948 Py_UNICODE* source_buf;
8949 Py_UNICODE* result_buf;
8950 PyObject* result;
8952 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
8953 &start, &stop, &step, &slicelength) < 0) {
8954 return NULL;
8957 if (slicelength <= 0) {
8958 return PyUnicode_FromUnicode(NULL, 0);
8959 } else if (start == 0 && step == 1 && slicelength == self->length &&
8960 PyUnicode_CheckExact(self)) {
8961 Py_INCREF(self);
8962 return (PyObject *)self;
8963 } else if (step == 1) {
8964 return PyUnicode_FromUnicode(self->str + start, slicelength);
8965 } else {
8966 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
8967 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8968 sizeof(Py_UNICODE));
8970 if (result_buf == NULL)
8971 return PyErr_NoMemory();
8973 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8974 result_buf[i] = source_buf[cur];
8977 result = PyUnicode_FromUnicode(result_buf, slicelength);
8978 PyObject_FREE(result_buf);
8979 return result;
8981 } else {
8982 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8983 return NULL;
8987 static PyMappingMethods unicode_as_mapping = {
8988 (lenfunc)unicode_length, /* mp_length */
8989 (binaryfunc)unicode_subscript, /* mp_subscript */
8990 (objobjargproc)0, /* mp_ass_subscript */
8994 /* Helpers for PyUnicode_Format() */
8996 static PyObject *
8997 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8999 Py_ssize_t argidx = *p_argidx;
9000 if (argidx < arglen) {
9001 (*p_argidx)++;
9002 if (arglen < 0)
9003 return args;
9004 else
9005 return PyTuple_GetItem(args, argidx);
9007 PyErr_SetString(PyExc_TypeError,
9008 "not enough arguments for format string");
9009 return NULL;
9012 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
9014 static PyObject *
9015 formatfloat(PyObject *v, int flags, int prec, int type)
9017 char *p;
9018 PyObject *result;
9019 double x;
9021 x = PyFloat_AsDouble(v);
9022 if (x == -1.0 && PyErr_Occurred())
9023 return NULL;
9025 if (prec < 0)
9026 prec = 6;
9028 p = PyOS_double_to_string(x, type, prec,
9029 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
9030 if (p == NULL)
9031 return NULL;
9032 result = PyUnicode_FromStringAndSize(p, strlen(p));
9033 PyMem_Free(p);
9034 return result;
9037 static PyObject*
9038 formatlong(PyObject *val, int flags, int prec, int type)
9040 char *buf;
9041 int len;
9042 PyObject *str; /* temporary string object. */
9043 PyObject *result;
9045 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9046 if (!str)
9047 return NULL;
9048 result = PyUnicode_FromStringAndSize(buf, len);
9049 Py_DECREF(str);
9050 return result;
9053 static int
9054 formatchar(Py_UNICODE *buf,
9055 size_t buflen,
9056 PyObject *v)
9058 /* presume that the buffer is at least 3 characters long */
9059 if (PyUnicode_Check(v)) {
9060 if (PyUnicode_GET_SIZE(v) == 1) {
9061 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9062 buf[1] = '\0';
9063 return 1;
9065 #ifndef Py_UNICODE_WIDE
9066 if (PyUnicode_GET_SIZE(v) == 2) {
9067 /* Decode a valid surrogate pair */
9068 int c0 = PyUnicode_AS_UNICODE(v)[0];
9069 int c1 = PyUnicode_AS_UNICODE(v)[1];
9070 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9071 0xDC00 <= c1 && c1 <= 0xDFFF) {
9072 buf[0] = c0;
9073 buf[1] = c1;
9074 buf[2] = '\0';
9075 return 2;
9078 #endif
9079 goto onError;
9081 else {
9082 /* Integer input truncated to a character */
9083 long x;
9084 x = PyLong_AsLong(v);
9085 if (x == -1 && PyErr_Occurred())
9086 goto onError;
9088 if (x < 0 || x > 0x10ffff) {
9089 PyErr_SetString(PyExc_OverflowError,
9090 "%c arg not in range(0x110000)");
9091 return -1;
9094 #ifndef Py_UNICODE_WIDE
9095 if (x > 0xffff) {
9096 x -= 0x10000;
9097 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9098 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9099 return 2;
9101 #endif
9102 buf[0] = (Py_UNICODE) x;
9103 buf[1] = '\0';
9104 return 1;
9107 onError:
9108 PyErr_SetString(PyExc_TypeError,
9109 "%c requires int or char");
9110 return -1;
9113 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
9114 FORMATBUFLEN is the length of the buffer in which chars are formatted.
9116 #define FORMATBUFLEN (size_t)10
9118 PyObject *PyUnicode_Format(PyObject *format,
9119 PyObject *args)
9121 Py_UNICODE *fmt, *res;
9122 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
9123 int args_owned = 0;
9124 PyUnicodeObject *result = NULL;
9125 PyObject *dict = NULL;
9126 PyObject *uformat;
9128 if (format == NULL || args == NULL) {
9129 PyErr_BadInternalCall();
9130 return NULL;
9132 uformat = PyUnicode_FromObject(format);
9133 if (uformat == NULL)
9134 return NULL;
9135 fmt = PyUnicode_AS_UNICODE(uformat);
9136 fmtcnt = PyUnicode_GET_SIZE(uformat);
9138 reslen = rescnt = fmtcnt + 100;
9139 result = _PyUnicode_New(reslen);
9140 if (result == NULL)
9141 goto onError;
9142 res = PyUnicode_AS_UNICODE(result);
9144 if (PyTuple_Check(args)) {
9145 arglen = PyTuple_Size(args);
9146 argidx = 0;
9148 else {
9149 arglen = -1;
9150 argidx = -2;
9152 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
9153 !PyUnicode_Check(args))
9154 dict = args;
9156 while (--fmtcnt >= 0) {
9157 if (*fmt != '%') {
9158 if (--rescnt < 0) {
9159 rescnt = fmtcnt + 100;
9160 reslen += rescnt;
9161 if (_PyUnicode_Resize(&result, reslen) < 0)
9162 goto onError;
9163 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9164 --rescnt;
9166 *res++ = *fmt++;
9168 else {
9169 /* Got a format specifier */
9170 int flags = 0;
9171 Py_ssize_t width = -1;
9172 int prec = -1;
9173 Py_UNICODE c = '\0';
9174 Py_UNICODE fill;
9175 int isnumok;
9176 PyObject *v = NULL;
9177 PyObject *temp = NULL;
9178 Py_UNICODE *pbuf;
9179 Py_UNICODE sign;
9180 Py_ssize_t len;
9181 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
9183 fmt++;
9184 if (*fmt == '(') {
9185 Py_UNICODE *keystart;
9186 Py_ssize_t keylen;
9187 PyObject *key;
9188 int pcount = 1;
9190 if (dict == NULL) {
9191 PyErr_SetString(PyExc_TypeError,
9192 "format requires a mapping");
9193 goto onError;
9195 ++fmt;
9196 --fmtcnt;
9197 keystart = fmt;
9198 /* Skip over balanced parentheses */
9199 while (pcount > 0 && --fmtcnt >= 0) {
9200 if (*fmt == ')')
9201 --pcount;
9202 else if (*fmt == '(')
9203 ++pcount;
9204 fmt++;
9206 keylen = fmt - keystart - 1;
9207 if (fmtcnt < 0 || pcount > 0) {
9208 PyErr_SetString(PyExc_ValueError,
9209 "incomplete format key");
9210 goto onError;
9212 #if 0
9213 /* keys are converted to strings using UTF-8 and
9214 then looked up since Python uses strings to hold
9215 variables names etc. in its namespaces and we
9216 wouldn't want to break common idioms. */
9217 key = PyUnicode_EncodeUTF8(keystart,
9218 keylen,
9219 NULL);
9220 #else
9221 key = PyUnicode_FromUnicode(keystart, keylen);
9222 #endif
9223 if (key == NULL)
9224 goto onError;
9225 if (args_owned) {
9226 Py_DECREF(args);
9227 args_owned = 0;
9229 args = PyObject_GetItem(dict, key);
9230 Py_DECREF(key);
9231 if (args == NULL) {
9232 goto onError;
9234 args_owned = 1;
9235 arglen = -1;
9236 argidx = -2;
9238 while (--fmtcnt >= 0) {
9239 switch (c = *fmt++) {
9240 case '-': flags |= F_LJUST; continue;
9241 case '+': flags |= F_SIGN; continue;
9242 case ' ': flags |= F_BLANK; continue;
9243 case '#': flags |= F_ALT; continue;
9244 case '0': flags |= F_ZERO; continue;
9246 break;
9248 if (c == '*') {
9249 v = getnextarg(args, arglen, &argidx);
9250 if (v == NULL)
9251 goto onError;
9252 if (!PyLong_Check(v)) {
9253 PyErr_SetString(PyExc_TypeError,
9254 "* wants int");
9255 goto onError;
9257 width = PyLong_AsLong(v);
9258 if (width == -1 && PyErr_Occurred())
9259 goto onError;
9260 if (width < 0) {
9261 flags |= F_LJUST;
9262 width = -width;
9264 if (--fmtcnt >= 0)
9265 c = *fmt++;
9267 else if (c >= '0' && c <= '9') {
9268 width = c - '0';
9269 while (--fmtcnt >= 0) {
9270 c = *fmt++;
9271 if (c < '0' || c > '9')
9272 break;
9273 if ((width*10) / 10 != width) {
9274 PyErr_SetString(PyExc_ValueError,
9275 "width too big");
9276 goto onError;
9278 width = width*10 + (c - '0');
9281 if (c == '.') {
9282 prec = 0;
9283 if (--fmtcnt >= 0)
9284 c = *fmt++;
9285 if (c == '*') {
9286 v = getnextarg(args, arglen, &argidx);
9287 if (v == NULL)
9288 goto onError;
9289 if (!PyLong_Check(v)) {
9290 PyErr_SetString(PyExc_TypeError,
9291 "* wants int");
9292 goto onError;
9294 prec = PyLong_AsLong(v);
9295 if (prec == -1 && PyErr_Occurred())
9296 goto onError;
9297 if (prec < 0)
9298 prec = 0;
9299 if (--fmtcnt >= 0)
9300 c = *fmt++;
9302 else if (c >= '0' && c <= '9') {
9303 prec = c - '0';
9304 while (--fmtcnt >= 0) {
9305 c = *fmt++;
9306 if (c < '0' || c > '9')
9307 break;
9308 if ((prec*10) / 10 != prec) {
9309 PyErr_SetString(PyExc_ValueError,
9310 "prec too big");
9311 goto onError;
9313 prec = prec*10 + (c - '0');
9316 } /* prec */
9317 if (fmtcnt >= 0) {
9318 if (c == 'h' || c == 'l' || c == 'L') {
9319 if (--fmtcnt >= 0)
9320 c = *fmt++;
9323 if (fmtcnt < 0) {
9324 PyErr_SetString(PyExc_ValueError,
9325 "incomplete format");
9326 goto onError;
9328 if (c != '%') {
9329 v = getnextarg(args, arglen, &argidx);
9330 if (v == NULL)
9331 goto onError;
9333 sign = 0;
9334 fill = ' ';
9335 switch (c) {
9337 case '%':
9338 pbuf = formatbuf;
9339 /* presume that buffer length is at least 1 */
9340 pbuf[0] = '%';
9341 len = 1;
9342 break;
9344 case 's':
9345 case 'r':
9346 case 'a':
9347 if (PyUnicode_CheckExact(v) && c == 's') {
9348 temp = v;
9349 Py_INCREF(temp);
9351 else {
9352 if (c == 's')
9353 temp = PyObject_Str(v);
9354 else if (c == 'r')
9355 temp = PyObject_Repr(v);
9356 else
9357 temp = PyObject_ASCII(v);
9358 if (temp == NULL)
9359 goto onError;
9360 if (PyUnicode_Check(temp))
9361 /* nothing to do */;
9362 else {
9363 Py_DECREF(temp);
9364 PyErr_SetString(PyExc_TypeError,
9365 "%s argument has non-string str()");
9366 goto onError;
9369 pbuf = PyUnicode_AS_UNICODE(temp);
9370 len = PyUnicode_GET_SIZE(temp);
9371 if (prec >= 0 && len > prec)
9372 len = prec;
9373 break;
9375 case 'i':
9376 case 'd':
9377 case 'u':
9378 case 'o':
9379 case 'x':
9380 case 'X':
9381 if (c == 'i')
9382 c = 'd';
9383 isnumok = 0;
9384 if (PyNumber_Check(v)) {
9385 PyObject *iobj=NULL;
9387 if (PyLong_Check(v)) {
9388 iobj = v;
9389 Py_INCREF(iobj);
9391 else {
9392 iobj = PyNumber_Long(v);
9394 if (iobj!=NULL) {
9395 if (PyLong_Check(iobj)) {
9396 isnumok = 1;
9397 temp = formatlong(iobj, flags, prec, c);
9398 Py_DECREF(iobj);
9399 if (!temp)
9400 goto onError;
9401 pbuf = PyUnicode_AS_UNICODE(temp);
9402 len = PyUnicode_GET_SIZE(temp);
9403 sign = 1;
9405 else {
9406 Py_DECREF(iobj);
9410 if (!isnumok) {
9411 PyErr_Format(PyExc_TypeError,
9412 "%%%c format: a number is required, "
9413 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9414 goto onError;
9416 if (flags & F_ZERO)
9417 fill = '0';
9418 break;
9420 case 'e':
9421 case 'E':
9422 case 'f':
9423 case 'F':
9424 case 'g':
9425 case 'G':
9426 temp = formatfloat(v, flags, prec, c);
9427 if (!temp)
9428 goto onError;
9429 pbuf = PyUnicode_AS_UNICODE(temp);
9430 len = PyUnicode_GET_SIZE(temp);
9431 sign = 1;
9432 if (flags & F_ZERO)
9433 fill = '0';
9434 break;
9436 case 'c':
9437 pbuf = formatbuf;
9438 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9439 if (len < 0)
9440 goto onError;
9441 break;
9443 default:
9444 PyErr_Format(PyExc_ValueError,
9445 "unsupported format character '%c' (0x%x) "
9446 "at index %zd",
9447 (31<=c && c<=126) ? (char)c : '?',
9448 (int)c,
9449 (Py_ssize_t)(fmt - 1 -
9450 PyUnicode_AS_UNICODE(uformat)));
9451 goto onError;
9453 if (sign) {
9454 if (*pbuf == '-' || *pbuf == '+') {
9455 sign = *pbuf++;
9456 len--;
9458 else if (flags & F_SIGN)
9459 sign = '+';
9460 else if (flags & F_BLANK)
9461 sign = ' ';
9462 else
9463 sign = 0;
9465 if (width < len)
9466 width = len;
9467 if (rescnt - (sign != 0) < width) {
9468 reslen -= rescnt;
9469 rescnt = width + fmtcnt + 100;
9470 reslen += rescnt;
9471 if (reslen < 0) {
9472 Py_XDECREF(temp);
9473 PyErr_NoMemory();
9474 goto onError;
9476 if (_PyUnicode_Resize(&result, reslen) < 0) {
9477 Py_XDECREF(temp);
9478 goto onError;
9480 res = PyUnicode_AS_UNICODE(result)
9481 + reslen - rescnt;
9483 if (sign) {
9484 if (fill != ' ')
9485 *res++ = sign;
9486 rescnt--;
9487 if (width > len)
9488 width--;
9490 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9491 assert(pbuf[0] == '0');
9492 assert(pbuf[1] == c);
9493 if (fill != ' ') {
9494 *res++ = *pbuf++;
9495 *res++ = *pbuf++;
9497 rescnt -= 2;
9498 width -= 2;
9499 if (width < 0)
9500 width = 0;
9501 len -= 2;
9503 if (width > len && !(flags & F_LJUST)) {
9504 do {
9505 --rescnt;
9506 *res++ = fill;
9507 } while (--width > len);
9509 if (fill == ' ') {
9510 if (sign)
9511 *res++ = sign;
9512 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9513 assert(pbuf[0] == '0');
9514 assert(pbuf[1] == c);
9515 *res++ = *pbuf++;
9516 *res++ = *pbuf++;
9519 Py_UNICODE_COPY(res, pbuf, len);
9520 res += len;
9521 rescnt -= len;
9522 while (--width >= len) {
9523 --rescnt;
9524 *res++ = ' ';
9526 if (dict && (argidx < arglen) && c != '%') {
9527 PyErr_SetString(PyExc_TypeError,
9528 "not all arguments converted during string formatting");
9529 Py_XDECREF(temp);
9530 goto onError;
9532 Py_XDECREF(temp);
9533 } /* '%' */
9534 } /* until end */
9535 if (argidx < arglen && !dict) {
9536 PyErr_SetString(PyExc_TypeError,
9537 "not all arguments converted during string formatting");
9538 goto onError;
9541 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
9542 goto onError;
9543 if (args_owned) {
9544 Py_DECREF(args);
9546 Py_DECREF(uformat);
9547 return (PyObject *)result;
9549 onError:
9550 Py_XDECREF(result);
9551 Py_DECREF(uformat);
9552 if (args_owned) {
9553 Py_DECREF(args);
9555 return NULL;
9558 static PyObject *
9559 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9561 static PyObject *
9562 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9564 PyObject *x = NULL;
9565 static char *kwlist[] = {"object", "encoding", "errors", 0};
9566 char *encoding = NULL;
9567 char *errors = NULL;
9569 if (type != &PyUnicode_Type)
9570 return unicode_subtype_new(type, args, kwds);
9571 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
9572 kwlist, &x, &encoding, &errors))
9573 return NULL;
9574 if (x == NULL)
9575 return (PyObject *)_PyUnicode_New(0);
9576 if (encoding == NULL && errors == NULL)
9577 return PyObject_Str(x);
9578 else
9579 return PyUnicode_FromEncodedObject(x, encoding, errors);
9582 static PyObject *
9583 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9585 PyUnicodeObject *tmp, *pnew;
9586 Py_ssize_t n;
9588 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9589 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9590 if (tmp == NULL)
9591 return NULL;
9592 assert(PyUnicode_Check(tmp));
9593 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9594 if (pnew == NULL) {
9595 Py_DECREF(tmp);
9596 return NULL;
9598 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9599 if (pnew->str == NULL) {
9600 _Py_ForgetReference((PyObject *)pnew);
9601 PyObject_Del(pnew);
9602 Py_DECREF(tmp);
9603 return PyErr_NoMemory();
9605 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9606 pnew->length = n;
9607 pnew->hash = tmp->hash;
9608 Py_DECREF(tmp);
9609 return (PyObject *)pnew;
9612 PyDoc_STRVAR(unicode_doc,
9613 "str(string[, encoding[, errors]]) -> str\n\
9615 Create a new string object from the given encoded string.\n\
9616 encoding defaults to the current default string encoding.\n\
9617 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
9619 static PyObject *unicode_iter(PyObject *seq);
9621 PyTypeObject PyUnicode_Type = {
9622 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9623 "str", /* tp_name */
9624 sizeof(PyUnicodeObject), /* tp_size */
9625 0, /* tp_itemsize */
9626 /* Slots */
9627 (destructor)unicode_dealloc, /* tp_dealloc */
9628 0, /* tp_print */
9629 0, /* tp_getattr */
9630 0, /* tp_setattr */
9631 0, /* tp_reserved */
9632 unicode_repr, /* tp_repr */
9633 &unicode_as_number, /* tp_as_number */
9634 &unicode_as_sequence, /* tp_as_sequence */
9635 &unicode_as_mapping, /* tp_as_mapping */
9636 (hashfunc) unicode_hash, /* tp_hash*/
9637 0, /* tp_call*/
9638 (reprfunc) unicode_str, /* tp_str */
9639 PyObject_GenericGetAttr, /* tp_getattro */
9640 0, /* tp_setattro */
9641 0, /* tp_as_buffer */
9642 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9643 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
9644 unicode_doc, /* tp_doc */
9645 0, /* tp_traverse */
9646 0, /* tp_clear */
9647 PyUnicode_RichCompare, /* tp_richcompare */
9648 0, /* tp_weaklistoffset */
9649 unicode_iter, /* tp_iter */
9650 0, /* tp_iternext */
9651 unicode_methods, /* tp_methods */
9652 0, /* tp_members */
9653 0, /* tp_getset */
9654 &PyBaseObject_Type, /* tp_base */
9655 0, /* tp_dict */
9656 0, /* tp_descr_get */
9657 0, /* tp_descr_set */
9658 0, /* tp_dictoffset */
9659 0, /* tp_init */
9660 0, /* tp_alloc */
9661 unicode_new, /* tp_new */
9662 PyObject_Del, /* tp_free */
9665 /* Initialize the Unicode implementation */
9667 void _PyUnicode_Init(void)
9669 int i;
9671 /* XXX - move this array to unicodectype.c ? */
9672 Py_UNICODE linebreak[] = {
9673 0x000A, /* LINE FEED */
9674 0x000D, /* CARRIAGE RETURN */
9675 0x001C, /* FILE SEPARATOR */
9676 0x001D, /* GROUP SEPARATOR */
9677 0x001E, /* RECORD SEPARATOR */
9678 0x0085, /* NEXT LINE */
9679 0x2028, /* LINE SEPARATOR */
9680 0x2029, /* PARAGRAPH SEPARATOR */
9683 /* Init the implementation */
9684 free_list = NULL;
9685 numfree = 0;
9686 unicode_empty = _PyUnicode_New(0);
9687 if (!unicode_empty)
9688 return;
9690 for (i = 0; i < 256; i++)
9691 unicode_latin1[i] = NULL;
9692 if (PyType_Ready(&PyUnicode_Type) < 0)
9693 Py_FatalError("Can't initialize 'unicode'");
9695 /* initialize the linebreak bloom filter */
9696 bloom_linebreak = make_bloom_mask(
9697 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9700 PyType_Ready(&EncodingMapType);
9703 /* Finalize the Unicode implementation */
9706 PyUnicode_ClearFreeList(void)
9708 int freelist_size = numfree;
9709 PyUnicodeObject *u;
9711 for (u = free_list; u != NULL;) {
9712 PyUnicodeObject *v = u;
9713 u = *(PyUnicodeObject **)u;
9714 if (v->str)
9715 PyObject_DEL(v->str);
9716 Py_XDECREF(v->defenc);
9717 PyObject_Del(v);
9718 numfree--;
9720 free_list = NULL;
9721 assert(numfree == 0);
9722 return freelist_size;
9725 void
9726 _PyUnicode_Fini(void)
9728 int i;
9730 Py_XDECREF(unicode_empty);
9731 unicode_empty = NULL;
9733 for (i = 0; i < 256; i++) {
9734 if (unicode_latin1[i]) {
9735 Py_DECREF(unicode_latin1[i]);
9736 unicode_latin1[i] = NULL;
9739 (void)PyUnicode_ClearFreeList();
9742 void
9743 PyUnicode_InternInPlace(PyObject **p)
9745 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9746 PyObject *t;
9747 if (s == NULL || !PyUnicode_Check(s))
9748 Py_FatalError(
9749 "PyUnicode_InternInPlace: unicode strings only please!");
9750 /* If it's a subclass, we don't really know what putting
9751 it in the interned dict might do. */
9752 if (!PyUnicode_CheckExact(s))
9753 return;
9754 if (PyUnicode_CHECK_INTERNED(s))
9755 return;
9756 if (interned == NULL) {
9757 interned = PyDict_New();
9758 if (interned == NULL) {
9759 PyErr_Clear(); /* Don't leave an exception */
9760 return;
9763 /* It might be that the GetItem call fails even
9764 though the key is present in the dictionary,
9765 namely when this happens during a stack overflow. */
9766 Py_ALLOW_RECURSION
9767 t = PyDict_GetItem(interned, (PyObject *)s);
9768 Py_END_ALLOW_RECURSION
9770 if (t) {
9771 Py_INCREF(t);
9772 Py_DECREF(*p);
9773 *p = t;
9774 return;
9777 PyThreadState_GET()->recursion_critical = 1;
9778 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9779 PyErr_Clear();
9780 PyThreadState_GET()->recursion_critical = 0;
9781 return;
9783 PyThreadState_GET()->recursion_critical = 0;
9784 /* The two references in interned are not counted by refcnt.
9785 The deallocator will take care of this */
9786 Py_REFCNT(s) -= 2;
9787 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9790 void
9791 PyUnicode_InternImmortal(PyObject **p)
9793 PyUnicode_InternInPlace(p);
9794 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9795 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9796 Py_INCREF(*p);
9800 PyObject *
9801 PyUnicode_InternFromString(const char *cp)
9803 PyObject *s = PyUnicode_FromString(cp);
9804 if (s == NULL)
9805 return NULL;
9806 PyUnicode_InternInPlace(&s);
9807 return s;
9810 void _Py_ReleaseInternedUnicodeStrings(void)
9812 PyObject *keys;
9813 PyUnicodeObject *s;
9814 Py_ssize_t i, n;
9815 Py_ssize_t immortal_size = 0, mortal_size = 0;
9817 if (interned == NULL || !PyDict_Check(interned))
9818 return;
9819 keys = PyDict_Keys(interned);
9820 if (keys == NULL || !PyList_Check(keys)) {
9821 PyErr_Clear();
9822 return;
9825 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9826 detector, interned unicode strings are not forcibly deallocated;
9827 rather, we give them their stolen references back, and then clear
9828 and DECREF the interned dict. */
9830 n = PyList_GET_SIZE(keys);
9831 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9833 for (i = 0; i < n; i++) {
9834 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9835 switch (s->state) {
9836 case SSTATE_NOT_INTERNED:
9837 /* XXX Shouldn't happen */
9838 break;
9839 case SSTATE_INTERNED_IMMORTAL:
9840 Py_REFCNT(s) += 1;
9841 immortal_size += s->length;
9842 break;
9843 case SSTATE_INTERNED_MORTAL:
9844 Py_REFCNT(s) += 2;
9845 mortal_size += s->length;
9846 break;
9847 default:
9848 Py_FatalError("Inconsistent interned string state.");
9850 s->state = SSTATE_NOT_INTERNED;
9852 fprintf(stderr, "total size of all interned strings: "
9853 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9854 "mortal/immortal\n", mortal_size, immortal_size);
9855 Py_DECREF(keys);
9856 PyDict_Clear(interned);
9857 Py_DECREF(interned);
9858 interned = NULL;
9862 /********************* Unicode Iterator **************************/
9864 typedef struct {
9865 PyObject_HEAD
9866 Py_ssize_t it_index;
9867 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9868 } unicodeiterobject;
9870 static void
9871 unicodeiter_dealloc(unicodeiterobject *it)
9873 _PyObject_GC_UNTRACK(it);
9874 Py_XDECREF(it->it_seq);
9875 PyObject_GC_Del(it);
9878 static int
9879 unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9881 Py_VISIT(it->it_seq);
9882 return 0;
9885 static PyObject *
9886 unicodeiter_next(unicodeiterobject *it)
9888 PyUnicodeObject *seq;
9889 PyObject *item;
9891 assert(it != NULL);
9892 seq = it->it_seq;
9893 if (seq == NULL)
9894 return NULL;
9895 assert(PyUnicode_Check(seq));
9897 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9898 item = PyUnicode_FromUnicode(
9899 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
9900 if (item != NULL)
9901 ++it->it_index;
9902 return item;
9905 Py_DECREF(seq);
9906 it->it_seq = NULL;
9907 return NULL;
9910 static PyObject *
9911 unicodeiter_len(unicodeiterobject *it)
9913 Py_ssize_t len = 0;
9914 if (it->it_seq)
9915 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9916 return PyLong_FromSsize_t(len);
9919 PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9921 static PyMethodDef unicodeiter_methods[] = {
9922 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9923 length_hint_doc},
9924 {NULL, NULL} /* sentinel */
9927 PyTypeObject PyUnicodeIter_Type = {
9928 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9929 "str_iterator", /* tp_name */
9930 sizeof(unicodeiterobject), /* tp_basicsize */
9931 0, /* tp_itemsize */
9932 /* methods */
9933 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9934 0, /* tp_print */
9935 0, /* tp_getattr */
9936 0, /* tp_setattr */
9937 0, /* tp_reserved */
9938 0, /* tp_repr */
9939 0, /* tp_as_number */
9940 0, /* tp_as_sequence */
9941 0, /* tp_as_mapping */
9942 0, /* tp_hash */
9943 0, /* tp_call */
9944 0, /* tp_str */
9945 PyObject_GenericGetAttr, /* tp_getattro */
9946 0, /* tp_setattro */
9947 0, /* tp_as_buffer */
9948 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9949 0, /* tp_doc */
9950 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9951 0, /* tp_clear */
9952 0, /* tp_richcompare */
9953 0, /* tp_weaklistoffset */
9954 PyObject_SelfIter, /* tp_iter */
9955 (iternextfunc)unicodeiter_next, /* tp_iternext */
9956 unicodeiter_methods, /* tp_methods */
9960 static PyObject *
9961 unicode_iter(PyObject *seq)
9963 unicodeiterobject *it;
9965 if (!PyUnicode_Check(seq)) {
9966 PyErr_BadInternalCall();
9967 return NULL;
9969 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9970 if (it == NULL)
9971 return NULL;
9972 it->it_index = 0;
9973 Py_INCREF(seq);
9974 it->it_seq = (PyUnicodeObject *)seq;
9975 _PyObject_GC_TRACK(it);
9976 return (PyObject *)it;
9979 size_t
9980 Py_UNICODE_strlen(const Py_UNICODE *u)
9982 int res = 0;
9983 while(*u++)
9984 res++;
9985 return res;
9988 Py_UNICODE*
9989 Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9991 Py_UNICODE *u = s1;
9992 while ((*u++ = *s2++));
9993 return s1;
9996 Py_UNICODE*
9997 Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9999 Py_UNICODE *u = s1;
10000 while ((*u++ = *s2++))
10001 if (n-- == 0)
10002 break;
10003 return s1;
10007 Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10009 while (*s1 && *s2 && *s1 == *s2)
10010 s1++, s2++;
10011 if (*s1 && *s2)
10012 return (*s1 < *s2) ? -1 : +1;
10013 if (*s1)
10014 return 1;
10015 if (*s2)
10016 return -1;
10017 return 0;
10020 Py_UNICODE*
10021 Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10023 const Py_UNICODE *p;
10024 for (p = s; *p; p++)
10025 if (*p == c)
10026 return (Py_UNICODE*)p;
10027 return NULL;
10031 #ifdef __cplusplus
10033 #endif
10037 Local variables:
10038 c-basic-offset: 4
10039 indent-tabs-mode: nil
10040 End: