fix typos, mostly in comments
[python.git] / Objects / unicodeobject.c
blobcac6a2d0ae6cab72ec232bb348edacf03fb59219
1 /*
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Copyright (c) Corporation for National Research Initiatives.
9 --------------------------------------------------------------------
10 The original string type implementation is:
12 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
15 By obtaining, using, and/or copying this software and/or its
16 associated documentation, you agree that you have read, understood,
17 and will comply with the following terms and conditions:
19 Permission to use, copy, modify, and distribute this software and its
20 associated documentation for any purpose and without fee is hereby
21 granted, provided that the above copyright notice appears in all
22 copies, and that both that copyright notice and this permission notice
23 appear in supporting documentation, and that the name of Secret Labs
24 AB or the author not be used in advertising or publicity pertaining to
25 distribution of the software without specific, written prior
26 permission.
28 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35 --------------------------------------------------------------------
39 #include "Python.h"
41 #include "unicodeobject.h"
42 #include "ucnhash.h"
44 #ifdef MS_WINDOWS
45 #include <windows.h>
46 #endif
48 /* Limit for the Unicode object free list */
50 #define MAX_UNICODE_FREELIST_SIZE 1024
52 /* Limit for the Unicode object free list stay alive optimization.
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
58 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
59 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
60 malloc()-overhead) bytes of unused garbage.
62 Setting the limit to 0 effectively turns the feature off.
64 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
69 #define KEEPALIVE_SIZE_LIMIT 9
71 /* Endianness switches; defaults to little endian */
73 #ifdef WORDS_BIGENDIAN
74 # define BYTEORDER_IS_BIG_ENDIAN
75 #else
76 # define BYTEORDER_IS_LITTLE_ENDIAN
77 #endif
79 /* --- Globals ------------------------------------------------------------
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
86 /* Free list for Unicode objects */
87 static PyUnicodeObject *unicode_freelist;
88 static int unicode_freelist_size;
90 /* The empty Unicode object is shared to improve performance. */
91 static PyUnicodeObject *unicode_empty;
93 /* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95 static PyUnicodeObject *unicode_latin1[256];
97 /* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
104 static char unicode_default_encoding[100];
106 Py_UNICODE
107 PyUnicode_GetMax(void)
109 #ifdef Py_UNICODE_WIDE
110 return 0x10FFFF;
111 #else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115 #endif
118 /* --- Unicode Object ----------------------------------------------------- */
120 static
121 int unicode_resize(register PyUnicodeObject *unicode,
122 int length)
124 void *oldstr;
126 /* Shortcut if there's nothing much to do. */
127 if (unicode->length == length)
128 goto reset;
130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 /* MvL said unicode->str[] may be signed. Python generally assumes
136 * an int contains at least 32 bits, and we don't use more than
137 * 32 bits even in a UCS4 build, so casting to unsigned int should
138 * be correct.
140 (unsigned int)unicode->str[0] < 256U &&
141 unicode_latin1[unicode->str[0]] == unicode)) {
142 PyErr_SetString(PyExc_SystemError,
143 "can't resize shared unicode objects");
144 return -1;
147 /* We allocate one more byte to make sure the string is
148 Ux0000 terminated -- XXX is this needed ? */
149 oldstr = unicode->str;
150 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
151 if (!unicode->str) {
152 unicode->str = oldstr;
153 PyErr_NoMemory();
154 return -1;
156 unicode->str[length] = 0;
157 unicode->length = length;
159 reset:
160 /* Reset the object caches */
161 if (unicode->defenc) {
162 Py_DECREF(unicode->defenc);
163 unicode->defenc = NULL;
165 unicode->hash = -1;
167 return 0;
170 /* We allocate one more byte to make sure the string is
171 Ux0000 terminated -- XXX is this needed ?
173 XXX This allocator could further be enhanced by assuring that the
174 free list never reduces its size below 1.
178 static
179 PyUnicodeObject *_PyUnicode_New(int length)
181 register PyUnicodeObject *unicode;
183 /* Optimization fo empty strings */
184 if (length == 0 && unicode_empty != NULL) {
185 Py_INCREF(unicode_empty);
186 return unicode_empty;
189 /* Unicode freelist & memory allocation */
190 if (unicode_freelist) {
191 unicode = unicode_freelist;
192 unicode_freelist = *(PyUnicodeObject **)unicode;
193 unicode_freelist_size--;
194 if (unicode->str) {
195 /* Keep-Alive optimization: we only upsize the buffer,
196 never downsize it. */
197 if ((unicode->length < length) &&
198 unicode_resize(unicode, length) < 0) {
199 PyMem_DEL(unicode->str);
200 goto onError;
203 else {
204 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
206 PyObject_INIT(unicode, &PyUnicode_Type);
208 else {
209 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
210 if (unicode == NULL)
211 return NULL;
212 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
215 if (!unicode->str) {
216 PyErr_NoMemory();
217 goto onError;
219 /* Initialize the first element to guard against cases where
220 * the caller fails before initializing str -- unicode_resize()
221 * reads str[0], and the Keep-Alive optimization can keep memory
222 * allocated for str alive across a call to unicode_dealloc(unicode).
223 * We don't want unicode_resize to read uninitialized memory in
224 * that case.
226 unicode->str[0] = 0;
227 unicode->str[length] = 0;
228 unicode->length = length;
229 unicode->hash = -1;
230 unicode->defenc = NULL;
231 return unicode;
233 onError:
234 _Py_ForgetReference((PyObject *)unicode);
235 PyObject_Del(unicode);
236 return NULL;
239 static
240 void unicode_dealloc(register PyUnicodeObject *unicode)
242 if (PyUnicode_CheckExact(unicode) &&
243 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
244 /* Keep-Alive optimization */
245 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
246 PyMem_DEL(unicode->str);
247 unicode->str = NULL;
248 unicode->length = 0;
250 if (unicode->defenc) {
251 Py_DECREF(unicode->defenc);
252 unicode->defenc = NULL;
254 /* Add to free list */
255 *(PyUnicodeObject **)unicode = unicode_freelist;
256 unicode_freelist = unicode;
257 unicode_freelist_size++;
259 else {
260 PyMem_DEL(unicode->str);
261 Py_XDECREF(unicode->defenc);
262 unicode->ob_type->tp_free((PyObject *)unicode);
266 int PyUnicode_Resize(PyObject **unicode, int length)
268 register PyUnicodeObject *v;
270 /* Argument checks */
271 if (unicode == NULL) {
272 PyErr_BadInternalCall();
273 return -1;
275 v = (PyUnicodeObject *)*unicode;
276 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
277 PyErr_BadInternalCall();
278 return -1;
281 /* Resizing unicode_empty and single character objects is not
282 possible since these are being shared. We simply return a fresh
283 copy with the same Unicode content. */
284 if (v->length != length &&
285 (v == unicode_empty || v->length == 1)) {
286 PyUnicodeObject *w = _PyUnicode_New(length);
287 if (w == NULL)
288 return -1;
289 Py_UNICODE_COPY(w->str, v->str,
290 length < v->length ? length : v->length);
291 Py_DECREF(*unicode);
292 *unicode = (PyObject *)w;
293 return 0;
296 /* Note that we don't have to modify *unicode for unshared Unicode
297 objects, since we can modify them in-place. */
298 return unicode_resize(v, length);
301 /* Internal API for use in unicodeobject.c only ! */
302 #define _PyUnicode_Resize(unicodevar, length) \
303 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
305 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
306 int size)
308 PyUnicodeObject *unicode;
310 /* If the Unicode data is known at construction time, we can apply
311 some optimizations which share commonly used objects. */
312 if (u != NULL) {
314 /* Optimization for empty strings */
315 if (size == 0 && unicode_empty != NULL) {
316 Py_INCREF(unicode_empty);
317 return (PyObject *)unicode_empty;
320 /* Single character Unicode objects in the Latin-1 range are
321 shared when using this constructor */
322 if (size == 1 && *u < 256) {
323 unicode = unicode_latin1[*u];
324 if (!unicode) {
325 unicode = _PyUnicode_New(1);
326 if (!unicode)
327 return NULL;
328 unicode->str[0] = *u;
329 unicode_latin1[*u] = unicode;
331 Py_INCREF(unicode);
332 return (PyObject *)unicode;
336 unicode = _PyUnicode_New(size);
337 if (!unicode)
338 return NULL;
340 /* Copy the Unicode data into the new object */
341 if (u != NULL)
342 Py_UNICODE_COPY(unicode->str, u, size);
344 return (PyObject *)unicode;
347 #ifdef HAVE_WCHAR_H
349 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
350 int size)
352 PyUnicodeObject *unicode;
354 if (w == NULL) {
355 PyErr_BadInternalCall();
356 return NULL;
359 unicode = _PyUnicode_New(size);
360 if (!unicode)
361 return NULL;
363 /* Copy the wchar_t data into the new object */
364 #ifdef HAVE_USABLE_WCHAR_T
365 memcpy(unicode->str, w, size * sizeof(wchar_t));
366 #else
368 register Py_UNICODE *u;
369 register int i;
370 u = PyUnicode_AS_UNICODE(unicode);
371 for (i = size; i > 0; i--)
372 *u++ = *w++;
374 #endif
376 return (PyObject *)unicode;
379 int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
380 register wchar_t *w,
381 int size)
383 if (unicode == NULL) {
384 PyErr_BadInternalCall();
385 return -1;
388 /* If possible, try to copy the 0-termination as well */
389 if (size > PyUnicode_GET_SIZE(unicode))
390 size = PyUnicode_GET_SIZE(unicode) + 1;
392 #ifdef HAVE_USABLE_WCHAR_T
393 memcpy(w, unicode->str, size * sizeof(wchar_t));
394 #else
396 register Py_UNICODE *u;
397 register int i;
398 u = PyUnicode_AS_UNICODE(unicode);
399 for (i = size; i > 0; i--)
400 *w++ = *u++;
402 #endif
404 if (size > PyUnicode_GET_SIZE(unicode))
405 return PyUnicode_GET_SIZE(unicode);
406 else
407 return size;
410 #endif
412 PyObject *PyUnicode_FromOrdinal(int ordinal)
414 Py_UNICODE s[1];
416 #ifdef Py_UNICODE_WIDE
417 if (ordinal < 0 || ordinal > 0x10ffff) {
418 PyErr_SetString(PyExc_ValueError,
419 "unichr() arg not in range(0x110000) "
420 "(wide Python build)");
421 return NULL;
423 #else
424 if (ordinal < 0 || ordinal > 0xffff) {
425 PyErr_SetString(PyExc_ValueError,
426 "unichr() arg not in range(0x10000) "
427 "(narrow Python build)");
428 return NULL;
430 #endif
432 s[0] = (Py_UNICODE)ordinal;
433 return PyUnicode_FromUnicode(s, 1);
436 PyObject *PyUnicode_FromObject(register PyObject *obj)
438 /* XXX Perhaps we should make this API an alias of
439 PyObject_Unicode() instead ?! */
440 if (PyUnicode_CheckExact(obj)) {
441 Py_INCREF(obj);
442 return obj;
444 if (PyUnicode_Check(obj)) {
445 /* For a Unicode subtype that's not a Unicode object,
446 return a true Unicode object with the same data. */
447 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
448 PyUnicode_GET_SIZE(obj));
450 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
453 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
454 const char *encoding,
455 const char *errors)
457 const char *s = NULL;
458 int len;
459 PyObject *v;
461 if (obj == NULL) {
462 PyErr_BadInternalCall();
463 return NULL;
466 #if 0
467 /* For b/w compatibility we also accept Unicode objects provided
468 that no encodings is given and then redirect to
469 PyObject_Unicode() which then applies the additional logic for
470 Unicode subclasses.
472 NOTE: This API should really only be used for object which
473 represent *encoded* Unicode !
476 if (PyUnicode_Check(obj)) {
477 if (encoding) {
478 PyErr_SetString(PyExc_TypeError,
479 "decoding Unicode is not supported");
480 return NULL;
482 return PyObject_Unicode(obj);
484 #else
485 if (PyUnicode_Check(obj)) {
486 PyErr_SetString(PyExc_TypeError,
487 "decoding Unicode is not supported");
488 return NULL;
490 #endif
492 /* Coerce object */
493 if (PyString_Check(obj)) {
494 s = PyString_AS_STRING(obj);
495 len = PyString_GET_SIZE(obj);
497 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
498 /* Overwrite the error message with something more useful in
499 case of a TypeError. */
500 if (PyErr_ExceptionMatches(PyExc_TypeError))
501 PyErr_Format(PyExc_TypeError,
502 "coercing to Unicode: need string or buffer, "
503 "%.80s found",
504 obj->ob_type->tp_name);
505 goto onError;
508 /* Convert to Unicode */
509 if (len == 0) {
510 Py_INCREF(unicode_empty);
511 v = (PyObject *)unicode_empty;
513 else
514 v = PyUnicode_Decode(s, len, encoding, errors);
516 return v;
518 onError:
519 return NULL;
522 PyObject *PyUnicode_Decode(const char *s,
523 int size,
524 const char *encoding,
525 const char *errors)
527 PyObject *buffer = NULL, *unicode;
529 if (encoding == NULL)
530 encoding = PyUnicode_GetDefaultEncoding();
532 /* Shortcuts for common default encodings */
533 if (strcmp(encoding, "utf-8") == 0)
534 return PyUnicode_DecodeUTF8(s, size, errors);
535 else if (strcmp(encoding, "latin-1") == 0)
536 return PyUnicode_DecodeLatin1(s, size, errors);
537 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
538 else if (strcmp(encoding, "mbcs") == 0)
539 return PyUnicode_DecodeMBCS(s, size, errors);
540 #endif
541 else if (strcmp(encoding, "ascii") == 0)
542 return PyUnicode_DecodeASCII(s, size, errors);
544 /* Decode via the codec registry */
545 buffer = PyBuffer_FromMemory((void *)s, size);
546 if (buffer == NULL)
547 goto onError;
548 unicode = PyCodec_Decode(buffer, encoding, errors);
549 if (unicode == NULL)
550 goto onError;
551 if (!PyUnicode_Check(unicode)) {
552 PyErr_Format(PyExc_TypeError,
553 "decoder did not return an unicode object (type=%.400s)",
554 unicode->ob_type->tp_name);
555 Py_DECREF(unicode);
556 goto onError;
558 Py_DECREF(buffer);
559 return unicode;
561 onError:
562 Py_XDECREF(buffer);
563 return NULL;
566 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
567 const char *encoding,
568 const char *errors)
570 PyObject *v;
572 if (!PyUnicode_Check(unicode)) {
573 PyErr_BadArgument();
574 goto onError;
577 if (encoding == NULL)
578 encoding = PyUnicode_GetDefaultEncoding();
580 /* Decode via the codec registry */
581 v = PyCodec_Decode(unicode, encoding, errors);
582 if (v == NULL)
583 goto onError;
584 return v;
586 onError:
587 return NULL;
590 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
591 int size,
592 const char *encoding,
593 const char *errors)
595 PyObject *v, *unicode;
597 unicode = PyUnicode_FromUnicode(s, size);
598 if (unicode == NULL)
599 return NULL;
600 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
601 Py_DECREF(unicode);
602 return v;
605 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
606 const char *encoding,
607 const char *errors)
609 PyObject *v;
611 if (!PyUnicode_Check(unicode)) {
612 PyErr_BadArgument();
613 goto onError;
616 if (encoding == NULL)
617 encoding = PyUnicode_GetDefaultEncoding();
619 /* Encode via the codec registry */
620 v = PyCodec_Encode(unicode, encoding, errors);
621 if (v == NULL)
622 goto onError;
623 return v;
625 onError:
626 return NULL;
629 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
630 const char *encoding,
631 const char *errors)
633 PyObject *v;
635 if (!PyUnicode_Check(unicode)) {
636 PyErr_BadArgument();
637 goto onError;
640 if (encoding == NULL)
641 encoding = PyUnicode_GetDefaultEncoding();
643 /* Shortcuts for common default encodings */
644 if (errors == NULL) {
645 if (strcmp(encoding, "utf-8") == 0)
646 return PyUnicode_AsUTF8String(unicode);
647 else if (strcmp(encoding, "latin-1") == 0)
648 return PyUnicode_AsLatin1String(unicode);
649 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
650 else if (strcmp(encoding, "mbcs") == 0)
651 return PyUnicode_AsMBCSString(unicode);
652 #endif
653 else if (strcmp(encoding, "ascii") == 0)
654 return PyUnicode_AsASCIIString(unicode);
657 /* Encode via the codec registry */
658 v = PyCodec_Encode(unicode, encoding, errors);
659 if (v == NULL)
660 goto onError;
661 if (!PyString_Check(v)) {
662 PyErr_Format(PyExc_TypeError,
663 "encoder did not return a string object (type=%.400s)",
664 v->ob_type->tp_name);
665 Py_DECREF(v);
666 goto onError;
668 return v;
670 onError:
671 return NULL;
674 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
675 const char *errors)
677 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
679 if (v)
680 return v;
681 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
682 if (v && errors == NULL)
683 ((PyUnicodeObject *)unicode)->defenc = v;
684 return v;
687 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
689 if (!PyUnicode_Check(unicode)) {
690 PyErr_BadArgument();
691 goto onError;
693 return PyUnicode_AS_UNICODE(unicode);
695 onError:
696 return NULL;
699 int PyUnicode_GetSize(PyObject *unicode)
701 if (!PyUnicode_Check(unicode)) {
702 PyErr_BadArgument();
703 goto onError;
705 return PyUnicode_GET_SIZE(unicode);
707 onError:
708 return -1;
711 const char *PyUnicode_GetDefaultEncoding(void)
713 return unicode_default_encoding;
716 int PyUnicode_SetDefaultEncoding(const char *encoding)
718 PyObject *v;
720 /* Make sure the encoding is valid. As side effect, this also
721 loads the encoding into the codec registry cache. */
722 v = _PyCodec_Lookup(encoding);
723 if (v == NULL)
724 goto onError;
725 Py_DECREF(v);
726 strncpy(unicode_default_encoding,
727 encoding,
728 sizeof(unicode_default_encoding));
729 return 0;
731 onError:
732 return -1;
735 /* error handling callback helper:
736 build arguments, call the callback and check the arguments,
737 if no exception occurred, copy the replacement to the output
738 and adjust various state variables.
739 return 0 on success, -1 on error
742 static
743 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
744 const char *encoding, const char *reason,
745 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
746 PyObject **output, int *outpos, Py_UNICODE **outptr)
748 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
750 PyObject *restuple = NULL;
751 PyObject *repunicode = NULL;
752 int outsize = PyUnicode_GET_SIZE(*output);
753 int requiredsize;
754 int newpos;
755 Py_UNICODE *repptr;
756 int repsize;
757 int res = -1;
759 if (*errorHandler == NULL) {
760 *errorHandler = PyCodec_LookupError(errors);
761 if (*errorHandler == NULL)
762 goto onError;
765 if (*exceptionObject == NULL) {
766 *exceptionObject = PyUnicodeDecodeError_Create(
767 encoding, input, insize, *startinpos, *endinpos, reason);
768 if (*exceptionObject == NULL)
769 goto onError;
771 else {
772 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
773 goto onError;
774 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
775 goto onError;
776 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
777 goto onError;
780 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
781 if (restuple == NULL)
782 goto onError;
783 if (!PyTuple_Check(restuple)) {
784 PyErr_Format(PyExc_TypeError, &argparse[4]);
785 goto onError;
787 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
788 goto onError;
789 if (newpos<0)
790 newpos = insize+newpos;
791 if (newpos<0 || newpos>insize) {
792 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
793 goto onError;
796 /* need more space? (at least enough for what we
797 have+the replacement+the rest of the string (starting
798 at the new input position), so we won't have to check space
799 when there are no errors in the rest of the string) */
800 repptr = PyUnicode_AS_UNICODE(repunicode);
801 repsize = PyUnicode_GET_SIZE(repunicode);
802 requiredsize = *outpos + repsize + insize-newpos;
803 if (requiredsize > outsize) {
804 if (requiredsize<2*outsize)
805 requiredsize = 2*outsize;
806 if (PyUnicode_Resize(output, requiredsize) < 0)
807 goto onError;
808 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
810 *endinpos = newpos;
811 *inptr = input + newpos;
812 Py_UNICODE_COPY(*outptr, repptr, repsize);
813 *outptr += repsize;
814 *outpos += repsize;
815 /* we made it! */
816 res = 0;
818 onError:
819 Py_XDECREF(restuple);
820 return res;
823 /* --- UTF-7 Codec -------------------------------------------------------- */
825 /* see RFC2152 for details */
827 static
828 char utf7_special[128] = {
829 /* indicate whether a UTF-7 character is special i.e. cannot be directly
830 encoded:
831 0 - not special
832 1 - special
833 2 - whitespace (optional)
834 3 - RFC2152 Set O (optional) */
835 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
836 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
837 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
838 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
839 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
840 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
841 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
842 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
846 /* Note: The comparison (c) <= 0 is a trick to work-around gcc
847 warnings about the comparison always being false; since
848 utf7_special[0] is 1, we can safely make that one comparison
849 true */
851 #define SPECIAL(c, encodeO, encodeWS) \
852 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
853 (encodeWS && (utf7_special[(c)] == 2)) || \
854 (encodeO && (utf7_special[(c)] == 3)))
856 #define B64(n) \
857 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
858 #define B64CHAR(c) \
859 (isalnum(c) || (c) == '+' || (c) == '/')
860 #define UB64(c) \
861 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
862 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
864 #define ENCODE(out, ch, bits) \
865 while (bits >= 6) { \
866 *out++ = B64(ch >> (bits-6)); \
867 bits -= 6; \
870 #define DECODE(out, ch, bits, surrogate) \
871 while (bits >= 16) { \
872 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
873 bits -= 16; \
874 if (surrogate) { \
875 /* We have already generated an error for the high surrogate \
876 so let's not bother seeing if the low surrogate is correct or not */ \
877 surrogate = 0; \
878 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
879 /* This is a surrogate pair. Unfortunately we can't represent \
880 it in a 16-bit character */ \
881 surrogate = 1; \
882 errmsg = "code pairs are not supported"; \
883 goto utf7Error; \
884 } else { \
885 *out++ = outCh; \
889 PyObject *PyUnicode_DecodeUTF7(const char *s,
890 int size,
891 const char *errors)
893 const char *starts = s;
894 int startinpos;
895 int endinpos;
896 int outpos;
897 const char *e;
898 PyUnicodeObject *unicode;
899 Py_UNICODE *p;
900 const char *errmsg = "";
901 int inShift = 0;
902 unsigned int bitsleft = 0;
903 unsigned long charsleft = 0;
904 int surrogate = 0;
905 PyObject *errorHandler = NULL;
906 PyObject *exc = NULL;
908 unicode = _PyUnicode_New(size);
909 if (!unicode)
910 return NULL;
911 if (size == 0)
912 return (PyObject *)unicode;
914 p = unicode->str;
915 e = s + size;
917 while (s < e) {
918 Py_UNICODE ch;
919 restart:
920 ch = *s;
922 if (inShift) {
923 if ((ch == '-') || !B64CHAR(ch)) {
924 inShift = 0;
925 s++;
927 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
928 if (bitsleft >= 6) {
929 /* The shift sequence has a partial character in it. If
930 bitsleft < 6 then we could just classify it as padding
931 but that is not the case here */
933 errmsg = "partial character in shift sequence";
934 goto utf7Error;
936 /* According to RFC2152 the remaining bits should be zero. We
937 choose to signal an error/insert a replacement character
938 here so indicate the potential of a misencoded character. */
940 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
941 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
942 errmsg = "non-zero padding bits in shift sequence";
943 goto utf7Error;
946 if (ch == '-') {
947 if ((s < e) && (*(s) == '-')) {
948 *p++ = '-';
949 inShift = 1;
951 } else if (SPECIAL(ch,0,0)) {
952 errmsg = "unexpected special character";
953 goto utf7Error;
954 } else {
955 *p++ = ch;
957 } else {
958 charsleft = (charsleft << 6) | UB64(ch);
959 bitsleft += 6;
960 s++;
961 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
964 else if ( ch == '+' ) {
965 startinpos = s-starts;
966 s++;
967 if (s < e && *s == '-') {
968 s++;
969 *p++ = '+';
970 } else
972 inShift = 1;
973 bitsleft = 0;
976 else if (SPECIAL(ch,0,0)) {
977 errmsg = "unexpected special character";
978 s++;
979 goto utf7Error;
981 else {
982 *p++ = ch;
983 s++;
985 continue;
986 utf7Error:
987 outpos = p-PyUnicode_AS_UNICODE(unicode);
988 endinpos = s-starts;
989 if (unicode_decode_call_errorhandler(
990 errors, &errorHandler,
991 "utf7", errmsg,
992 starts, size, &startinpos, &endinpos, &exc, &s,
993 (PyObject **)&unicode, &outpos, &p))
994 goto onError;
997 if (inShift) {
998 outpos = p-PyUnicode_AS_UNICODE(unicode);
999 endinpos = size;
1000 if (unicode_decode_call_errorhandler(
1001 errors, &errorHandler,
1002 "utf7", "unterminated shift sequence",
1003 starts, size, &startinpos, &endinpos, &exc, &s,
1004 (PyObject **)&unicode, &outpos, &p))
1005 goto onError;
1006 if (s < e)
1007 goto restart;
1010 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1011 goto onError;
1013 Py_XDECREF(errorHandler);
1014 Py_XDECREF(exc);
1015 return (PyObject *)unicode;
1017 onError:
1018 Py_XDECREF(errorHandler);
1019 Py_XDECREF(exc);
1020 Py_DECREF(unicode);
1021 return NULL;
1025 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1026 int size,
1027 int encodeSetO,
1028 int encodeWhiteSpace,
1029 const char *errors)
1031 PyObject *v;
1032 /* It might be possible to tighten this worst case */
1033 unsigned int cbAllocated = 5 * size;
1034 int inShift = 0;
1035 int i = 0;
1036 unsigned int bitsleft = 0;
1037 unsigned long charsleft = 0;
1038 char * out;
1039 char * start;
1041 if (size == 0)
1042 return PyString_FromStringAndSize(NULL, 0);
1044 v = PyString_FromStringAndSize(NULL, cbAllocated);
1045 if (v == NULL)
1046 return NULL;
1048 start = out = PyString_AS_STRING(v);
1049 for (;i < size; ++i) {
1050 Py_UNICODE ch = s[i];
1052 if (!inShift) {
1053 if (ch == '+') {
1054 *out++ = '+';
1055 *out++ = '-';
1056 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1057 charsleft = ch;
1058 bitsleft = 16;
1059 *out++ = '+';
1060 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1061 inShift = bitsleft > 0;
1062 } else {
1063 *out++ = (char) ch;
1065 } else {
1066 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1067 *out++ = B64(charsleft << (6-bitsleft));
1068 charsleft = 0;
1069 bitsleft = 0;
1070 /* Characters not in the BASE64 set implicitly unshift the sequence
1071 so no '-' is required, except if the character is itself a '-' */
1072 if (B64CHAR(ch) || ch == '-') {
1073 *out++ = '-';
1075 inShift = 0;
1076 *out++ = (char) ch;
1077 } else {
1078 bitsleft += 16;
1079 charsleft = (charsleft << 16) | ch;
1080 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1082 /* If the next character is special then we dont' need to terminate
1083 the shift sequence. If the next character is not a BASE64 character
1084 or '-' then the shift sequence will be terminated implicitly and we
1085 don't have to insert a '-'. */
1087 if (bitsleft == 0) {
1088 if (i + 1 < size) {
1089 Py_UNICODE ch2 = s[i+1];
1091 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1093 } else if (B64CHAR(ch2) || ch2 == '-') {
1094 *out++ = '-';
1095 inShift = 0;
1096 } else {
1097 inShift = 0;
1101 else {
1102 *out++ = '-';
1103 inShift = 0;
1109 if (bitsleft) {
1110 *out++= B64(charsleft << (6-bitsleft) );
1111 *out++ = '-';
1114 _PyString_Resize(&v, out - start);
1115 return v;
1118 #undef SPECIAL
1119 #undef B64
1120 #undef B64CHAR
1121 #undef UB64
1122 #undef ENCODE
1123 #undef DECODE
1125 /* --- UTF-8 Codec -------------------------------------------------------- */
1127 static
1128 char utf8_code_length[256] = {
1129 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1130 illegal prefix. see RFC 2279 for details */
1131 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1132 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1133 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1134 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1135 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1136 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1137 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1138 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1139 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1140 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1141 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1142 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1143 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1144 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1145 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1146 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1149 PyObject *PyUnicode_DecodeUTF8(const char *s,
1150 int size,
1151 const char *errors)
1153 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1156 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1157 int size,
1158 const char *errors,
1159 int *consumed)
1161 const char *starts = s;
1162 int n;
1163 int startinpos;
1164 int endinpos;
1165 int outpos;
1166 const char *e;
1167 PyUnicodeObject *unicode;
1168 Py_UNICODE *p;
1169 const char *errmsg = "";
1170 PyObject *errorHandler = NULL;
1171 PyObject *exc = NULL;
1173 /* Note: size will always be longer than the resulting Unicode
1174 character count */
1175 unicode = _PyUnicode_New(size);
1176 if (!unicode)
1177 return NULL;
1178 if (size == 0) {
1179 if (consumed)
1180 *consumed = 0;
1181 return (PyObject *)unicode;
1184 /* Unpack UTF-8 encoded data */
1185 p = unicode->str;
1186 e = s + size;
1188 while (s < e) {
1189 Py_UCS4 ch = (unsigned char)*s;
1191 if (ch < 0x80) {
1192 *p++ = (Py_UNICODE)ch;
1193 s++;
1194 continue;
1197 n = utf8_code_length[ch];
1199 if (s + n > e) {
1200 if (consumed)
1201 break;
1202 else {
1203 errmsg = "unexpected end of data";
1204 startinpos = s-starts;
1205 endinpos = size;
1206 goto utf8Error;
1210 switch (n) {
1212 case 0:
1213 errmsg = "unexpected code byte";
1214 startinpos = s-starts;
1215 endinpos = startinpos+1;
1216 goto utf8Error;
1218 case 1:
1219 errmsg = "internal error";
1220 startinpos = s-starts;
1221 endinpos = startinpos+1;
1222 goto utf8Error;
1224 case 2:
1225 if ((s[1] & 0xc0) != 0x80) {
1226 errmsg = "invalid data";
1227 startinpos = s-starts;
1228 endinpos = startinpos+2;
1229 goto utf8Error;
1231 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1232 if (ch < 0x80) {
1233 startinpos = s-starts;
1234 endinpos = startinpos+2;
1235 errmsg = "illegal encoding";
1236 goto utf8Error;
1238 else
1239 *p++ = (Py_UNICODE)ch;
1240 break;
1242 case 3:
1243 if ((s[1] & 0xc0) != 0x80 ||
1244 (s[2] & 0xc0) != 0x80) {
1245 errmsg = "invalid data";
1246 startinpos = s-starts;
1247 endinpos = startinpos+3;
1248 goto utf8Error;
1250 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1251 if (ch < 0x0800) {
1252 /* Note: UTF-8 encodings of surrogates are considered
1253 legal UTF-8 sequences;
1255 XXX For wide builds (UCS-4) we should probably try
1256 to recombine the surrogates into a single code
1257 unit.
1259 errmsg = "illegal encoding";
1260 startinpos = s-starts;
1261 endinpos = startinpos+3;
1262 goto utf8Error;
1264 else
1265 *p++ = (Py_UNICODE)ch;
1266 break;
1268 case 4:
1269 if ((s[1] & 0xc0) != 0x80 ||
1270 (s[2] & 0xc0) != 0x80 ||
1271 (s[3] & 0xc0) != 0x80) {
1272 errmsg = "invalid data";
1273 startinpos = s-starts;
1274 endinpos = startinpos+4;
1275 goto utf8Error;
1277 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1278 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1279 /* validate and convert to UTF-16 */
1280 if ((ch < 0x10000) /* minimum value allowed for 4
1281 byte encoding */
1282 || (ch > 0x10ffff)) /* maximum value allowed for
1283 UTF-16 */
1285 errmsg = "illegal encoding";
1286 startinpos = s-starts;
1287 endinpos = startinpos+4;
1288 goto utf8Error;
1290 #ifdef Py_UNICODE_WIDE
1291 *p++ = (Py_UNICODE)ch;
1292 #else
1293 /* compute and append the two surrogates: */
1295 /* translate from 10000..10FFFF to 0..FFFF */
1296 ch -= 0x10000;
1298 /* high surrogate = top 10 bits added to D800 */
1299 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1301 /* low surrogate = bottom 10 bits added to DC00 */
1302 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1303 #endif
1304 break;
1306 default:
1307 /* Other sizes are only needed for UCS-4 */
1308 errmsg = "unsupported Unicode code range";
1309 startinpos = s-starts;
1310 endinpos = startinpos+n;
1311 goto utf8Error;
1313 s += n;
1314 continue;
1316 utf8Error:
1317 outpos = p-PyUnicode_AS_UNICODE(unicode);
1318 if (unicode_decode_call_errorhandler(
1319 errors, &errorHandler,
1320 "utf8", errmsg,
1321 starts, size, &startinpos, &endinpos, &exc, &s,
1322 (PyObject **)&unicode, &outpos, &p))
1323 goto onError;
1325 if (consumed)
1326 *consumed = s-starts;
1328 /* Adjust length */
1329 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1330 goto onError;
1332 Py_XDECREF(errorHandler);
1333 Py_XDECREF(exc);
1334 return (PyObject *)unicode;
1336 onError:
1337 Py_XDECREF(errorHandler);
1338 Py_XDECREF(exc);
1339 Py_DECREF(unicode);
1340 return NULL;
1343 /* Allocation strategy: if the string is short, convert into a stack buffer
1344 and allocate exactly as much space needed at the end. Else allocate the
1345 maximum possible needed (4 result bytes per Unicode character), and return
1346 the excess memory at the end.
1348 PyObject *
1349 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1350 int size,
1351 const char *errors)
1353 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
1355 int i; /* index into s of next input byte */
1356 PyObject *v; /* result string object */
1357 char *p; /* next free byte in output buffer */
1358 int nallocated; /* number of result bytes allocated */
1359 int nneeded; /* number of result bytes needed */
1360 char stackbuf[MAX_SHORT_UNICHARS * 4];
1362 assert(s != NULL);
1363 assert(size >= 0);
1365 if (size <= MAX_SHORT_UNICHARS) {
1366 /* Write into the stack buffer; nallocated can't overflow.
1367 * At the end, we'll allocate exactly as much heap space as it
1368 * turns out we need.
1370 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1371 v = NULL; /* will allocate after we're done */
1372 p = stackbuf;
1374 else {
1375 /* Overallocate on the heap, and give the excess back at the end. */
1376 nallocated = size * 4;
1377 if (nallocated / 4 != size) /* overflow! */
1378 return PyErr_NoMemory();
1379 v = PyString_FromStringAndSize(NULL, nallocated);
1380 if (v == NULL)
1381 return NULL;
1382 p = PyString_AS_STRING(v);
1385 for (i = 0; i < size;) {
1386 Py_UCS4 ch = s[i++];
1388 if (ch < 0x80)
1389 /* Encode ASCII */
1390 *p++ = (char) ch;
1392 else if (ch < 0x0800) {
1393 /* Encode Latin-1 */
1394 *p++ = (char)(0xc0 | (ch >> 6));
1395 *p++ = (char)(0x80 | (ch & 0x3f));
1397 else {
1398 /* Encode UCS2 Unicode ordinals */
1399 if (ch < 0x10000) {
1400 /* Special case: check for high surrogate */
1401 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1402 Py_UCS4 ch2 = s[i];
1403 /* Check for low surrogate and combine the two to
1404 form a UCS4 value */
1405 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1406 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
1407 i++;
1408 goto encodeUCS4;
1410 /* Fall through: handles isolated high surrogates */
1412 *p++ = (char)(0xe0 | (ch >> 12));
1413 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1414 *p++ = (char)(0x80 | (ch & 0x3f));
1415 continue;
1417 encodeUCS4:
1418 /* Encode UCS4 Unicode ordinals */
1419 *p++ = (char)(0xf0 | (ch >> 18));
1420 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1421 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1422 *p++ = (char)(0x80 | (ch & 0x3f));
1426 if (v == NULL) {
1427 /* This was stack allocated. */
1428 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1429 assert(nneeded <= nallocated);
1430 v = PyString_FromStringAndSize(stackbuf, nneeded);
1432 else {
1433 /* Cut back to size actually needed. */
1434 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1435 assert(nneeded <= nallocated);
1436 _PyString_Resize(&v, nneeded);
1438 return v;
1440 #undef MAX_SHORT_UNICHARS
1443 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1445 if (!PyUnicode_Check(unicode)) {
1446 PyErr_BadArgument();
1447 return NULL;
1449 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1450 PyUnicode_GET_SIZE(unicode),
1451 NULL);
1454 /* --- UTF-16 Codec ------------------------------------------------------- */
1456 PyObject *
1457 PyUnicode_DecodeUTF16(const char *s,
1458 int size,
1459 const char *errors,
1460 int *byteorder)
1462 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1465 PyObject *
1466 PyUnicode_DecodeUTF16Stateful(const char *s,
1467 int size,
1468 const char *errors,
1469 int *byteorder,
1470 int *consumed)
1472 const char *starts = s;
1473 int startinpos;
1474 int endinpos;
1475 int outpos;
1476 PyUnicodeObject *unicode;
1477 Py_UNICODE *p;
1478 const unsigned char *q, *e;
1479 int bo = 0; /* assume native ordering by default */
1480 const char *errmsg = "";
1481 /* Offsets from q for retrieving byte pairs in the right order. */
1482 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1483 int ihi = 1, ilo = 0;
1484 #else
1485 int ihi = 0, ilo = 1;
1486 #endif
1487 PyObject *errorHandler = NULL;
1488 PyObject *exc = NULL;
1490 /* Note: size will always be longer than the resulting Unicode
1491 character count */
1492 unicode = _PyUnicode_New(size);
1493 if (!unicode)
1494 return NULL;
1495 if (size == 0)
1496 return (PyObject *)unicode;
1498 /* Unpack UTF-16 encoded data */
1499 p = unicode->str;
1500 q = (unsigned char *)s;
1501 e = q + size;
1503 if (byteorder)
1504 bo = *byteorder;
1506 /* Check for BOM marks (U+FEFF) in the input and adjust current
1507 byte order setting accordingly. In native mode, the leading BOM
1508 mark is skipped, in all other modes, it is copied to the output
1509 stream as-is (giving a ZWNBSP character). */
1510 if (bo == 0) {
1511 if (size >= 2) {
1512 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
1513 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1514 if (bom == 0xFEFF) {
1515 q += 2;
1516 bo = -1;
1518 else if (bom == 0xFFFE) {
1519 q += 2;
1520 bo = 1;
1522 #else
1523 if (bom == 0xFEFF) {
1524 q += 2;
1525 bo = 1;
1527 else if (bom == 0xFFFE) {
1528 q += 2;
1529 bo = -1;
1531 #endif
1535 if (bo == -1) {
1536 /* force LE */
1537 ihi = 1;
1538 ilo = 0;
1540 else if (bo == 1) {
1541 /* force BE */
1542 ihi = 0;
1543 ilo = 1;
1546 while (q < e) {
1547 Py_UNICODE ch;
1548 /* remaining bytes at the end? (size should be even) */
1549 if (e-q<2) {
1550 if (consumed)
1551 break;
1552 errmsg = "truncated data";
1553 startinpos = ((const char *)q)-starts;
1554 endinpos = ((const char *)e)-starts;
1555 goto utf16Error;
1556 /* The remaining input chars are ignored if the callback
1557 chooses to skip the input */
1559 ch = (q[ihi] << 8) | q[ilo];
1561 q += 2;
1563 if (ch < 0xD800 || ch > 0xDFFF) {
1564 *p++ = ch;
1565 continue;
1568 /* UTF-16 code pair: */
1569 if (q >= e) {
1570 errmsg = "unexpected end of data";
1571 startinpos = (((const char *)q)-2)-starts;
1572 endinpos = ((const char *)e)-starts;
1573 goto utf16Error;
1575 if (0xD800 <= ch && ch <= 0xDBFF) {
1576 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1577 q += 2;
1578 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1579 #ifndef Py_UNICODE_WIDE
1580 *p++ = ch;
1581 *p++ = ch2;
1582 #else
1583 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1584 #endif
1585 continue;
1587 else {
1588 errmsg = "illegal UTF-16 surrogate";
1589 startinpos = (((const char *)q)-4)-starts;
1590 endinpos = startinpos+2;
1591 goto utf16Error;
1595 errmsg = "illegal encoding";
1596 startinpos = (((const char *)q)-2)-starts;
1597 endinpos = startinpos+2;
1598 /* Fall through to report the error */
1600 utf16Error:
1601 outpos = p-PyUnicode_AS_UNICODE(unicode);
1602 if (unicode_decode_call_errorhandler(
1603 errors, &errorHandler,
1604 "utf16", errmsg,
1605 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1606 (PyObject **)&unicode, &outpos, &p))
1607 goto onError;
1610 if (byteorder)
1611 *byteorder = bo;
1613 if (consumed)
1614 *consumed = (const char *)q-starts;
1616 /* Adjust length */
1617 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1618 goto onError;
1620 Py_XDECREF(errorHandler);
1621 Py_XDECREF(exc);
1622 return (PyObject *)unicode;
1624 onError:
1625 Py_DECREF(unicode);
1626 Py_XDECREF(errorHandler);
1627 Py_XDECREF(exc);
1628 return NULL;
1631 PyObject *
1632 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1633 int size,
1634 const char *errors,
1635 int byteorder)
1637 PyObject *v;
1638 unsigned char *p;
1639 #ifdef Py_UNICODE_WIDE
1640 int i, pairs;
1641 #else
1642 const int pairs = 0;
1643 #endif
1644 /* Offsets from p for storing byte pairs in the right order. */
1645 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1646 int ihi = 1, ilo = 0;
1647 #else
1648 int ihi = 0, ilo = 1;
1649 #endif
1651 #define STORECHAR(CH) \
1652 do { \
1653 p[ihi] = ((CH) >> 8) & 0xff; \
1654 p[ilo] = (CH) & 0xff; \
1655 p += 2; \
1656 } while(0)
1658 #ifdef Py_UNICODE_WIDE
1659 for (i = pairs = 0; i < size; i++)
1660 if (s[i] >= 0x10000)
1661 pairs++;
1662 #endif
1663 v = PyString_FromStringAndSize(NULL,
1664 2 * (size + pairs + (byteorder == 0)));
1665 if (v == NULL)
1666 return NULL;
1668 p = (unsigned char *)PyString_AS_STRING(v);
1669 if (byteorder == 0)
1670 STORECHAR(0xFEFF);
1671 if (size == 0)
1672 return v;
1674 if (byteorder == -1) {
1675 /* force LE */
1676 ihi = 1;
1677 ilo = 0;
1679 else if (byteorder == 1) {
1680 /* force BE */
1681 ihi = 0;
1682 ilo = 1;
1685 while (size-- > 0) {
1686 Py_UNICODE ch = *s++;
1687 Py_UNICODE ch2 = 0;
1688 #ifdef Py_UNICODE_WIDE
1689 if (ch >= 0x10000) {
1690 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1691 ch = 0xD800 | ((ch-0x10000) >> 10);
1693 #endif
1694 STORECHAR(ch);
1695 if (ch2)
1696 STORECHAR(ch2);
1698 return v;
1699 #undef STORECHAR
1702 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1704 if (!PyUnicode_Check(unicode)) {
1705 PyErr_BadArgument();
1706 return NULL;
1708 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1709 PyUnicode_GET_SIZE(unicode),
1710 NULL,
1714 /* --- Unicode Escape Codec ----------------------------------------------- */
1716 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1718 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1719 int size,
1720 const char *errors)
1722 const char *starts = s;
1723 int startinpos;
1724 int endinpos;
1725 int outpos;
1726 int i;
1727 PyUnicodeObject *v;
1728 Py_UNICODE *p;
1729 const char *end;
1730 char* message;
1731 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1732 PyObject *errorHandler = NULL;
1733 PyObject *exc = NULL;
1735 /* Escaped strings will always be longer than the resulting
1736 Unicode string, so we start with size here and then reduce the
1737 length after conversion to the true value.
1738 (but if the error callback returns a long replacement string
1739 we'll have to allocate more space) */
1740 v = _PyUnicode_New(size);
1741 if (v == NULL)
1742 goto onError;
1743 if (size == 0)
1744 return (PyObject *)v;
1746 p = PyUnicode_AS_UNICODE(v);
1747 end = s + size;
1749 while (s < end) {
1750 unsigned char c;
1751 Py_UNICODE x;
1752 int digits;
1754 /* Non-escape characters are interpreted as Unicode ordinals */
1755 if (*s != '\\') {
1756 *p++ = (unsigned char) *s++;
1757 continue;
1760 startinpos = s-starts;
1761 /* \ - Escapes */
1762 s++;
1763 switch (*s++) {
1765 /* \x escapes */
1766 case '\n': break;
1767 case '\\': *p++ = '\\'; break;
1768 case '\'': *p++ = '\''; break;
1769 case '\"': *p++ = '\"'; break;
1770 case 'b': *p++ = '\b'; break;
1771 case 'f': *p++ = '\014'; break; /* FF */
1772 case 't': *p++ = '\t'; break;
1773 case 'n': *p++ = '\n'; break;
1774 case 'r': *p++ = '\r'; break;
1775 case 'v': *p++ = '\013'; break; /* VT */
1776 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1778 /* \OOO (octal) escapes */
1779 case '0': case '1': case '2': case '3':
1780 case '4': case '5': case '6': case '7':
1781 x = s[-1] - '0';
1782 if ('0' <= *s && *s <= '7') {
1783 x = (x<<3) + *s++ - '0';
1784 if ('0' <= *s && *s <= '7')
1785 x = (x<<3) + *s++ - '0';
1787 *p++ = x;
1788 break;
1790 /* hex escapes */
1791 /* \xXX */
1792 case 'x':
1793 digits = 2;
1794 message = "truncated \\xXX escape";
1795 goto hexescape;
1797 /* \uXXXX */
1798 case 'u':
1799 digits = 4;
1800 message = "truncated \\uXXXX escape";
1801 goto hexescape;
1803 /* \UXXXXXXXX */
1804 case 'U':
1805 digits = 8;
1806 message = "truncated \\UXXXXXXXX escape";
1807 hexescape:
1808 chr = 0;
1809 outpos = p-PyUnicode_AS_UNICODE(v);
1810 if (s+digits>end) {
1811 endinpos = size;
1812 if (unicode_decode_call_errorhandler(
1813 errors, &errorHandler,
1814 "unicodeescape", "end of string in escape sequence",
1815 starts, size, &startinpos, &endinpos, &exc, &s,
1816 (PyObject **)&v, &outpos, &p))
1817 goto onError;
1818 goto nextByte;
1820 for (i = 0; i < digits; ++i) {
1821 c = (unsigned char) s[i];
1822 if (!isxdigit(c)) {
1823 endinpos = (s+i+1)-starts;
1824 if (unicode_decode_call_errorhandler(
1825 errors, &errorHandler,
1826 "unicodeescape", message,
1827 starts, size, &startinpos, &endinpos, &exc, &s,
1828 (PyObject **)&v, &outpos, &p))
1829 goto onError;
1830 goto nextByte;
1832 chr = (chr<<4) & ~0xF;
1833 if (c >= '0' && c <= '9')
1834 chr += c - '0';
1835 else if (c >= 'a' && c <= 'f')
1836 chr += 10 + c - 'a';
1837 else
1838 chr += 10 + c - 'A';
1840 s += i;
1841 if (chr == 0xffffffff && PyErr_Occurred())
1842 /* _decoding_error will have already written into the
1843 target buffer. */
1844 break;
1845 store:
1846 /* when we get here, chr is a 32-bit unicode character */
1847 if (chr <= 0xffff)
1848 /* UCS-2 character */
1849 *p++ = (Py_UNICODE) chr;
1850 else if (chr <= 0x10ffff) {
1851 /* UCS-4 character. Either store directly, or as
1852 surrogate pair. */
1853 #ifdef Py_UNICODE_WIDE
1854 *p++ = chr;
1855 #else
1856 chr -= 0x10000L;
1857 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1858 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
1859 #endif
1860 } else {
1861 endinpos = s-starts;
1862 outpos = p-PyUnicode_AS_UNICODE(v);
1863 if (unicode_decode_call_errorhandler(
1864 errors, &errorHandler,
1865 "unicodeescape", "illegal Unicode character",
1866 starts, size, &startinpos, &endinpos, &exc, &s,
1867 (PyObject **)&v, &outpos, &p))
1868 goto onError;
1870 break;
1872 /* \N{name} */
1873 case 'N':
1874 message = "malformed \\N character escape";
1875 if (ucnhash_CAPI == NULL) {
1876 /* load the unicode data module */
1877 PyObject *m, *v;
1878 m = PyImport_ImportModule("unicodedata");
1879 if (m == NULL)
1880 goto ucnhashError;
1881 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1882 Py_DECREF(m);
1883 if (v == NULL)
1884 goto ucnhashError;
1885 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1886 Py_DECREF(v);
1887 if (ucnhash_CAPI == NULL)
1888 goto ucnhashError;
1890 if (*s == '{') {
1891 const char *start = s+1;
1892 /* look for the closing brace */
1893 while (*s != '}' && s < end)
1894 s++;
1895 if (s > start && s < end && *s == '}') {
1896 /* found a name. look it up in the unicode database */
1897 message = "unknown Unicode character name";
1898 s++;
1899 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1900 goto store;
1903 endinpos = s-starts;
1904 outpos = p-PyUnicode_AS_UNICODE(v);
1905 if (unicode_decode_call_errorhandler(
1906 errors, &errorHandler,
1907 "unicodeescape", message,
1908 starts, size, &startinpos, &endinpos, &exc, &s,
1909 (PyObject **)&v, &outpos, &p))
1910 goto onError;
1911 break;
1913 default:
1914 if (s > end) {
1915 message = "\\ at end of string";
1916 s--;
1917 endinpos = s-starts;
1918 outpos = p-PyUnicode_AS_UNICODE(v);
1919 if (unicode_decode_call_errorhandler(
1920 errors, &errorHandler,
1921 "unicodeescape", message,
1922 starts, size, &startinpos, &endinpos, &exc, &s,
1923 (PyObject **)&v, &outpos, &p))
1924 goto onError;
1926 else {
1927 *p++ = '\\';
1928 *p++ = (unsigned char)s[-1];
1930 break;
1932 nextByte:
1935 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
1936 goto onError;
1937 Py_XDECREF(errorHandler);
1938 Py_XDECREF(exc);
1939 return (PyObject *)v;
1941 ucnhashError:
1942 PyErr_SetString(
1943 PyExc_UnicodeError,
1944 "\\N escapes not supported (can't load unicodedata module)"
1946 Py_XDECREF(errorHandler);
1947 Py_XDECREF(exc);
1948 return NULL;
1950 onError:
1951 Py_XDECREF(v);
1952 Py_XDECREF(errorHandler);
1953 Py_XDECREF(exc);
1954 return NULL;
1957 /* Return a Unicode-Escape string version of the Unicode object.
1959 If quotes is true, the string is enclosed in u"" or u'' quotes as
1960 appropriate.
1964 static const Py_UNICODE *findchar(const Py_UNICODE *s,
1965 int size,
1966 Py_UNICODE ch);
1968 static
1969 PyObject *unicodeescape_string(const Py_UNICODE *s,
1970 int size,
1971 int quotes)
1973 PyObject *repr;
1974 char *p;
1976 static const char *hexdigit = "0123456789abcdef";
1978 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1979 if (repr == NULL)
1980 return NULL;
1982 p = PyString_AS_STRING(repr);
1984 if (quotes) {
1985 *p++ = 'u';
1986 *p++ = (findchar(s, size, '\'') &&
1987 !findchar(s, size, '"')) ? '"' : '\'';
1989 while (size-- > 0) {
1990 Py_UNICODE ch = *s++;
1992 /* Escape quotes */
1993 if (quotes &&
1994 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
1995 *p++ = '\\';
1996 *p++ = (char) ch;
1997 continue;
2000 #ifdef Py_UNICODE_WIDE
2001 /* Map 21-bit characters to '\U00xxxxxx' */
2002 else if (ch >= 0x10000) {
2003 int offset = p - PyString_AS_STRING(repr);
2005 /* Resize the string if necessary */
2006 if (offset + 12 > PyString_GET_SIZE(repr)) {
2007 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
2008 return NULL;
2009 p = PyString_AS_STRING(repr) + offset;
2012 *p++ = '\\';
2013 *p++ = 'U';
2014 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2015 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2016 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2017 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2018 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2019 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2020 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
2021 *p++ = hexdigit[ch & 0x0000000F];
2022 continue;
2024 #endif
2025 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2026 else if (ch >= 0xD800 && ch < 0xDC00) {
2027 Py_UNICODE ch2;
2028 Py_UCS4 ucs;
2030 ch2 = *s++;
2031 size--;
2032 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2033 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2034 *p++ = '\\';
2035 *p++ = 'U';
2036 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2037 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2038 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2039 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2040 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2041 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2042 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2043 *p++ = hexdigit[ucs & 0x0000000F];
2044 continue;
2046 /* Fall through: isolated surrogates are copied as-is */
2047 s--;
2048 size++;
2051 /* Map 16-bit characters to '\uxxxx' */
2052 if (ch >= 256) {
2053 *p++ = '\\';
2054 *p++ = 'u';
2055 *p++ = hexdigit[(ch >> 12) & 0x000F];
2056 *p++ = hexdigit[(ch >> 8) & 0x000F];
2057 *p++ = hexdigit[(ch >> 4) & 0x000F];
2058 *p++ = hexdigit[ch & 0x000F];
2061 /* Map special whitespace to '\t', \n', '\r' */
2062 else if (ch == '\t') {
2063 *p++ = '\\';
2064 *p++ = 't';
2066 else if (ch == '\n') {
2067 *p++ = '\\';
2068 *p++ = 'n';
2070 else if (ch == '\r') {
2071 *p++ = '\\';
2072 *p++ = 'r';
2075 /* Map non-printable US ASCII to '\xhh' */
2076 else if (ch < ' ' || ch >= 0x7F) {
2077 *p++ = '\\';
2078 *p++ = 'x';
2079 *p++ = hexdigit[(ch >> 4) & 0x000F];
2080 *p++ = hexdigit[ch & 0x000F];
2083 /* Copy everything else as-is */
2084 else
2085 *p++ = (char) ch;
2087 if (quotes)
2088 *p++ = PyString_AS_STRING(repr)[1];
2090 *p = '\0';
2091 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
2092 return repr;
2095 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2096 int size)
2098 return unicodeescape_string(s, size, 0);
2101 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2103 if (!PyUnicode_Check(unicode)) {
2104 PyErr_BadArgument();
2105 return NULL;
2107 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2108 PyUnicode_GET_SIZE(unicode));
2111 /* --- Raw Unicode Escape Codec ------------------------------------------- */
2113 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2114 int size,
2115 const char *errors)
2117 const char *starts = s;
2118 int startinpos;
2119 int endinpos;
2120 int outpos;
2121 PyUnicodeObject *v;
2122 Py_UNICODE *p;
2123 const char *end;
2124 const char *bs;
2125 PyObject *errorHandler = NULL;
2126 PyObject *exc = NULL;
2128 /* Escaped strings will always be longer than the resulting
2129 Unicode string, so we start with size here and then reduce the
2130 length after conversion to the true value. (But decoding error
2131 handler might have to resize the string) */
2132 v = _PyUnicode_New(size);
2133 if (v == NULL)
2134 goto onError;
2135 if (size == 0)
2136 return (PyObject *)v;
2137 p = PyUnicode_AS_UNICODE(v);
2138 end = s + size;
2139 while (s < end) {
2140 unsigned char c;
2141 Py_UCS4 x;
2142 int i;
2143 int count;
2145 /* Non-escape characters are interpreted as Unicode ordinals */
2146 if (*s != '\\') {
2147 *p++ = (unsigned char)*s++;
2148 continue;
2150 startinpos = s-starts;
2152 /* \u-escapes are only interpreted iff the number of leading
2153 backslashes if odd */
2154 bs = s;
2155 for (;s < end;) {
2156 if (*s != '\\')
2157 break;
2158 *p++ = (unsigned char)*s++;
2160 if (((s - bs) & 1) == 0 ||
2161 s >= end ||
2162 (*s != 'u' && *s != 'U')) {
2163 continue;
2165 p--;
2166 count = *s=='u' ? 4 : 8;
2167 s++;
2169 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
2170 outpos = p-PyUnicode_AS_UNICODE(v);
2171 for (x = 0, i = 0; i < count; ++i, ++s) {
2172 c = (unsigned char)*s;
2173 if (!isxdigit(c)) {
2174 endinpos = s-starts;
2175 if (unicode_decode_call_errorhandler(
2176 errors, &errorHandler,
2177 "rawunicodeescape", "truncated \\uXXXX",
2178 starts, size, &startinpos, &endinpos, &exc, &s,
2179 (PyObject **)&v, &outpos, &p))
2180 goto onError;
2181 goto nextByte;
2183 x = (x<<4) & ~0xF;
2184 if (c >= '0' && c <= '9')
2185 x += c - '0';
2186 else if (c >= 'a' && c <= 'f')
2187 x += 10 + c - 'a';
2188 else
2189 x += 10 + c - 'A';
2191 #ifndef Py_UNICODE_WIDE
2192 if (x > 0x10000) {
2193 if (unicode_decode_call_errorhandler(
2194 errors, &errorHandler,
2195 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2196 starts, size, &startinpos, &endinpos, &exc, &s,
2197 (PyObject **)&v, &outpos, &p))
2198 goto onError;
2200 #endif
2201 *p++ = x;
2202 nextByte:
2205 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
2206 goto onError;
2207 Py_XDECREF(errorHandler);
2208 Py_XDECREF(exc);
2209 return (PyObject *)v;
2211 onError:
2212 Py_XDECREF(v);
2213 Py_XDECREF(errorHandler);
2214 Py_XDECREF(exc);
2215 return NULL;
2218 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2219 int size)
2221 PyObject *repr;
2222 char *p;
2223 char *q;
2225 static const char *hexdigit = "0123456789abcdef";
2227 #ifdef Py_UNICODE_WIDE
2228 repr = PyString_FromStringAndSize(NULL, 10 * size);
2229 #else
2230 repr = PyString_FromStringAndSize(NULL, 6 * size);
2231 #endif
2232 if (repr == NULL)
2233 return NULL;
2234 if (size == 0)
2235 return repr;
2237 p = q = PyString_AS_STRING(repr);
2238 while (size-- > 0) {
2239 Py_UNICODE ch = *s++;
2240 #ifdef Py_UNICODE_WIDE
2241 /* Map 32-bit characters to '\Uxxxxxxxx' */
2242 if (ch >= 0x10000) {
2243 *p++ = '\\';
2244 *p++ = 'U';
2245 *p++ = hexdigit[(ch >> 28) & 0xf];
2246 *p++ = hexdigit[(ch >> 24) & 0xf];
2247 *p++ = hexdigit[(ch >> 20) & 0xf];
2248 *p++ = hexdigit[(ch >> 16) & 0xf];
2249 *p++ = hexdigit[(ch >> 12) & 0xf];
2250 *p++ = hexdigit[(ch >> 8) & 0xf];
2251 *p++ = hexdigit[(ch >> 4) & 0xf];
2252 *p++ = hexdigit[ch & 15];
2254 else
2255 #endif
2256 /* Map 16-bit characters to '\uxxxx' */
2257 if (ch >= 256) {
2258 *p++ = '\\';
2259 *p++ = 'u';
2260 *p++ = hexdigit[(ch >> 12) & 0xf];
2261 *p++ = hexdigit[(ch >> 8) & 0xf];
2262 *p++ = hexdigit[(ch >> 4) & 0xf];
2263 *p++ = hexdigit[ch & 15];
2265 /* Copy everything else as-is */
2266 else
2267 *p++ = (char) ch;
2269 *p = '\0';
2270 _PyString_Resize(&repr, p - q);
2271 return repr;
2274 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2276 if (!PyUnicode_Check(unicode)) {
2277 PyErr_BadArgument();
2278 return NULL;
2280 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2281 PyUnicode_GET_SIZE(unicode));
2284 /* --- Unicode Internal Codec ------------------------------------------- */
2286 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
2287 int size,
2288 const char *errors)
2290 const char *starts = s;
2291 int startinpos;
2292 int endinpos;
2293 int outpos;
2294 Py_UNICODE unimax;
2295 PyUnicodeObject *v;
2296 Py_UNICODE *p;
2297 const char *end;
2298 const char *reason;
2299 PyObject *errorHandler = NULL;
2300 PyObject *exc = NULL;
2302 unimax = PyUnicode_GetMax();
2303 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2304 if (v == NULL)
2305 goto onError;
2306 if (PyUnicode_GetSize((PyObject *)v) == 0)
2307 return (PyObject *)v;
2308 p = PyUnicode_AS_UNICODE(v);
2309 end = s + size;
2311 while (s < end) {
2312 *p = *(Py_UNICODE *)s;
2313 /* We have to sanity check the raw data, otherwise doom looms for
2314 some malformed UCS-4 data. */
2315 if (
2316 #ifdef Py_UNICODE_WIDE
2317 *p > unimax || *p < 0 ||
2318 #endif
2319 end-s < Py_UNICODE_SIZE
2322 startinpos = s - starts;
2323 if (end-s < Py_UNICODE_SIZE) {
2324 endinpos = end-starts;
2325 reason = "truncated input";
2327 else {
2328 endinpos = s - starts + Py_UNICODE_SIZE;
2329 reason = "illegal code point (> 0x10FFFF)";
2331 outpos = p - PyUnicode_AS_UNICODE(v);
2332 if (unicode_decode_call_errorhandler(
2333 errors, &errorHandler,
2334 "unicode_internal", reason,
2335 starts, size, &startinpos, &endinpos, &exc, &s,
2336 (PyObject **)&v, &outpos, &p)) {
2337 goto onError;
2340 else {
2341 p++;
2342 s += Py_UNICODE_SIZE;
2346 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
2347 goto onError;
2348 Py_XDECREF(errorHandler);
2349 Py_XDECREF(exc);
2350 return (PyObject *)v;
2352 onError:
2353 Py_XDECREF(v);
2354 Py_XDECREF(errorHandler);
2355 Py_XDECREF(exc);
2356 return NULL;
2359 /* --- Latin-1 Codec ------------------------------------------------------ */
2361 PyObject *PyUnicode_DecodeLatin1(const char *s,
2362 int size,
2363 const char *errors)
2365 PyUnicodeObject *v;
2366 Py_UNICODE *p;
2368 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2369 if (size == 1) {
2370 Py_UNICODE r = *(unsigned char*)s;
2371 return PyUnicode_FromUnicode(&r, 1);
2374 v = _PyUnicode_New(size);
2375 if (v == NULL)
2376 goto onError;
2377 if (size == 0)
2378 return (PyObject *)v;
2379 p = PyUnicode_AS_UNICODE(v);
2380 while (size-- > 0)
2381 *p++ = (unsigned char)*s++;
2382 return (PyObject *)v;
2384 onError:
2385 Py_XDECREF(v);
2386 return NULL;
2389 /* create or adjust a UnicodeEncodeError */
2390 static void make_encode_exception(PyObject **exceptionObject,
2391 const char *encoding,
2392 const Py_UNICODE *unicode, int size,
2393 int startpos, int endpos,
2394 const char *reason)
2396 if (*exceptionObject == NULL) {
2397 *exceptionObject = PyUnicodeEncodeError_Create(
2398 encoding, unicode, size, startpos, endpos, reason);
2400 else {
2401 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2402 goto onError;
2403 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2404 goto onError;
2405 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2406 goto onError;
2407 return;
2408 onError:
2409 Py_DECREF(*exceptionObject);
2410 *exceptionObject = NULL;
2414 /* raises a UnicodeEncodeError */
2415 static void raise_encode_exception(PyObject **exceptionObject,
2416 const char *encoding,
2417 const Py_UNICODE *unicode, int size,
2418 int startpos, int endpos,
2419 const char *reason)
2421 make_encode_exception(exceptionObject,
2422 encoding, unicode, size, startpos, endpos, reason);
2423 if (*exceptionObject != NULL)
2424 PyCodec_StrictErrors(*exceptionObject);
2427 /* error handling callback helper:
2428 build arguments, call the callback and check the arguments,
2429 put the result into newpos and return the replacement string, which
2430 has to be freed by the caller */
2431 static PyObject *unicode_encode_call_errorhandler(const char *errors,
2432 PyObject **errorHandler,
2433 const char *encoding, const char *reason,
2434 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2435 int startpos, int endpos,
2436 int *newpos)
2438 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2440 PyObject *restuple;
2441 PyObject *resunicode;
2443 if (*errorHandler == NULL) {
2444 *errorHandler = PyCodec_LookupError(errors);
2445 if (*errorHandler == NULL)
2446 return NULL;
2449 make_encode_exception(exceptionObject,
2450 encoding, unicode, size, startpos, endpos, reason);
2451 if (*exceptionObject == NULL)
2452 return NULL;
2454 restuple = PyObject_CallFunctionObjArgs(
2455 *errorHandler, *exceptionObject, NULL);
2456 if (restuple == NULL)
2457 return NULL;
2458 if (!PyTuple_Check(restuple)) {
2459 PyErr_Format(PyExc_TypeError, &argparse[4]);
2460 Py_DECREF(restuple);
2461 return NULL;
2463 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2464 &resunicode, newpos)) {
2465 Py_DECREF(restuple);
2466 return NULL;
2468 if (*newpos<0)
2469 *newpos = size+*newpos;
2470 if (*newpos<0 || *newpos>size) {
2471 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2472 Py_DECREF(restuple);
2473 return NULL;
2475 Py_INCREF(resunicode);
2476 Py_DECREF(restuple);
2477 return resunicode;
2480 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2481 int size,
2482 const char *errors,
2483 int limit)
2485 /* output object */
2486 PyObject *res;
2487 /* pointers to the beginning and end+1 of input */
2488 const Py_UNICODE *startp = p;
2489 const Py_UNICODE *endp = p + size;
2490 /* pointer to the beginning of the unencodable characters */
2491 /* const Py_UNICODE *badp = NULL; */
2492 /* pointer into the output */
2493 char *str;
2494 /* current output position */
2495 int respos = 0;
2496 int ressize;
2497 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2498 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2499 PyObject *errorHandler = NULL;
2500 PyObject *exc = NULL;
2501 /* the following variable is used for caching string comparisons
2502 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2503 int known_errorHandler = -1;
2505 /* allocate enough for a simple encoding without
2506 replacements, if we need more, we'll resize */
2507 res = PyString_FromStringAndSize(NULL, size);
2508 if (res == NULL)
2509 goto onError;
2510 if (size == 0)
2511 return res;
2512 str = PyString_AS_STRING(res);
2513 ressize = size;
2515 while (p<endp) {
2516 Py_UNICODE c = *p;
2518 /* can we encode this? */
2519 if (c<limit) {
2520 /* no overflow check, because we know that the space is enough */
2521 *str++ = (char)c;
2522 ++p;
2524 else {
2525 int unicodepos = p-startp;
2526 int requiredsize;
2527 PyObject *repunicode;
2528 int repsize;
2529 int newpos;
2530 int respos;
2531 Py_UNICODE *uni2;
2532 /* startpos for collecting unencodable chars */
2533 const Py_UNICODE *collstart = p;
2534 const Py_UNICODE *collend = p;
2535 /* find all unecodable characters */
2536 while ((collend < endp) && ((*collend)>=limit))
2537 ++collend;
2538 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2539 if (known_errorHandler==-1) {
2540 if ((errors==NULL) || (!strcmp(errors, "strict")))
2541 known_errorHandler = 1;
2542 else if (!strcmp(errors, "replace"))
2543 known_errorHandler = 2;
2544 else if (!strcmp(errors, "ignore"))
2545 known_errorHandler = 3;
2546 else if (!strcmp(errors, "xmlcharrefreplace"))
2547 known_errorHandler = 4;
2548 else
2549 known_errorHandler = 0;
2551 switch (known_errorHandler) {
2552 case 1: /* strict */
2553 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2554 goto onError;
2555 case 2: /* replace */
2556 while (collstart++<collend)
2557 *str++ = '?'; /* fall through */
2558 case 3: /* ignore */
2559 p = collend;
2560 break;
2561 case 4: /* xmlcharrefreplace */
2562 respos = str-PyString_AS_STRING(res);
2563 /* determine replacement size (temporarily (mis)uses p) */
2564 for (p = collstart, repsize = 0; p < collend; ++p) {
2565 if (*p<10)
2566 repsize += 2+1+1;
2567 else if (*p<100)
2568 repsize += 2+2+1;
2569 else if (*p<1000)
2570 repsize += 2+3+1;
2571 else if (*p<10000)
2572 repsize += 2+4+1;
2573 #ifndef Py_UNICODE_WIDE
2574 else
2575 repsize += 2+5+1;
2576 #else
2577 else if (*p<100000)
2578 repsize += 2+5+1;
2579 else if (*p<1000000)
2580 repsize += 2+6+1;
2581 else
2582 repsize += 2+7+1;
2583 #endif
2585 requiredsize = respos+repsize+(endp-collend);
2586 if (requiredsize > ressize) {
2587 if (requiredsize<2*ressize)
2588 requiredsize = 2*ressize;
2589 if (_PyString_Resize(&res, requiredsize))
2590 goto onError;
2591 str = PyString_AS_STRING(res) + respos;
2592 ressize = requiredsize;
2594 /* generate replacement (temporarily (mis)uses p) */
2595 for (p = collstart; p < collend; ++p) {
2596 str += sprintf(str, "&#%d;", (int)*p);
2598 p = collend;
2599 break;
2600 default:
2601 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2602 encoding, reason, startp, size, &exc,
2603 collstart-startp, collend-startp, &newpos);
2604 if (repunicode == NULL)
2605 goto onError;
2606 /* need more space? (at least enough for what we
2607 have+the replacement+the rest of the string, so
2608 we won't have to check space for encodable characters) */
2609 respos = str-PyString_AS_STRING(res);
2610 repsize = PyUnicode_GET_SIZE(repunicode);
2611 requiredsize = respos+repsize+(endp-collend);
2612 if (requiredsize > ressize) {
2613 if (requiredsize<2*ressize)
2614 requiredsize = 2*ressize;
2615 if (_PyString_Resize(&res, requiredsize)) {
2616 Py_DECREF(repunicode);
2617 goto onError;
2619 str = PyString_AS_STRING(res) + respos;
2620 ressize = requiredsize;
2622 /* check if there is anything unencodable in the replacement
2623 and copy it to the output */
2624 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2625 c = *uni2;
2626 if (c >= limit) {
2627 raise_encode_exception(&exc, encoding, startp, size,
2628 unicodepos, unicodepos+1, reason);
2629 Py_DECREF(repunicode);
2630 goto onError;
2632 *str = (char)c;
2634 p = startp + newpos;
2635 Py_DECREF(repunicode);
2639 /* Resize if we allocated to much */
2640 respos = str-PyString_AS_STRING(res);
2641 if (respos<ressize)
2642 /* If this falls res will be NULL */
2643 _PyString_Resize(&res, respos);
2644 Py_XDECREF(errorHandler);
2645 Py_XDECREF(exc);
2646 return res;
2648 onError:
2649 Py_XDECREF(res);
2650 Py_XDECREF(errorHandler);
2651 Py_XDECREF(exc);
2652 return NULL;
2655 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2656 int size,
2657 const char *errors)
2659 return unicode_encode_ucs1(p, size, errors, 256);
2662 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2664 if (!PyUnicode_Check(unicode)) {
2665 PyErr_BadArgument();
2666 return NULL;
2668 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2669 PyUnicode_GET_SIZE(unicode),
2670 NULL);
2673 /* --- 7-bit ASCII Codec -------------------------------------------------- */
2675 PyObject *PyUnicode_DecodeASCII(const char *s,
2676 int size,
2677 const char *errors)
2679 const char *starts = s;
2680 PyUnicodeObject *v;
2681 Py_UNICODE *p;
2682 int startinpos;
2683 int endinpos;
2684 int outpos;
2685 const char *e;
2686 PyObject *errorHandler = NULL;
2687 PyObject *exc = NULL;
2689 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
2690 if (size == 1 && *(unsigned char*)s < 128) {
2691 Py_UNICODE r = *(unsigned char*)s;
2692 return PyUnicode_FromUnicode(&r, 1);
2695 v = _PyUnicode_New(size);
2696 if (v == NULL)
2697 goto onError;
2698 if (size == 0)
2699 return (PyObject *)v;
2700 p = PyUnicode_AS_UNICODE(v);
2701 e = s + size;
2702 while (s < e) {
2703 register unsigned char c = (unsigned char)*s;
2704 if (c < 128) {
2705 *p++ = c;
2706 ++s;
2708 else {
2709 startinpos = s-starts;
2710 endinpos = startinpos + 1;
2711 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
2712 if (unicode_decode_call_errorhandler(
2713 errors, &errorHandler,
2714 "ascii", "ordinal not in range(128)",
2715 starts, size, &startinpos, &endinpos, &exc, &s,
2716 (PyObject **)&v, &outpos, &p))
2717 goto onError;
2720 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
2721 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
2722 goto onError;
2723 Py_XDECREF(errorHandler);
2724 Py_XDECREF(exc);
2725 return (PyObject *)v;
2727 onError:
2728 Py_XDECREF(v);
2729 Py_XDECREF(errorHandler);
2730 Py_XDECREF(exc);
2731 return NULL;
2734 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2735 int size,
2736 const char *errors)
2738 return unicode_encode_ucs1(p, size, errors, 128);
2741 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2743 if (!PyUnicode_Check(unicode)) {
2744 PyErr_BadArgument();
2745 return NULL;
2747 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2748 PyUnicode_GET_SIZE(unicode),
2749 NULL);
2752 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
2754 /* --- MBCS codecs for Windows -------------------------------------------- */
2756 PyObject *PyUnicode_DecodeMBCS(const char *s,
2757 int size,
2758 const char *errors)
2760 PyUnicodeObject *v;
2761 Py_UNICODE *p;
2763 /* First get the size of the result */
2764 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2765 if (size > 0 && usize==0)
2766 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2768 v = _PyUnicode_New(usize);
2769 if (v == NULL)
2770 return NULL;
2771 if (usize == 0)
2772 return (PyObject *)v;
2773 p = PyUnicode_AS_UNICODE(v);
2774 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2775 Py_DECREF(v);
2776 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2779 return (PyObject *)v;
2782 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2783 int size,
2784 const char *errors)
2786 PyObject *repr;
2787 char *s;
2788 DWORD mbcssize;
2790 /* If there are no characters, bail now! */
2791 if (size==0)
2792 return PyString_FromString("");
2794 /* First get the size of the result */
2795 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2796 if (mbcssize==0)
2797 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2799 repr = PyString_FromStringAndSize(NULL, mbcssize);
2800 if (repr == NULL)
2801 return NULL;
2802 if (mbcssize == 0)
2803 return repr;
2805 /* Do the conversion */
2806 s = PyString_AS_STRING(repr);
2807 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2808 Py_DECREF(repr);
2809 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2811 return repr;
2814 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2816 if (!PyUnicode_Check(unicode)) {
2817 PyErr_BadArgument();
2818 return NULL;
2820 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2821 PyUnicode_GET_SIZE(unicode),
2822 NULL);
2825 #endif /* MS_WINDOWS */
2827 /* --- Character Mapping Codec -------------------------------------------- */
2829 PyObject *PyUnicode_DecodeCharmap(const char *s,
2830 int size,
2831 PyObject *mapping,
2832 const char *errors)
2834 const char *starts = s;
2835 int startinpos;
2836 int endinpos;
2837 int outpos;
2838 const char *e;
2839 PyUnicodeObject *v;
2840 Py_UNICODE *p;
2841 int extrachars = 0;
2842 PyObject *errorHandler = NULL;
2843 PyObject *exc = NULL;
2844 Py_UNICODE *mapstring = NULL;
2845 int maplen = 0;
2847 /* Default to Latin-1 */
2848 if (mapping == NULL)
2849 return PyUnicode_DecodeLatin1(s, size, errors);
2851 v = _PyUnicode_New(size);
2852 if (v == NULL)
2853 goto onError;
2854 if (size == 0)
2855 return (PyObject *)v;
2856 p = PyUnicode_AS_UNICODE(v);
2857 e = s + size;
2858 if (PyUnicode_CheckExact(mapping)) {
2859 mapstring = PyUnicode_AS_UNICODE(mapping);
2860 maplen = PyUnicode_GET_SIZE(mapping);
2861 while (s < e) {
2862 unsigned char ch = *s;
2863 Py_UNICODE x = 0xfffe; /* illegal value */
2865 if (ch < maplen)
2866 x = mapstring[ch];
2868 if (x == 0xfffe) {
2869 /* undefined mapping */
2870 outpos = p-PyUnicode_AS_UNICODE(v);
2871 startinpos = s-starts;
2872 endinpos = startinpos+1;
2873 if (unicode_decode_call_errorhandler(
2874 errors, &errorHandler,
2875 "charmap", "character maps to <undefined>",
2876 starts, size, &startinpos, &endinpos, &exc, &s,
2877 (PyObject **)&v, &outpos, &p)) {
2878 goto onError;
2880 continue;
2882 *p++ = x;
2883 ++s;
2886 else {
2887 while (s < e) {
2888 unsigned char ch = *s;
2889 PyObject *w, *x;
2891 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2892 w = PyInt_FromLong((long)ch);
2893 if (w == NULL)
2894 goto onError;
2895 x = PyObject_GetItem(mapping, w);
2896 Py_DECREF(w);
2897 if (x == NULL) {
2898 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2899 /* No mapping found means: mapping is undefined. */
2900 PyErr_Clear();
2901 x = Py_None;
2902 Py_INCREF(x);
2903 } else
2904 goto onError;
2907 /* Apply mapping */
2908 if (PyInt_Check(x)) {
2909 long value = PyInt_AS_LONG(x);
2910 if (value < 0 || value > 65535) {
2911 PyErr_SetString(PyExc_TypeError,
2912 "character mapping must be in range(65536)");
2913 Py_DECREF(x);
2914 goto onError;
2916 *p++ = (Py_UNICODE)value;
2918 else if (x == Py_None) {
2919 /* undefined mapping */
2920 outpos = p-PyUnicode_AS_UNICODE(v);
2921 startinpos = s-starts;
2922 endinpos = startinpos+1;
2923 if (unicode_decode_call_errorhandler(
2924 errors, &errorHandler,
2925 "charmap", "character maps to <undefined>",
2926 starts, size, &startinpos, &endinpos, &exc, &s,
2927 (PyObject **)&v, &outpos, &p)) {
2928 Py_DECREF(x);
2929 goto onError;
2931 continue;
2933 else if (PyUnicode_Check(x)) {
2934 int targetsize = PyUnicode_GET_SIZE(x);
2936 if (targetsize == 1)
2937 /* 1-1 mapping */
2938 *p++ = *PyUnicode_AS_UNICODE(x);
2940 else if (targetsize > 1) {
2941 /* 1-n mapping */
2942 if (targetsize > extrachars) {
2943 /* resize first */
2944 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2945 int needed = (targetsize - extrachars) + \
2946 (targetsize << 2);
2947 extrachars += needed;
2948 if (_PyUnicode_Resize(&v,
2949 PyUnicode_GET_SIZE(v) + needed) < 0) {
2950 Py_DECREF(x);
2951 goto onError;
2953 p = PyUnicode_AS_UNICODE(v) + oldpos;
2955 Py_UNICODE_COPY(p,
2956 PyUnicode_AS_UNICODE(x),
2957 targetsize);
2958 p += targetsize;
2959 extrachars -= targetsize;
2961 /* 1-0 mapping: skip the character */
2963 else {
2964 /* wrong return value */
2965 PyErr_SetString(PyExc_TypeError,
2966 "character mapping must return integer, None or unicode");
2967 Py_DECREF(x);
2968 goto onError;
2970 Py_DECREF(x);
2971 ++s;
2974 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2975 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
2976 goto onError;
2977 Py_XDECREF(errorHandler);
2978 Py_XDECREF(exc);
2979 return (PyObject *)v;
2981 onError:
2982 Py_XDECREF(errorHandler);
2983 Py_XDECREF(exc);
2984 Py_XDECREF(v);
2985 return NULL;
2988 /* Lookup the character ch in the mapping. If the character
2989 can't be found, Py_None is returned (or NULL, if another
2990 error occurred). */
2991 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
2993 PyObject *w = PyInt_FromLong((long)c);
2994 PyObject *x;
2996 if (w == NULL)
2997 return NULL;
2998 x = PyObject_GetItem(mapping, w);
2999 Py_DECREF(w);
3000 if (x == NULL) {
3001 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3002 /* No mapping found means: mapping is undefined. */
3003 PyErr_Clear();
3004 x = Py_None;
3005 Py_INCREF(x);
3006 return x;
3007 } else
3008 return NULL;
3010 else if (x == Py_None)
3011 return x;
3012 else if (PyInt_Check(x)) {
3013 long value = PyInt_AS_LONG(x);
3014 if (value < 0 || value > 255) {
3015 PyErr_SetString(PyExc_TypeError,
3016 "character mapping must be in range(256)");
3017 Py_DECREF(x);
3018 return NULL;
3020 return x;
3022 else if (PyString_Check(x))
3023 return x;
3024 else {
3025 /* wrong return value */
3026 PyErr_SetString(PyExc_TypeError,
3027 "character mapping must return integer, None or str");
3028 Py_DECREF(x);
3029 return NULL;
3033 /* lookup the character, put the result in the output string and adjust
3034 various state variables. Reallocate the output string if not enough
3035 space is available. Return a new reference to the object that
3036 was put in the output buffer, or Py_None, if the mapping was undefined
3037 (in which case no character was written) or NULL, if a
3038 reallocation error ocurred. The called must decref the result */
3039 static
3040 PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
3041 PyObject **outobj, int *outpos)
3043 PyObject *rep = charmapencode_lookup(c, mapping);
3045 if (rep==NULL)
3046 return NULL;
3047 else if (rep==Py_None)
3048 return rep;
3049 else {
3050 char *outstart = PyString_AS_STRING(*outobj);
3051 int outsize = PyString_GET_SIZE(*outobj);
3052 if (PyInt_Check(rep)) {
3053 int requiredsize = *outpos+1;
3054 if (outsize<requiredsize) {
3055 /* exponentially overallocate to minimize reallocations */
3056 if (requiredsize < 2*outsize)
3057 requiredsize = 2*outsize;
3058 if (_PyString_Resize(outobj, requiredsize)) {
3059 Py_DECREF(rep);
3060 return NULL;
3062 outstart = PyString_AS_STRING(*outobj);
3064 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3066 else {
3067 const char *repchars = PyString_AS_STRING(rep);
3068 int repsize = PyString_GET_SIZE(rep);
3069 int requiredsize = *outpos+repsize;
3070 if (outsize<requiredsize) {
3071 /* exponentially overallocate to minimize reallocations */
3072 if (requiredsize < 2*outsize)
3073 requiredsize = 2*outsize;
3074 if (_PyString_Resize(outobj, requiredsize)) {
3075 Py_DECREF(rep);
3076 return NULL;
3078 outstart = PyString_AS_STRING(*outobj);
3080 memcpy(outstart + *outpos, repchars, repsize);
3081 *outpos += repsize;
3084 return rep;
3087 /* handle an error in PyUnicode_EncodeCharmap
3088 Return 0 on success, -1 on error */
3089 static
3090 int charmap_encoding_error(
3091 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
3092 PyObject **exceptionObject,
3093 int *known_errorHandler, PyObject **errorHandler, const char *errors,
3094 PyObject **res, int *respos)
3096 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3097 int repsize;
3098 int newpos;
3099 Py_UNICODE *uni2;
3100 /* startpos for collecting unencodable chars */
3101 int collstartpos = *inpos;
3102 int collendpos = *inpos+1;
3103 int collpos;
3104 char *encoding = "charmap";
3105 char *reason = "character maps to <undefined>";
3107 PyObject *x;
3108 /* find all unencodable characters */
3109 while (collendpos < size) {
3110 x = charmapencode_lookup(p[collendpos], mapping);
3111 if (x==NULL)
3112 return -1;
3113 else if (x!=Py_None) {
3114 Py_DECREF(x);
3115 break;
3117 Py_DECREF(x);
3118 ++collendpos;
3120 /* cache callback name lookup
3121 * (if not done yet, i.e. it's the first error) */
3122 if (*known_errorHandler==-1) {
3123 if ((errors==NULL) || (!strcmp(errors, "strict")))
3124 *known_errorHandler = 1;
3125 else if (!strcmp(errors, "replace"))
3126 *known_errorHandler = 2;
3127 else if (!strcmp(errors, "ignore"))
3128 *known_errorHandler = 3;
3129 else if (!strcmp(errors, "xmlcharrefreplace"))
3130 *known_errorHandler = 4;
3131 else
3132 *known_errorHandler = 0;
3134 switch (*known_errorHandler) {
3135 case 1: /* strict */
3136 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3137 return -1;
3138 case 2: /* replace */
3139 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3140 x = charmapencode_output('?', mapping, res, respos);
3141 if (x==NULL) {
3142 return -1;
3144 else if (x==Py_None) {
3145 Py_DECREF(x);
3146 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3147 return -1;
3149 Py_DECREF(x);
3151 /* fall through */
3152 case 3: /* ignore */
3153 *inpos = collendpos;
3154 break;
3155 case 4: /* xmlcharrefreplace */
3156 /* generate replacement (temporarily (mis)uses p) */
3157 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3158 char buffer[2+29+1+1];
3159 char *cp;
3160 sprintf(buffer, "&#%d;", (int)p[collpos]);
3161 for (cp = buffer; *cp; ++cp) {
3162 x = charmapencode_output(*cp, mapping, res, respos);
3163 if (x==NULL)
3164 return -1;
3165 else if (x==Py_None) {
3166 Py_DECREF(x);
3167 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3168 return -1;
3170 Py_DECREF(x);
3173 *inpos = collendpos;
3174 break;
3175 default:
3176 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
3177 encoding, reason, p, size, exceptionObject,
3178 collstartpos, collendpos, &newpos);
3179 if (repunicode == NULL)
3180 return -1;
3181 /* generate replacement */
3182 repsize = PyUnicode_GET_SIZE(repunicode);
3183 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3184 x = charmapencode_output(*uni2, mapping, res, respos);
3185 if (x==NULL) {
3186 Py_DECREF(repunicode);
3187 return -1;
3189 else if (x==Py_None) {
3190 Py_DECREF(repunicode);
3191 Py_DECREF(x);
3192 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3193 return -1;
3195 Py_DECREF(x);
3197 *inpos = newpos;
3198 Py_DECREF(repunicode);
3200 return 0;
3203 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3204 int size,
3205 PyObject *mapping,
3206 const char *errors)
3208 /* output object */
3209 PyObject *res = NULL;
3210 /* current input position */
3211 int inpos = 0;
3212 /* current output position */
3213 int respos = 0;
3214 PyObject *errorHandler = NULL;
3215 PyObject *exc = NULL;
3216 /* the following variable is used for caching string comparisons
3217 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3218 * 3=ignore, 4=xmlcharrefreplace */
3219 int known_errorHandler = -1;
3221 /* Default to Latin-1 */
3222 if (mapping == NULL)
3223 return PyUnicode_EncodeLatin1(p, size, errors);
3225 /* allocate enough for a simple encoding without
3226 replacements, if we need more, we'll resize */
3227 res = PyString_FromStringAndSize(NULL, size);
3228 if (res == NULL)
3229 goto onError;
3230 if (size == 0)
3231 return res;
3233 while (inpos<size) {
3234 /* try to encode it */
3235 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3236 if (x==NULL) /* error */
3237 goto onError;
3238 if (x==Py_None) { /* unencodable character */
3239 if (charmap_encoding_error(p, size, &inpos, mapping,
3240 &exc,
3241 &known_errorHandler, &errorHandler, errors,
3242 &res, &respos)) {
3243 Py_DECREF(x);
3244 goto onError;
3247 else
3248 /* done with this character => adjust input position */
3249 ++inpos;
3250 Py_DECREF(x);
3253 /* Resize if we allocated to much */
3254 if (respos<PyString_GET_SIZE(res)) {
3255 if (_PyString_Resize(&res, respos))
3256 goto onError;
3258 Py_XDECREF(exc);
3259 Py_XDECREF(errorHandler);
3260 return res;
3262 onError:
3263 Py_XDECREF(res);
3264 Py_XDECREF(exc);
3265 Py_XDECREF(errorHandler);
3266 return NULL;
3269 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3270 PyObject *mapping)
3272 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3273 PyErr_BadArgument();
3274 return NULL;
3276 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3277 PyUnicode_GET_SIZE(unicode),
3278 mapping,
3279 NULL);
3282 /* create or adjust a UnicodeTranslateError */
3283 static void make_translate_exception(PyObject **exceptionObject,
3284 const Py_UNICODE *unicode, int size,
3285 int startpos, int endpos,
3286 const char *reason)
3288 if (*exceptionObject == NULL) {
3289 *exceptionObject = PyUnicodeTranslateError_Create(
3290 unicode, size, startpos, endpos, reason);
3292 else {
3293 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3294 goto onError;
3295 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3296 goto onError;
3297 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3298 goto onError;
3299 return;
3300 onError:
3301 Py_DECREF(*exceptionObject);
3302 *exceptionObject = NULL;
3306 /* raises a UnicodeTranslateError */
3307 static void raise_translate_exception(PyObject **exceptionObject,
3308 const Py_UNICODE *unicode, int size,
3309 int startpos, int endpos,
3310 const char *reason)
3312 make_translate_exception(exceptionObject,
3313 unicode, size, startpos, endpos, reason);
3314 if (*exceptionObject != NULL)
3315 PyCodec_StrictErrors(*exceptionObject);
3318 /* error handling callback helper:
3319 build arguments, call the callback and check the arguments,
3320 put the result into newpos and return the replacement string, which
3321 has to be freed by the caller */
3322 static PyObject *unicode_translate_call_errorhandler(const char *errors,
3323 PyObject **errorHandler,
3324 const char *reason,
3325 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3326 int startpos, int endpos,
3327 int *newpos)
3329 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3331 PyObject *restuple;
3332 PyObject *resunicode;
3334 if (*errorHandler == NULL) {
3335 *errorHandler = PyCodec_LookupError(errors);
3336 if (*errorHandler == NULL)
3337 return NULL;
3340 make_translate_exception(exceptionObject,
3341 unicode, size, startpos, endpos, reason);
3342 if (*exceptionObject == NULL)
3343 return NULL;
3345 restuple = PyObject_CallFunctionObjArgs(
3346 *errorHandler, *exceptionObject, NULL);
3347 if (restuple == NULL)
3348 return NULL;
3349 if (!PyTuple_Check(restuple)) {
3350 PyErr_Format(PyExc_TypeError, &argparse[4]);
3351 Py_DECREF(restuple);
3352 return NULL;
3354 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3355 &resunicode, newpos)) {
3356 Py_DECREF(restuple);
3357 return NULL;
3359 if (*newpos<0)
3360 *newpos = size+*newpos;
3361 if (*newpos<0 || *newpos>size) {
3362 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3363 Py_DECREF(restuple);
3364 return NULL;
3366 Py_INCREF(resunicode);
3367 Py_DECREF(restuple);
3368 return resunicode;
3371 /* Lookup the character ch in the mapping and put the result in result,
3372 which must be decrefed by the caller.
3373 Return 0 on success, -1 on error */
3374 static
3375 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3377 PyObject *w = PyInt_FromLong((long)c);
3378 PyObject *x;
3380 if (w == NULL)
3381 return -1;
3382 x = PyObject_GetItem(mapping, w);
3383 Py_DECREF(w);
3384 if (x == NULL) {
3385 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3386 /* No mapping found means: use 1:1 mapping. */
3387 PyErr_Clear();
3388 *result = NULL;
3389 return 0;
3390 } else
3391 return -1;
3393 else if (x == Py_None) {
3394 *result = x;
3395 return 0;
3397 else if (PyInt_Check(x)) {
3398 long value = PyInt_AS_LONG(x);
3399 long max = PyUnicode_GetMax();
3400 if (value < 0 || value > max) {
3401 PyErr_Format(PyExc_TypeError,
3402 "character mapping must be in range(0x%lx)", max+1);
3403 Py_DECREF(x);
3404 return -1;
3406 *result = x;
3407 return 0;
3409 else if (PyUnicode_Check(x)) {
3410 *result = x;
3411 return 0;
3413 else {
3414 /* wrong return value */
3415 PyErr_SetString(PyExc_TypeError,
3416 "character mapping must return integer, None or unicode");
3417 Py_DECREF(x);
3418 return -1;
3421 /* ensure that *outobj is at least requiredsize characters long,
3422 if not reallocate and adjust various state variables.
3423 Return 0 on success, -1 on error */
3424 static
3425 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
3426 int requiredsize)
3428 int oldsize = PyUnicode_GET_SIZE(*outobj);
3429 if (requiredsize > oldsize) {
3430 /* remember old output position */
3431 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3432 /* exponentially overallocate to minimize reallocations */
3433 if (requiredsize < 2 * oldsize)
3434 requiredsize = 2 * oldsize;
3435 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
3436 return -1;
3437 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
3439 return 0;
3441 /* lookup the character, put the result in the output string and adjust
3442 various state variables. Return a new reference to the object that
3443 was put in the output buffer in *result, or Py_None, if the mapping was
3444 undefined (in which case no character was written).
3445 The called must decref result.
3446 Return 0 on success, -1 on error. */
3447 static
3448 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
3449 int insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
3450 PyObject **res)
3452 if (charmaptranslate_lookup(*curinp, mapping, res))
3453 return -1;
3454 if (*res==NULL) {
3455 /* not found => default to 1:1 mapping */
3456 *(*outp)++ = *curinp;
3458 else if (*res==Py_None)
3460 else if (PyInt_Check(*res)) {
3461 /* no overflow check, because we know that the space is enough */
3462 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3464 else if (PyUnicode_Check(*res)) {
3465 int repsize = PyUnicode_GET_SIZE(*res);
3466 if (repsize==1) {
3467 /* no overflow check, because we know that the space is enough */
3468 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3470 else if (repsize!=0) {
3471 /* more than one character */
3472 int requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
3473 (insize - (curinp-startinp)) +
3474 repsize - 1;
3475 if (charmaptranslate_makespace(outobj, outp, requiredsize))
3476 return -1;
3477 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3478 *outp += repsize;
3481 else
3482 return -1;
3483 return 0;
3486 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
3487 int size,
3488 PyObject *mapping,
3489 const char *errors)
3491 /* output object */
3492 PyObject *res = NULL;
3493 /* pointers to the beginning and end+1 of input */
3494 const Py_UNICODE *startp = p;
3495 const Py_UNICODE *endp = p + size;
3496 /* pointer into the output */
3497 Py_UNICODE *str;
3498 /* current output position */
3499 int respos = 0;
3500 char *reason = "character maps to <undefined>";
3501 PyObject *errorHandler = NULL;
3502 PyObject *exc = NULL;
3503 /* the following variable is used for caching string comparisons
3504 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3505 * 3=ignore, 4=xmlcharrefreplace */
3506 int known_errorHandler = -1;
3508 if (mapping == NULL) {
3509 PyErr_BadArgument();
3510 return NULL;
3513 /* allocate enough for a simple 1:1 translation without
3514 replacements, if we need more, we'll resize */
3515 res = PyUnicode_FromUnicode(NULL, size);
3516 if (res == NULL)
3517 goto onError;
3518 if (size == 0)
3519 return res;
3520 str = PyUnicode_AS_UNICODE(res);
3522 while (p<endp) {
3523 /* try to encode it */
3524 PyObject *x = NULL;
3525 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
3526 Py_XDECREF(x);
3527 goto onError;
3529 Py_XDECREF(x);
3530 if (x!=Py_None) /* it worked => adjust input pointer */
3531 ++p;
3532 else { /* untranslatable character */
3533 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3534 int repsize;
3535 int newpos;
3536 Py_UNICODE *uni2;
3537 /* startpos for collecting untranslatable chars */
3538 const Py_UNICODE *collstart = p;
3539 const Py_UNICODE *collend = p+1;
3540 const Py_UNICODE *coll;
3542 /* find all untranslatable characters */
3543 while (collend < endp) {
3544 if (charmaptranslate_lookup(*collend, mapping, &x))
3545 goto onError;
3546 Py_XDECREF(x);
3547 if (x!=Py_None)
3548 break;
3549 ++collend;
3551 /* cache callback name lookup
3552 * (if not done yet, i.e. it's the first error) */
3553 if (known_errorHandler==-1) {
3554 if ((errors==NULL) || (!strcmp(errors, "strict")))
3555 known_errorHandler = 1;
3556 else if (!strcmp(errors, "replace"))
3557 known_errorHandler = 2;
3558 else if (!strcmp(errors, "ignore"))
3559 known_errorHandler = 3;
3560 else if (!strcmp(errors, "xmlcharrefreplace"))
3561 known_errorHandler = 4;
3562 else
3563 known_errorHandler = 0;
3565 switch (known_errorHandler) {
3566 case 1: /* strict */
3567 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3568 goto onError;
3569 case 2: /* replace */
3570 /* No need to check for space, this is a 1:1 replacement */
3571 for (coll = collstart; coll<collend; ++coll)
3572 *str++ = '?';
3573 /* fall through */
3574 case 3: /* ignore */
3575 p = collend;
3576 break;
3577 case 4: /* xmlcharrefreplace */
3578 /* generate replacement (temporarily (mis)uses p) */
3579 for (p = collstart; p < collend; ++p) {
3580 char buffer[2+29+1+1];
3581 char *cp;
3582 sprintf(buffer, "&#%d;", (int)*p);
3583 if (charmaptranslate_makespace(&res, &str,
3584 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3585 goto onError;
3586 for (cp = buffer; *cp; ++cp)
3587 *str++ = *cp;
3589 p = collend;
3590 break;
3591 default:
3592 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3593 reason, startp, size, &exc,
3594 collstart-startp, collend-startp, &newpos);
3595 if (repunicode == NULL)
3596 goto onError;
3597 /* generate replacement */
3598 repsize = PyUnicode_GET_SIZE(repunicode);
3599 if (charmaptranslate_makespace(&res, &str,
3600 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3601 Py_DECREF(repunicode);
3602 goto onError;
3604 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3605 *str++ = *uni2;
3606 p = startp + newpos;
3607 Py_DECREF(repunicode);
3611 /* Resize if we allocated to much */
3612 respos = str-PyUnicode_AS_UNICODE(res);
3613 if (respos<PyUnicode_GET_SIZE(res)) {
3614 if (_PyUnicode_Resize(&res, respos) < 0)
3615 goto onError;
3617 Py_XDECREF(exc);
3618 Py_XDECREF(errorHandler);
3619 return res;
3621 onError:
3622 Py_XDECREF(res);
3623 Py_XDECREF(exc);
3624 Py_XDECREF(errorHandler);
3625 return NULL;
3628 PyObject *PyUnicode_Translate(PyObject *str,
3629 PyObject *mapping,
3630 const char *errors)
3632 PyObject *result;
3634 str = PyUnicode_FromObject(str);
3635 if (str == NULL)
3636 goto onError;
3637 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3638 PyUnicode_GET_SIZE(str),
3639 mapping,
3640 errors);
3641 Py_DECREF(str);
3642 return result;
3644 onError:
3645 Py_XDECREF(str);
3646 return NULL;
3649 /* --- Decimal Encoder ---------------------------------------------------- */
3651 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3652 int length,
3653 char *output,
3654 const char *errors)
3656 Py_UNICODE *p, *end;
3657 PyObject *errorHandler = NULL;
3658 PyObject *exc = NULL;
3659 const char *encoding = "decimal";
3660 const char *reason = "invalid decimal Unicode string";
3661 /* the following variable is used for caching string comparisons
3662 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3663 int known_errorHandler = -1;
3665 if (output == NULL) {
3666 PyErr_BadArgument();
3667 return -1;
3670 p = s;
3671 end = s + length;
3672 while (p < end) {
3673 register Py_UNICODE ch = *p;
3674 int decimal;
3675 PyObject *repunicode;
3676 int repsize;
3677 int newpos;
3678 Py_UNICODE *uni2;
3679 Py_UNICODE *collstart;
3680 Py_UNICODE *collend;
3682 if (Py_UNICODE_ISSPACE(ch)) {
3683 *output++ = ' ';
3684 ++p;
3685 continue;
3687 decimal = Py_UNICODE_TODECIMAL(ch);
3688 if (decimal >= 0) {
3689 *output++ = '0' + decimal;
3690 ++p;
3691 continue;
3693 if (0 < ch && ch < 256) {
3694 *output++ = (char)ch;
3695 ++p;
3696 continue;
3698 /* All other characters are considered unencodable */
3699 collstart = p;
3700 collend = p+1;
3701 while (collend < end) {
3702 if ((0 < *collend && *collend < 256) ||
3703 !Py_UNICODE_ISSPACE(*collend) ||
3704 Py_UNICODE_TODECIMAL(*collend))
3705 break;
3707 /* cache callback name lookup
3708 * (if not done yet, i.e. it's the first error) */
3709 if (known_errorHandler==-1) {
3710 if ((errors==NULL) || (!strcmp(errors, "strict")))
3711 known_errorHandler = 1;
3712 else if (!strcmp(errors, "replace"))
3713 known_errorHandler = 2;
3714 else if (!strcmp(errors, "ignore"))
3715 known_errorHandler = 3;
3716 else if (!strcmp(errors, "xmlcharrefreplace"))
3717 known_errorHandler = 4;
3718 else
3719 known_errorHandler = 0;
3721 switch (known_errorHandler) {
3722 case 1: /* strict */
3723 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3724 goto onError;
3725 case 2: /* replace */
3726 for (p = collstart; p < collend; ++p)
3727 *output++ = '?';
3728 /* fall through */
3729 case 3: /* ignore */
3730 p = collend;
3731 break;
3732 case 4: /* xmlcharrefreplace */
3733 /* generate replacement (temporarily (mis)uses p) */
3734 for (p = collstart; p < collend; ++p)
3735 output += sprintf(output, "&#%d;", (int)*p);
3736 p = collend;
3737 break;
3738 default:
3739 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3740 encoding, reason, s, length, &exc,
3741 collstart-s, collend-s, &newpos);
3742 if (repunicode == NULL)
3743 goto onError;
3744 /* generate replacement */
3745 repsize = PyUnicode_GET_SIZE(repunicode);
3746 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3747 Py_UNICODE ch = *uni2;
3748 if (Py_UNICODE_ISSPACE(ch))
3749 *output++ = ' ';
3750 else {
3751 decimal = Py_UNICODE_TODECIMAL(ch);
3752 if (decimal >= 0)
3753 *output++ = '0' + decimal;
3754 else if (0 < ch && ch < 256)
3755 *output++ = (char)ch;
3756 else {
3757 Py_DECREF(repunicode);
3758 raise_encode_exception(&exc, encoding,
3759 s, length, collstart-s, collend-s, reason);
3760 goto onError;
3764 p = s + newpos;
3765 Py_DECREF(repunicode);
3768 /* 0-terminate the output string */
3769 *output++ = '\0';
3770 Py_XDECREF(exc);
3771 Py_XDECREF(errorHandler);
3772 return 0;
3774 onError:
3775 Py_XDECREF(exc);
3776 Py_XDECREF(errorHandler);
3777 return -1;
3780 /* --- Helpers ------------------------------------------------------------ */
3782 static
3783 int count(PyUnicodeObject *self,
3784 int start,
3785 int end,
3786 PyUnicodeObject *substring)
3788 int count = 0;
3790 if (start < 0)
3791 start += self->length;
3792 if (start < 0)
3793 start = 0;
3794 if (end > self->length)
3795 end = self->length;
3796 if (end < 0)
3797 end += self->length;
3798 if (end < 0)
3799 end = 0;
3801 if (substring->length == 0)
3802 return (end - start + 1);
3804 end -= substring->length;
3806 while (start <= end)
3807 if (Py_UNICODE_MATCH(self, start, substring)) {
3808 count++;
3809 start += substring->length;
3810 } else
3811 start++;
3813 return count;
3816 int PyUnicode_Count(PyObject *str,
3817 PyObject *substr,
3818 int start,
3819 int end)
3821 int result;
3823 str = PyUnicode_FromObject(str);
3824 if (str == NULL)
3825 return -1;
3826 substr = PyUnicode_FromObject(substr);
3827 if (substr == NULL) {
3828 Py_DECREF(str);
3829 return -1;
3832 result = count((PyUnicodeObject *)str,
3833 start, end,
3834 (PyUnicodeObject *)substr);
3836 Py_DECREF(str);
3837 Py_DECREF(substr);
3838 return result;
3841 static
3842 int findstring(PyUnicodeObject *self,
3843 PyUnicodeObject *substring,
3844 int start,
3845 int end,
3846 int direction)
3848 if (start < 0)
3849 start += self->length;
3850 if (start < 0)
3851 start = 0;
3853 if (end > self->length)
3854 end = self->length;
3855 if (end < 0)
3856 end += self->length;
3857 if (end < 0)
3858 end = 0;
3860 if (substring->length == 0)
3861 return (direction > 0) ? start : end;
3863 end -= substring->length;
3865 if (direction < 0) {
3866 for (; end >= start; end--)
3867 if (Py_UNICODE_MATCH(self, end, substring))
3868 return end;
3869 } else {
3870 for (; start <= end; start++)
3871 if (Py_UNICODE_MATCH(self, start, substring))
3872 return start;
3875 return -1;
3878 int PyUnicode_Find(PyObject *str,
3879 PyObject *substr,
3880 int start,
3881 int end,
3882 int direction)
3884 int result;
3886 str = PyUnicode_FromObject(str);
3887 if (str == NULL)
3888 return -2;
3889 substr = PyUnicode_FromObject(substr);
3890 if (substr == NULL) {
3891 Py_DECREF(str);
3892 return -2;
3895 result = findstring((PyUnicodeObject *)str,
3896 (PyUnicodeObject *)substr,
3897 start, end, direction);
3898 Py_DECREF(str);
3899 Py_DECREF(substr);
3900 return result;
3903 static
3904 int tailmatch(PyUnicodeObject *self,
3905 PyUnicodeObject *substring,
3906 int start,
3907 int end,
3908 int direction)
3910 if (start < 0)
3911 start += self->length;
3912 if (start < 0)
3913 start = 0;
3915 if (substring->length == 0)
3916 return 1;
3918 if (end > self->length)
3919 end = self->length;
3920 if (end < 0)
3921 end += self->length;
3922 if (end < 0)
3923 end = 0;
3925 end -= substring->length;
3926 if (end < start)
3927 return 0;
3929 if (direction > 0) {
3930 if (Py_UNICODE_MATCH(self, end, substring))
3931 return 1;
3932 } else {
3933 if (Py_UNICODE_MATCH(self, start, substring))
3934 return 1;
3937 return 0;
3940 int PyUnicode_Tailmatch(PyObject *str,
3941 PyObject *substr,
3942 int start,
3943 int end,
3944 int direction)
3946 int result;
3948 str = PyUnicode_FromObject(str);
3949 if (str == NULL)
3950 return -1;
3951 substr = PyUnicode_FromObject(substr);
3952 if (substr == NULL) {
3953 Py_DECREF(substr);
3954 return -1;
3957 result = tailmatch((PyUnicodeObject *)str,
3958 (PyUnicodeObject *)substr,
3959 start, end, direction);
3960 Py_DECREF(str);
3961 Py_DECREF(substr);
3962 return result;
3965 static
3966 const Py_UNICODE *findchar(const Py_UNICODE *s,
3967 int size,
3968 Py_UNICODE ch)
3970 /* like wcschr, but doesn't stop at NULL characters */
3972 while (size-- > 0) {
3973 if (*s == ch)
3974 return s;
3975 s++;
3978 return NULL;
3981 /* Apply fixfct filter to the Unicode object self and return a
3982 reference to the modified object */
3984 static
3985 PyObject *fixup(PyUnicodeObject *self,
3986 int (*fixfct)(PyUnicodeObject *s))
3989 PyUnicodeObject *u;
3991 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
3992 if (u == NULL)
3993 return NULL;
3995 Py_UNICODE_COPY(u->str, self->str, self->length);
3997 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
3998 /* fixfct should return TRUE if it modified the buffer. If
3999 FALSE, return a reference to the original buffer instead
4000 (to save space, not time) */
4001 Py_INCREF(self);
4002 Py_DECREF(u);
4003 return (PyObject*) self;
4005 return (PyObject*) u;
4008 static
4009 int fixupper(PyUnicodeObject *self)
4011 int len = self->length;
4012 Py_UNICODE *s = self->str;
4013 int status = 0;
4015 while (len-- > 0) {
4016 register Py_UNICODE ch;
4018 ch = Py_UNICODE_TOUPPER(*s);
4019 if (ch != *s) {
4020 status = 1;
4021 *s = ch;
4023 s++;
4026 return status;
4029 static
4030 int fixlower(PyUnicodeObject *self)
4032 int len = self->length;
4033 Py_UNICODE *s = self->str;
4034 int status = 0;
4036 while (len-- > 0) {
4037 register Py_UNICODE ch;
4039 ch = Py_UNICODE_TOLOWER(*s);
4040 if (ch != *s) {
4041 status = 1;
4042 *s = ch;
4044 s++;
4047 return status;
4050 static
4051 int fixswapcase(PyUnicodeObject *self)
4053 int len = self->length;
4054 Py_UNICODE *s = self->str;
4055 int status = 0;
4057 while (len-- > 0) {
4058 if (Py_UNICODE_ISUPPER(*s)) {
4059 *s = Py_UNICODE_TOLOWER(*s);
4060 status = 1;
4061 } else if (Py_UNICODE_ISLOWER(*s)) {
4062 *s = Py_UNICODE_TOUPPER(*s);
4063 status = 1;
4065 s++;
4068 return status;
4071 static
4072 int fixcapitalize(PyUnicodeObject *self)
4074 int len = self->length;
4075 Py_UNICODE *s = self->str;
4076 int status = 0;
4078 if (len == 0)
4079 return 0;
4080 if (Py_UNICODE_ISLOWER(*s)) {
4081 *s = Py_UNICODE_TOUPPER(*s);
4082 status = 1;
4084 s++;
4085 while (--len > 0) {
4086 if (Py_UNICODE_ISUPPER(*s)) {
4087 *s = Py_UNICODE_TOLOWER(*s);
4088 status = 1;
4090 s++;
4092 return status;
4095 static
4096 int fixtitle(PyUnicodeObject *self)
4098 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4099 register Py_UNICODE *e;
4100 int previous_is_cased;
4102 /* Shortcut for single character strings */
4103 if (PyUnicode_GET_SIZE(self) == 1) {
4104 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4105 if (*p != ch) {
4106 *p = ch;
4107 return 1;
4109 else
4110 return 0;
4113 e = p + PyUnicode_GET_SIZE(self);
4114 previous_is_cased = 0;
4115 for (; p < e; p++) {
4116 register const Py_UNICODE ch = *p;
4118 if (previous_is_cased)
4119 *p = Py_UNICODE_TOLOWER(ch);
4120 else
4121 *p = Py_UNICODE_TOTITLE(ch);
4123 if (Py_UNICODE_ISLOWER(ch) ||
4124 Py_UNICODE_ISUPPER(ch) ||
4125 Py_UNICODE_ISTITLE(ch))
4126 previous_is_cased = 1;
4127 else
4128 previous_is_cased = 0;
4130 return 1;
4133 PyObject *
4134 PyUnicode_Join(PyObject *separator, PyObject *seq)
4136 PyObject *internal_separator = NULL;
4137 const Py_UNICODE blank = ' ';
4138 const Py_UNICODE *sep = &blank;
4139 size_t seplen = 1;
4140 PyUnicodeObject *res = NULL; /* the result */
4141 size_t res_alloc = 100; /* # allocated bytes for string in res */
4142 size_t res_used; /* # used bytes */
4143 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4144 PyObject *fseq; /* PySequence_Fast(seq) */
4145 int seqlen; /* len(fseq) -- number of items in sequence */
4146 PyObject *item;
4147 int i;
4149 fseq = PySequence_Fast(seq, "");
4150 if (fseq == NULL) {
4151 return NULL;
4154 /* Grrrr. A codec may be invoked to convert str objects to
4155 * Unicode, and so it's possible to call back into Python code
4156 * during PyUnicode_FromObject(), and so it's possible for a sick
4157 * codec to change the size of fseq (if seq is a list). Therefore
4158 * we have to keep refetching the size -- can't assume seqlen
4159 * is invariant.
4161 seqlen = PySequence_Fast_GET_SIZE(fseq);
4162 /* If empty sequence, return u"". */
4163 if (seqlen == 0) {
4164 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4165 goto Done;
4167 /* If singleton sequence with an exact Unicode, return that. */
4168 if (seqlen == 1) {
4169 item = PySequence_Fast_GET_ITEM(fseq, 0);
4170 if (PyUnicode_CheckExact(item)) {
4171 Py_INCREF(item);
4172 res = (PyUnicodeObject *)item;
4173 goto Done;
4177 /* At least two items to join, or one that isn't exact Unicode. */
4178 if (seqlen > 1) {
4179 /* Set up sep and seplen -- they're needed. */
4180 if (separator == NULL) {
4181 sep = &blank;
4182 seplen = 1;
4184 else {
4185 internal_separator = PyUnicode_FromObject(separator);
4186 if (internal_separator == NULL)
4187 goto onError;
4188 sep = PyUnicode_AS_UNICODE(internal_separator);
4189 seplen = PyUnicode_GET_SIZE(internal_separator);
4190 /* In case PyUnicode_FromObject() mutated seq. */
4191 seqlen = PySequence_Fast_GET_SIZE(fseq);
4195 /* Get space. */
4196 res = _PyUnicode_New((int)res_alloc);
4197 if (res == NULL)
4198 goto onError;
4199 res_p = PyUnicode_AS_UNICODE(res);
4200 res_used = 0;
4202 for (i = 0; i < seqlen; ++i) {
4203 size_t itemlen;
4204 size_t new_res_used;
4206 item = PySequence_Fast_GET_ITEM(fseq, i);
4207 /* Convert item to Unicode. */
4208 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4209 PyErr_Format(PyExc_TypeError,
4210 "sequence item %i: expected string or Unicode,"
4211 " %.80s found",
4212 i, item->ob_type->tp_name);
4213 goto onError;
4215 item = PyUnicode_FromObject(item);
4216 if (item == NULL)
4217 goto onError;
4218 /* We own a reference to item from here on. */
4220 /* In case PyUnicode_FromObject() mutated seq. */
4221 seqlen = PySequence_Fast_GET_SIZE(fseq);
4223 /* Make sure we have enough space for the separator and the item. */
4224 itemlen = PyUnicode_GET_SIZE(item);
4225 new_res_used = res_used + itemlen;
4226 if (new_res_used < res_used || new_res_used > INT_MAX)
4227 goto Overflow;
4228 if (i < seqlen - 1) {
4229 new_res_used += seplen;
4230 if (new_res_used < res_used || new_res_used > INT_MAX)
4231 goto Overflow;
4233 if (new_res_used > res_alloc) {
4234 /* double allocated size until it's big enough */
4235 do {
4236 size_t oldsize = res_alloc;
4237 res_alloc += res_alloc;
4238 if (res_alloc < oldsize || res_alloc > INT_MAX)
4239 goto Overflow;
4240 } while (new_res_used > res_alloc);
4241 if (_PyUnicode_Resize(&res, (int)res_alloc) < 0) {
4242 Py_DECREF(item);
4243 goto onError;
4245 res_p = PyUnicode_AS_UNICODE(res) + res_used;
4248 /* Copy item, and maybe the separator. */
4249 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), (int)itemlen);
4250 res_p += itemlen;
4251 if (i < seqlen - 1) {
4252 Py_UNICODE_COPY(res_p, sep, (int)seplen);
4253 res_p += seplen;
4255 Py_DECREF(item);
4256 res_used = new_res_used;
4259 /* Shrink res to match the used area; this probably can't fail,
4260 * but it's cheap to check.
4262 if (_PyUnicode_Resize(&res, (int)res_used) < 0)
4263 goto onError;
4265 Done:
4266 Py_XDECREF(internal_separator);
4267 Py_DECREF(fseq);
4268 return (PyObject *)res;
4270 Overflow:
4271 PyErr_SetString(PyExc_OverflowError,
4272 "join() is too long for a Python string");
4273 Py_DECREF(item);
4274 /* fall through */
4276 onError:
4277 Py_XDECREF(internal_separator);
4278 Py_DECREF(fseq);
4279 Py_XDECREF(res);
4280 return NULL;
4283 static
4284 PyUnicodeObject *pad(PyUnicodeObject *self,
4285 int left,
4286 int right,
4287 Py_UNICODE fill)
4289 PyUnicodeObject *u;
4291 if (left < 0)
4292 left = 0;
4293 if (right < 0)
4294 right = 0;
4296 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
4297 Py_INCREF(self);
4298 return self;
4301 u = _PyUnicode_New(left + self->length + right);
4302 if (u) {
4303 if (left)
4304 Py_UNICODE_FILL(u->str, fill, left);
4305 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4306 if (right)
4307 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4310 return u;
4313 #define SPLIT_APPEND(data, left, right) \
4314 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4315 if (!str) \
4316 goto onError; \
4317 if (PyList_Append(list, str)) { \
4318 Py_DECREF(str); \
4319 goto onError; \
4321 else \
4322 Py_DECREF(str);
4324 #define SPLIT_INSERT(data, left, right) \
4325 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4326 if (!str) \
4327 goto onError; \
4328 if (PyList_Insert(list, 0, str)) { \
4329 Py_DECREF(str); \
4330 goto onError; \
4332 else \
4333 Py_DECREF(str);
4335 static
4336 PyObject *split_whitespace(PyUnicodeObject *self,
4337 PyObject *list,
4338 int maxcount)
4340 register int i;
4341 register int j;
4342 int len = self->length;
4343 PyObject *str;
4345 for (i = j = 0; i < len; ) {
4346 /* find a token */
4347 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4348 i++;
4349 j = i;
4350 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4351 i++;
4352 if (j < i) {
4353 if (maxcount-- <= 0)
4354 break;
4355 SPLIT_APPEND(self->str, j, i);
4356 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4357 i++;
4358 j = i;
4361 if (j < len) {
4362 SPLIT_APPEND(self->str, j, len);
4364 return list;
4366 onError:
4367 Py_DECREF(list);
4368 return NULL;
4371 PyObject *PyUnicode_Splitlines(PyObject *string,
4372 int keepends)
4374 register int i;
4375 register int j;
4376 int len;
4377 PyObject *list;
4378 PyObject *str;
4379 Py_UNICODE *data;
4381 string = PyUnicode_FromObject(string);
4382 if (string == NULL)
4383 return NULL;
4384 data = PyUnicode_AS_UNICODE(string);
4385 len = PyUnicode_GET_SIZE(string);
4387 list = PyList_New(0);
4388 if (!list)
4389 goto onError;
4391 for (i = j = 0; i < len; ) {
4392 int eol;
4394 /* Find a line and append it */
4395 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4396 i++;
4398 /* Skip the line break reading CRLF as one line break */
4399 eol = i;
4400 if (i < len) {
4401 if (data[i] == '\r' && i + 1 < len &&
4402 data[i+1] == '\n')
4403 i += 2;
4404 else
4405 i++;
4406 if (keepends)
4407 eol = i;
4409 SPLIT_APPEND(data, j, eol);
4410 j = i;
4412 if (j < len) {
4413 SPLIT_APPEND(data, j, len);
4416 Py_DECREF(string);
4417 return list;
4419 onError:
4420 Py_DECREF(list);
4421 Py_DECREF(string);
4422 return NULL;
4425 static
4426 PyObject *split_char(PyUnicodeObject *self,
4427 PyObject *list,
4428 Py_UNICODE ch,
4429 int maxcount)
4431 register int i;
4432 register int j;
4433 int len = self->length;
4434 PyObject *str;
4436 for (i = j = 0; i < len; ) {
4437 if (self->str[i] == ch) {
4438 if (maxcount-- <= 0)
4439 break;
4440 SPLIT_APPEND(self->str, j, i);
4441 i = j = i + 1;
4442 } else
4443 i++;
4445 if (j <= len) {
4446 SPLIT_APPEND(self->str, j, len);
4448 return list;
4450 onError:
4451 Py_DECREF(list);
4452 return NULL;
4455 static
4456 PyObject *split_substring(PyUnicodeObject *self,
4457 PyObject *list,
4458 PyUnicodeObject *substring,
4459 int maxcount)
4461 register int i;
4462 register int j;
4463 int len = self->length;
4464 int sublen = substring->length;
4465 PyObject *str;
4467 for (i = j = 0; i <= len - sublen; ) {
4468 if (Py_UNICODE_MATCH(self, i, substring)) {
4469 if (maxcount-- <= 0)
4470 break;
4471 SPLIT_APPEND(self->str, j, i);
4472 i = j = i + sublen;
4473 } else
4474 i++;
4476 if (j <= len) {
4477 SPLIT_APPEND(self->str, j, len);
4479 return list;
4481 onError:
4482 Py_DECREF(list);
4483 return NULL;
4486 static
4487 PyObject *rsplit_whitespace(PyUnicodeObject *self,
4488 PyObject *list,
4489 int maxcount)
4491 register int i;
4492 register int j;
4493 int len = self->length;
4494 PyObject *str;
4496 for (i = j = len - 1; i >= 0; ) {
4497 /* find a token */
4498 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4499 i--;
4500 j = i;
4501 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4502 i--;
4503 if (j > i) {
4504 if (maxcount-- <= 0)
4505 break;
4506 SPLIT_INSERT(self->str, i + 1, j + 1);
4507 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4508 i--;
4509 j = i;
4512 if (j >= 0) {
4513 SPLIT_INSERT(self->str, 0, j + 1);
4515 return list;
4517 onError:
4518 Py_DECREF(list);
4519 return NULL;
4522 static
4523 PyObject *rsplit_char(PyUnicodeObject *self,
4524 PyObject *list,
4525 Py_UNICODE ch,
4526 int maxcount)
4528 register int i;
4529 register int j;
4530 int len = self->length;
4531 PyObject *str;
4533 for (i = j = len - 1; i >= 0; ) {
4534 if (self->str[i] == ch) {
4535 if (maxcount-- <= 0)
4536 break;
4537 SPLIT_INSERT(self->str, i + 1, j + 1);
4538 j = i = i - 1;
4539 } else
4540 i--;
4542 if (j >= -1) {
4543 SPLIT_INSERT(self->str, 0, j + 1);
4545 return list;
4547 onError:
4548 Py_DECREF(list);
4549 return NULL;
4552 static
4553 PyObject *rsplit_substring(PyUnicodeObject *self,
4554 PyObject *list,
4555 PyUnicodeObject *substring,
4556 int maxcount)
4558 register int i;
4559 register int j;
4560 int len = self->length;
4561 int sublen = substring->length;
4562 PyObject *str;
4564 for (i = len - sublen, j = len; i >= 0; ) {
4565 if (Py_UNICODE_MATCH(self, i, substring)) {
4566 if (maxcount-- <= 0)
4567 break;
4568 SPLIT_INSERT(self->str, i + sublen, j);
4569 j = i;
4570 i -= sublen;
4571 } else
4572 i--;
4574 if (j >= 0) {
4575 SPLIT_INSERT(self->str, 0, j);
4577 return list;
4579 onError:
4580 Py_DECREF(list);
4581 return NULL;
4584 #undef SPLIT_APPEND
4585 #undef SPLIT_INSERT
4587 static
4588 PyObject *split(PyUnicodeObject *self,
4589 PyUnicodeObject *substring,
4590 int maxcount)
4592 PyObject *list;
4594 if (maxcount < 0)
4595 maxcount = INT_MAX;
4597 list = PyList_New(0);
4598 if (!list)
4599 return NULL;
4601 if (substring == NULL)
4602 return split_whitespace(self,list,maxcount);
4604 else if (substring->length == 1)
4605 return split_char(self,list,substring->str[0],maxcount);
4607 else if (substring->length == 0) {
4608 Py_DECREF(list);
4609 PyErr_SetString(PyExc_ValueError, "empty separator");
4610 return NULL;
4612 else
4613 return split_substring(self,list,substring,maxcount);
4616 static
4617 PyObject *rsplit(PyUnicodeObject *self,
4618 PyUnicodeObject *substring,
4619 int maxcount)
4621 PyObject *list;
4623 if (maxcount < 0)
4624 maxcount = INT_MAX;
4626 list = PyList_New(0);
4627 if (!list)
4628 return NULL;
4630 if (substring == NULL)
4631 return rsplit_whitespace(self,list,maxcount);
4633 else if (substring->length == 1)
4634 return rsplit_char(self,list,substring->str[0],maxcount);
4636 else if (substring->length == 0) {
4637 Py_DECREF(list);
4638 PyErr_SetString(PyExc_ValueError, "empty separator");
4639 return NULL;
4641 else
4642 return rsplit_substring(self,list,substring,maxcount);
4645 static
4646 PyObject *replace(PyUnicodeObject *self,
4647 PyUnicodeObject *str1,
4648 PyUnicodeObject *str2,
4649 int maxcount)
4651 PyUnicodeObject *u;
4653 if (maxcount < 0)
4654 maxcount = INT_MAX;
4656 if (str1->length == 1 && str2->length == 1) {
4657 int i;
4659 /* replace characters */
4660 if (!findchar(self->str, self->length, str1->str[0]) &&
4661 PyUnicode_CheckExact(self)) {
4662 /* nothing to replace, return original string */
4663 Py_INCREF(self);
4664 u = self;
4665 } else {
4666 Py_UNICODE u1 = str1->str[0];
4667 Py_UNICODE u2 = str2->str[0];
4669 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
4670 NULL,
4671 self->length
4673 if (u != NULL) {
4674 Py_UNICODE_COPY(u->str, self->str,
4675 self->length);
4676 for (i = 0; i < u->length; i++)
4677 if (u->str[i] == u1) {
4678 if (--maxcount < 0)
4679 break;
4680 u->str[i] = u2;
4685 } else {
4686 int n, i;
4687 Py_UNICODE *p;
4689 /* replace strings */
4690 n = count(self, 0, self->length, str1);
4691 if (n > maxcount)
4692 n = maxcount;
4693 if (n == 0) {
4694 /* nothing to replace, return original string */
4695 if (PyUnicode_CheckExact(self)) {
4696 Py_INCREF(self);
4697 u = self;
4699 else {
4700 u = (PyUnicodeObject *)
4701 PyUnicode_FromUnicode(self->str, self->length);
4703 } else {
4704 u = _PyUnicode_New(
4705 self->length + n * (str2->length - str1->length));
4706 if (u) {
4707 i = 0;
4708 p = u->str;
4709 if (str1->length > 0) {
4710 while (i <= self->length - str1->length)
4711 if (Py_UNICODE_MATCH(self, i, str1)) {
4712 /* replace string segment */
4713 Py_UNICODE_COPY(p, str2->str, str2->length);
4714 p += str2->length;
4715 i += str1->length;
4716 if (--n <= 0) {
4717 /* copy remaining part */
4718 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4719 break;
4721 } else
4722 *p++ = self->str[i++];
4723 } else {
4724 while (n > 0) {
4725 Py_UNICODE_COPY(p, str2->str, str2->length);
4726 p += str2->length;
4727 if (--n <= 0)
4728 break;
4729 *p++ = self->str[i++];
4731 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4737 return (PyObject *) u;
4740 /* --- Unicode Object Methods --------------------------------------------- */
4742 PyDoc_STRVAR(title__doc__,
4743 "S.title() -> unicode\n\
4745 Return a titlecased version of S, i.e. words start with title case\n\
4746 characters, all remaining cased characters have lower case.");
4748 static PyObject*
4749 unicode_title(PyUnicodeObject *self)
4751 return fixup(self, fixtitle);
4754 PyDoc_STRVAR(capitalize__doc__,
4755 "S.capitalize() -> unicode\n\
4757 Return a capitalized version of S, i.e. make the first character\n\
4758 have upper case.");
4760 static PyObject*
4761 unicode_capitalize(PyUnicodeObject *self)
4763 return fixup(self, fixcapitalize);
4766 #if 0
4767 PyDoc_STRVAR(capwords__doc__,
4768 "S.capwords() -> unicode\n\
4770 Apply .capitalize() to all words in S and return the result with\n\
4771 normalized whitespace (all whitespace strings are replaced by ' ').");
4773 static PyObject*
4774 unicode_capwords(PyUnicodeObject *self)
4776 PyObject *list;
4777 PyObject *item;
4778 int i;
4780 /* Split into words */
4781 list = split(self, NULL, -1);
4782 if (!list)
4783 return NULL;
4785 /* Capitalize each word */
4786 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4787 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4788 fixcapitalize);
4789 if (item == NULL)
4790 goto onError;
4791 Py_DECREF(PyList_GET_ITEM(list, i));
4792 PyList_SET_ITEM(list, i, item);
4795 /* Join the words to form a new string */
4796 item = PyUnicode_Join(NULL, list);
4798 onError:
4799 Py_DECREF(list);
4800 return (PyObject *)item;
4802 #endif
4804 /* Argument converter. Coerces to a single unicode character */
4806 static int
4807 convert_uc(PyObject *obj, void *addr)
4809 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4810 PyObject *uniobj;
4811 Py_UNICODE *unistr;
4813 uniobj = PyUnicode_FromObject(obj);
4814 if (uniobj == NULL) {
4815 PyErr_SetString(PyExc_TypeError,
4816 "The fill character cannot be converted to Unicode");
4817 return 0;
4819 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4820 PyErr_SetString(PyExc_TypeError,
4821 "The fill character must be exactly one character long");
4822 Py_DECREF(uniobj);
4823 return 0;
4825 unistr = PyUnicode_AS_UNICODE(uniobj);
4826 *fillcharloc = unistr[0];
4827 Py_DECREF(uniobj);
4828 return 1;
4831 PyDoc_STRVAR(center__doc__,
4832 "S.center(width[, fillchar]) -> unicode\n\
4834 Return S centered in a Unicode string of length width. Padding is\n\
4835 done using the specified fill character (default is a space)");
4837 static PyObject *
4838 unicode_center(PyUnicodeObject *self, PyObject *args)
4840 int marg, left;
4841 int width;
4842 Py_UNICODE fillchar = ' ';
4844 if (!PyArg_ParseTuple(args, "i|O&:center", &width, convert_uc, &fillchar))
4845 return NULL;
4847 if (self->length >= width && PyUnicode_CheckExact(self)) {
4848 Py_INCREF(self);
4849 return (PyObject*) self;
4852 marg = width - self->length;
4853 left = marg / 2 + (marg & width & 1);
4855 return (PyObject*) pad(self, left, marg - left, fillchar);
4858 #if 0
4860 /* This code should go into some future Unicode collation support
4861 module. The basic comparison should compare ordinals on a naive
4862 basis (this is what Java does and thus JPython too). */
4864 /* speedy UTF-16 code point order comparison */
4865 /* gleaned from: */
4866 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4868 static short utf16Fixup[32] =
4870 0, 0, 0, 0, 0, 0, 0, 0,
4871 0, 0, 0, 0, 0, 0, 0, 0,
4872 0, 0, 0, 0, 0, 0, 0, 0,
4873 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
4876 static int
4877 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4879 int len1, len2;
4881 Py_UNICODE *s1 = str1->str;
4882 Py_UNICODE *s2 = str2->str;
4884 len1 = str1->length;
4885 len2 = str2->length;
4887 while (len1 > 0 && len2 > 0) {
4888 Py_UNICODE c1, c2;
4890 c1 = *s1++;
4891 c2 = *s2++;
4893 if (c1 > (1<<11) * 26)
4894 c1 += utf16Fixup[c1>>11];
4895 if (c2 > (1<<11) * 26)
4896 c2 += utf16Fixup[c2>>11];
4897 /* now c1 and c2 are in UTF-32-compatible order */
4899 if (c1 != c2)
4900 return (c1 < c2) ? -1 : 1;
4902 len1--; len2--;
4905 return (len1 < len2) ? -1 : (len1 != len2);
4908 #else
4910 static int
4911 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4913 register int len1, len2;
4915 Py_UNICODE *s1 = str1->str;
4916 Py_UNICODE *s2 = str2->str;
4918 len1 = str1->length;
4919 len2 = str2->length;
4921 while (len1 > 0 && len2 > 0) {
4922 Py_UNICODE c1, c2;
4924 c1 = *s1++;
4925 c2 = *s2++;
4927 if (c1 != c2)
4928 return (c1 < c2) ? -1 : 1;
4930 len1--; len2--;
4933 return (len1 < len2) ? -1 : (len1 != len2);
4936 #endif
4938 int PyUnicode_Compare(PyObject *left,
4939 PyObject *right)
4941 PyUnicodeObject *u = NULL, *v = NULL;
4942 int result;
4944 /* Coerce the two arguments */
4945 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4946 if (u == NULL)
4947 goto onError;
4948 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4949 if (v == NULL)
4950 goto onError;
4952 /* Shortcut for empty or interned objects */
4953 if (v == u) {
4954 Py_DECREF(u);
4955 Py_DECREF(v);
4956 return 0;
4959 result = unicode_compare(u, v);
4961 Py_DECREF(u);
4962 Py_DECREF(v);
4963 return result;
4965 onError:
4966 Py_XDECREF(u);
4967 Py_XDECREF(v);
4968 return -1;
4971 int PyUnicode_Contains(PyObject *container,
4972 PyObject *element)
4974 PyUnicodeObject *u = NULL, *v = NULL;
4975 int result, size;
4976 register const Py_UNICODE *lhs, *end, *rhs;
4978 /* Coerce the two arguments */
4979 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
4980 if (v == NULL) {
4981 PyErr_SetString(PyExc_TypeError,
4982 "'in <string>' requires string as left operand");
4983 goto onError;
4985 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
4986 if (u == NULL)
4987 goto onError;
4989 size = PyUnicode_GET_SIZE(v);
4990 rhs = PyUnicode_AS_UNICODE(v);
4991 lhs = PyUnicode_AS_UNICODE(u);
4993 result = 0;
4994 if (size == 1) {
4995 end = lhs + PyUnicode_GET_SIZE(u);
4996 while (lhs < end) {
4997 if (*lhs++ == *rhs) {
4998 result = 1;
4999 break;
5003 else {
5004 end = lhs + (PyUnicode_GET_SIZE(u) - size);
5005 while (lhs <= end) {
5006 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
5007 result = 1;
5008 break;
5013 Py_DECREF(u);
5014 Py_DECREF(v);
5015 return result;
5017 onError:
5018 Py_XDECREF(u);
5019 Py_XDECREF(v);
5020 return -1;
5023 /* Concat to string or Unicode object giving a new Unicode object. */
5025 PyObject *PyUnicode_Concat(PyObject *left,
5026 PyObject *right)
5028 PyUnicodeObject *u = NULL, *v = NULL, *w;
5030 /* Coerce the two arguments */
5031 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5032 if (u == NULL)
5033 goto onError;
5034 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5035 if (v == NULL)
5036 goto onError;
5038 /* Shortcuts */
5039 if (v == unicode_empty) {
5040 Py_DECREF(v);
5041 return (PyObject *)u;
5043 if (u == unicode_empty) {
5044 Py_DECREF(u);
5045 return (PyObject *)v;
5048 /* Concat the two Unicode strings */
5049 w = _PyUnicode_New(u->length + v->length);
5050 if (w == NULL)
5051 goto onError;
5052 Py_UNICODE_COPY(w->str, u->str, u->length);
5053 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5055 Py_DECREF(u);
5056 Py_DECREF(v);
5057 return (PyObject *)w;
5059 onError:
5060 Py_XDECREF(u);
5061 Py_XDECREF(v);
5062 return NULL;
5065 PyDoc_STRVAR(count__doc__,
5066 "S.count(sub[, start[, end]]) -> int\n\
5068 Return the number of occurrences of substring sub in Unicode string\n\
5069 S[start:end]. Optional arguments start and end are\n\
5070 interpreted as in slice notation.");
5072 static PyObject *
5073 unicode_count(PyUnicodeObject *self, PyObject *args)
5075 PyUnicodeObject *substring;
5076 int start = 0;
5077 int end = INT_MAX;
5078 PyObject *result;
5080 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5081 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5082 return NULL;
5084 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5085 (PyObject *)substring);
5086 if (substring == NULL)
5087 return NULL;
5089 if (start < 0)
5090 start += self->length;
5091 if (start < 0)
5092 start = 0;
5093 if (end > self->length)
5094 end = self->length;
5095 if (end < 0)
5096 end += self->length;
5097 if (end < 0)
5098 end = 0;
5100 result = PyInt_FromLong((long) count(self, start, end, substring));
5102 Py_DECREF(substring);
5103 return result;
5106 PyDoc_STRVAR(encode__doc__,
5107 "S.encode([encoding[,errors]]) -> string or unicode\n\
5109 Encodes S using the codec registered for encoding. encoding defaults\n\
5110 to the default encoding. errors may be given to set a different error\n\
5111 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5112 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5113 'xmlcharrefreplace' as well as any other name registered with\n\
5114 codecs.register_error that can handle UnicodeEncodeErrors.");
5116 static PyObject *
5117 unicode_encode(PyUnicodeObject *self, PyObject *args)
5119 char *encoding = NULL;
5120 char *errors = NULL;
5121 PyObject *v;
5123 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5124 return NULL;
5125 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
5126 if (v == NULL)
5127 goto onError;
5128 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5129 PyErr_Format(PyExc_TypeError,
5130 "encoder did not return a string/unicode object "
5131 "(type=%.400s)",
5132 v->ob_type->tp_name);
5133 Py_DECREF(v);
5134 return NULL;
5136 return v;
5138 onError:
5139 return NULL;
5142 PyDoc_STRVAR(decode__doc__,
5143 "S.decode([encoding[,errors]]) -> string or unicode\n\
5145 Decodes S using the codec registered for encoding. encoding defaults\n\
5146 to the default encoding. errors may be given to set a different error\n\
5147 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5148 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5149 as well as any other name registerd with codecs.register_error that is\n\
5150 able to handle UnicodeDecodeErrors.");
5152 static PyObject *
5153 unicode_decode(PyUnicodeObject *self, PyObject *args)
5155 char *encoding = NULL;
5156 char *errors = NULL;
5157 PyObject *v;
5159 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5160 return NULL;
5161 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
5162 if (v == NULL)
5163 goto onError;
5164 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5165 PyErr_Format(PyExc_TypeError,
5166 "decoder did not return a string/unicode object "
5167 "(type=%.400s)",
5168 v->ob_type->tp_name);
5169 Py_DECREF(v);
5170 return NULL;
5172 return v;
5174 onError:
5175 return NULL;
5178 PyDoc_STRVAR(expandtabs__doc__,
5179 "S.expandtabs([tabsize]) -> unicode\n\
5181 Return a copy of S where all tab characters are expanded using spaces.\n\
5182 If tabsize is not given, a tab size of 8 characters is assumed.");
5184 static PyObject*
5185 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5187 Py_UNICODE *e;
5188 Py_UNICODE *p;
5189 Py_UNICODE *q;
5190 int i, j;
5191 PyUnicodeObject *u;
5192 int tabsize = 8;
5194 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5195 return NULL;
5197 /* First pass: determine size of output string */
5198 i = j = 0;
5199 e = self->str + self->length;
5200 for (p = self->str; p < e; p++)
5201 if (*p == '\t') {
5202 if (tabsize > 0)
5203 j += tabsize - (j % tabsize);
5205 else {
5206 j++;
5207 if (*p == '\n' || *p == '\r') {
5208 i += j;
5209 j = 0;
5213 /* Second pass: create output string and fill it */
5214 u = _PyUnicode_New(i + j);
5215 if (!u)
5216 return NULL;
5218 j = 0;
5219 q = u->str;
5221 for (p = self->str; p < e; p++)
5222 if (*p == '\t') {
5223 if (tabsize > 0) {
5224 i = tabsize - (j % tabsize);
5225 j += i;
5226 while (i--)
5227 *q++ = ' ';
5230 else {
5231 j++;
5232 *q++ = *p;
5233 if (*p == '\n' || *p == '\r')
5234 j = 0;
5237 return (PyObject*) u;
5240 PyDoc_STRVAR(find__doc__,
5241 "S.find(sub [,start [,end]]) -> int\n\
5243 Return the lowest index in S where substring sub is found,\n\
5244 such that sub is contained within s[start,end]. Optional\n\
5245 arguments start and end are interpreted as in slice notation.\n\
5247 Return -1 on failure.");
5249 static PyObject *
5250 unicode_find(PyUnicodeObject *self, PyObject *args)
5252 PyUnicodeObject *substring;
5253 int start = 0;
5254 int end = INT_MAX;
5255 PyObject *result;
5257 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5258 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5259 return NULL;
5260 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5261 (PyObject *)substring);
5262 if (substring == NULL)
5263 return NULL;
5265 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
5267 Py_DECREF(substring);
5268 return result;
5271 static PyObject *
5272 unicode_getitem(PyUnicodeObject *self, int index)
5274 if (index < 0 || index >= self->length) {
5275 PyErr_SetString(PyExc_IndexError, "string index out of range");
5276 return NULL;
5279 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5282 static long
5283 unicode_hash(PyUnicodeObject *self)
5285 /* Since Unicode objects compare equal to their ASCII string
5286 counterparts, they should use the individual character values
5287 as basis for their hash value. This is needed to assure that
5288 strings and Unicode objects behave in the same way as
5289 dictionary keys. */
5291 register int len;
5292 register Py_UNICODE *p;
5293 register long x;
5295 if (self->hash != -1)
5296 return self->hash;
5297 len = PyUnicode_GET_SIZE(self);
5298 p = PyUnicode_AS_UNICODE(self);
5299 x = *p << 7;
5300 while (--len >= 0)
5301 x = (1000003*x) ^ *p++;
5302 x ^= PyUnicode_GET_SIZE(self);
5303 if (x == -1)
5304 x = -2;
5305 self->hash = x;
5306 return x;
5309 PyDoc_STRVAR(index__doc__,
5310 "S.index(sub [,start [,end]]) -> int\n\
5312 Like S.find() but raise ValueError when the substring is not found.");
5314 static PyObject *
5315 unicode_index(PyUnicodeObject *self, PyObject *args)
5317 int result;
5318 PyUnicodeObject *substring;
5319 int start = 0;
5320 int end = INT_MAX;
5322 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5323 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5324 return NULL;
5326 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5327 (PyObject *)substring);
5328 if (substring == NULL)
5329 return NULL;
5331 result = findstring(self, substring, start, end, 1);
5333 Py_DECREF(substring);
5334 if (result < 0) {
5335 PyErr_SetString(PyExc_ValueError, "substring not found");
5336 return NULL;
5338 return PyInt_FromLong(result);
5341 PyDoc_STRVAR(islower__doc__,
5342 "S.islower() -> bool\n\
5344 Return True if all cased characters in S are lowercase and there is\n\
5345 at least one cased character in S, False otherwise.");
5347 static PyObject*
5348 unicode_islower(PyUnicodeObject *self)
5350 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5351 register const Py_UNICODE *e;
5352 int cased;
5354 /* Shortcut for single character strings */
5355 if (PyUnicode_GET_SIZE(self) == 1)
5356 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
5358 /* Special case for empty strings */
5359 if (PyString_GET_SIZE(self) == 0)
5360 return PyBool_FromLong(0);
5362 e = p + PyUnicode_GET_SIZE(self);
5363 cased = 0;
5364 for (; p < e; p++) {
5365 register const Py_UNICODE ch = *p;
5367 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
5368 return PyBool_FromLong(0);
5369 else if (!cased && Py_UNICODE_ISLOWER(ch))
5370 cased = 1;
5372 return PyBool_FromLong(cased);
5375 PyDoc_STRVAR(isupper__doc__,
5376 "S.isupper() -> bool\n\
5378 Return True if all cased characters in S are uppercase and there is\n\
5379 at least one cased character in S, False otherwise.");
5381 static PyObject*
5382 unicode_isupper(PyUnicodeObject *self)
5384 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5385 register const Py_UNICODE *e;
5386 int cased;
5388 /* Shortcut for single character strings */
5389 if (PyUnicode_GET_SIZE(self) == 1)
5390 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
5392 /* Special case for empty strings */
5393 if (PyString_GET_SIZE(self) == 0)
5394 return PyBool_FromLong(0);
5396 e = p + PyUnicode_GET_SIZE(self);
5397 cased = 0;
5398 for (; p < e; p++) {
5399 register const Py_UNICODE ch = *p;
5401 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
5402 return PyBool_FromLong(0);
5403 else if (!cased && Py_UNICODE_ISUPPER(ch))
5404 cased = 1;
5406 return PyBool_FromLong(cased);
5409 PyDoc_STRVAR(istitle__doc__,
5410 "S.istitle() -> bool\n\
5412 Return True if S is a titlecased string and there is at least one\n\
5413 character in S, i.e. upper- and titlecase characters may only\n\
5414 follow uncased characters and lowercase characters only cased ones.\n\
5415 Return False otherwise.");
5417 static PyObject*
5418 unicode_istitle(PyUnicodeObject *self)
5420 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5421 register const Py_UNICODE *e;
5422 int cased, previous_is_cased;
5424 /* Shortcut for single character strings */
5425 if (PyUnicode_GET_SIZE(self) == 1)
5426 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5427 (Py_UNICODE_ISUPPER(*p) != 0));
5429 /* Special case for empty strings */
5430 if (PyString_GET_SIZE(self) == 0)
5431 return PyBool_FromLong(0);
5433 e = p + PyUnicode_GET_SIZE(self);
5434 cased = 0;
5435 previous_is_cased = 0;
5436 for (; p < e; p++) {
5437 register const Py_UNICODE ch = *p;
5439 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5440 if (previous_is_cased)
5441 return PyBool_FromLong(0);
5442 previous_is_cased = 1;
5443 cased = 1;
5445 else if (Py_UNICODE_ISLOWER(ch)) {
5446 if (!previous_is_cased)
5447 return PyBool_FromLong(0);
5448 previous_is_cased = 1;
5449 cased = 1;
5451 else
5452 previous_is_cased = 0;
5454 return PyBool_FromLong(cased);
5457 PyDoc_STRVAR(isspace__doc__,
5458 "S.isspace() -> bool\n\
5460 Return True if all characters in S are whitespace\n\
5461 and there is at least one character in S, False otherwise.");
5463 static PyObject*
5464 unicode_isspace(PyUnicodeObject *self)
5466 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5467 register const Py_UNICODE *e;
5469 /* Shortcut for single character strings */
5470 if (PyUnicode_GET_SIZE(self) == 1 &&
5471 Py_UNICODE_ISSPACE(*p))
5472 return PyBool_FromLong(1);
5474 /* Special case for empty strings */
5475 if (PyString_GET_SIZE(self) == 0)
5476 return PyBool_FromLong(0);
5478 e = p + PyUnicode_GET_SIZE(self);
5479 for (; p < e; p++) {
5480 if (!Py_UNICODE_ISSPACE(*p))
5481 return PyBool_FromLong(0);
5483 return PyBool_FromLong(1);
5486 PyDoc_STRVAR(isalpha__doc__,
5487 "S.isalpha() -> bool\n\
5489 Return True if all characters in S are alphabetic\n\
5490 and there is at least one character in S, False otherwise.");
5492 static PyObject*
5493 unicode_isalpha(PyUnicodeObject *self)
5495 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5496 register const Py_UNICODE *e;
5498 /* Shortcut for single character strings */
5499 if (PyUnicode_GET_SIZE(self) == 1 &&
5500 Py_UNICODE_ISALPHA(*p))
5501 return PyBool_FromLong(1);
5503 /* Special case for empty strings */
5504 if (PyString_GET_SIZE(self) == 0)
5505 return PyBool_FromLong(0);
5507 e = p + PyUnicode_GET_SIZE(self);
5508 for (; p < e; p++) {
5509 if (!Py_UNICODE_ISALPHA(*p))
5510 return PyBool_FromLong(0);
5512 return PyBool_FromLong(1);
5515 PyDoc_STRVAR(isalnum__doc__,
5516 "S.isalnum() -> bool\n\
5518 Return True if all characters in S are alphanumeric\n\
5519 and there is at least one character in S, False otherwise.");
5521 static PyObject*
5522 unicode_isalnum(PyUnicodeObject *self)
5524 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5525 register const Py_UNICODE *e;
5527 /* Shortcut for single character strings */
5528 if (PyUnicode_GET_SIZE(self) == 1 &&
5529 Py_UNICODE_ISALNUM(*p))
5530 return PyBool_FromLong(1);
5532 /* Special case for empty strings */
5533 if (PyString_GET_SIZE(self) == 0)
5534 return PyBool_FromLong(0);
5536 e = p + PyUnicode_GET_SIZE(self);
5537 for (; p < e; p++) {
5538 if (!Py_UNICODE_ISALNUM(*p))
5539 return PyBool_FromLong(0);
5541 return PyBool_FromLong(1);
5544 PyDoc_STRVAR(isdecimal__doc__,
5545 "S.isdecimal() -> bool\n\
5547 Return True if there are only decimal characters in S,\n\
5548 False otherwise.");
5550 static PyObject*
5551 unicode_isdecimal(PyUnicodeObject *self)
5553 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5554 register const Py_UNICODE *e;
5556 /* Shortcut for single character strings */
5557 if (PyUnicode_GET_SIZE(self) == 1 &&
5558 Py_UNICODE_ISDECIMAL(*p))
5559 return PyBool_FromLong(1);
5561 /* Special case for empty strings */
5562 if (PyString_GET_SIZE(self) == 0)
5563 return PyBool_FromLong(0);
5565 e = p + PyUnicode_GET_SIZE(self);
5566 for (; p < e; p++) {
5567 if (!Py_UNICODE_ISDECIMAL(*p))
5568 return PyBool_FromLong(0);
5570 return PyBool_FromLong(1);
5573 PyDoc_STRVAR(isdigit__doc__,
5574 "S.isdigit() -> bool\n\
5576 Return True if all characters in S are digits\n\
5577 and there is at least one character in S, False otherwise.");
5579 static PyObject*
5580 unicode_isdigit(PyUnicodeObject *self)
5582 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5583 register const Py_UNICODE *e;
5585 /* Shortcut for single character strings */
5586 if (PyUnicode_GET_SIZE(self) == 1 &&
5587 Py_UNICODE_ISDIGIT(*p))
5588 return PyBool_FromLong(1);
5590 /* Special case for empty strings */
5591 if (PyString_GET_SIZE(self) == 0)
5592 return PyBool_FromLong(0);
5594 e = p + PyUnicode_GET_SIZE(self);
5595 for (; p < e; p++) {
5596 if (!Py_UNICODE_ISDIGIT(*p))
5597 return PyBool_FromLong(0);
5599 return PyBool_FromLong(1);
5602 PyDoc_STRVAR(isnumeric__doc__,
5603 "S.isnumeric() -> bool\n\
5605 Return True if there are only numeric characters in S,\n\
5606 False otherwise.");
5608 static PyObject*
5609 unicode_isnumeric(PyUnicodeObject *self)
5611 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5612 register const Py_UNICODE *e;
5614 /* Shortcut for single character strings */
5615 if (PyUnicode_GET_SIZE(self) == 1 &&
5616 Py_UNICODE_ISNUMERIC(*p))
5617 return PyBool_FromLong(1);
5619 /* Special case for empty strings */
5620 if (PyString_GET_SIZE(self) == 0)
5621 return PyBool_FromLong(0);
5623 e = p + PyUnicode_GET_SIZE(self);
5624 for (; p < e; p++) {
5625 if (!Py_UNICODE_ISNUMERIC(*p))
5626 return PyBool_FromLong(0);
5628 return PyBool_FromLong(1);
5631 PyDoc_STRVAR(join__doc__,
5632 "S.join(sequence) -> unicode\n\
5634 Return a string which is the concatenation of the strings in the\n\
5635 sequence. The separator between elements is S.");
5637 static PyObject*
5638 unicode_join(PyObject *self, PyObject *data)
5640 return PyUnicode_Join(self, data);
5643 static int
5644 unicode_length(PyUnicodeObject *self)
5646 return self->length;
5649 PyDoc_STRVAR(ljust__doc__,
5650 "S.ljust(width[, fillchar]) -> int\n\
5652 Return S left justified in a Unicode string of length width. Padding is\n\
5653 done using the specified fill character (default is a space).");
5655 static PyObject *
5656 unicode_ljust(PyUnicodeObject *self, PyObject *args)
5658 int width;
5659 Py_UNICODE fillchar = ' ';
5661 if (!PyArg_ParseTuple(args, "i|O&:ljust", &width, convert_uc, &fillchar))
5662 return NULL;
5664 if (self->length >= width && PyUnicode_CheckExact(self)) {
5665 Py_INCREF(self);
5666 return (PyObject*) self;
5669 return (PyObject*) pad(self, 0, width - self->length, fillchar);
5672 PyDoc_STRVAR(lower__doc__,
5673 "S.lower() -> unicode\n\
5675 Return a copy of the string S converted to lowercase.");
5677 static PyObject*
5678 unicode_lower(PyUnicodeObject *self)
5680 return fixup(self, fixlower);
5683 #define LEFTSTRIP 0
5684 #define RIGHTSTRIP 1
5685 #define BOTHSTRIP 2
5687 /* Arrays indexed by above */
5688 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5690 #define STRIPNAME(i) (stripformat[i]+3)
5692 static const Py_UNICODE *
5693 unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5695 size_t i;
5696 for (i = 0; i < n; ++i)
5697 if (s[i] == c)
5698 return s+i;
5699 return NULL;
5702 /* externally visible for str.strip(unicode) */
5703 PyObject *
5704 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5706 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5707 int len = PyUnicode_GET_SIZE(self);
5708 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5709 int seplen = PyUnicode_GET_SIZE(sepobj);
5710 int i, j;
5712 i = 0;
5713 if (striptype != RIGHTSTRIP) {
5714 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5715 i++;
5719 j = len;
5720 if (striptype != LEFTSTRIP) {
5721 do {
5722 j--;
5723 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5724 j++;
5727 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5728 Py_INCREF(self);
5729 return (PyObject*)self;
5731 else
5732 return PyUnicode_FromUnicode(s+i, j-i);
5736 static PyObject *
5737 do_strip(PyUnicodeObject *self, int striptype)
5739 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5740 int len = PyUnicode_GET_SIZE(self), i, j;
5742 i = 0;
5743 if (striptype != RIGHTSTRIP) {
5744 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5745 i++;
5749 j = len;
5750 if (striptype != LEFTSTRIP) {
5751 do {
5752 j--;
5753 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5754 j++;
5757 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5758 Py_INCREF(self);
5759 return (PyObject*)self;
5761 else
5762 return PyUnicode_FromUnicode(s+i, j-i);
5766 static PyObject *
5767 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5769 PyObject *sep = NULL;
5771 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5772 return NULL;
5774 if (sep != NULL && sep != Py_None) {
5775 if (PyUnicode_Check(sep))
5776 return _PyUnicode_XStrip(self, striptype, sep);
5777 else if (PyString_Check(sep)) {
5778 PyObject *res;
5779 sep = PyUnicode_FromObject(sep);
5780 if (sep==NULL)
5781 return NULL;
5782 res = _PyUnicode_XStrip(self, striptype, sep);
5783 Py_DECREF(sep);
5784 return res;
5786 else {
5787 PyErr_Format(PyExc_TypeError,
5788 "%s arg must be None, unicode or str",
5789 STRIPNAME(striptype));
5790 return NULL;
5794 return do_strip(self, striptype);
5798 PyDoc_STRVAR(strip__doc__,
5799 "S.strip([chars]) -> unicode\n\
5801 Return a copy of the string S with leading and trailing\n\
5802 whitespace removed.\n\
5803 If chars is given and not None, remove characters in chars instead.\n\
5804 If chars is a str, it will be converted to unicode before stripping");
5806 static PyObject *
5807 unicode_strip(PyUnicodeObject *self, PyObject *args)
5809 if (PyTuple_GET_SIZE(args) == 0)
5810 return do_strip(self, BOTHSTRIP); /* Common case */
5811 else
5812 return do_argstrip(self, BOTHSTRIP, args);
5816 PyDoc_STRVAR(lstrip__doc__,
5817 "S.lstrip([chars]) -> unicode\n\
5819 Return a copy of the string S with leading whitespace removed.\n\
5820 If chars is given and not None, remove characters in chars instead.\n\
5821 If chars is a str, it will be converted to unicode before stripping");
5823 static PyObject *
5824 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5826 if (PyTuple_GET_SIZE(args) == 0)
5827 return do_strip(self, LEFTSTRIP); /* Common case */
5828 else
5829 return do_argstrip(self, LEFTSTRIP, args);
5833 PyDoc_STRVAR(rstrip__doc__,
5834 "S.rstrip([chars]) -> unicode\n\
5836 Return a copy of the string S with trailing whitespace removed.\n\
5837 If chars is given and not None, remove characters in chars instead.\n\
5838 If chars is a str, it will be converted to unicode before stripping");
5840 static PyObject *
5841 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5843 if (PyTuple_GET_SIZE(args) == 0)
5844 return do_strip(self, RIGHTSTRIP); /* Common case */
5845 else
5846 return do_argstrip(self, RIGHTSTRIP, args);
5850 static PyObject*
5851 unicode_repeat(PyUnicodeObject *str, int len)
5853 PyUnicodeObject *u;
5854 Py_UNICODE *p;
5855 int nchars;
5856 size_t nbytes;
5858 if (len < 0)
5859 len = 0;
5861 if (len == 1 && PyUnicode_CheckExact(str)) {
5862 /* no repeat, return original string */
5863 Py_INCREF(str);
5864 return (PyObject*) str;
5867 /* ensure # of chars needed doesn't overflow int and # of bytes
5868 * needed doesn't overflow size_t
5870 nchars = len * str->length;
5871 if (len && nchars / len != str->length) {
5872 PyErr_SetString(PyExc_OverflowError,
5873 "repeated string is too long");
5874 return NULL;
5876 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5877 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5878 PyErr_SetString(PyExc_OverflowError,
5879 "repeated string is too long");
5880 return NULL;
5882 u = _PyUnicode_New(nchars);
5883 if (!u)
5884 return NULL;
5886 p = u->str;
5888 while (len-- > 0) {
5889 Py_UNICODE_COPY(p, str->str, str->length);
5890 p += str->length;
5893 return (PyObject*) u;
5896 PyObject *PyUnicode_Replace(PyObject *obj,
5897 PyObject *subobj,
5898 PyObject *replobj,
5899 int maxcount)
5901 PyObject *self;
5902 PyObject *str1;
5903 PyObject *str2;
5904 PyObject *result;
5906 self = PyUnicode_FromObject(obj);
5907 if (self == NULL)
5908 return NULL;
5909 str1 = PyUnicode_FromObject(subobj);
5910 if (str1 == NULL) {
5911 Py_DECREF(self);
5912 return NULL;
5914 str2 = PyUnicode_FromObject(replobj);
5915 if (str2 == NULL) {
5916 Py_DECREF(self);
5917 Py_DECREF(str1);
5918 return NULL;
5920 result = replace((PyUnicodeObject *)self,
5921 (PyUnicodeObject *)str1,
5922 (PyUnicodeObject *)str2,
5923 maxcount);
5924 Py_DECREF(self);
5925 Py_DECREF(str1);
5926 Py_DECREF(str2);
5927 return result;
5930 PyDoc_STRVAR(replace__doc__,
5931 "S.replace (old, new[, maxsplit]) -> unicode\n\
5933 Return a copy of S with all occurrences of substring\n\
5934 old replaced by new. If the optional argument maxsplit is\n\
5935 given, only the first maxsplit occurrences are replaced.");
5937 static PyObject*
5938 unicode_replace(PyUnicodeObject *self, PyObject *args)
5940 PyUnicodeObject *str1;
5941 PyUnicodeObject *str2;
5942 int maxcount = -1;
5943 PyObject *result;
5945 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5946 return NULL;
5947 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5948 if (str1 == NULL)
5949 return NULL;
5950 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
5951 if (str2 == NULL) {
5952 Py_DECREF(str1);
5953 return NULL;
5956 result = replace(self, str1, str2, maxcount);
5958 Py_DECREF(str1);
5959 Py_DECREF(str2);
5960 return result;
5963 static
5964 PyObject *unicode_repr(PyObject *unicode)
5966 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5967 PyUnicode_GET_SIZE(unicode),
5971 PyDoc_STRVAR(rfind__doc__,
5972 "S.rfind(sub [,start [,end]]) -> int\n\
5974 Return the highest index in S where substring sub is found,\n\
5975 such that sub is contained within s[start,end]. Optional\n\
5976 arguments start and end are interpreted as in slice notation.\n\
5978 Return -1 on failure.");
5980 static PyObject *
5981 unicode_rfind(PyUnicodeObject *self, PyObject *args)
5983 PyUnicodeObject *substring;
5984 int start = 0;
5985 int end = INT_MAX;
5986 PyObject *result;
5988 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5989 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5990 return NULL;
5991 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5992 (PyObject *)substring);
5993 if (substring == NULL)
5994 return NULL;
5996 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5998 Py_DECREF(substring);
5999 return result;
6002 PyDoc_STRVAR(rindex__doc__,
6003 "S.rindex(sub [,start [,end]]) -> int\n\
6005 Like S.rfind() but raise ValueError when the substring is not found.");
6007 static PyObject *
6008 unicode_rindex(PyUnicodeObject *self, PyObject *args)
6010 int result;
6011 PyUnicodeObject *substring;
6012 int start = 0;
6013 int end = INT_MAX;
6015 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6016 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6017 return NULL;
6018 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6019 (PyObject *)substring);
6020 if (substring == NULL)
6021 return NULL;
6023 result = findstring(self, substring, start, end, -1);
6025 Py_DECREF(substring);
6026 if (result < 0) {
6027 PyErr_SetString(PyExc_ValueError, "substring not found");
6028 return NULL;
6030 return PyInt_FromLong(result);
6033 PyDoc_STRVAR(rjust__doc__,
6034 "S.rjust(width[, fillchar]) -> unicode\n\
6036 Return S right justified in a Unicode string of length width. Padding is\n\
6037 done using the specified fill character (default is a space).");
6039 static PyObject *
6040 unicode_rjust(PyUnicodeObject *self, PyObject *args)
6042 int width;
6043 Py_UNICODE fillchar = ' ';
6045 if (!PyArg_ParseTuple(args, "i|O&:rjust", &width, convert_uc, &fillchar))
6046 return NULL;
6048 if (self->length >= width && PyUnicode_CheckExact(self)) {
6049 Py_INCREF(self);
6050 return (PyObject*) self;
6053 return (PyObject*) pad(self, width - self->length, 0, fillchar);
6056 static PyObject*
6057 unicode_slice(PyUnicodeObject *self, int start, int end)
6059 /* standard clamping */
6060 if (start < 0)
6061 start = 0;
6062 if (end < 0)
6063 end = 0;
6064 if (end > self->length)
6065 end = self->length;
6066 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
6067 /* full slice, return original string */
6068 Py_INCREF(self);
6069 return (PyObject*) self;
6071 if (start > end)
6072 start = end;
6073 /* copy slice */
6074 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6075 end - start);
6078 PyObject *PyUnicode_Split(PyObject *s,
6079 PyObject *sep,
6080 int maxsplit)
6082 PyObject *result;
6084 s = PyUnicode_FromObject(s);
6085 if (s == NULL)
6086 return NULL;
6087 if (sep != NULL) {
6088 sep = PyUnicode_FromObject(sep);
6089 if (sep == NULL) {
6090 Py_DECREF(s);
6091 return NULL;
6095 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6097 Py_DECREF(s);
6098 Py_XDECREF(sep);
6099 return result;
6102 PyDoc_STRVAR(split__doc__,
6103 "S.split([sep [,maxsplit]]) -> list of strings\n\
6105 Return a list of the words in S, using sep as the\n\
6106 delimiter string. If maxsplit is given, at most maxsplit\n\
6107 splits are done. If sep is not specified or is None,\n\
6108 any whitespace string is a separator.");
6110 static PyObject*
6111 unicode_split(PyUnicodeObject *self, PyObject *args)
6113 PyObject *substring = Py_None;
6114 int maxcount = -1;
6116 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
6117 return NULL;
6119 if (substring == Py_None)
6120 return split(self, NULL, maxcount);
6121 else if (PyUnicode_Check(substring))
6122 return split(self, (PyUnicodeObject *)substring, maxcount);
6123 else
6124 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6127 PyObject *PyUnicode_RSplit(PyObject *s,
6128 PyObject *sep,
6129 int maxsplit)
6131 PyObject *result;
6133 s = PyUnicode_FromObject(s);
6134 if (s == NULL)
6135 return NULL;
6136 if (sep != NULL) {
6137 sep = PyUnicode_FromObject(sep);
6138 if (sep == NULL) {
6139 Py_DECREF(s);
6140 return NULL;
6144 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6146 Py_DECREF(s);
6147 Py_XDECREF(sep);
6148 return result;
6151 PyDoc_STRVAR(rsplit__doc__,
6152 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6154 Return a list of the words in S, using sep as the\n\
6155 delimiter string, starting at the end of the string and\n\
6156 working to the front. If maxsplit is given, at most maxsplit\n\
6157 splits are done. If sep is not specified, any whitespace string\n\
6158 is a separator.");
6160 static PyObject*
6161 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6163 PyObject *substring = Py_None;
6164 int maxcount = -1;
6166 if (!PyArg_ParseTuple(args, "|Oi:rsplit", &substring, &maxcount))
6167 return NULL;
6169 if (substring == Py_None)
6170 return rsplit(self, NULL, maxcount);
6171 else if (PyUnicode_Check(substring))
6172 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6173 else
6174 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6177 PyDoc_STRVAR(splitlines__doc__,
6178 "S.splitlines([keepends]]) -> list of strings\n\
6180 Return a list of the lines in S, breaking at line boundaries.\n\
6181 Line breaks are not included in the resulting list unless keepends\n\
6182 is given and true.");
6184 static PyObject*
6185 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6187 int keepends = 0;
6189 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
6190 return NULL;
6192 return PyUnicode_Splitlines((PyObject *)self, keepends);
6195 static
6196 PyObject *unicode_str(PyUnicodeObject *self)
6198 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
6201 PyDoc_STRVAR(swapcase__doc__,
6202 "S.swapcase() -> unicode\n\
6204 Return a copy of S with uppercase characters converted to lowercase\n\
6205 and vice versa.");
6207 static PyObject*
6208 unicode_swapcase(PyUnicodeObject *self)
6210 return fixup(self, fixswapcase);
6213 PyDoc_STRVAR(translate__doc__,
6214 "S.translate(table) -> unicode\n\
6216 Return a copy of the string S, where all characters have been mapped\n\
6217 through the given translation table, which must be a mapping of\n\
6218 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6219 Unmapped characters are left untouched. Characters mapped to None\n\
6220 are deleted.");
6222 static PyObject*
6223 unicode_translate(PyUnicodeObject *self, PyObject *table)
6225 return PyUnicode_TranslateCharmap(self->str,
6226 self->length,
6227 table,
6228 "ignore");
6231 PyDoc_STRVAR(upper__doc__,
6232 "S.upper() -> unicode\n\
6234 Return a copy of S converted to uppercase.");
6236 static PyObject*
6237 unicode_upper(PyUnicodeObject *self)
6239 return fixup(self, fixupper);
6242 PyDoc_STRVAR(zfill__doc__,
6243 "S.zfill(width) -> unicode\n\
6245 Pad a numeric string x with zeros on the left, to fill a field\n\
6246 of the specified width. The string x is never truncated.");
6248 static PyObject *
6249 unicode_zfill(PyUnicodeObject *self, PyObject *args)
6251 int fill;
6252 PyUnicodeObject *u;
6254 int width;
6255 if (!PyArg_ParseTuple(args, "i:zfill", &width))
6256 return NULL;
6258 if (self->length >= width) {
6259 if (PyUnicode_CheckExact(self)) {
6260 Py_INCREF(self);
6261 return (PyObject*) self;
6263 else
6264 return PyUnicode_FromUnicode(
6265 PyUnicode_AS_UNICODE(self),
6266 PyUnicode_GET_SIZE(self)
6270 fill = width - self->length;
6272 u = pad(self, fill, 0, '0');
6274 if (u == NULL)
6275 return NULL;
6277 if (u->str[fill] == '+' || u->str[fill] == '-') {
6278 /* move sign to beginning of string */
6279 u->str[0] = u->str[fill];
6280 u->str[fill] = '0';
6283 return (PyObject*) u;
6286 #if 0
6287 static PyObject*
6288 unicode_freelistsize(PyUnicodeObject *self)
6290 return PyInt_FromLong(unicode_freelist_size);
6292 #endif
6294 PyDoc_STRVAR(startswith__doc__,
6295 "S.startswith(prefix[, start[, end]]) -> bool\n\
6297 Return True if S starts with the specified prefix, False otherwise.\n\
6298 With optional start, test S beginning at that position.\n\
6299 With optional end, stop comparing S at that position.");
6301 static PyObject *
6302 unicode_startswith(PyUnicodeObject *self,
6303 PyObject *args)
6305 PyUnicodeObject *substring;
6306 int start = 0;
6307 int end = INT_MAX;
6308 PyObject *result;
6310 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6311 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6312 return NULL;
6313 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6314 (PyObject *)substring);
6315 if (substring == NULL)
6316 return NULL;
6318 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
6320 Py_DECREF(substring);
6321 return result;
6325 PyDoc_STRVAR(endswith__doc__,
6326 "S.endswith(suffix[, start[, end]]) -> bool\n\
6328 Return True if S ends with the specified suffix, False otherwise.\n\
6329 With optional start, test S beginning at that position.\n\
6330 With optional end, stop comparing S at that position.");
6332 static PyObject *
6333 unicode_endswith(PyUnicodeObject *self,
6334 PyObject *args)
6336 PyUnicodeObject *substring;
6337 int start = 0;
6338 int end = INT_MAX;
6339 PyObject *result;
6341 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6342 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6343 return NULL;
6344 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6345 (PyObject *)substring);
6346 if (substring == NULL)
6347 return NULL;
6349 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
6351 Py_DECREF(substring);
6352 return result;
6357 static PyObject *
6358 unicode_getnewargs(PyUnicodeObject *v)
6360 return Py_BuildValue("(u#)", v->str, v->length);
6364 static PyMethodDef unicode_methods[] = {
6366 /* Order is according to common usage: often used methods should
6367 appear first, since lookup is done sequentially. */
6369 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
6370 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6371 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
6372 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
6373 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6374 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6375 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6376 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6377 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6378 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6379 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6380 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6381 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6382 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
6383 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
6384 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
6385 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6386 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6387 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6388 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
6389 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
6390 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
6391 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
6392 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6393 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6394 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6395 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6396 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6397 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6398 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6399 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6400 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6401 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6402 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6403 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6404 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6405 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
6406 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
6407 #if 0
6408 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
6409 #endif
6411 #if 0
6412 /* This one is just used for debugging the implementation. */
6413 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
6414 #endif
6416 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
6417 {NULL, NULL}
6420 static PyObject *
6421 unicode_mod(PyObject *v, PyObject *w)
6423 if (!PyUnicode_Check(v)) {
6424 Py_INCREF(Py_NotImplemented);
6425 return Py_NotImplemented;
6427 return PyUnicode_Format(v, w);
6430 static PyNumberMethods unicode_as_number = {
6431 0, /*nb_add*/
6432 0, /*nb_subtract*/
6433 0, /*nb_multiply*/
6434 0, /*nb_divide*/
6435 unicode_mod, /*nb_remainder*/
6438 static PySequenceMethods unicode_as_sequence = {
6439 (inquiry) unicode_length, /* sq_length */
6440 (binaryfunc) PyUnicode_Concat, /* sq_concat */
6441 (intargfunc) unicode_repeat, /* sq_repeat */
6442 (intargfunc) unicode_getitem, /* sq_item */
6443 (intintargfunc) unicode_slice, /* sq_slice */
6444 0, /* sq_ass_item */
6445 0, /* sq_ass_slice */
6446 (objobjproc)PyUnicode_Contains, /*sq_contains*/
6449 static PyObject*
6450 unicode_subscript(PyUnicodeObject* self, PyObject* item)
6452 if (PyInt_Check(item)) {
6453 long i = PyInt_AS_LONG(item);
6454 if (i < 0)
6455 i += PyString_GET_SIZE(self);
6456 return unicode_getitem(self, i);
6457 } else if (PyLong_Check(item)) {
6458 long i = PyLong_AsLong(item);
6459 if (i == -1 && PyErr_Occurred())
6460 return NULL;
6461 if (i < 0)
6462 i += PyString_GET_SIZE(self);
6463 return unicode_getitem(self, i);
6464 } else if (PySlice_Check(item)) {
6465 int start, stop, step, slicelength, cur, i;
6466 Py_UNICODE* source_buf;
6467 Py_UNICODE* result_buf;
6468 PyObject* result;
6470 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
6471 &start, &stop, &step, &slicelength) < 0) {
6472 return NULL;
6475 if (slicelength <= 0) {
6476 return PyUnicode_FromUnicode(NULL, 0);
6477 } else {
6478 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
6479 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
6481 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6482 result_buf[i] = source_buf[cur];
6485 result = PyUnicode_FromUnicode(result_buf, slicelength);
6486 PyMem_FREE(result_buf);
6487 return result;
6489 } else {
6490 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6491 return NULL;
6495 static PyMappingMethods unicode_as_mapping = {
6496 (inquiry)unicode_length, /* mp_length */
6497 (binaryfunc)unicode_subscript, /* mp_subscript */
6498 (objobjargproc)0, /* mp_ass_subscript */
6501 static int
6502 unicode_buffer_getreadbuf(PyUnicodeObject *self,
6503 int index,
6504 const void **ptr)
6506 if (index != 0) {
6507 PyErr_SetString(PyExc_SystemError,
6508 "accessing non-existent unicode segment");
6509 return -1;
6511 *ptr = (void *) self->str;
6512 return PyUnicode_GET_DATA_SIZE(self);
6515 static int
6516 unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
6517 const void **ptr)
6519 PyErr_SetString(PyExc_TypeError,
6520 "cannot use unicode as modifiable buffer");
6521 return -1;
6524 static int
6525 unicode_buffer_getsegcount(PyUnicodeObject *self,
6526 int *lenp)
6528 if (lenp)
6529 *lenp = PyUnicode_GET_DATA_SIZE(self);
6530 return 1;
6533 static int
6534 unicode_buffer_getcharbuf(PyUnicodeObject *self,
6535 int index,
6536 const void **ptr)
6538 PyObject *str;
6540 if (index != 0) {
6541 PyErr_SetString(PyExc_SystemError,
6542 "accessing non-existent unicode segment");
6543 return -1;
6545 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
6546 if (str == NULL)
6547 return -1;
6548 *ptr = (void *) PyString_AS_STRING(str);
6549 return PyString_GET_SIZE(str);
6552 /* Helpers for PyUnicode_Format() */
6554 static PyObject *
6555 getnextarg(PyObject *args, int arglen, int *p_argidx)
6557 int argidx = *p_argidx;
6558 if (argidx < arglen) {
6559 (*p_argidx)++;
6560 if (arglen < 0)
6561 return args;
6562 else
6563 return PyTuple_GetItem(args, argidx);
6565 PyErr_SetString(PyExc_TypeError,
6566 "not enough arguments for format string");
6567 return NULL;
6570 #define F_LJUST (1<<0)
6571 #define F_SIGN (1<<1)
6572 #define F_BLANK (1<<2)
6573 #define F_ALT (1<<3)
6574 #define F_ZERO (1<<4)
6576 static
6577 int usprintf(register Py_UNICODE *buffer, char *format, ...)
6579 register int i;
6580 int len;
6581 va_list va;
6582 char *charbuffer;
6583 va_start(va, format);
6585 /* First, format the string as char array, then expand to Py_UNICODE
6586 array. */
6587 charbuffer = (char *)buffer;
6588 len = vsprintf(charbuffer, format, va);
6589 for (i = len - 1; i >= 0; i--)
6590 buffer[i] = (Py_UNICODE) charbuffer[i];
6592 va_end(va);
6593 return len;
6596 /* XXX To save some code duplication, formatfloat/long/int could have been
6597 shared with stringobject.c, converting from 8-bit to Unicode after the
6598 formatting is done. */
6600 static int
6601 formatfloat(Py_UNICODE *buf,
6602 size_t buflen,
6603 int flags,
6604 int prec,
6605 int type,
6606 PyObject *v)
6608 /* fmt = '%#.' + `prec` + `type`
6609 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
6610 char fmt[20];
6611 double x;
6613 x = PyFloat_AsDouble(v);
6614 if (x == -1.0 && PyErr_Occurred())
6615 return -1;
6616 if (prec < 0)
6617 prec = 6;
6618 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6619 type = 'g';
6620 /* Worst case length calc to ensure no buffer overrun:
6622 'g' formats:
6623 fmt = %#.<prec>g
6624 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6625 for any double rep.)
6626 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6628 'f' formats:
6629 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6630 len = 1 + 50 + 1 + prec = 52 + prec
6632 If prec=0 the effective precision is 1 (the leading digit is
6633 always given), therefore increase the length by one.
6636 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6637 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
6638 PyErr_SetString(PyExc_OverflowError,
6639 "formatted float is too long (precision too large?)");
6640 return -1;
6642 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6643 (flags&F_ALT) ? "#" : "",
6644 prec, type);
6645 return usprintf(buf, fmt, x);
6648 static PyObject*
6649 formatlong(PyObject *val, int flags, int prec, int type)
6651 char *buf;
6652 int i, len;
6653 PyObject *str; /* temporary string object. */
6654 PyUnicodeObject *result;
6656 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6657 if (!str)
6658 return NULL;
6659 result = _PyUnicode_New(len);
6660 for (i = 0; i < len; i++)
6661 result->str[i] = buf[i];
6662 result->str[len] = 0;
6663 Py_DECREF(str);
6664 return (PyObject*)result;
6667 static int
6668 formatint(Py_UNICODE *buf,
6669 size_t buflen,
6670 int flags,
6671 int prec,
6672 int type,
6673 PyObject *v)
6675 /* fmt = '%#.' + `prec` + 'l' + `type`
6676 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6677 * + 1 + 1
6678 * = 24
6680 char fmt[64]; /* plenty big enough! */
6681 char *sign;
6682 long x;
6684 x = PyInt_AsLong(v);
6685 if (x == -1 && PyErr_Occurred())
6686 return -1;
6687 if (x < 0 && type == 'u') {
6688 type = 'd';
6690 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6691 sign = "-";
6692 else
6693 sign = "";
6694 if (prec < 0)
6695 prec = 1;
6697 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6698 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
6700 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
6701 PyErr_SetString(PyExc_OverflowError,
6702 "formatted integer is too long (precision too large?)");
6703 return -1;
6706 if ((flags & F_ALT) &&
6707 (type == 'x' || type == 'X')) {
6708 /* When converting under %#x or %#X, there are a number
6709 * of issues that cause pain:
6710 * - when 0 is being converted, the C standard leaves off
6711 * the '0x' or '0X', which is inconsistent with other
6712 * %#x/%#X conversions and inconsistent with Python's
6713 * hex() function
6714 * - there are platforms that violate the standard and
6715 * convert 0 with the '0x' or '0X'
6716 * (Metrowerks, Compaq Tru64)
6717 * - there are platforms that give '0x' when converting
6718 * under %#X, but convert 0 in accordance with the
6719 * standard (OS/2 EMX)
6721 * We can achieve the desired consistency by inserting our
6722 * own '0x' or '0X' prefix, and substituting %x/%X in place
6723 * of %#x/%#X.
6725 * Note that this is the same approach as used in
6726 * formatint() in stringobject.c
6728 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6729 sign, type, prec, type);
6731 else {
6732 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6733 sign, (flags&F_ALT) ? "#" : "",
6734 prec, type);
6736 if (sign[0])
6737 return usprintf(buf, fmt, -x);
6738 else
6739 return usprintf(buf, fmt, x);
6742 static int
6743 formatchar(Py_UNICODE *buf,
6744 size_t buflen,
6745 PyObject *v)
6747 /* presume that the buffer is at least 2 characters long */
6748 if (PyUnicode_Check(v)) {
6749 if (PyUnicode_GET_SIZE(v) != 1)
6750 goto onError;
6751 buf[0] = PyUnicode_AS_UNICODE(v)[0];
6754 else if (PyString_Check(v)) {
6755 if (PyString_GET_SIZE(v) != 1)
6756 goto onError;
6757 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6760 else {
6761 /* Integer input truncated to a character */
6762 long x;
6763 x = PyInt_AsLong(v);
6764 if (x == -1 && PyErr_Occurred())
6765 goto onError;
6766 #ifdef Py_UNICODE_WIDE
6767 if (x < 0 || x > 0x10ffff) {
6768 PyErr_SetString(PyExc_OverflowError,
6769 "%c arg not in range(0x110000) "
6770 "(wide Python build)");
6771 return -1;
6773 #else
6774 if (x < 0 || x > 0xffff) {
6775 PyErr_SetString(PyExc_OverflowError,
6776 "%c arg not in range(0x10000) "
6777 "(narrow Python build)");
6778 return -1;
6780 #endif
6781 buf[0] = (Py_UNICODE) x;
6783 buf[1] = '\0';
6784 return 1;
6786 onError:
6787 PyErr_SetString(PyExc_TypeError,
6788 "%c requires int or char");
6789 return -1;
6792 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6794 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6795 chars are formatted. XXX This is a magic number. Each formatting
6796 routine does bounds checking to ensure no overflow, but a better
6797 solution may be to malloc a buffer of appropriate size for each
6798 format. For now, the current solution is sufficient.
6800 #define FORMATBUFLEN (size_t)120
6802 PyObject *PyUnicode_Format(PyObject *format,
6803 PyObject *args)
6805 Py_UNICODE *fmt, *res;
6806 int fmtcnt, rescnt, reslen, arglen, argidx;
6807 int args_owned = 0;
6808 PyUnicodeObject *result = NULL;
6809 PyObject *dict = NULL;
6810 PyObject *uformat;
6812 if (format == NULL || args == NULL) {
6813 PyErr_BadInternalCall();
6814 return NULL;
6816 uformat = PyUnicode_FromObject(format);
6817 if (uformat == NULL)
6818 return NULL;
6819 fmt = PyUnicode_AS_UNICODE(uformat);
6820 fmtcnt = PyUnicode_GET_SIZE(uformat);
6822 reslen = rescnt = fmtcnt + 100;
6823 result = _PyUnicode_New(reslen);
6824 if (result == NULL)
6825 goto onError;
6826 res = PyUnicode_AS_UNICODE(result);
6828 if (PyTuple_Check(args)) {
6829 arglen = PyTuple_Size(args);
6830 argidx = 0;
6832 else {
6833 arglen = -1;
6834 argidx = -2;
6836 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6837 !PyObject_TypeCheck(args, &PyBaseString_Type))
6838 dict = args;
6840 while (--fmtcnt >= 0) {
6841 if (*fmt != '%') {
6842 if (--rescnt < 0) {
6843 rescnt = fmtcnt + 100;
6844 reslen += rescnt;
6845 if (_PyUnicode_Resize(&result, reslen) < 0)
6846 return NULL;
6847 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6848 --rescnt;
6850 *res++ = *fmt++;
6852 else {
6853 /* Got a format specifier */
6854 int flags = 0;
6855 int width = -1;
6856 int prec = -1;
6857 Py_UNICODE c = '\0';
6858 Py_UNICODE fill;
6859 PyObject *v = NULL;
6860 PyObject *temp = NULL;
6861 Py_UNICODE *pbuf;
6862 Py_UNICODE sign;
6863 int len;
6864 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
6866 fmt++;
6867 if (*fmt == '(') {
6868 Py_UNICODE *keystart;
6869 int keylen;
6870 PyObject *key;
6871 int pcount = 1;
6873 if (dict == NULL) {
6874 PyErr_SetString(PyExc_TypeError,
6875 "format requires a mapping");
6876 goto onError;
6878 ++fmt;
6879 --fmtcnt;
6880 keystart = fmt;
6881 /* Skip over balanced parentheses */
6882 while (pcount > 0 && --fmtcnt >= 0) {
6883 if (*fmt == ')')
6884 --pcount;
6885 else if (*fmt == '(')
6886 ++pcount;
6887 fmt++;
6889 keylen = fmt - keystart - 1;
6890 if (fmtcnt < 0 || pcount > 0) {
6891 PyErr_SetString(PyExc_ValueError,
6892 "incomplete format key");
6893 goto onError;
6895 #if 0
6896 /* keys are converted to strings using UTF-8 and
6897 then looked up since Python uses strings to hold
6898 variables names etc. in its namespaces and we
6899 wouldn't want to break common idioms. */
6900 key = PyUnicode_EncodeUTF8(keystart,
6901 keylen,
6902 NULL);
6903 #else
6904 key = PyUnicode_FromUnicode(keystart, keylen);
6905 #endif
6906 if (key == NULL)
6907 goto onError;
6908 if (args_owned) {
6909 Py_DECREF(args);
6910 args_owned = 0;
6912 args = PyObject_GetItem(dict, key);
6913 Py_DECREF(key);
6914 if (args == NULL) {
6915 goto onError;
6917 args_owned = 1;
6918 arglen = -1;
6919 argidx = -2;
6921 while (--fmtcnt >= 0) {
6922 switch (c = *fmt++) {
6923 case '-': flags |= F_LJUST; continue;
6924 case '+': flags |= F_SIGN; continue;
6925 case ' ': flags |= F_BLANK; continue;
6926 case '#': flags |= F_ALT; continue;
6927 case '0': flags |= F_ZERO; continue;
6929 break;
6931 if (c == '*') {
6932 v = getnextarg(args, arglen, &argidx);
6933 if (v == NULL)
6934 goto onError;
6935 if (!PyInt_Check(v)) {
6936 PyErr_SetString(PyExc_TypeError,
6937 "* wants int");
6938 goto onError;
6940 width = PyInt_AsLong(v);
6941 if (width < 0) {
6942 flags |= F_LJUST;
6943 width = -width;
6945 if (--fmtcnt >= 0)
6946 c = *fmt++;
6948 else if (c >= '0' && c <= '9') {
6949 width = c - '0';
6950 while (--fmtcnt >= 0) {
6951 c = *fmt++;
6952 if (c < '0' || c > '9')
6953 break;
6954 if ((width*10) / 10 != width) {
6955 PyErr_SetString(PyExc_ValueError,
6956 "width too big");
6957 goto onError;
6959 width = width*10 + (c - '0');
6962 if (c == '.') {
6963 prec = 0;
6964 if (--fmtcnt >= 0)
6965 c = *fmt++;
6966 if (c == '*') {
6967 v = getnextarg(args, arglen, &argidx);
6968 if (v == NULL)
6969 goto onError;
6970 if (!PyInt_Check(v)) {
6971 PyErr_SetString(PyExc_TypeError,
6972 "* wants int");
6973 goto onError;
6975 prec = PyInt_AsLong(v);
6976 if (prec < 0)
6977 prec = 0;
6978 if (--fmtcnt >= 0)
6979 c = *fmt++;
6981 else if (c >= '0' && c <= '9') {
6982 prec = c - '0';
6983 while (--fmtcnt >= 0) {
6984 c = Py_CHARMASK(*fmt++);
6985 if (c < '0' || c > '9')
6986 break;
6987 if ((prec*10) / 10 != prec) {
6988 PyErr_SetString(PyExc_ValueError,
6989 "prec too big");
6990 goto onError;
6992 prec = prec*10 + (c - '0');
6995 } /* prec */
6996 if (fmtcnt >= 0) {
6997 if (c == 'h' || c == 'l' || c == 'L') {
6998 if (--fmtcnt >= 0)
6999 c = *fmt++;
7002 if (fmtcnt < 0) {
7003 PyErr_SetString(PyExc_ValueError,
7004 "incomplete format");
7005 goto onError;
7007 if (c != '%') {
7008 v = getnextarg(args, arglen, &argidx);
7009 if (v == NULL)
7010 goto onError;
7012 sign = 0;
7013 fill = ' ';
7014 switch (c) {
7016 case '%':
7017 pbuf = formatbuf;
7018 /* presume that buffer length is at least 1 */
7019 pbuf[0] = '%';
7020 len = 1;
7021 break;
7023 case 's':
7024 case 'r':
7025 if (PyUnicode_Check(v) && c == 's') {
7026 temp = v;
7027 Py_INCREF(temp);
7029 else {
7030 PyObject *unicode;
7031 if (c == 's')
7032 temp = PyObject_Unicode(v);
7033 else
7034 temp = PyObject_Repr(v);
7035 if (temp == NULL)
7036 goto onError;
7037 if (PyUnicode_Check(temp))
7038 /* nothing to do */;
7039 else if (PyString_Check(temp)) {
7040 /* convert to string to Unicode */
7041 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
7042 PyString_GET_SIZE(temp),
7043 NULL,
7044 "strict");
7045 Py_DECREF(temp);
7046 temp = unicode;
7047 if (temp == NULL)
7048 goto onError;
7050 else {
7051 Py_DECREF(temp);
7052 PyErr_SetString(PyExc_TypeError,
7053 "%s argument has non-string str()");
7054 goto onError;
7057 pbuf = PyUnicode_AS_UNICODE(temp);
7058 len = PyUnicode_GET_SIZE(temp);
7059 if (prec >= 0 && len > prec)
7060 len = prec;
7061 break;
7063 case 'i':
7064 case 'd':
7065 case 'u':
7066 case 'o':
7067 case 'x':
7068 case 'X':
7069 if (c == 'i')
7070 c = 'd';
7071 if (PyLong_Check(v)) {
7072 temp = formatlong(v, flags, prec, c);
7073 if (!temp)
7074 goto onError;
7075 pbuf = PyUnicode_AS_UNICODE(temp);
7076 len = PyUnicode_GET_SIZE(temp);
7077 sign = 1;
7079 else {
7080 pbuf = formatbuf;
7081 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7082 flags, prec, c, v);
7083 if (len < 0)
7084 goto onError;
7085 sign = 1;
7087 if (flags & F_ZERO)
7088 fill = '0';
7089 break;
7091 case 'e':
7092 case 'E':
7093 case 'f':
7094 case 'F':
7095 case 'g':
7096 case 'G':
7097 if (c == 'F')
7098 c = 'f';
7099 pbuf = formatbuf;
7100 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7101 flags, prec, c, v);
7102 if (len < 0)
7103 goto onError;
7104 sign = 1;
7105 if (flags & F_ZERO)
7106 fill = '0';
7107 break;
7109 case 'c':
7110 pbuf = formatbuf;
7111 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
7112 if (len < 0)
7113 goto onError;
7114 break;
7116 default:
7117 PyErr_Format(PyExc_ValueError,
7118 "unsupported format character '%c' (0x%x) "
7119 "at index %i",
7120 (31<=c && c<=126) ? (char)c : '?',
7121 (int)c,
7122 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
7123 goto onError;
7125 if (sign) {
7126 if (*pbuf == '-' || *pbuf == '+') {
7127 sign = *pbuf++;
7128 len--;
7130 else if (flags & F_SIGN)
7131 sign = '+';
7132 else if (flags & F_BLANK)
7133 sign = ' ';
7134 else
7135 sign = 0;
7137 if (width < len)
7138 width = len;
7139 if (rescnt - (sign != 0) < width) {
7140 reslen -= rescnt;
7141 rescnt = width + fmtcnt + 100;
7142 reslen += rescnt;
7143 if (reslen < 0) {
7144 Py_DECREF(result);
7145 return PyErr_NoMemory();
7147 if (_PyUnicode_Resize(&result, reslen) < 0)
7148 return NULL;
7149 res = PyUnicode_AS_UNICODE(result)
7150 + reslen - rescnt;
7152 if (sign) {
7153 if (fill != ' ')
7154 *res++ = sign;
7155 rescnt--;
7156 if (width > len)
7157 width--;
7159 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7160 assert(pbuf[0] == '0');
7161 assert(pbuf[1] == c);
7162 if (fill != ' ') {
7163 *res++ = *pbuf++;
7164 *res++ = *pbuf++;
7166 rescnt -= 2;
7167 width -= 2;
7168 if (width < 0)
7169 width = 0;
7170 len -= 2;
7172 if (width > len && !(flags & F_LJUST)) {
7173 do {
7174 --rescnt;
7175 *res++ = fill;
7176 } while (--width > len);
7178 if (fill == ' ') {
7179 if (sign)
7180 *res++ = sign;
7181 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7182 assert(pbuf[0] == '0');
7183 assert(pbuf[1] == c);
7184 *res++ = *pbuf++;
7185 *res++ = *pbuf++;
7188 Py_UNICODE_COPY(res, pbuf, len);
7189 res += len;
7190 rescnt -= len;
7191 while (--width >= len) {
7192 --rescnt;
7193 *res++ = ' ';
7195 if (dict && (argidx < arglen) && c != '%') {
7196 PyErr_SetString(PyExc_TypeError,
7197 "not all arguments converted during string formatting");
7198 goto onError;
7200 Py_XDECREF(temp);
7201 } /* '%' */
7202 } /* until end */
7203 if (argidx < arglen && !dict) {
7204 PyErr_SetString(PyExc_TypeError,
7205 "not all arguments converted during string formatting");
7206 goto onError;
7209 if (args_owned) {
7210 Py_DECREF(args);
7212 Py_DECREF(uformat);
7213 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7214 goto onError;
7215 return (PyObject *)result;
7217 onError:
7218 Py_XDECREF(result);
7219 Py_DECREF(uformat);
7220 if (args_owned) {
7221 Py_DECREF(args);
7223 return NULL;
7226 static PyBufferProcs unicode_as_buffer = {
7227 (getreadbufferproc) unicode_buffer_getreadbuf,
7228 (getwritebufferproc) unicode_buffer_getwritebuf,
7229 (getsegcountproc) unicode_buffer_getsegcount,
7230 (getcharbufferproc) unicode_buffer_getcharbuf,
7233 static PyObject *
7234 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7236 static PyObject *
7237 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7239 PyObject *x = NULL;
7240 static char *kwlist[] = {"string", "encoding", "errors", 0};
7241 char *encoding = NULL;
7242 char *errors = NULL;
7244 if (type != &PyUnicode_Type)
7245 return unicode_subtype_new(type, args, kwds);
7246 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7247 kwlist, &x, &encoding, &errors))
7248 return NULL;
7249 if (x == NULL)
7250 return (PyObject *)_PyUnicode_New(0);
7251 if (encoding == NULL && errors == NULL)
7252 return PyObject_Unicode(x);
7253 else
7254 return PyUnicode_FromEncodedObject(x, encoding, errors);
7257 static PyObject *
7258 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7260 PyUnicodeObject *tmp, *pnew;
7261 int n;
7263 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7264 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7265 if (tmp == NULL)
7266 return NULL;
7267 assert(PyUnicode_Check(tmp));
7268 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
7269 if (pnew == NULL) {
7270 Py_DECREF(tmp);
7271 return NULL;
7273 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7274 if (pnew->str == NULL) {
7275 _Py_ForgetReference((PyObject *)pnew);
7276 PyObject_Del(pnew);
7277 Py_DECREF(tmp);
7278 return PyErr_NoMemory();
7280 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7281 pnew->length = n;
7282 pnew->hash = tmp->hash;
7283 Py_DECREF(tmp);
7284 return (PyObject *)pnew;
7287 PyDoc_STRVAR(unicode_doc,
7288 "unicode(string [, encoding[, errors]]) -> object\n\
7290 Create a new Unicode object from the given encoded string.\n\
7291 encoding defaults to the current default string encoding.\n\
7292 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
7294 PyTypeObject PyUnicode_Type = {
7295 PyObject_HEAD_INIT(&PyType_Type)
7296 0, /* ob_size */
7297 "unicode", /* tp_name */
7298 sizeof(PyUnicodeObject), /* tp_size */
7299 0, /* tp_itemsize */
7300 /* Slots */
7301 (destructor)unicode_dealloc, /* tp_dealloc */
7302 0, /* tp_print */
7303 0, /* tp_getattr */
7304 0, /* tp_setattr */
7305 (cmpfunc) unicode_compare, /* tp_compare */
7306 (reprfunc) unicode_repr, /* tp_repr */
7307 &unicode_as_number, /* tp_as_number */
7308 &unicode_as_sequence, /* tp_as_sequence */
7309 &unicode_as_mapping, /* tp_as_mapping */
7310 (hashfunc) unicode_hash, /* tp_hash*/
7311 0, /* tp_call*/
7312 (reprfunc) unicode_str, /* tp_str */
7313 PyObject_GenericGetAttr, /* tp_getattro */
7314 0, /* tp_setattro */
7315 &unicode_as_buffer, /* tp_as_buffer */
7316 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7317 Py_TPFLAGS_BASETYPE, /* tp_flags */
7318 unicode_doc, /* tp_doc */
7319 0, /* tp_traverse */
7320 0, /* tp_clear */
7321 0, /* tp_richcompare */
7322 0, /* tp_weaklistoffset */
7323 0, /* tp_iter */
7324 0, /* tp_iternext */
7325 unicode_methods, /* tp_methods */
7326 0, /* tp_members */
7327 0, /* tp_getset */
7328 &PyBaseString_Type, /* tp_base */
7329 0, /* tp_dict */
7330 0, /* tp_descr_get */
7331 0, /* tp_descr_set */
7332 0, /* tp_dictoffset */
7333 0, /* tp_init */
7334 0, /* tp_alloc */
7335 unicode_new, /* tp_new */
7336 PyObject_Del, /* tp_free */
7339 /* Initialize the Unicode implementation */
7341 void _PyUnicode_Init(void)
7343 int i;
7345 /* Init the implementation */
7346 unicode_freelist = NULL;
7347 unicode_freelist_size = 0;
7348 unicode_empty = _PyUnicode_New(0);
7349 strcpy(unicode_default_encoding, "ascii");
7350 for (i = 0; i < 256; i++)
7351 unicode_latin1[i] = NULL;
7352 if (PyType_Ready(&PyUnicode_Type) < 0)
7353 Py_FatalError("Can't initialize 'unicode'");
7356 /* Finalize the Unicode implementation */
7358 void
7359 _PyUnicode_Fini(void)
7361 PyUnicodeObject *u;
7362 int i;
7364 Py_XDECREF(unicode_empty);
7365 unicode_empty = NULL;
7367 for (i = 0; i < 256; i++) {
7368 if (unicode_latin1[i]) {
7369 Py_DECREF(unicode_latin1[i]);
7370 unicode_latin1[i] = NULL;
7374 for (u = unicode_freelist; u != NULL;) {
7375 PyUnicodeObject *v = u;
7376 u = *(PyUnicodeObject **)u;
7377 if (v->str)
7378 PyMem_DEL(v->str);
7379 Py_XDECREF(v->defenc);
7380 PyObject_Del(v);
7382 unicode_freelist = NULL;
7383 unicode_freelist_size = 0;
7387 Local variables:
7388 c-basic-offset: 4
7389 indent-tabs-mode: nil
7390 End: