Patch #1635058 by Mark Roberts: ensure that htonl and friends never accept or
[python.git] / Objects / unicodeobject.c
blob290e8dfb4759cd6f8b22f4969045ee1eaf3b1e36
1 /*
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
10 Copyright (c) Corporation for National Research Initiatives.
12 --------------------------------------------------------------------
13 The original string type implementation is:
15 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
18 By obtaining, using, and/or copying this software and/or its
19 associated documentation, you agree that you have read, understood,
20 and will comply with the following terms and conditions:
22 Permission to use, copy, modify, and distribute this software and its
23 associated documentation for any purpose and without fee is hereby
24 granted, provided that the above copyright notice appears in all
25 copies, and that both that copyright notice and this permission notice
26 appear in supporting documentation, and that the name of Secret Labs
27 AB or the author not be used in advertising or publicity pertaining to
28 distribution of the software without specific, written prior
29 permission.
31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 --------------------------------------------------------------------
42 #define PY_SSIZE_T_CLEAN
43 #include "Python.h"
45 #include "unicodeobject.h"
46 #include "ucnhash.h"
48 #ifdef MS_WINDOWS
49 #include <windows.h>
50 #endif
52 /* Limit for the Unicode object free list */
54 #define MAX_UNICODE_FREELIST_SIZE 1024
56 /* Limit for the Unicode object free list stay alive optimization.
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
60 limit. This reduces malloc() overhead for small Unicode objects.
62 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64 malloc()-overhead) bytes of unused garbage.
66 Setting the limit to 0 effectively turns the feature off.
68 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
73 #define KEEPALIVE_SIZE_LIMIT 9
75 /* Endianness switches; defaults to little endian */
77 #ifdef WORDS_BIGENDIAN
78 # define BYTEORDER_IS_BIG_ENDIAN
79 #else
80 # define BYTEORDER_IS_LITTLE_ENDIAN
81 #endif
83 /* --- Globals ------------------------------------------------------------
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
91 #ifdef __cplusplus
92 extern "C" {
93 #endif
95 /* Free list for Unicode objects */
96 static PyUnicodeObject *unicode_freelist;
97 static int unicode_freelist_size;
99 /* The empty Unicode object is shared to improve performance. */
100 static PyUnicodeObject *unicode_empty;
102 /* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104 static PyUnicodeObject *unicode_latin1[256];
106 /* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
109 Always use the PyUnicode_SetDefaultEncoding() and
110 PyUnicode_GetDefaultEncoding() APIs to access this global.
113 static char unicode_default_encoding[100];
115 Py_UNICODE
116 PyUnicode_GetMax(void)
118 #ifdef Py_UNICODE_WIDE
119 return 0x10FFFF;
120 #else
121 /* This is actually an illegal character, so it should
122 not be passed to unichr. */
123 return 0xFFFF;
124 #endif
127 /* --- Bloom Filters ----------------------------------------------------- */
129 /* stuff to implement simple "bloom filters" for Unicode characters.
130 to keep things simple, we use a single bitmask, using the least 5
131 bits from each unicode characters as the bit index. */
133 /* the linebreak mask is set up by Unicode_Init below */
135 #define BLOOM_MASK unsigned long
137 static BLOOM_MASK bloom_linebreak;
139 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
141 #define BLOOM_LINEBREAK(ch)\
142 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
144 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
146 /* calculate simple bloom-style bitmask for a given unicode string */
148 long mask;
149 Py_ssize_t i;
151 mask = 0;
152 for (i = 0; i < len; i++)
153 mask |= (1 << (ptr[i] & 0x1F));
155 return mask;
158 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
160 Py_ssize_t i;
162 for (i = 0; i < setlen; i++)
163 if (set[i] == chr)
164 return 1;
166 return 0;
169 #define BLOOM_MEMBER(mask, chr, set, setlen)\
170 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
172 /* --- Unicode Object ----------------------------------------------------- */
174 static
175 int unicode_resize(register PyUnicodeObject *unicode,
176 Py_ssize_t length)
178 void *oldstr;
180 /* Shortcut if there's nothing much to do. */
181 if (unicode->length == length)
182 goto reset;
184 /* Resizing shared object (unicode_empty or single character
185 objects) in-place is not allowed. Use PyUnicode_Resize()
186 instead ! */
188 if (unicode == unicode_empty ||
189 (unicode->length == 1 &&
190 unicode->str[0] < 256U &&
191 unicode_latin1[unicode->str[0]] == unicode)) {
192 PyErr_SetString(PyExc_SystemError,
193 "can't resize shared unicode objects");
194 return -1;
197 /* We allocate one more byte to make sure the string is Ux0000 terminated.
198 The overallocation is also used by fastsearch, which assumes that it's
199 safe to look at str[length] (without making any assumptions about what
200 it contains). */
202 oldstr = unicode->str;
203 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
204 if (!unicode->str) {
205 unicode->str = (Py_UNICODE *)oldstr;
206 PyErr_NoMemory();
207 return -1;
209 unicode->str[length] = 0;
210 unicode->length = length;
212 reset:
213 /* Reset the object caches */
214 if (unicode->defenc) {
215 Py_DECREF(unicode->defenc);
216 unicode->defenc = NULL;
218 unicode->hash = -1;
220 return 0;
223 /* We allocate one more byte to make sure the string is
224 Ux0000 terminated -- XXX is this needed ?
226 XXX This allocator could further be enhanced by assuring that the
227 free list never reduces its size below 1.
231 static
232 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
234 register PyUnicodeObject *unicode;
236 /* Optimization for empty strings */
237 if (length == 0 && unicode_empty != NULL) {
238 Py_INCREF(unicode_empty);
239 return unicode_empty;
242 /* Unicode freelist & memory allocation */
243 if (unicode_freelist) {
244 unicode = unicode_freelist;
245 unicode_freelist = *(PyUnicodeObject **)unicode;
246 unicode_freelist_size--;
247 if (unicode->str) {
248 /* Keep-Alive optimization: we only upsize the buffer,
249 never downsize it. */
250 if ((unicode->length < length) &&
251 unicode_resize(unicode, length) < 0) {
252 PyMem_DEL(unicode->str);
253 goto onError;
256 else {
257 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
259 PyObject_INIT(unicode, &PyUnicode_Type);
261 else {
262 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
263 if (unicode == NULL)
264 return NULL;
265 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
268 if (!unicode->str) {
269 PyErr_NoMemory();
270 goto onError;
272 /* Initialize the first element to guard against cases where
273 * the caller fails before initializing str -- unicode_resize()
274 * reads str[0], and the Keep-Alive optimization can keep memory
275 * allocated for str alive across a call to unicode_dealloc(unicode).
276 * We don't want unicode_resize to read uninitialized memory in
277 * that case.
279 unicode->str[0] = 0;
280 unicode->str[length] = 0;
281 unicode->length = length;
282 unicode->hash = -1;
283 unicode->defenc = NULL;
284 return unicode;
286 onError:
287 _Py_ForgetReference((PyObject *)unicode);
288 PyObject_Del(unicode);
289 return NULL;
292 static
293 void unicode_dealloc(register PyUnicodeObject *unicode)
295 if (PyUnicode_CheckExact(unicode) &&
296 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
297 /* Keep-Alive optimization */
298 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
299 PyMem_DEL(unicode->str);
300 unicode->str = NULL;
301 unicode->length = 0;
303 if (unicode->defenc) {
304 Py_DECREF(unicode->defenc);
305 unicode->defenc = NULL;
307 /* Add to free list */
308 *(PyUnicodeObject **)unicode = unicode_freelist;
309 unicode_freelist = unicode;
310 unicode_freelist_size++;
312 else {
313 PyMem_DEL(unicode->str);
314 Py_XDECREF(unicode->defenc);
315 unicode->ob_type->tp_free((PyObject *)unicode);
319 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
321 register PyUnicodeObject *v;
323 /* Argument checks */
324 if (unicode == NULL) {
325 PyErr_BadInternalCall();
326 return -1;
328 v = (PyUnicodeObject *)*unicode;
329 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
330 PyErr_BadInternalCall();
331 return -1;
334 /* Resizing unicode_empty and single character objects is not
335 possible since these are being shared. We simply return a fresh
336 copy with the same Unicode content. */
337 if (v->length != length &&
338 (v == unicode_empty || v->length == 1)) {
339 PyUnicodeObject *w = _PyUnicode_New(length);
340 if (w == NULL)
341 return -1;
342 Py_UNICODE_COPY(w->str, v->str,
343 length < v->length ? length : v->length);
344 Py_DECREF(*unicode);
345 *unicode = (PyObject *)w;
346 return 0;
349 /* Note that we don't have to modify *unicode for unshared Unicode
350 objects, since we can modify them in-place. */
351 return unicode_resize(v, length);
354 /* Internal API for use in unicodeobject.c only ! */
355 #define _PyUnicode_Resize(unicodevar, length) \
356 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
358 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
359 Py_ssize_t size)
361 PyUnicodeObject *unicode;
363 /* If the Unicode data is known at construction time, we can apply
364 some optimizations which share commonly used objects. */
365 if (u != NULL) {
367 /* Optimization for empty strings */
368 if (size == 0 && unicode_empty != NULL) {
369 Py_INCREF(unicode_empty);
370 return (PyObject *)unicode_empty;
373 /* Single character Unicode objects in the Latin-1 range are
374 shared when using this constructor */
375 if (size == 1 && *u < 256) {
376 unicode = unicode_latin1[*u];
377 if (!unicode) {
378 unicode = _PyUnicode_New(1);
379 if (!unicode)
380 return NULL;
381 unicode->str[0] = *u;
382 unicode_latin1[*u] = unicode;
384 Py_INCREF(unicode);
385 return (PyObject *)unicode;
389 unicode = _PyUnicode_New(size);
390 if (!unicode)
391 return NULL;
393 /* Copy the Unicode data into the new object */
394 if (u != NULL)
395 Py_UNICODE_COPY(unicode->str, u, size);
397 return (PyObject *)unicode;
400 #ifdef HAVE_WCHAR_H
402 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
403 Py_ssize_t size)
405 PyUnicodeObject *unicode;
407 if (w == NULL) {
408 PyErr_BadInternalCall();
409 return NULL;
412 unicode = _PyUnicode_New(size);
413 if (!unicode)
414 return NULL;
416 /* Copy the wchar_t data into the new object */
417 #ifdef HAVE_USABLE_WCHAR_T
418 memcpy(unicode->str, w, size * sizeof(wchar_t));
419 #else
421 register Py_UNICODE *u;
422 register Py_ssize_t i;
423 u = PyUnicode_AS_UNICODE(unicode);
424 for (i = size; i > 0; i--)
425 *u++ = *w++;
427 #endif
429 return (PyObject *)unicode;
432 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
433 wchar_t *w,
434 Py_ssize_t size)
436 if (unicode == NULL) {
437 PyErr_BadInternalCall();
438 return -1;
441 /* If possible, try to copy the 0-termination as well */
442 if (size > PyUnicode_GET_SIZE(unicode))
443 size = PyUnicode_GET_SIZE(unicode) + 1;
445 #ifdef HAVE_USABLE_WCHAR_T
446 memcpy(w, unicode->str, size * sizeof(wchar_t));
447 #else
449 register Py_UNICODE *u;
450 register Py_ssize_t i;
451 u = PyUnicode_AS_UNICODE(unicode);
452 for (i = size; i > 0; i--)
453 *w++ = *u++;
455 #endif
457 if (size > PyUnicode_GET_SIZE(unicode))
458 return PyUnicode_GET_SIZE(unicode);
459 else
460 return size;
463 #endif
465 PyObject *PyUnicode_FromOrdinal(int ordinal)
467 Py_UNICODE s[1];
469 #ifdef Py_UNICODE_WIDE
470 if (ordinal < 0 || ordinal > 0x10ffff) {
471 PyErr_SetString(PyExc_ValueError,
472 "unichr() arg not in range(0x110000) "
473 "(wide Python build)");
474 return NULL;
476 #else
477 if (ordinal < 0 || ordinal > 0xffff) {
478 PyErr_SetString(PyExc_ValueError,
479 "unichr() arg not in range(0x10000) "
480 "(narrow Python build)");
481 return NULL;
483 #endif
485 s[0] = (Py_UNICODE)ordinal;
486 return PyUnicode_FromUnicode(s, 1);
489 PyObject *PyUnicode_FromObject(register PyObject *obj)
491 /* XXX Perhaps we should make this API an alias of
492 PyObject_Unicode() instead ?! */
493 if (PyUnicode_CheckExact(obj)) {
494 Py_INCREF(obj);
495 return obj;
497 if (PyUnicode_Check(obj)) {
498 /* For a Unicode subtype that's not a Unicode object,
499 return a true Unicode object with the same data. */
500 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
501 PyUnicode_GET_SIZE(obj));
503 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
506 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
507 const char *encoding,
508 const char *errors)
510 const char *s = NULL;
511 Py_ssize_t len;
512 PyObject *v;
514 if (obj == NULL) {
515 PyErr_BadInternalCall();
516 return NULL;
519 #if 0
520 /* For b/w compatibility we also accept Unicode objects provided
521 that no encodings is given and then redirect to
522 PyObject_Unicode() which then applies the additional logic for
523 Unicode subclasses.
525 NOTE: This API should really only be used for object which
526 represent *encoded* Unicode !
529 if (PyUnicode_Check(obj)) {
530 if (encoding) {
531 PyErr_SetString(PyExc_TypeError,
532 "decoding Unicode is not supported");
533 return NULL;
535 return PyObject_Unicode(obj);
537 #else
538 if (PyUnicode_Check(obj)) {
539 PyErr_SetString(PyExc_TypeError,
540 "decoding Unicode is not supported");
541 return NULL;
543 #endif
545 /* Coerce object */
546 if (PyString_Check(obj)) {
547 s = PyString_AS_STRING(obj);
548 len = PyString_GET_SIZE(obj);
550 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
551 /* Overwrite the error message with something more useful in
552 case of a TypeError. */
553 if (PyErr_ExceptionMatches(PyExc_TypeError))
554 PyErr_Format(PyExc_TypeError,
555 "coercing to Unicode: need string or buffer, "
556 "%.80s found",
557 obj->ob_type->tp_name);
558 goto onError;
561 /* Convert to Unicode */
562 if (len == 0) {
563 Py_INCREF(unicode_empty);
564 v = (PyObject *)unicode_empty;
566 else
567 v = PyUnicode_Decode(s, len, encoding, errors);
569 return v;
571 onError:
572 return NULL;
575 PyObject *PyUnicode_Decode(const char *s,
576 Py_ssize_t size,
577 const char *encoding,
578 const char *errors)
580 PyObject *buffer = NULL, *unicode;
582 if (encoding == NULL)
583 encoding = PyUnicode_GetDefaultEncoding();
585 /* Shortcuts for common default encodings */
586 if (strcmp(encoding, "utf-8") == 0)
587 return PyUnicode_DecodeUTF8(s, size, errors);
588 else if (strcmp(encoding, "latin-1") == 0)
589 return PyUnicode_DecodeLatin1(s, size, errors);
590 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
591 else if (strcmp(encoding, "mbcs") == 0)
592 return PyUnicode_DecodeMBCS(s, size, errors);
593 #endif
594 else if (strcmp(encoding, "ascii") == 0)
595 return PyUnicode_DecodeASCII(s, size, errors);
597 /* Decode via the codec registry */
598 buffer = PyBuffer_FromMemory((void *)s, size);
599 if (buffer == NULL)
600 goto onError;
601 unicode = PyCodec_Decode(buffer, encoding, errors);
602 if (unicode == NULL)
603 goto onError;
604 if (!PyUnicode_Check(unicode)) {
605 PyErr_Format(PyExc_TypeError,
606 "decoder did not return an unicode object (type=%.400s)",
607 unicode->ob_type->tp_name);
608 Py_DECREF(unicode);
609 goto onError;
611 Py_DECREF(buffer);
612 return unicode;
614 onError:
615 Py_XDECREF(buffer);
616 return NULL;
619 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
620 const char *encoding,
621 const char *errors)
623 PyObject *v;
625 if (!PyUnicode_Check(unicode)) {
626 PyErr_BadArgument();
627 goto onError;
630 if (encoding == NULL)
631 encoding = PyUnicode_GetDefaultEncoding();
633 /* Decode via the codec registry */
634 v = PyCodec_Decode(unicode, encoding, errors);
635 if (v == NULL)
636 goto onError;
637 return v;
639 onError:
640 return NULL;
643 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
644 Py_ssize_t size,
645 const char *encoding,
646 const char *errors)
648 PyObject *v, *unicode;
650 unicode = PyUnicode_FromUnicode(s, size);
651 if (unicode == NULL)
652 return NULL;
653 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
654 Py_DECREF(unicode);
655 return v;
658 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
659 const char *encoding,
660 const char *errors)
662 PyObject *v;
664 if (!PyUnicode_Check(unicode)) {
665 PyErr_BadArgument();
666 goto onError;
669 if (encoding == NULL)
670 encoding = PyUnicode_GetDefaultEncoding();
672 /* Encode via the codec registry */
673 v = PyCodec_Encode(unicode, encoding, errors);
674 if (v == NULL)
675 goto onError;
676 return v;
678 onError:
679 return NULL;
682 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
683 const char *encoding,
684 const char *errors)
686 PyObject *v;
688 if (!PyUnicode_Check(unicode)) {
689 PyErr_BadArgument();
690 goto onError;
693 if (encoding == NULL)
694 encoding = PyUnicode_GetDefaultEncoding();
696 /* Shortcuts for common default encodings */
697 if (errors == NULL) {
698 if (strcmp(encoding, "utf-8") == 0)
699 return PyUnicode_AsUTF8String(unicode);
700 else if (strcmp(encoding, "latin-1") == 0)
701 return PyUnicode_AsLatin1String(unicode);
702 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
703 else if (strcmp(encoding, "mbcs") == 0)
704 return PyUnicode_AsMBCSString(unicode);
705 #endif
706 else if (strcmp(encoding, "ascii") == 0)
707 return PyUnicode_AsASCIIString(unicode);
710 /* Encode via the codec registry */
711 v = PyCodec_Encode(unicode, encoding, errors);
712 if (v == NULL)
713 goto onError;
714 if (!PyString_Check(v)) {
715 PyErr_Format(PyExc_TypeError,
716 "encoder did not return a string object (type=%.400s)",
717 v->ob_type->tp_name);
718 Py_DECREF(v);
719 goto onError;
721 return v;
723 onError:
724 return NULL;
727 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
728 const char *errors)
730 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
732 if (v)
733 return v;
734 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
735 if (v && errors == NULL)
736 ((PyUnicodeObject *)unicode)->defenc = v;
737 return v;
740 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
742 if (!PyUnicode_Check(unicode)) {
743 PyErr_BadArgument();
744 goto onError;
746 return PyUnicode_AS_UNICODE(unicode);
748 onError:
749 return NULL;
752 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
754 if (!PyUnicode_Check(unicode)) {
755 PyErr_BadArgument();
756 goto onError;
758 return PyUnicode_GET_SIZE(unicode);
760 onError:
761 return -1;
764 const char *PyUnicode_GetDefaultEncoding(void)
766 return unicode_default_encoding;
769 int PyUnicode_SetDefaultEncoding(const char *encoding)
771 PyObject *v;
773 /* Make sure the encoding is valid. As side effect, this also
774 loads the encoding into the codec registry cache. */
775 v = _PyCodec_Lookup(encoding);
776 if (v == NULL)
777 goto onError;
778 Py_DECREF(v);
779 strncpy(unicode_default_encoding,
780 encoding,
781 sizeof(unicode_default_encoding));
782 return 0;
784 onError:
785 return -1;
788 /* error handling callback helper:
789 build arguments, call the callback and check the arguments,
790 if no exception occurred, copy the replacement to the output
791 and adjust various state variables.
792 return 0 on success, -1 on error
795 static
796 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
797 const char *encoding, const char *reason,
798 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
799 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
801 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
803 PyObject *restuple = NULL;
804 PyObject *repunicode = NULL;
805 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
806 Py_ssize_t requiredsize;
807 Py_ssize_t newpos;
808 Py_UNICODE *repptr;
809 Py_ssize_t repsize;
810 int res = -1;
812 if (*errorHandler == NULL) {
813 *errorHandler = PyCodec_LookupError(errors);
814 if (*errorHandler == NULL)
815 goto onError;
818 if (*exceptionObject == NULL) {
819 *exceptionObject = PyUnicodeDecodeError_Create(
820 encoding, input, insize, *startinpos, *endinpos, reason);
821 if (*exceptionObject == NULL)
822 goto onError;
824 else {
825 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
826 goto onError;
827 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
828 goto onError;
829 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
830 goto onError;
833 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
834 if (restuple == NULL)
835 goto onError;
836 if (!PyTuple_Check(restuple)) {
837 PyErr_Format(PyExc_TypeError, &argparse[4]);
838 goto onError;
840 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
841 goto onError;
842 if (newpos<0)
843 newpos = insize+newpos;
844 if (newpos<0 || newpos>insize) {
845 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
846 goto onError;
849 /* need more space? (at least enough for what we
850 have+the replacement+the rest of the string (starting
851 at the new input position), so we won't have to check space
852 when there are no errors in the rest of the string) */
853 repptr = PyUnicode_AS_UNICODE(repunicode);
854 repsize = PyUnicode_GET_SIZE(repunicode);
855 requiredsize = *outpos + repsize + insize-newpos;
856 if (requiredsize > outsize) {
857 if (requiredsize<2*outsize)
858 requiredsize = 2*outsize;
859 if (PyUnicode_Resize(output, requiredsize) < 0)
860 goto onError;
861 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
863 *endinpos = newpos;
864 *inptr = input + newpos;
865 Py_UNICODE_COPY(*outptr, repptr, repsize);
866 *outptr += repsize;
867 *outpos += repsize;
868 /* we made it! */
869 res = 0;
871 onError:
872 Py_XDECREF(restuple);
873 return res;
876 /* --- UTF-7 Codec -------------------------------------------------------- */
878 /* see RFC2152 for details */
880 static
881 char utf7_special[128] = {
882 /* indicate whether a UTF-7 character is special i.e. cannot be directly
883 encoded:
884 0 - not special
885 1 - special
886 2 - whitespace (optional)
887 3 - RFC2152 Set O (optional) */
888 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
889 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
890 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
891 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
892 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
893 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
894 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
895 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
899 /* Note: The comparison (c) <= 0 is a trick to work-around gcc
900 warnings about the comparison always being false; since
901 utf7_special[0] is 1, we can safely make that one comparison
902 true */
904 #define SPECIAL(c, encodeO, encodeWS) \
905 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
906 (encodeWS && (utf7_special[(c)] == 2)) || \
907 (encodeO && (utf7_special[(c)] == 3)))
909 #define B64(n) \
910 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
911 #define B64CHAR(c) \
912 (isalnum(c) || (c) == '+' || (c) == '/')
913 #define UB64(c) \
914 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
915 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
917 #define ENCODE(out, ch, bits) \
918 while (bits >= 6) { \
919 *out++ = B64(ch >> (bits-6)); \
920 bits -= 6; \
923 #define DECODE(out, ch, bits, surrogate) \
924 while (bits >= 16) { \
925 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
926 bits -= 16; \
927 if (surrogate) { \
928 /* We have already generated an error for the high surrogate \
929 so let's not bother seeing if the low surrogate is correct or not */ \
930 surrogate = 0; \
931 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
932 /* This is a surrogate pair. Unfortunately we can't represent \
933 it in a 16-bit character */ \
934 surrogate = 1; \
935 errmsg = "code pairs are not supported"; \
936 goto utf7Error; \
937 } else { \
938 *out++ = outCh; \
942 PyObject *PyUnicode_DecodeUTF7(const char *s,
943 Py_ssize_t size,
944 const char *errors)
946 const char *starts = s;
947 Py_ssize_t startinpos;
948 Py_ssize_t endinpos;
949 Py_ssize_t outpos;
950 const char *e;
951 PyUnicodeObject *unicode;
952 Py_UNICODE *p;
953 const char *errmsg = "";
954 int inShift = 0;
955 unsigned int bitsleft = 0;
956 unsigned long charsleft = 0;
957 int surrogate = 0;
958 PyObject *errorHandler = NULL;
959 PyObject *exc = NULL;
961 unicode = _PyUnicode_New(size);
962 if (!unicode)
963 return NULL;
964 if (size == 0)
965 return (PyObject *)unicode;
967 p = unicode->str;
968 e = s + size;
970 while (s < e) {
971 Py_UNICODE ch;
972 restart:
973 ch = *s;
975 if (inShift) {
976 if ((ch == '-') || !B64CHAR(ch)) {
977 inShift = 0;
978 s++;
980 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
981 if (bitsleft >= 6) {
982 /* The shift sequence has a partial character in it. If
983 bitsleft < 6 then we could just classify it as padding
984 but that is not the case here */
986 errmsg = "partial character in shift sequence";
987 goto utf7Error;
989 /* According to RFC2152 the remaining bits should be zero. We
990 choose to signal an error/insert a replacement character
991 here so indicate the potential of a misencoded character. */
993 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
994 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
995 errmsg = "non-zero padding bits in shift sequence";
996 goto utf7Error;
999 if (ch == '-') {
1000 if ((s < e) && (*(s) == '-')) {
1001 *p++ = '-';
1002 inShift = 1;
1004 } else if (SPECIAL(ch,0,0)) {
1005 errmsg = "unexpected special character";
1006 goto utf7Error;
1007 } else {
1008 *p++ = ch;
1010 } else {
1011 charsleft = (charsleft << 6) | UB64(ch);
1012 bitsleft += 6;
1013 s++;
1014 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1017 else if ( ch == '+' ) {
1018 startinpos = s-starts;
1019 s++;
1020 if (s < e && *s == '-') {
1021 s++;
1022 *p++ = '+';
1023 } else
1025 inShift = 1;
1026 bitsleft = 0;
1029 else if (SPECIAL(ch,0,0)) {
1030 errmsg = "unexpected special character";
1031 s++;
1032 goto utf7Error;
1034 else {
1035 *p++ = ch;
1036 s++;
1038 continue;
1039 utf7Error:
1040 outpos = p-PyUnicode_AS_UNICODE(unicode);
1041 endinpos = s-starts;
1042 if (unicode_decode_call_errorhandler(
1043 errors, &errorHandler,
1044 "utf7", errmsg,
1045 starts, size, &startinpos, &endinpos, &exc, &s,
1046 (PyObject **)&unicode, &outpos, &p))
1047 goto onError;
1050 if (inShift) {
1051 outpos = p-PyUnicode_AS_UNICODE(unicode);
1052 endinpos = size;
1053 if (unicode_decode_call_errorhandler(
1054 errors, &errorHandler,
1055 "utf7", "unterminated shift sequence",
1056 starts, size, &startinpos, &endinpos, &exc, &s,
1057 (PyObject **)&unicode, &outpos, &p))
1058 goto onError;
1059 if (s < e)
1060 goto restart;
1063 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1064 goto onError;
1066 Py_XDECREF(errorHandler);
1067 Py_XDECREF(exc);
1068 return (PyObject *)unicode;
1070 onError:
1071 Py_XDECREF(errorHandler);
1072 Py_XDECREF(exc);
1073 Py_DECREF(unicode);
1074 return NULL;
1078 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1079 Py_ssize_t size,
1080 int encodeSetO,
1081 int encodeWhiteSpace,
1082 const char *errors)
1084 PyObject *v;
1085 /* It might be possible to tighten this worst case */
1086 Py_ssize_t cbAllocated = 5 * size;
1087 int inShift = 0;
1088 Py_ssize_t i = 0;
1089 unsigned int bitsleft = 0;
1090 unsigned long charsleft = 0;
1091 char * out;
1092 char * start;
1094 if (size == 0)
1095 return PyString_FromStringAndSize(NULL, 0);
1097 v = PyString_FromStringAndSize(NULL, cbAllocated);
1098 if (v == NULL)
1099 return NULL;
1101 start = out = PyString_AS_STRING(v);
1102 for (;i < size; ++i) {
1103 Py_UNICODE ch = s[i];
1105 if (!inShift) {
1106 if (ch == '+') {
1107 *out++ = '+';
1108 *out++ = '-';
1109 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1110 charsleft = ch;
1111 bitsleft = 16;
1112 *out++ = '+';
1113 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1114 inShift = bitsleft > 0;
1115 } else {
1116 *out++ = (char) ch;
1118 } else {
1119 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1120 *out++ = B64(charsleft << (6-bitsleft));
1121 charsleft = 0;
1122 bitsleft = 0;
1123 /* Characters not in the BASE64 set implicitly unshift the sequence
1124 so no '-' is required, except if the character is itself a '-' */
1125 if (B64CHAR(ch) || ch == '-') {
1126 *out++ = '-';
1128 inShift = 0;
1129 *out++ = (char) ch;
1130 } else {
1131 bitsleft += 16;
1132 charsleft = (charsleft << 16) | ch;
1133 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1135 /* If the next character is special then we dont' need to terminate
1136 the shift sequence. If the next character is not a BASE64 character
1137 or '-' then the shift sequence will be terminated implicitly and we
1138 don't have to insert a '-'. */
1140 if (bitsleft == 0) {
1141 if (i + 1 < size) {
1142 Py_UNICODE ch2 = s[i+1];
1144 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1146 } else if (B64CHAR(ch2) || ch2 == '-') {
1147 *out++ = '-';
1148 inShift = 0;
1149 } else {
1150 inShift = 0;
1154 else {
1155 *out++ = '-';
1156 inShift = 0;
1162 if (bitsleft) {
1163 *out++= B64(charsleft << (6-bitsleft) );
1164 *out++ = '-';
1167 _PyString_Resize(&v, out - start);
1168 return v;
1171 #undef SPECIAL
1172 #undef B64
1173 #undef B64CHAR
1174 #undef UB64
1175 #undef ENCODE
1176 #undef DECODE
1178 /* --- UTF-8 Codec -------------------------------------------------------- */
1180 static
1181 char utf8_code_length[256] = {
1182 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1183 illegal prefix. see RFC 2279 for details */
1184 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1196 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1197 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1198 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1199 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1202 PyObject *PyUnicode_DecodeUTF8(const char *s,
1203 Py_ssize_t size,
1204 const char *errors)
1206 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1209 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1210 Py_ssize_t size,
1211 const char *errors,
1212 Py_ssize_t *consumed)
1214 const char *starts = s;
1215 int n;
1216 Py_ssize_t startinpos;
1217 Py_ssize_t endinpos;
1218 Py_ssize_t outpos;
1219 const char *e;
1220 PyUnicodeObject *unicode;
1221 Py_UNICODE *p;
1222 const char *errmsg = "";
1223 PyObject *errorHandler = NULL;
1224 PyObject *exc = NULL;
1226 /* Note: size will always be longer than the resulting Unicode
1227 character count */
1228 unicode = _PyUnicode_New(size);
1229 if (!unicode)
1230 return NULL;
1231 if (size == 0) {
1232 if (consumed)
1233 *consumed = 0;
1234 return (PyObject *)unicode;
1237 /* Unpack UTF-8 encoded data */
1238 p = unicode->str;
1239 e = s + size;
1241 while (s < e) {
1242 Py_UCS4 ch = (unsigned char)*s;
1244 if (ch < 0x80) {
1245 *p++ = (Py_UNICODE)ch;
1246 s++;
1247 continue;
1250 n = utf8_code_length[ch];
1252 if (s + n > e) {
1253 if (consumed)
1254 break;
1255 else {
1256 errmsg = "unexpected end of data";
1257 startinpos = s-starts;
1258 endinpos = size;
1259 goto utf8Error;
1263 switch (n) {
1265 case 0:
1266 errmsg = "unexpected code byte";
1267 startinpos = s-starts;
1268 endinpos = startinpos+1;
1269 goto utf8Error;
1271 case 1:
1272 errmsg = "internal error";
1273 startinpos = s-starts;
1274 endinpos = startinpos+1;
1275 goto utf8Error;
1277 case 2:
1278 if ((s[1] & 0xc0) != 0x80) {
1279 errmsg = "invalid data";
1280 startinpos = s-starts;
1281 endinpos = startinpos+2;
1282 goto utf8Error;
1284 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1285 if (ch < 0x80) {
1286 startinpos = s-starts;
1287 endinpos = startinpos+2;
1288 errmsg = "illegal encoding";
1289 goto utf8Error;
1291 else
1292 *p++ = (Py_UNICODE)ch;
1293 break;
1295 case 3:
1296 if ((s[1] & 0xc0) != 0x80 ||
1297 (s[2] & 0xc0) != 0x80) {
1298 errmsg = "invalid data";
1299 startinpos = s-starts;
1300 endinpos = startinpos+3;
1301 goto utf8Error;
1303 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1304 if (ch < 0x0800) {
1305 /* Note: UTF-8 encodings of surrogates are considered
1306 legal UTF-8 sequences;
1308 XXX For wide builds (UCS-4) we should probably try
1309 to recombine the surrogates into a single code
1310 unit.
1312 errmsg = "illegal encoding";
1313 startinpos = s-starts;
1314 endinpos = startinpos+3;
1315 goto utf8Error;
1317 else
1318 *p++ = (Py_UNICODE)ch;
1319 break;
1321 case 4:
1322 if ((s[1] & 0xc0) != 0x80 ||
1323 (s[2] & 0xc0) != 0x80 ||
1324 (s[3] & 0xc0) != 0x80) {
1325 errmsg = "invalid data";
1326 startinpos = s-starts;
1327 endinpos = startinpos+4;
1328 goto utf8Error;
1330 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1331 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1332 /* validate and convert to UTF-16 */
1333 if ((ch < 0x10000) /* minimum value allowed for 4
1334 byte encoding */
1335 || (ch > 0x10ffff)) /* maximum value allowed for
1336 UTF-16 */
1338 errmsg = "illegal encoding";
1339 startinpos = s-starts;
1340 endinpos = startinpos+4;
1341 goto utf8Error;
1343 #ifdef Py_UNICODE_WIDE
1344 *p++ = (Py_UNICODE)ch;
1345 #else
1346 /* compute and append the two surrogates: */
1348 /* translate from 10000..10FFFF to 0..FFFF */
1349 ch -= 0x10000;
1351 /* high surrogate = top 10 bits added to D800 */
1352 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1354 /* low surrogate = bottom 10 bits added to DC00 */
1355 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1356 #endif
1357 break;
1359 default:
1360 /* Other sizes are only needed for UCS-4 */
1361 errmsg = "unsupported Unicode code range";
1362 startinpos = s-starts;
1363 endinpos = startinpos+n;
1364 goto utf8Error;
1366 s += n;
1367 continue;
1369 utf8Error:
1370 outpos = p-PyUnicode_AS_UNICODE(unicode);
1371 if (unicode_decode_call_errorhandler(
1372 errors, &errorHandler,
1373 "utf8", errmsg,
1374 starts, size, &startinpos, &endinpos, &exc, &s,
1375 (PyObject **)&unicode, &outpos, &p))
1376 goto onError;
1378 if (consumed)
1379 *consumed = s-starts;
1381 /* Adjust length */
1382 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1383 goto onError;
1385 Py_XDECREF(errorHandler);
1386 Py_XDECREF(exc);
1387 return (PyObject *)unicode;
1389 onError:
1390 Py_XDECREF(errorHandler);
1391 Py_XDECREF(exc);
1392 Py_DECREF(unicode);
1393 return NULL;
1396 /* Allocation strategy: if the string is short, convert into a stack buffer
1397 and allocate exactly as much space needed at the end. Else allocate the
1398 maximum possible needed (4 result bytes per Unicode character), and return
1399 the excess memory at the end.
1401 PyObject *
1402 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1403 Py_ssize_t size,
1404 const char *errors)
1406 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
1408 Py_ssize_t i; /* index into s of next input byte */
1409 PyObject *v; /* result string object */
1410 char *p; /* next free byte in output buffer */
1411 Py_ssize_t nallocated; /* number of result bytes allocated */
1412 Py_ssize_t nneeded; /* number of result bytes needed */
1413 char stackbuf[MAX_SHORT_UNICHARS * 4];
1415 assert(s != NULL);
1416 assert(size >= 0);
1418 if (size <= MAX_SHORT_UNICHARS) {
1419 /* Write into the stack buffer; nallocated can't overflow.
1420 * At the end, we'll allocate exactly as much heap space as it
1421 * turns out we need.
1423 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1424 v = NULL; /* will allocate after we're done */
1425 p = stackbuf;
1427 else {
1428 /* Overallocate on the heap, and give the excess back at the end. */
1429 nallocated = size * 4;
1430 if (nallocated / 4 != size) /* overflow! */
1431 return PyErr_NoMemory();
1432 v = PyString_FromStringAndSize(NULL, nallocated);
1433 if (v == NULL)
1434 return NULL;
1435 p = PyString_AS_STRING(v);
1438 for (i = 0; i < size;) {
1439 Py_UCS4 ch = s[i++];
1441 if (ch < 0x80)
1442 /* Encode ASCII */
1443 *p++ = (char) ch;
1445 else if (ch < 0x0800) {
1446 /* Encode Latin-1 */
1447 *p++ = (char)(0xc0 | (ch >> 6));
1448 *p++ = (char)(0x80 | (ch & 0x3f));
1450 else {
1451 /* Encode UCS2 Unicode ordinals */
1452 if (ch < 0x10000) {
1453 /* Special case: check for high surrogate */
1454 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1455 Py_UCS4 ch2 = s[i];
1456 /* Check for low surrogate and combine the two to
1457 form a UCS4 value */
1458 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1459 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
1460 i++;
1461 goto encodeUCS4;
1463 /* Fall through: handles isolated high surrogates */
1465 *p++ = (char)(0xe0 | (ch >> 12));
1466 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1467 *p++ = (char)(0x80 | (ch & 0x3f));
1468 continue;
1470 encodeUCS4:
1471 /* Encode UCS4 Unicode ordinals */
1472 *p++ = (char)(0xf0 | (ch >> 18));
1473 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1474 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1475 *p++ = (char)(0x80 | (ch & 0x3f));
1479 if (v == NULL) {
1480 /* This was stack allocated. */
1481 nneeded = p - stackbuf;
1482 assert(nneeded <= nallocated);
1483 v = PyString_FromStringAndSize(stackbuf, nneeded);
1485 else {
1486 /* Cut back to size actually needed. */
1487 nneeded = p - PyString_AS_STRING(v);
1488 assert(nneeded <= nallocated);
1489 _PyString_Resize(&v, nneeded);
1491 return v;
1493 #undef MAX_SHORT_UNICHARS
1496 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1498 if (!PyUnicode_Check(unicode)) {
1499 PyErr_BadArgument();
1500 return NULL;
1502 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1503 PyUnicode_GET_SIZE(unicode),
1504 NULL);
1507 /* --- UTF-16 Codec ------------------------------------------------------- */
1509 PyObject *
1510 PyUnicode_DecodeUTF16(const char *s,
1511 Py_ssize_t size,
1512 const char *errors,
1513 int *byteorder)
1515 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1518 PyObject *
1519 PyUnicode_DecodeUTF16Stateful(const char *s,
1520 Py_ssize_t size,
1521 const char *errors,
1522 int *byteorder,
1523 Py_ssize_t *consumed)
1525 const char *starts = s;
1526 Py_ssize_t startinpos;
1527 Py_ssize_t endinpos;
1528 Py_ssize_t outpos;
1529 PyUnicodeObject *unicode;
1530 Py_UNICODE *p;
1531 const unsigned char *q, *e;
1532 int bo = 0; /* assume native ordering by default */
1533 const char *errmsg = "";
1534 /* Offsets from q for retrieving byte pairs in the right order. */
1535 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1536 int ihi = 1, ilo = 0;
1537 #else
1538 int ihi = 0, ilo = 1;
1539 #endif
1540 PyObject *errorHandler = NULL;
1541 PyObject *exc = NULL;
1543 /* Note: size will always be longer than the resulting Unicode
1544 character count */
1545 unicode = _PyUnicode_New(size);
1546 if (!unicode)
1547 return NULL;
1548 if (size == 0)
1549 return (PyObject *)unicode;
1551 /* Unpack UTF-16 encoded data */
1552 p = unicode->str;
1553 q = (unsigned char *)s;
1554 e = q + size;
1556 if (byteorder)
1557 bo = *byteorder;
1559 /* Check for BOM marks (U+FEFF) in the input and adjust current
1560 byte order setting accordingly. In native mode, the leading BOM
1561 mark is skipped, in all other modes, it is copied to the output
1562 stream as-is (giving a ZWNBSP character). */
1563 if (bo == 0) {
1564 if (size >= 2) {
1565 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
1566 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1567 if (bom == 0xFEFF) {
1568 q += 2;
1569 bo = -1;
1571 else if (bom == 0xFFFE) {
1572 q += 2;
1573 bo = 1;
1575 #else
1576 if (bom == 0xFEFF) {
1577 q += 2;
1578 bo = 1;
1580 else if (bom == 0xFFFE) {
1581 q += 2;
1582 bo = -1;
1584 #endif
1588 if (bo == -1) {
1589 /* force LE */
1590 ihi = 1;
1591 ilo = 0;
1593 else if (bo == 1) {
1594 /* force BE */
1595 ihi = 0;
1596 ilo = 1;
1599 while (q < e) {
1600 Py_UNICODE ch;
1601 /* remaining bytes at the end? (size should be even) */
1602 if (e-q<2) {
1603 if (consumed)
1604 break;
1605 errmsg = "truncated data";
1606 startinpos = ((const char *)q)-starts;
1607 endinpos = ((const char *)e)-starts;
1608 goto utf16Error;
1609 /* The remaining input chars are ignored if the callback
1610 chooses to skip the input */
1612 ch = (q[ihi] << 8) | q[ilo];
1614 q += 2;
1616 if (ch < 0xD800 || ch > 0xDFFF) {
1617 *p++ = ch;
1618 continue;
1621 /* UTF-16 code pair: */
1622 if (q >= e) {
1623 errmsg = "unexpected end of data";
1624 startinpos = (((const char *)q)-2)-starts;
1625 endinpos = ((const char *)e)-starts;
1626 goto utf16Error;
1628 if (0xD800 <= ch && ch <= 0xDBFF) {
1629 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1630 q += 2;
1631 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1632 #ifndef Py_UNICODE_WIDE
1633 *p++ = ch;
1634 *p++ = ch2;
1635 #else
1636 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1637 #endif
1638 continue;
1640 else {
1641 errmsg = "illegal UTF-16 surrogate";
1642 startinpos = (((const char *)q)-4)-starts;
1643 endinpos = startinpos+2;
1644 goto utf16Error;
1648 errmsg = "illegal encoding";
1649 startinpos = (((const char *)q)-2)-starts;
1650 endinpos = startinpos+2;
1651 /* Fall through to report the error */
1653 utf16Error:
1654 outpos = p-PyUnicode_AS_UNICODE(unicode);
1655 if (unicode_decode_call_errorhandler(
1656 errors, &errorHandler,
1657 "utf16", errmsg,
1658 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1659 (PyObject **)&unicode, &outpos, &p))
1660 goto onError;
1663 if (byteorder)
1664 *byteorder = bo;
1666 if (consumed)
1667 *consumed = (const char *)q-starts;
1669 /* Adjust length */
1670 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1671 goto onError;
1673 Py_XDECREF(errorHandler);
1674 Py_XDECREF(exc);
1675 return (PyObject *)unicode;
1677 onError:
1678 Py_DECREF(unicode);
1679 Py_XDECREF(errorHandler);
1680 Py_XDECREF(exc);
1681 return NULL;
1684 PyObject *
1685 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1686 Py_ssize_t size,
1687 const char *errors,
1688 int byteorder)
1690 PyObject *v;
1691 unsigned char *p;
1692 #ifdef Py_UNICODE_WIDE
1693 int i, pairs;
1694 #else
1695 const int pairs = 0;
1696 #endif
1697 /* Offsets from p for storing byte pairs in the right order. */
1698 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1699 int ihi = 1, ilo = 0;
1700 #else
1701 int ihi = 0, ilo = 1;
1702 #endif
1704 #define STORECHAR(CH) \
1705 do { \
1706 p[ihi] = ((CH) >> 8) & 0xff; \
1707 p[ilo] = (CH) & 0xff; \
1708 p += 2; \
1709 } while(0)
1711 #ifdef Py_UNICODE_WIDE
1712 for (i = pairs = 0; i < size; i++)
1713 if (s[i] >= 0x10000)
1714 pairs++;
1715 #endif
1716 v = PyString_FromStringAndSize(NULL,
1717 2 * (size + pairs + (byteorder == 0)));
1718 if (v == NULL)
1719 return NULL;
1721 p = (unsigned char *)PyString_AS_STRING(v);
1722 if (byteorder == 0)
1723 STORECHAR(0xFEFF);
1724 if (size == 0)
1725 return v;
1727 if (byteorder == -1) {
1728 /* force LE */
1729 ihi = 1;
1730 ilo = 0;
1732 else if (byteorder == 1) {
1733 /* force BE */
1734 ihi = 0;
1735 ilo = 1;
1738 while (size-- > 0) {
1739 Py_UNICODE ch = *s++;
1740 Py_UNICODE ch2 = 0;
1741 #ifdef Py_UNICODE_WIDE
1742 if (ch >= 0x10000) {
1743 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1744 ch = 0xD800 | ((ch-0x10000) >> 10);
1746 #endif
1747 STORECHAR(ch);
1748 if (ch2)
1749 STORECHAR(ch2);
1751 return v;
1752 #undef STORECHAR
1755 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1757 if (!PyUnicode_Check(unicode)) {
1758 PyErr_BadArgument();
1759 return NULL;
1761 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1762 PyUnicode_GET_SIZE(unicode),
1763 NULL,
1767 /* --- Unicode Escape Codec ----------------------------------------------- */
1769 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1771 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1772 Py_ssize_t size,
1773 const char *errors)
1775 const char *starts = s;
1776 Py_ssize_t startinpos;
1777 Py_ssize_t endinpos;
1778 Py_ssize_t outpos;
1779 int i;
1780 PyUnicodeObject *v;
1781 Py_UNICODE *p;
1782 const char *end;
1783 char* message;
1784 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1785 PyObject *errorHandler = NULL;
1786 PyObject *exc = NULL;
1788 /* Escaped strings will always be longer than the resulting
1789 Unicode string, so we start with size here and then reduce the
1790 length after conversion to the true value.
1791 (but if the error callback returns a long replacement string
1792 we'll have to allocate more space) */
1793 v = _PyUnicode_New(size);
1794 if (v == NULL)
1795 goto onError;
1796 if (size == 0)
1797 return (PyObject *)v;
1799 p = PyUnicode_AS_UNICODE(v);
1800 end = s + size;
1802 while (s < end) {
1803 unsigned char c;
1804 Py_UNICODE x;
1805 int digits;
1807 /* Non-escape characters are interpreted as Unicode ordinals */
1808 if (*s != '\\') {
1809 *p++ = (unsigned char) *s++;
1810 continue;
1813 startinpos = s-starts;
1814 /* \ - Escapes */
1815 s++;
1816 switch (*s++) {
1818 /* \x escapes */
1819 case '\n': break;
1820 case '\\': *p++ = '\\'; break;
1821 case '\'': *p++ = '\''; break;
1822 case '\"': *p++ = '\"'; break;
1823 case 'b': *p++ = '\b'; break;
1824 case 'f': *p++ = '\014'; break; /* FF */
1825 case 't': *p++ = '\t'; break;
1826 case 'n': *p++ = '\n'; break;
1827 case 'r': *p++ = '\r'; break;
1828 case 'v': *p++ = '\013'; break; /* VT */
1829 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1831 /* \OOO (octal) escapes */
1832 case '0': case '1': case '2': case '3':
1833 case '4': case '5': case '6': case '7':
1834 x = s[-1] - '0';
1835 if ('0' <= *s && *s <= '7') {
1836 x = (x<<3) + *s++ - '0';
1837 if ('0' <= *s && *s <= '7')
1838 x = (x<<3) + *s++ - '0';
1840 *p++ = x;
1841 break;
1843 /* hex escapes */
1844 /* \xXX */
1845 case 'x':
1846 digits = 2;
1847 message = "truncated \\xXX escape";
1848 goto hexescape;
1850 /* \uXXXX */
1851 case 'u':
1852 digits = 4;
1853 message = "truncated \\uXXXX escape";
1854 goto hexescape;
1856 /* \UXXXXXXXX */
1857 case 'U':
1858 digits = 8;
1859 message = "truncated \\UXXXXXXXX escape";
1860 hexescape:
1861 chr = 0;
1862 outpos = p-PyUnicode_AS_UNICODE(v);
1863 if (s+digits>end) {
1864 endinpos = size;
1865 if (unicode_decode_call_errorhandler(
1866 errors, &errorHandler,
1867 "unicodeescape", "end of string in escape sequence",
1868 starts, size, &startinpos, &endinpos, &exc, &s,
1869 (PyObject **)&v, &outpos, &p))
1870 goto onError;
1871 goto nextByte;
1873 for (i = 0; i < digits; ++i) {
1874 c = (unsigned char) s[i];
1875 if (!isxdigit(c)) {
1876 endinpos = (s+i+1)-starts;
1877 if (unicode_decode_call_errorhandler(
1878 errors, &errorHandler,
1879 "unicodeescape", message,
1880 starts, size, &startinpos, &endinpos, &exc, &s,
1881 (PyObject **)&v, &outpos, &p))
1882 goto onError;
1883 goto nextByte;
1885 chr = (chr<<4) & ~0xF;
1886 if (c >= '0' && c <= '9')
1887 chr += c - '0';
1888 else if (c >= 'a' && c <= 'f')
1889 chr += 10 + c - 'a';
1890 else
1891 chr += 10 + c - 'A';
1893 s += i;
1894 if (chr == 0xffffffff && PyErr_Occurred())
1895 /* _decoding_error will have already written into the
1896 target buffer. */
1897 break;
1898 store:
1899 /* when we get here, chr is a 32-bit unicode character */
1900 if (chr <= 0xffff)
1901 /* UCS-2 character */
1902 *p++ = (Py_UNICODE) chr;
1903 else if (chr <= 0x10ffff) {
1904 /* UCS-4 character. Either store directly, or as
1905 surrogate pair. */
1906 #ifdef Py_UNICODE_WIDE
1907 *p++ = chr;
1908 #else
1909 chr -= 0x10000L;
1910 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1911 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
1912 #endif
1913 } else {
1914 endinpos = s-starts;
1915 outpos = p-PyUnicode_AS_UNICODE(v);
1916 if (unicode_decode_call_errorhandler(
1917 errors, &errorHandler,
1918 "unicodeescape", "illegal Unicode character",
1919 starts, size, &startinpos, &endinpos, &exc, &s,
1920 (PyObject **)&v, &outpos, &p))
1921 goto onError;
1923 break;
1925 /* \N{name} */
1926 case 'N':
1927 message = "malformed \\N character escape";
1928 if (ucnhash_CAPI == NULL) {
1929 /* load the unicode data module */
1930 PyObject *m, *api;
1931 m = PyImport_ImportModule("unicodedata");
1932 if (m == NULL)
1933 goto ucnhashError;
1934 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
1935 Py_DECREF(m);
1936 if (api == NULL)
1937 goto ucnhashError;
1938 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
1939 Py_DECREF(api);
1940 if (ucnhash_CAPI == NULL)
1941 goto ucnhashError;
1943 if (*s == '{') {
1944 const char *start = s+1;
1945 /* look for the closing brace */
1946 while (*s != '}' && s < end)
1947 s++;
1948 if (s > start && s < end && *s == '}') {
1949 /* found a name. look it up in the unicode database */
1950 message = "unknown Unicode character name";
1951 s++;
1952 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
1953 goto store;
1956 endinpos = s-starts;
1957 outpos = p-PyUnicode_AS_UNICODE(v);
1958 if (unicode_decode_call_errorhandler(
1959 errors, &errorHandler,
1960 "unicodeescape", message,
1961 starts, size, &startinpos, &endinpos, &exc, &s,
1962 (PyObject **)&v, &outpos, &p))
1963 goto onError;
1964 break;
1966 default:
1967 if (s > end) {
1968 message = "\\ at end of string";
1969 s--;
1970 endinpos = s-starts;
1971 outpos = p-PyUnicode_AS_UNICODE(v);
1972 if (unicode_decode_call_errorhandler(
1973 errors, &errorHandler,
1974 "unicodeescape", message,
1975 starts, size, &startinpos, &endinpos, &exc, &s,
1976 (PyObject **)&v, &outpos, &p))
1977 goto onError;
1979 else {
1980 *p++ = '\\';
1981 *p++ = (unsigned char)s[-1];
1983 break;
1985 nextByte:
1988 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
1989 goto onError;
1990 Py_XDECREF(errorHandler);
1991 Py_XDECREF(exc);
1992 return (PyObject *)v;
1994 ucnhashError:
1995 PyErr_SetString(
1996 PyExc_UnicodeError,
1997 "\\N escapes not supported (can't load unicodedata module)"
1999 Py_XDECREF(v);
2000 Py_XDECREF(errorHandler);
2001 Py_XDECREF(exc);
2002 return NULL;
2004 onError:
2005 Py_XDECREF(v);
2006 Py_XDECREF(errorHandler);
2007 Py_XDECREF(exc);
2008 return NULL;
2011 /* Return a Unicode-Escape string version of the Unicode object.
2013 If quotes is true, the string is enclosed in u"" or u'' quotes as
2014 appropriate.
2018 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2019 Py_ssize_t size,
2020 Py_UNICODE ch)
2022 /* like wcschr, but doesn't stop at NULL characters */
2024 while (size-- > 0) {
2025 if (*s == ch)
2026 return s;
2027 s++;
2030 return NULL;
2033 static
2034 PyObject *unicodeescape_string(const Py_UNICODE *s,
2035 Py_ssize_t size,
2036 int quotes)
2038 PyObject *repr;
2039 char *p;
2041 static const char *hexdigit = "0123456789abcdef";
2043 /* XXX(nnorwitz): rather than over-allocating, it would be
2044 better to choose a different scheme. Perhaps scan the
2045 first N-chars of the string and allocate based on that size.
2047 /* Initial allocation is based on the longest-possible unichr
2048 escape.
2050 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2051 unichr, so in this case it's the longest unichr escape. In
2052 narrow (UTF-16) builds this is five chars per source unichr
2053 since there are two unichrs in the surrogate pair, so in narrow
2054 (UTF-16) builds it's not the longest unichr escape.
2056 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2057 so in the narrow (UTF-16) build case it's the longest unichr
2058 escape.
2061 repr = PyString_FromStringAndSize(NULL,
2063 #ifdef Py_UNICODE_WIDE
2064 + 10*size
2065 #else
2066 + 6*size
2067 #endif
2068 + 1);
2069 if (repr == NULL)
2070 return NULL;
2072 p = PyString_AS_STRING(repr);
2074 if (quotes) {
2075 *p++ = 'u';
2076 *p++ = (findchar(s, size, '\'') &&
2077 !findchar(s, size, '"')) ? '"' : '\'';
2079 while (size-- > 0) {
2080 Py_UNICODE ch = *s++;
2082 /* Escape quotes and backslashes */
2083 if ((quotes &&
2084 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
2085 *p++ = '\\';
2086 *p++ = (char) ch;
2087 continue;
2090 #ifdef Py_UNICODE_WIDE
2091 /* Map 21-bit characters to '\U00xxxxxx' */
2092 else if (ch >= 0x10000) {
2093 *p++ = '\\';
2094 *p++ = 'U';
2095 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2096 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2097 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2098 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2099 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2100 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2101 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
2102 *p++ = hexdigit[ch & 0x0000000F];
2103 continue;
2105 #else
2106 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
2107 else if (ch >= 0xD800 && ch < 0xDC00) {
2108 Py_UNICODE ch2;
2109 Py_UCS4 ucs;
2111 ch2 = *s++;
2112 size--;
2113 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2114 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2115 *p++ = '\\';
2116 *p++ = 'U';
2117 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2118 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2119 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2120 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2121 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2122 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2123 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2124 *p++ = hexdigit[ucs & 0x0000000F];
2125 continue;
2127 /* Fall through: isolated surrogates are copied as-is */
2128 s--;
2129 size++;
2131 #endif
2133 /* Map 16-bit characters to '\uxxxx' */
2134 if (ch >= 256) {
2135 *p++ = '\\';
2136 *p++ = 'u';
2137 *p++ = hexdigit[(ch >> 12) & 0x000F];
2138 *p++ = hexdigit[(ch >> 8) & 0x000F];
2139 *p++ = hexdigit[(ch >> 4) & 0x000F];
2140 *p++ = hexdigit[ch & 0x000F];
2143 /* Map special whitespace to '\t', \n', '\r' */
2144 else if (ch == '\t') {
2145 *p++ = '\\';
2146 *p++ = 't';
2148 else if (ch == '\n') {
2149 *p++ = '\\';
2150 *p++ = 'n';
2152 else if (ch == '\r') {
2153 *p++ = '\\';
2154 *p++ = 'r';
2157 /* Map non-printable US ASCII to '\xhh' */
2158 else if (ch < ' ' || ch >= 0x7F) {
2159 *p++ = '\\';
2160 *p++ = 'x';
2161 *p++ = hexdigit[(ch >> 4) & 0x000F];
2162 *p++ = hexdigit[ch & 0x000F];
2165 /* Copy everything else as-is */
2166 else
2167 *p++ = (char) ch;
2169 if (quotes)
2170 *p++ = PyString_AS_STRING(repr)[1];
2172 *p = '\0';
2173 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
2174 return repr;
2177 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2178 Py_ssize_t size)
2180 return unicodeescape_string(s, size, 0);
2183 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2185 if (!PyUnicode_Check(unicode)) {
2186 PyErr_BadArgument();
2187 return NULL;
2189 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2190 PyUnicode_GET_SIZE(unicode));
2193 /* --- Raw Unicode Escape Codec ------------------------------------------- */
2195 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2196 Py_ssize_t size,
2197 const char *errors)
2199 const char *starts = s;
2200 Py_ssize_t startinpos;
2201 Py_ssize_t endinpos;
2202 Py_ssize_t outpos;
2203 PyUnicodeObject *v;
2204 Py_UNICODE *p;
2205 const char *end;
2206 const char *bs;
2207 PyObject *errorHandler = NULL;
2208 PyObject *exc = NULL;
2210 /* Escaped strings will always be longer than the resulting
2211 Unicode string, so we start with size here and then reduce the
2212 length after conversion to the true value. (But decoding error
2213 handler might have to resize the string) */
2214 v = _PyUnicode_New(size);
2215 if (v == NULL)
2216 goto onError;
2217 if (size == 0)
2218 return (PyObject *)v;
2219 p = PyUnicode_AS_UNICODE(v);
2220 end = s + size;
2221 while (s < end) {
2222 unsigned char c;
2223 Py_UCS4 x;
2224 int i;
2225 int count;
2227 /* Non-escape characters are interpreted as Unicode ordinals */
2228 if (*s != '\\') {
2229 *p++ = (unsigned char)*s++;
2230 continue;
2232 startinpos = s-starts;
2234 /* \u-escapes are only interpreted iff the number of leading
2235 backslashes if odd */
2236 bs = s;
2237 for (;s < end;) {
2238 if (*s != '\\')
2239 break;
2240 *p++ = (unsigned char)*s++;
2242 if (((s - bs) & 1) == 0 ||
2243 s >= end ||
2244 (*s != 'u' && *s != 'U')) {
2245 continue;
2247 p--;
2248 count = *s=='u' ? 4 : 8;
2249 s++;
2251 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
2252 outpos = p-PyUnicode_AS_UNICODE(v);
2253 for (x = 0, i = 0; i < count; ++i, ++s) {
2254 c = (unsigned char)*s;
2255 if (!isxdigit(c)) {
2256 endinpos = s-starts;
2257 if (unicode_decode_call_errorhandler(
2258 errors, &errorHandler,
2259 "rawunicodeescape", "truncated \\uXXXX",
2260 starts, size, &startinpos, &endinpos, &exc, &s,
2261 (PyObject **)&v, &outpos, &p))
2262 goto onError;
2263 goto nextByte;
2265 x = (x<<4) & ~0xF;
2266 if (c >= '0' && c <= '9')
2267 x += c - '0';
2268 else if (c >= 'a' && c <= 'f')
2269 x += 10 + c - 'a';
2270 else
2271 x += 10 + c - 'A';
2273 #ifndef Py_UNICODE_WIDE
2274 if (x > 0x10000) {
2275 if (unicode_decode_call_errorhandler(
2276 errors, &errorHandler,
2277 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2278 starts, size, &startinpos, &endinpos, &exc, &s,
2279 (PyObject **)&v, &outpos, &p))
2280 goto onError;
2282 #endif
2283 *p++ = x;
2284 nextByte:
2287 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2288 goto onError;
2289 Py_XDECREF(errorHandler);
2290 Py_XDECREF(exc);
2291 return (PyObject *)v;
2293 onError:
2294 Py_XDECREF(v);
2295 Py_XDECREF(errorHandler);
2296 Py_XDECREF(exc);
2297 return NULL;
2300 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2301 Py_ssize_t size)
2303 PyObject *repr;
2304 char *p;
2305 char *q;
2307 static const char *hexdigit = "0123456789abcdef";
2309 #ifdef Py_UNICODE_WIDE
2310 repr = PyString_FromStringAndSize(NULL, 10 * size);
2311 #else
2312 repr = PyString_FromStringAndSize(NULL, 6 * size);
2313 #endif
2314 if (repr == NULL)
2315 return NULL;
2316 if (size == 0)
2317 return repr;
2319 p = q = PyString_AS_STRING(repr);
2320 while (size-- > 0) {
2321 Py_UNICODE ch = *s++;
2322 #ifdef Py_UNICODE_WIDE
2323 /* Map 32-bit characters to '\Uxxxxxxxx' */
2324 if (ch >= 0x10000) {
2325 *p++ = '\\';
2326 *p++ = 'U';
2327 *p++ = hexdigit[(ch >> 28) & 0xf];
2328 *p++ = hexdigit[(ch >> 24) & 0xf];
2329 *p++ = hexdigit[(ch >> 20) & 0xf];
2330 *p++ = hexdigit[(ch >> 16) & 0xf];
2331 *p++ = hexdigit[(ch >> 12) & 0xf];
2332 *p++ = hexdigit[(ch >> 8) & 0xf];
2333 *p++ = hexdigit[(ch >> 4) & 0xf];
2334 *p++ = hexdigit[ch & 15];
2336 else
2337 #endif
2338 /* Map 16-bit characters to '\uxxxx' */
2339 if (ch >= 256) {
2340 *p++ = '\\';
2341 *p++ = 'u';
2342 *p++ = hexdigit[(ch >> 12) & 0xf];
2343 *p++ = hexdigit[(ch >> 8) & 0xf];
2344 *p++ = hexdigit[(ch >> 4) & 0xf];
2345 *p++ = hexdigit[ch & 15];
2347 /* Copy everything else as-is */
2348 else
2349 *p++ = (char) ch;
2351 *p = '\0';
2352 _PyString_Resize(&repr, p - q);
2353 return repr;
2356 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2358 if (!PyUnicode_Check(unicode)) {
2359 PyErr_BadArgument();
2360 return NULL;
2362 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2363 PyUnicode_GET_SIZE(unicode));
2366 /* --- Unicode Internal Codec ------------------------------------------- */
2368 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
2369 Py_ssize_t size,
2370 const char *errors)
2372 const char *starts = s;
2373 Py_ssize_t startinpos;
2374 Py_ssize_t endinpos;
2375 Py_ssize_t outpos;
2376 PyUnicodeObject *v;
2377 Py_UNICODE *p;
2378 const char *end;
2379 const char *reason;
2380 PyObject *errorHandler = NULL;
2381 PyObject *exc = NULL;
2383 #ifdef Py_UNICODE_WIDE
2384 Py_UNICODE unimax = PyUnicode_GetMax();
2385 #endif
2387 /* XXX overflow detection missing */
2388 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2389 if (v == NULL)
2390 goto onError;
2391 if (PyUnicode_GetSize((PyObject *)v) == 0)
2392 return (PyObject *)v;
2393 p = PyUnicode_AS_UNICODE(v);
2394 end = s + size;
2396 while (s < end) {
2397 memcpy(p, s, sizeof(Py_UNICODE));
2398 /* We have to sanity check the raw data, otherwise doom looms for
2399 some malformed UCS-4 data. */
2400 if (
2401 #ifdef Py_UNICODE_WIDE
2402 *p > unimax || *p < 0 ||
2403 #endif
2404 end-s < Py_UNICODE_SIZE
2407 startinpos = s - starts;
2408 if (end-s < Py_UNICODE_SIZE) {
2409 endinpos = end-starts;
2410 reason = "truncated input";
2412 else {
2413 endinpos = s - starts + Py_UNICODE_SIZE;
2414 reason = "illegal code point (> 0x10FFFF)";
2416 outpos = p - PyUnicode_AS_UNICODE(v);
2417 if (unicode_decode_call_errorhandler(
2418 errors, &errorHandler,
2419 "unicode_internal", reason,
2420 starts, size, &startinpos, &endinpos, &exc, &s,
2421 (PyObject **)&v, &outpos, &p)) {
2422 goto onError;
2425 else {
2426 p++;
2427 s += Py_UNICODE_SIZE;
2431 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2432 goto onError;
2433 Py_XDECREF(errorHandler);
2434 Py_XDECREF(exc);
2435 return (PyObject *)v;
2437 onError:
2438 Py_XDECREF(v);
2439 Py_XDECREF(errorHandler);
2440 Py_XDECREF(exc);
2441 return NULL;
2444 /* --- Latin-1 Codec ------------------------------------------------------ */
2446 PyObject *PyUnicode_DecodeLatin1(const char *s,
2447 Py_ssize_t size,
2448 const char *errors)
2450 PyUnicodeObject *v;
2451 Py_UNICODE *p;
2453 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2454 if (size == 1) {
2455 Py_UNICODE r = *(unsigned char*)s;
2456 return PyUnicode_FromUnicode(&r, 1);
2459 v = _PyUnicode_New(size);
2460 if (v == NULL)
2461 goto onError;
2462 if (size == 0)
2463 return (PyObject *)v;
2464 p = PyUnicode_AS_UNICODE(v);
2465 while (size-- > 0)
2466 *p++ = (unsigned char)*s++;
2467 return (PyObject *)v;
2469 onError:
2470 Py_XDECREF(v);
2471 return NULL;
2474 /* create or adjust a UnicodeEncodeError */
2475 static void make_encode_exception(PyObject **exceptionObject,
2476 const char *encoding,
2477 const Py_UNICODE *unicode, Py_ssize_t size,
2478 Py_ssize_t startpos, Py_ssize_t endpos,
2479 const char *reason)
2481 if (*exceptionObject == NULL) {
2482 *exceptionObject = PyUnicodeEncodeError_Create(
2483 encoding, unicode, size, startpos, endpos, reason);
2485 else {
2486 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2487 goto onError;
2488 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2489 goto onError;
2490 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2491 goto onError;
2492 return;
2493 onError:
2494 Py_DECREF(*exceptionObject);
2495 *exceptionObject = NULL;
2499 /* raises a UnicodeEncodeError */
2500 static void raise_encode_exception(PyObject **exceptionObject,
2501 const char *encoding,
2502 const Py_UNICODE *unicode, Py_ssize_t size,
2503 Py_ssize_t startpos, Py_ssize_t endpos,
2504 const char *reason)
2506 make_encode_exception(exceptionObject,
2507 encoding, unicode, size, startpos, endpos, reason);
2508 if (*exceptionObject != NULL)
2509 PyCodec_StrictErrors(*exceptionObject);
2512 /* error handling callback helper:
2513 build arguments, call the callback and check the arguments,
2514 put the result into newpos and return the replacement string, which
2515 has to be freed by the caller */
2516 static PyObject *unicode_encode_call_errorhandler(const char *errors,
2517 PyObject **errorHandler,
2518 const char *encoding, const char *reason,
2519 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2520 Py_ssize_t startpos, Py_ssize_t endpos,
2521 Py_ssize_t *newpos)
2523 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
2525 PyObject *restuple;
2526 PyObject *resunicode;
2528 if (*errorHandler == NULL) {
2529 *errorHandler = PyCodec_LookupError(errors);
2530 if (*errorHandler == NULL)
2531 return NULL;
2534 make_encode_exception(exceptionObject,
2535 encoding, unicode, size, startpos, endpos, reason);
2536 if (*exceptionObject == NULL)
2537 return NULL;
2539 restuple = PyObject_CallFunctionObjArgs(
2540 *errorHandler, *exceptionObject, NULL);
2541 if (restuple == NULL)
2542 return NULL;
2543 if (!PyTuple_Check(restuple)) {
2544 PyErr_Format(PyExc_TypeError, &argparse[4]);
2545 Py_DECREF(restuple);
2546 return NULL;
2548 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2549 &resunicode, newpos)) {
2550 Py_DECREF(restuple);
2551 return NULL;
2553 if (*newpos<0)
2554 *newpos = size+*newpos;
2555 if (*newpos<0 || *newpos>size) {
2556 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
2557 Py_DECREF(restuple);
2558 return NULL;
2560 Py_INCREF(resunicode);
2561 Py_DECREF(restuple);
2562 return resunicode;
2565 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2566 Py_ssize_t size,
2567 const char *errors,
2568 int limit)
2570 /* output object */
2571 PyObject *res;
2572 /* pointers to the beginning and end+1 of input */
2573 const Py_UNICODE *startp = p;
2574 const Py_UNICODE *endp = p + size;
2575 /* pointer to the beginning of the unencodable characters */
2576 /* const Py_UNICODE *badp = NULL; */
2577 /* pointer into the output */
2578 char *str;
2579 /* current output position */
2580 Py_ssize_t respos = 0;
2581 Py_ssize_t ressize;
2582 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2583 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2584 PyObject *errorHandler = NULL;
2585 PyObject *exc = NULL;
2586 /* the following variable is used for caching string comparisons
2587 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2588 int known_errorHandler = -1;
2590 /* allocate enough for a simple encoding without
2591 replacements, if we need more, we'll resize */
2592 res = PyString_FromStringAndSize(NULL, size);
2593 if (res == NULL)
2594 goto onError;
2595 if (size == 0)
2596 return res;
2597 str = PyString_AS_STRING(res);
2598 ressize = size;
2600 while (p<endp) {
2601 Py_UNICODE c = *p;
2603 /* can we encode this? */
2604 if (c<limit) {
2605 /* no overflow check, because we know that the space is enough */
2606 *str++ = (char)c;
2607 ++p;
2609 else {
2610 Py_ssize_t unicodepos = p-startp;
2611 Py_ssize_t requiredsize;
2612 PyObject *repunicode;
2613 Py_ssize_t repsize;
2614 Py_ssize_t newpos;
2615 Py_ssize_t respos;
2616 Py_UNICODE *uni2;
2617 /* startpos for collecting unencodable chars */
2618 const Py_UNICODE *collstart = p;
2619 const Py_UNICODE *collend = p;
2620 /* find all unecodable characters */
2621 while ((collend < endp) && ((*collend)>=limit))
2622 ++collend;
2623 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2624 if (known_errorHandler==-1) {
2625 if ((errors==NULL) || (!strcmp(errors, "strict")))
2626 known_errorHandler = 1;
2627 else if (!strcmp(errors, "replace"))
2628 known_errorHandler = 2;
2629 else if (!strcmp(errors, "ignore"))
2630 known_errorHandler = 3;
2631 else if (!strcmp(errors, "xmlcharrefreplace"))
2632 known_errorHandler = 4;
2633 else
2634 known_errorHandler = 0;
2636 switch (known_errorHandler) {
2637 case 1: /* strict */
2638 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2639 goto onError;
2640 case 2: /* replace */
2641 while (collstart++<collend)
2642 *str++ = '?'; /* fall through */
2643 case 3: /* ignore */
2644 p = collend;
2645 break;
2646 case 4: /* xmlcharrefreplace */
2647 respos = str-PyString_AS_STRING(res);
2648 /* determine replacement size (temporarily (mis)uses p) */
2649 for (p = collstart, repsize = 0; p < collend; ++p) {
2650 if (*p<10)
2651 repsize += 2+1+1;
2652 else if (*p<100)
2653 repsize += 2+2+1;
2654 else if (*p<1000)
2655 repsize += 2+3+1;
2656 else if (*p<10000)
2657 repsize += 2+4+1;
2658 #ifndef Py_UNICODE_WIDE
2659 else
2660 repsize += 2+5+1;
2661 #else
2662 else if (*p<100000)
2663 repsize += 2+5+1;
2664 else if (*p<1000000)
2665 repsize += 2+6+1;
2666 else
2667 repsize += 2+7+1;
2668 #endif
2670 requiredsize = respos+repsize+(endp-collend);
2671 if (requiredsize > ressize) {
2672 if (requiredsize<2*ressize)
2673 requiredsize = 2*ressize;
2674 if (_PyString_Resize(&res, requiredsize))
2675 goto onError;
2676 str = PyString_AS_STRING(res) + respos;
2677 ressize = requiredsize;
2679 /* generate replacement (temporarily (mis)uses p) */
2680 for (p = collstart; p < collend; ++p) {
2681 str += sprintf(str, "&#%d;", (int)*p);
2683 p = collend;
2684 break;
2685 default:
2686 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2687 encoding, reason, startp, size, &exc,
2688 collstart-startp, collend-startp, &newpos);
2689 if (repunicode == NULL)
2690 goto onError;
2691 /* need more space? (at least enough for what we
2692 have+the replacement+the rest of the string, so
2693 we won't have to check space for encodable characters) */
2694 respos = str-PyString_AS_STRING(res);
2695 repsize = PyUnicode_GET_SIZE(repunicode);
2696 requiredsize = respos+repsize+(endp-collend);
2697 if (requiredsize > ressize) {
2698 if (requiredsize<2*ressize)
2699 requiredsize = 2*ressize;
2700 if (_PyString_Resize(&res, requiredsize)) {
2701 Py_DECREF(repunicode);
2702 goto onError;
2704 str = PyString_AS_STRING(res) + respos;
2705 ressize = requiredsize;
2707 /* check if there is anything unencodable in the replacement
2708 and copy it to the output */
2709 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2710 c = *uni2;
2711 if (c >= limit) {
2712 raise_encode_exception(&exc, encoding, startp, size,
2713 unicodepos, unicodepos+1, reason);
2714 Py_DECREF(repunicode);
2715 goto onError;
2717 *str = (char)c;
2719 p = startp + newpos;
2720 Py_DECREF(repunicode);
2724 /* Resize if we allocated to much */
2725 respos = str-PyString_AS_STRING(res);
2726 if (respos<ressize)
2727 /* If this falls res will be NULL */
2728 _PyString_Resize(&res, respos);
2729 Py_XDECREF(errorHandler);
2730 Py_XDECREF(exc);
2731 return res;
2733 onError:
2734 Py_XDECREF(res);
2735 Py_XDECREF(errorHandler);
2736 Py_XDECREF(exc);
2737 return NULL;
2740 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2741 Py_ssize_t size,
2742 const char *errors)
2744 return unicode_encode_ucs1(p, size, errors, 256);
2747 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2749 if (!PyUnicode_Check(unicode)) {
2750 PyErr_BadArgument();
2751 return NULL;
2753 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2754 PyUnicode_GET_SIZE(unicode),
2755 NULL);
2758 /* --- 7-bit ASCII Codec -------------------------------------------------- */
2760 PyObject *PyUnicode_DecodeASCII(const char *s,
2761 Py_ssize_t size,
2762 const char *errors)
2764 const char *starts = s;
2765 PyUnicodeObject *v;
2766 Py_UNICODE *p;
2767 Py_ssize_t startinpos;
2768 Py_ssize_t endinpos;
2769 Py_ssize_t outpos;
2770 const char *e;
2771 PyObject *errorHandler = NULL;
2772 PyObject *exc = NULL;
2774 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
2775 if (size == 1 && *(unsigned char*)s < 128) {
2776 Py_UNICODE r = *(unsigned char*)s;
2777 return PyUnicode_FromUnicode(&r, 1);
2780 v = _PyUnicode_New(size);
2781 if (v == NULL)
2782 goto onError;
2783 if (size == 0)
2784 return (PyObject *)v;
2785 p = PyUnicode_AS_UNICODE(v);
2786 e = s + size;
2787 while (s < e) {
2788 register unsigned char c = (unsigned char)*s;
2789 if (c < 128) {
2790 *p++ = c;
2791 ++s;
2793 else {
2794 startinpos = s-starts;
2795 endinpos = startinpos + 1;
2796 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
2797 if (unicode_decode_call_errorhandler(
2798 errors, &errorHandler,
2799 "ascii", "ordinal not in range(128)",
2800 starts, size, &startinpos, &endinpos, &exc, &s,
2801 (PyObject **)&v, &outpos, &p))
2802 goto onError;
2805 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
2806 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2807 goto onError;
2808 Py_XDECREF(errorHandler);
2809 Py_XDECREF(exc);
2810 return (PyObject *)v;
2812 onError:
2813 Py_XDECREF(v);
2814 Py_XDECREF(errorHandler);
2815 Py_XDECREF(exc);
2816 return NULL;
2819 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2820 Py_ssize_t size,
2821 const char *errors)
2823 return unicode_encode_ucs1(p, size, errors, 128);
2826 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2828 if (!PyUnicode_Check(unicode)) {
2829 PyErr_BadArgument();
2830 return NULL;
2832 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2833 PyUnicode_GET_SIZE(unicode),
2834 NULL);
2837 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
2839 /* --- MBCS codecs for Windows -------------------------------------------- */
2841 #if SIZEOF_INT < SIZEOF_SSIZE_T
2842 #define NEED_RETRY
2843 #endif
2845 /* XXX This code is limited to "true" double-byte encodings, as
2846 a) it assumes an incomplete character consists of a single byte, and
2847 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2848 encodings, see IsDBCSLeadByteEx documentation. */
2850 static int is_dbcs_lead_byte(const char *s, int offset)
2852 const char *curr = s + offset;
2854 if (IsDBCSLeadByte(*curr)) {
2855 const char *prev = CharPrev(s, curr);
2856 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
2858 return 0;
2862 * Decode MBCS string into unicode object. If 'final' is set, converts
2863 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2865 static int decode_mbcs(PyUnicodeObject **v,
2866 const char *s, /* MBCS string */
2867 int size, /* sizeof MBCS string */
2868 int final)
2870 Py_UNICODE *p;
2871 Py_ssize_t n = 0;
2872 int usize = 0;
2874 assert(size >= 0);
2876 /* Skip trailing lead-byte unless 'final' is set */
2877 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
2878 --size;
2880 /* First get the size of the result */
2881 if (size > 0) {
2882 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2883 if (usize == 0) {
2884 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2885 return -1;
2889 if (*v == NULL) {
2890 /* Create unicode object */
2891 *v = _PyUnicode_New(usize);
2892 if (*v == NULL)
2893 return -1;
2895 else {
2896 /* Extend unicode object */
2897 n = PyUnicode_GET_SIZE(*v);
2898 if (_PyUnicode_Resize(v, n + usize) < 0)
2899 return -1;
2902 /* Do the conversion */
2903 if (size > 0) {
2904 p = PyUnicode_AS_UNICODE(*v) + n;
2905 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2906 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2907 return -1;
2911 return size;
2914 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
2915 Py_ssize_t size,
2916 const char *errors,
2917 Py_ssize_t *consumed)
2919 PyUnicodeObject *v = NULL;
2920 int done;
2922 if (consumed)
2923 *consumed = 0;
2925 #ifdef NEED_RETRY
2926 retry:
2927 if (size > INT_MAX)
2928 done = decode_mbcs(&v, s, INT_MAX, 0);
2929 else
2930 #endif
2931 done = decode_mbcs(&v, s, (int)size, !consumed);
2933 if (done < 0) {
2934 Py_XDECREF(v);
2935 return NULL;
2938 if (consumed)
2939 *consumed += done;
2941 #ifdef NEED_RETRY
2942 if (size > INT_MAX) {
2943 s += done;
2944 size -= done;
2945 goto retry;
2947 #endif
2949 return (PyObject *)v;
2952 PyObject *PyUnicode_DecodeMBCS(const char *s,
2953 Py_ssize_t size,
2954 const char *errors)
2956 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
2960 * Convert unicode into string object (MBCS).
2961 * Returns 0 if succeed, -1 otherwise.
2963 static int encode_mbcs(PyObject **repr,
2964 const Py_UNICODE *p, /* unicode */
2965 int size) /* size of unicode */
2967 int mbcssize = 0;
2968 Py_ssize_t n = 0;
2970 assert(size >= 0);
2972 /* First get the size of the result */
2973 if (size > 0) {
2974 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2975 if (mbcssize == 0) {
2976 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2977 return -1;
2981 if (*repr == NULL) {
2982 /* Create string object */
2983 *repr = PyString_FromStringAndSize(NULL, mbcssize);
2984 if (*repr == NULL)
2985 return -1;
2987 else {
2988 /* Extend string object */
2989 n = PyString_Size(*repr);
2990 if (_PyString_Resize(repr, n + mbcssize) < 0)
2991 return -1;
2994 /* Do the conversion */
2995 if (size > 0) {
2996 char *s = PyString_AS_STRING(*repr) + n;
2997 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2998 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2999 return -1;
3003 return 0;
3006 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
3007 Py_ssize_t size,
3008 const char *errors)
3010 PyObject *repr = NULL;
3011 int ret;
3013 #ifdef NEED_RETRY
3014 retry:
3015 if (size > INT_MAX)
3016 ret = encode_mbcs(&repr, p, INT_MAX);
3017 else
3018 #endif
3019 ret = encode_mbcs(&repr, p, (int)size);
3021 if (ret < 0) {
3022 Py_XDECREF(repr);
3023 return NULL;
3026 #ifdef NEED_RETRY
3027 if (size > INT_MAX) {
3028 p += INT_MAX;
3029 size -= INT_MAX;
3030 goto retry;
3032 #endif
3034 return repr;
3037 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3039 if (!PyUnicode_Check(unicode)) {
3040 PyErr_BadArgument();
3041 return NULL;
3043 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3044 PyUnicode_GET_SIZE(unicode),
3045 NULL);
3048 #undef NEED_RETRY
3050 #endif /* MS_WINDOWS */
3052 /* --- Character Mapping Codec -------------------------------------------- */
3054 PyObject *PyUnicode_DecodeCharmap(const char *s,
3055 Py_ssize_t size,
3056 PyObject *mapping,
3057 const char *errors)
3059 const char *starts = s;
3060 Py_ssize_t startinpos;
3061 Py_ssize_t endinpos;
3062 Py_ssize_t outpos;
3063 const char *e;
3064 PyUnicodeObject *v;
3065 Py_UNICODE *p;
3066 Py_ssize_t extrachars = 0;
3067 PyObject *errorHandler = NULL;
3068 PyObject *exc = NULL;
3069 Py_UNICODE *mapstring = NULL;
3070 Py_ssize_t maplen = 0;
3072 /* Default to Latin-1 */
3073 if (mapping == NULL)
3074 return PyUnicode_DecodeLatin1(s, size, errors);
3076 v = _PyUnicode_New(size);
3077 if (v == NULL)
3078 goto onError;
3079 if (size == 0)
3080 return (PyObject *)v;
3081 p = PyUnicode_AS_UNICODE(v);
3082 e = s + size;
3083 if (PyUnicode_CheckExact(mapping)) {
3084 mapstring = PyUnicode_AS_UNICODE(mapping);
3085 maplen = PyUnicode_GET_SIZE(mapping);
3086 while (s < e) {
3087 unsigned char ch = *s;
3088 Py_UNICODE x = 0xfffe; /* illegal value */
3090 if (ch < maplen)
3091 x = mapstring[ch];
3093 if (x == 0xfffe) {
3094 /* undefined mapping */
3095 outpos = p-PyUnicode_AS_UNICODE(v);
3096 startinpos = s-starts;
3097 endinpos = startinpos+1;
3098 if (unicode_decode_call_errorhandler(
3099 errors, &errorHandler,
3100 "charmap", "character maps to <undefined>",
3101 starts, size, &startinpos, &endinpos, &exc, &s,
3102 (PyObject **)&v, &outpos, &p)) {
3103 goto onError;
3105 continue;
3107 *p++ = x;
3108 ++s;
3111 else {
3112 while (s < e) {
3113 unsigned char ch = *s;
3114 PyObject *w, *x;
3116 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3117 w = PyInt_FromLong((long)ch);
3118 if (w == NULL)
3119 goto onError;
3120 x = PyObject_GetItem(mapping, w);
3121 Py_DECREF(w);
3122 if (x == NULL) {
3123 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3124 /* No mapping found means: mapping is undefined. */
3125 PyErr_Clear();
3126 x = Py_None;
3127 Py_INCREF(x);
3128 } else
3129 goto onError;
3132 /* Apply mapping */
3133 if (PyInt_Check(x)) {
3134 long value = PyInt_AS_LONG(x);
3135 if (value < 0 || value > 65535) {
3136 PyErr_SetString(PyExc_TypeError,
3137 "character mapping must be in range(65536)");
3138 Py_DECREF(x);
3139 goto onError;
3141 *p++ = (Py_UNICODE)value;
3143 else if (x == Py_None) {
3144 /* undefined mapping */
3145 outpos = p-PyUnicode_AS_UNICODE(v);
3146 startinpos = s-starts;
3147 endinpos = startinpos+1;
3148 if (unicode_decode_call_errorhandler(
3149 errors, &errorHandler,
3150 "charmap", "character maps to <undefined>",
3151 starts, size, &startinpos, &endinpos, &exc, &s,
3152 (PyObject **)&v, &outpos, &p)) {
3153 Py_DECREF(x);
3154 goto onError;
3156 Py_DECREF(x);
3157 continue;
3159 else if (PyUnicode_Check(x)) {
3160 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
3162 if (targetsize == 1)
3163 /* 1-1 mapping */
3164 *p++ = *PyUnicode_AS_UNICODE(x);
3166 else if (targetsize > 1) {
3167 /* 1-n mapping */
3168 if (targetsize > extrachars) {
3169 /* resize first */
3170 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3171 Py_ssize_t needed = (targetsize - extrachars) + \
3172 (targetsize << 2);
3173 extrachars += needed;
3174 /* XXX overflow detection missing */
3175 if (_PyUnicode_Resize(&v,
3176 PyUnicode_GET_SIZE(v) + needed) < 0) {
3177 Py_DECREF(x);
3178 goto onError;
3180 p = PyUnicode_AS_UNICODE(v) + oldpos;
3182 Py_UNICODE_COPY(p,
3183 PyUnicode_AS_UNICODE(x),
3184 targetsize);
3185 p += targetsize;
3186 extrachars -= targetsize;
3188 /* 1-0 mapping: skip the character */
3190 else {
3191 /* wrong return value */
3192 PyErr_SetString(PyExc_TypeError,
3193 "character mapping must return integer, None or unicode");
3194 Py_DECREF(x);
3195 goto onError;
3197 Py_DECREF(x);
3198 ++s;
3201 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
3202 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3203 goto onError;
3204 Py_XDECREF(errorHandler);
3205 Py_XDECREF(exc);
3206 return (PyObject *)v;
3208 onError:
3209 Py_XDECREF(errorHandler);
3210 Py_XDECREF(exc);
3211 Py_XDECREF(v);
3212 return NULL;
3215 /* Charmap encoding: the lookup table */
3217 struct encoding_map{
3218 PyObject_HEAD
3219 unsigned char level1[32];
3220 int count2, count3;
3221 unsigned char level23[1];
3224 static PyObject*
3225 encoding_map_size(PyObject *obj, PyObject* args)
3227 struct encoding_map *map = (struct encoding_map*)obj;
3228 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3229 128*map->count3);
3232 static PyMethodDef encoding_map_methods[] = {
3233 {"size", encoding_map_size, METH_NOARGS,
3234 PyDoc_STR("Return the size (in bytes) of this object") },
3235 { 0 }
3238 static void
3239 encoding_map_dealloc(PyObject* o)
3241 PyObject_FREE(o);
3244 static PyTypeObject EncodingMapType = {
3245 PyObject_HEAD_INIT(NULL)
3246 0, /*ob_size*/
3247 "EncodingMap", /*tp_name*/
3248 sizeof(struct encoding_map), /*tp_basicsize*/
3249 0, /*tp_itemsize*/
3250 /* methods */
3251 encoding_map_dealloc, /*tp_dealloc*/
3252 0, /*tp_print*/
3253 0, /*tp_getattr*/
3254 0, /*tp_setattr*/
3255 0, /*tp_compare*/
3256 0, /*tp_repr*/
3257 0, /*tp_as_number*/
3258 0, /*tp_as_sequence*/
3259 0, /*tp_as_mapping*/
3260 0, /*tp_hash*/
3261 0, /*tp_call*/
3262 0, /*tp_str*/
3263 0, /*tp_getattro*/
3264 0, /*tp_setattro*/
3265 0, /*tp_as_buffer*/
3266 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3267 0, /*tp_doc*/
3268 0, /*tp_traverse*/
3269 0, /*tp_clear*/
3270 0, /*tp_richcompare*/
3271 0, /*tp_weaklistoffset*/
3272 0, /*tp_iter*/
3273 0, /*tp_iternext*/
3274 encoding_map_methods, /*tp_methods*/
3275 0, /*tp_members*/
3276 0, /*tp_getset*/
3277 0, /*tp_base*/
3278 0, /*tp_dict*/
3279 0, /*tp_descr_get*/
3280 0, /*tp_descr_set*/
3281 0, /*tp_dictoffset*/
3282 0, /*tp_init*/
3283 0, /*tp_alloc*/
3284 0, /*tp_new*/
3285 0, /*tp_free*/
3286 0, /*tp_is_gc*/
3289 PyObject*
3290 PyUnicode_BuildEncodingMap(PyObject* string)
3292 Py_UNICODE *decode;
3293 PyObject *result;
3294 struct encoding_map *mresult;
3295 int i;
3296 int need_dict = 0;
3297 unsigned char level1[32];
3298 unsigned char level2[512];
3299 unsigned char *mlevel1, *mlevel2, *mlevel3;
3300 int count2 = 0, count3 = 0;
3302 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3303 PyErr_BadArgument();
3304 return NULL;
3306 decode = PyUnicode_AS_UNICODE(string);
3307 memset(level1, 0xFF, sizeof level1);
3308 memset(level2, 0xFF, sizeof level2);
3310 /* If there isn't a one-to-one mapping of NULL to \0,
3311 or if there are non-BMP characters, we need to use
3312 a mapping dictionary. */
3313 if (decode[0] != 0)
3314 need_dict = 1;
3315 for (i = 1; i < 256; i++) {
3316 int l1, l2;
3317 if (decode[i] == 0
3318 #ifdef Py_UNICODE_WIDE
3319 || decode[i] > 0xFFFF
3320 #endif
3322 need_dict = 1;
3323 break;
3325 if (decode[i] == 0xFFFE)
3326 /* unmapped character */
3327 continue;
3328 l1 = decode[i] >> 11;
3329 l2 = decode[i] >> 7;
3330 if (level1[l1] == 0xFF)
3331 level1[l1] = count2++;
3332 if (level2[l2] == 0xFF)
3333 level2[l2] = count3++;
3336 if (count2 >= 0xFF || count3 >= 0xFF)
3337 need_dict = 1;
3339 if (need_dict) {
3340 PyObject *result = PyDict_New();
3341 PyObject *key, *value;
3342 if (!result)
3343 return NULL;
3344 for (i = 0; i < 256; i++) {
3345 key = value = NULL;
3346 key = PyInt_FromLong(decode[i]);
3347 value = PyInt_FromLong(i);
3348 if (!key || !value)
3349 goto failed1;
3350 if (PyDict_SetItem(result, key, value) == -1)
3351 goto failed1;
3352 Py_DECREF(key);
3353 Py_DECREF(value);
3355 return result;
3356 failed1:
3357 Py_XDECREF(key);
3358 Py_XDECREF(value);
3359 Py_DECREF(result);
3360 return NULL;
3363 /* Create a three-level trie */
3364 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3365 16*count2 + 128*count3 - 1);
3366 if (!result)
3367 return PyErr_NoMemory();
3368 PyObject_Init(result, &EncodingMapType);
3369 mresult = (struct encoding_map*)result;
3370 mresult->count2 = count2;
3371 mresult->count3 = count3;
3372 mlevel1 = mresult->level1;
3373 mlevel2 = mresult->level23;
3374 mlevel3 = mresult->level23 + 16*count2;
3375 memcpy(mlevel1, level1, 32);
3376 memset(mlevel2, 0xFF, 16*count2);
3377 memset(mlevel3, 0, 128*count3);
3378 count3 = 0;
3379 for (i = 1; i < 256; i++) {
3380 int o1, o2, o3, i2, i3;
3381 if (decode[i] == 0xFFFE)
3382 /* unmapped character */
3383 continue;
3384 o1 = decode[i]>>11;
3385 o2 = (decode[i]>>7) & 0xF;
3386 i2 = 16*mlevel1[o1] + o2;
3387 if (mlevel2[i2] == 0xFF)
3388 mlevel2[i2] = count3++;
3389 o3 = decode[i] & 0x7F;
3390 i3 = 128*mlevel2[i2] + o3;
3391 mlevel3[i3] = i;
3393 return result;
3396 static int
3397 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3399 struct encoding_map *map = (struct encoding_map*)mapping;
3400 int l1 = c>>11;
3401 int l2 = (c>>7) & 0xF;
3402 int l3 = c & 0x7F;
3403 int i;
3405 #ifdef Py_UNICODE_WIDE
3406 if (c > 0xFFFF) {
3407 return -1;
3409 #endif
3410 if (c == 0)
3411 return 0;
3412 /* level 1*/
3413 i = map->level1[l1];
3414 if (i == 0xFF) {
3415 return -1;
3417 /* level 2*/
3418 i = map->level23[16*i+l2];
3419 if (i == 0xFF) {
3420 return -1;
3422 /* level 3 */
3423 i = map->level23[16*map->count2 + 128*i + l3];
3424 if (i == 0) {
3425 return -1;
3427 return i;
3430 /* Lookup the character ch in the mapping. If the character
3431 can't be found, Py_None is returned (or NULL, if another
3432 error occurred). */
3433 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
3435 PyObject *w = PyInt_FromLong((long)c);
3436 PyObject *x;
3438 if (w == NULL)
3439 return NULL;
3440 x = PyObject_GetItem(mapping, w);
3441 Py_DECREF(w);
3442 if (x == NULL) {
3443 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3444 /* No mapping found means: mapping is undefined. */
3445 PyErr_Clear();
3446 x = Py_None;
3447 Py_INCREF(x);
3448 return x;
3449 } else
3450 return NULL;
3452 else if (x == Py_None)
3453 return x;
3454 else if (PyInt_Check(x)) {
3455 long value = PyInt_AS_LONG(x);
3456 if (value < 0 || value > 255) {
3457 PyErr_SetString(PyExc_TypeError,
3458 "character mapping must be in range(256)");
3459 Py_DECREF(x);
3460 return NULL;
3462 return x;
3464 else if (PyString_Check(x))
3465 return x;
3466 else {
3467 /* wrong return value */
3468 PyErr_SetString(PyExc_TypeError,
3469 "character mapping must return integer, None or str");
3470 Py_DECREF(x);
3471 return NULL;
3475 static int
3476 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3478 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3479 /* exponentially overallocate to minimize reallocations */
3480 if (requiredsize < 2*outsize)
3481 requiredsize = 2*outsize;
3482 if (_PyString_Resize(outobj, requiredsize)) {
3483 return 0;
3485 return 1;
3488 typedef enum charmapencode_result {
3489 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3490 }charmapencode_result;
3491 /* lookup the character, put the result in the output string and adjust
3492 various state variables. Reallocate the output string if not enough
3493 space is available. Return a new reference to the object that
3494 was put in the output buffer, or Py_None, if the mapping was undefined
3495 (in which case no character was written) or NULL, if a
3496 reallocation error occurred. The caller must decref the result */
3497 static
3498 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
3499 PyObject **outobj, Py_ssize_t *outpos)
3501 PyObject *rep;
3502 char *outstart;
3503 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3505 if (mapping->ob_type == &EncodingMapType) {
3506 int res = encoding_map_lookup(c, mapping);
3507 Py_ssize_t requiredsize = *outpos+1;
3508 if (res == -1)
3509 return enc_FAILED;
3510 if (outsize<requiredsize)
3511 if (!charmapencode_resize(outobj, outpos, requiredsize))
3512 return enc_EXCEPTION;
3513 outstart = PyString_AS_STRING(*outobj);
3514 outstart[(*outpos)++] = (char)res;
3515 return enc_SUCCESS;
3518 rep = charmapencode_lookup(c, mapping);
3519 if (rep==NULL)
3520 return enc_EXCEPTION;
3521 else if (rep==Py_None) {
3522 Py_DECREF(rep);
3523 return enc_FAILED;
3524 } else {
3525 if (PyInt_Check(rep)) {
3526 Py_ssize_t requiredsize = *outpos+1;
3527 if (outsize<requiredsize)
3528 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
3529 Py_DECREF(rep);
3530 return enc_EXCEPTION;
3532 outstart = PyString_AS_STRING(*outobj);
3533 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3535 else {
3536 const char *repchars = PyString_AS_STRING(rep);
3537 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3538 Py_ssize_t requiredsize = *outpos+repsize;
3539 if (outsize<requiredsize)
3540 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
3541 Py_DECREF(rep);
3542 return enc_EXCEPTION;
3544 outstart = PyString_AS_STRING(*outobj);
3545 memcpy(outstart + *outpos, repchars, repsize);
3546 *outpos += repsize;
3549 Py_DECREF(rep);
3550 return enc_SUCCESS;
3553 /* handle an error in PyUnicode_EncodeCharmap
3554 Return 0 on success, -1 on error */
3555 static
3556 int charmap_encoding_error(
3557 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
3558 PyObject **exceptionObject,
3559 int *known_errorHandler, PyObject **errorHandler, const char *errors,
3560 PyObject **res, Py_ssize_t *respos)
3562 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3563 Py_ssize_t repsize;
3564 Py_ssize_t newpos;
3565 Py_UNICODE *uni2;
3566 /* startpos for collecting unencodable chars */
3567 Py_ssize_t collstartpos = *inpos;
3568 Py_ssize_t collendpos = *inpos+1;
3569 Py_ssize_t collpos;
3570 char *encoding = "charmap";
3571 char *reason = "character maps to <undefined>";
3572 charmapencode_result x;
3574 /* find all unencodable characters */
3575 while (collendpos < size) {
3576 PyObject *rep;
3577 if (mapping->ob_type == &EncodingMapType) {
3578 int res = encoding_map_lookup(p[collendpos], mapping);
3579 if (res != -1)
3580 break;
3581 ++collendpos;
3582 continue;
3585 rep = charmapencode_lookup(p[collendpos], mapping);
3586 if (rep==NULL)
3587 return -1;
3588 else if (rep!=Py_None) {
3589 Py_DECREF(rep);
3590 break;
3592 Py_DECREF(rep);
3593 ++collendpos;
3595 /* cache callback name lookup
3596 * (if not done yet, i.e. it's the first error) */
3597 if (*known_errorHandler==-1) {
3598 if ((errors==NULL) || (!strcmp(errors, "strict")))
3599 *known_errorHandler = 1;
3600 else if (!strcmp(errors, "replace"))
3601 *known_errorHandler = 2;
3602 else if (!strcmp(errors, "ignore"))
3603 *known_errorHandler = 3;
3604 else if (!strcmp(errors, "xmlcharrefreplace"))
3605 *known_errorHandler = 4;
3606 else
3607 *known_errorHandler = 0;
3609 switch (*known_errorHandler) {
3610 case 1: /* strict */
3611 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3612 return -1;
3613 case 2: /* replace */
3614 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3615 x = charmapencode_output('?', mapping, res, respos);
3616 if (x==enc_EXCEPTION) {
3617 return -1;
3619 else if (x==enc_FAILED) {
3620 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3621 return -1;
3624 /* fall through */
3625 case 3: /* ignore */
3626 *inpos = collendpos;
3627 break;
3628 case 4: /* xmlcharrefreplace */
3629 /* generate replacement (temporarily (mis)uses p) */
3630 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3631 char buffer[2+29+1+1];
3632 char *cp;
3633 sprintf(buffer, "&#%d;", (int)p[collpos]);
3634 for (cp = buffer; *cp; ++cp) {
3635 x = charmapencode_output(*cp, mapping, res, respos);
3636 if (x==enc_EXCEPTION)
3637 return -1;
3638 else if (x==enc_FAILED) {
3639 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3640 return -1;
3644 *inpos = collendpos;
3645 break;
3646 default:
3647 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
3648 encoding, reason, p, size, exceptionObject,
3649 collstartpos, collendpos, &newpos);
3650 if (repunicode == NULL)
3651 return -1;
3652 /* generate replacement */
3653 repsize = PyUnicode_GET_SIZE(repunicode);
3654 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3655 x = charmapencode_output(*uni2, mapping, res, respos);
3656 if (x==enc_EXCEPTION) {
3657 return -1;
3659 else if (x==enc_FAILED) {
3660 Py_DECREF(repunicode);
3661 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3662 return -1;
3665 *inpos = newpos;
3666 Py_DECREF(repunicode);
3668 return 0;
3671 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3672 Py_ssize_t size,
3673 PyObject *mapping,
3674 const char *errors)
3676 /* output object */
3677 PyObject *res = NULL;
3678 /* current input position */
3679 Py_ssize_t inpos = 0;
3680 /* current output position */
3681 Py_ssize_t respos = 0;
3682 PyObject *errorHandler = NULL;
3683 PyObject *exc = NULL;
3684 /* the following variable is used for caching string comparisons
3685 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3686 * 3=ignore, 4=xmlcharrefreplace */
3687 int known_errorHandler = -1;
3689 /* Default to Latin-1 */
3690 if (mapping == NULL)
3691 return PyUnicode_EncodeLatin1(p, size, errors);
3693 /* allocate enough for a simple encoding without
3694 replacements, if we need more, we'll resize */
3695 res = PyString_FromStringAndSize(NULL, size);
3696 if (res == NULL)
3697 goto onError;
3698 if (size == 0)
3699 return res;
3701 while (inpos<size) {
3702 /* try to encode it */
3703 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3704 if (x==enc_EXCEPTION) /* error */
3705 goto onError;
3706 if (x==enc_FAILED) { /* unencodable character */
3707 if (charmap_encoding_error(p, size, &inpos, mapping,
3708 &exc,
3709 &known_errorHandler, &errorHandler, errors,
3710 &res, &respos)) {
3711 goto onError;
3714 else
3715 /* done with this character => adjust input position */
3716 ++inpos;
3719 /* Resize if we allocated to much */
3720 if (respos<PyString_GET_SIZE(res)) {
3721 if (_PyString_Resize(&res, respos))
3722 goto onError;
3724 Py_XDECREF(exc);
3725 Py_XDECREF(errorHandler);
3726 return res;
3728 onError:
3729 Py_XDECREF(res);
3730 Py_XDECREF(exc);
3731 Py_XDECREF(errorHandler);
3732 return NULL;
3735 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3736 PyObject *mapping)
3738 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3739 PyErr_BadArgument();
3740 return NULL;
3742 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3743 PyUnicode_GET_SIZE(unicode),
3744 mapping,
3745 NULL);
3748 /* create or adjust a UnicodeTranslateError */
3749 static void make_translate_exception(PyObject **exceptionObject,
3750 const Py_UNICODE *unicode, Py_ssize_t size,
3751 Py_ssize_t startpos, Py_ssize_t endpos,
3752 const char *reason)
3754 if (*exceptionObject == NULL) {
3755 *exceptionObject = PyUnicodeTranslateError_Create(
3756 unicode, size, startpos, endpos, reason);
3758 else {
3759 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3760 goto onError;
3761 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3762 goto onError;
3763 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3764 goto onError;
3765 return;
3766 onError:
3767 Py_DECREF(*exceptionObject);
3768 *exceptionObject = NULL;
3772 /* raises a UnicodeTranslateError */
3773 static void raise_translate_exception(PyObject **exceptionObject,
3774 const Py_UNICODE *unicode, Py_ssize_t size,
3775 Py_ssize_t startpos, Py_ssize_t endpos,
3776 const char *reason)
3778 make_translate_exception(exceptionObject,
3779 unicode, size, startpos, endpos, reason);
3780 if (*exceptionObject != NULL)
3781 PyCodec_StrictErrors(*exceptionObject);
3784 /* error handling callback helper:
3785 build arguments, call the callback and check the arguments,
3786 put the result into newpos and return the replacement string, which
3787 has to be freed by the caller */
3788 static PyObject *unicode_translate_call_errorhandler(const char *errors,
3789 PyObject **errorHandler,
3790 const char *reason,
3791 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3792 Py_ssize_t startpos, Py_ssize_t endpos,
3793 Py_ssize_t *newpos)
3795 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
3797 Py_ssize_t i_newpos;
3798 PyObject *restuple;
3799 PyObject *resunicode;
3801 if (*errorHandler == NULL) {
3802 *errorHandler = PyCodec_LookupError(errors);
3803 if (*errorHandler == NULL)
3804 return NULL;
3807 make_translate_exception(exceptionObject,
3808 unicode, size, startpos, endpos, reason);
3809 if (*exceptionObject == NULL)
3810 return NULL;
3812 restuple = PyObject_CallFunctionObjArgs(
3813 *errorHandler, *exceptionObject, NULL);
3814 if (restuple == NULL)
3815 return NULL;
3816 if (!PyTuple_Check(restuple)) {
3817 PyErr_Format(PyExc_TypeError, &argparse[4]);
3818 Py_DECREF(restuple);
3819 return NULL;
3821 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3822 &resunicode, &i_newpos)) {
3823 Py_DECREF(restuple);
3824 return NULL;
3826 if (i_newpos<0)
3827 *newpos = size+i_newpos;
3828 else
3829 *newpos = i_newpos;
3830 if (*newpos<0 || *newpos>size) {
3831 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3832 Py_DECREF(restuple);
3833 return NULL;
3835 Py_INCREF(resunicode);
3836 Py_DECREF(restuple);
3837 return resunicode;
3840 /* Lookup the character ch in the mapping and put the result in result,
3841 which must be decrefed by the caller.
3842 Return 0 on success, -1 on error */
3843 static
3844 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3846 PyObject *w = PyInt_FromLong((long)c);
3847 PyObject *x;
3849 if (w == NULL)
3850 return -1;
3851 x = PyObject_GetItem(mapping, w);
3852 Py_DECREF(w);
3853 if (x == NULL) {
3854 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3855 /* No mapping found means: use 1:1 mapping. */
3856 PyErr_Clear();
3857 *result = NULL;
3858 return 0;
3859 } else
3860 return -1;
3862 else if (x == Py_None) {
3863 *result = x;
3864 return 0;
3866 else if (PyInt_Check(x)) {
3867 long value = PyInt_AS_LONG(x);
3868 long max = PyUnicode_GetMax();
3869 if (value < 0 || value > max) {
3870 PyErr_Format(PyExc_TypeError,
3871 "character mapping must be in range(0x%lx)", max+1);
3872 Py_DECREF(x);
3873 return -1;
3875 *result = x;
3876 return 0;
3878 else if (PyUnicode_Check(x)) {
3879 *result = x;
3880 return 0;
3882 else {
3883 /* wrong return value */
3884 PyErr_SetString(PyExc_TypeError,
3885 "character mapping must return integer, None or unicode");
3886 Py_DECREF(x);
3887 return -1;
3890 /* ensure that *outobj is at least requiredsize characters long,
3891 if not reallocate and adjust various state variables.
3892 Return 0 on success, -1 on error */
3893 static
3894 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
3895 Py_ssize_t requiredsize)
3897 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
3898 if (requiredsize > oldsize) {
3899 /* remember old output position */
3900 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3901 /* exponentially overallocate to minimize reallocations */
3902 if (requiredsize < 2 * oldsize)
3903 requiredsize = 2 * oldsize;
3904 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
3905 return -1;
3906 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
3908 return 0;
3910 /* lookup the character, put the result in the output string and adjust
3911 various state variables. Return a new reference to the object that
3912 was put in the output buffer in *result, or Py_None, if the mapping was
3913 undefined (in which case no character was written).
3914 The called must decref result.
3915 Return 0 on success, -1 on error. */
3916 static
3917 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
3918 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
3919 PyObject **res)
3921 if (charmaptranslate_lookup(*curinp, mapping, res))
3922 return -1;
3923 if (*res==NULL) {
3924 /* not found => default to 1:1 mapping */
3925 *(*outp)++ = *curinp;
3927 else if (*res==Py_None)
3929 else if (PyInt_Check(*res)) {
3930 /* no overflow check, because we know that the space is enough */
3931 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3933 else if (PyUnicode_Check(*res)) {
3934 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
3935 if (repsize==1) {
3936 /* no overflow check, because we know that the space is enough */
3937 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3939 else if (repsize!=0) {
3940 /* more than one character */
3941 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
3942 (insize - (curinp-startinp)) +
3943 repsize - 1;
3944 if (charmaptranslate_makespace(outobj, outp, requiredsize))
3945 return -1;
3946 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3947 *outp += repsize;
3950 else
3951 return -1;
3952 return 0;
3955 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
3956 Py_ssize_t size,
3957 PyObject *mapping,
3958 const char *errors)
3960 /* output object */
3961 PyObject *res = NULL;
3962 /* pointers to the beginning and end+1 of input */
3963 const Py_UNICODE *startp = p;
3964 const Py_UNICODE *endp = p + size;
3965 /* pointer into the output */
3966 Py_UNICODE *str;
3967 /* current output position */
3968 Py_ssize_t respos = 0;
3969 char *reason = "character maps to <undefined>";
3970 PyObject *errorHandler = NULL;
3971 PyObject *exc = NULL;
3972 /* the following variable is used for caching string comparisons
3973 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3974 * 3=ignore, 4=xmlcharrefreplace */
3975 int known_errorHandler = -1;
3977 if (mapping == NULL) {
3978 PyErr_BadArgument();
3979 return NULL;
3982 /* allocate enough for a simple 1:1 translation without
3983 replacements, if we need more, we'll resize */
3984 res = PyUnicode_FromUnicode(NULL, size);
3985 if (res == NULL)
3986 goto onError;
3987 if (size == 0)
3988 return res;
3989 str = PyUnicode_AS_UNICODE(res);
3991 while (p<endp) {
3992 /* try to encode it */
3993 PyObject *x = NULL;
3994 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
3995 Py_XDECREF(x);
3996 goto onError;
3998 Py_XDECREF(x);
3999 if (x!=Py_None) /* it worked => adjust input pointer */
4000 ++p;
4001 else { /* untranslatable character */
4002 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4003 Py_ssize_t repsize;
4004 Py_ssize_t newpos;
4005 Py_UNICODE *uni2;
4006 /* startpos for collecting untranslatable chars */
4007 const Py_UNICODE *collstart = p;
4008 const Py_UNICODE *collend = p+1;
4009 const Py_UNICODE *coll;
4011 /* find all untranslatable characters */
4012 while (collend < endp) {
4013 if (charmaptranslate_lookup(*collend, mapping, &x))
4014 goto onError;
4015 Py_XDECREF(x);
4016 if (x!=Py_None)
4017 break;
4018 ++collend;
4020 /* cache callback name lookup
4021 * (if not done yet, i.e. it's the first error) */
4022 if (known_errorHandler==-1) {
4023 if ((errors==NULL) || (!strcmp(errors, "strict")))
4024 known_errorHandler = 1;
4025 else if (!strcmp(errors, "replace"))
4026 known_errorHandler = 2;
4027 else if (!strcmp(errors, "ignore"))
4028 known_errorHandler = 3;
4029 else if (!strcmp(errors, "xmlcharrefreplace"))
4030 known_errorHandler = 4;
4031 else
4032 known_errorHandler = 0;
4034 switch (known_errorHandler) {
4035 case 1: /* strict */
4036 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4037 goto onError;
4038 case 2: /* replace */
4039 /* No need to check for space, this is a 1:1 replacement */
4040 for (coll = collstart; coll<collend; ++coll)
4041 *str++ = '?';
4042 /* fall through */
4043 case 3: /* ignore */
4044 p = collend;
4045 break;
4046 case 4: /* xmlcharrefreplace */
4047 /* generate replacement (temporarily (mis)uses p) */
4048 for (p = collstart; p < collend; ++p) {
4049 char buffer[2+29+1+1];
4050 char *cp;
4051 sprintf(buffer, "&#%d;", (int)*p);
4052 if (charmaptranslate_makespace(&res, &str,
4053 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4054 goto onError;
4055 for (cp = buffer; *cp; ++cp)
4056 *str++ = *cp;
4058 p = collend;
4059 break;
4060 default:
4061 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4062 reason, startp, size, &exc,
4063 collstart-startp, collend-startp, &newpos);
4064 if (repunicode == NULL)
4065 goto onError;
4066 /* generate replacement */
4067 repsize = PyUnicode_GET_SIZE(repunicode);
4068 if (charmaptranslate_makespace(&res, &str,
4069 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4070 Py_DECREF(repunicode);
4071 goto onError;
4073 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4074 *str++ = *uni2;
4075 p = startp + newpos;
4076 Py_DECREF(repunicode);
4080 /* Resize if we allocated to much */
4081 respos = str-PyUnicode_AS_UNICODE(res);
4082 if (respos<PyUnicode_GET_SIZE(res)) {
4083 if (_PyUnicode_Resize(&res, respos) < 0)
4084 goto onError;
4086 Py_XDECREF(exc);
4087 Py_XDECREF(errorHandler);
4088 return res;
4090 onError:
4091 Py_XDECREF(res);
4092 Py_XDECREF(exc);
4093 Py_XDECREF(errorHandler);
4094 return NULL;
4097 PyObject *PyUnicode_Translate(PyObject *str,
4098 PyObject *mapping,
4099 const char *errors)
4101 PyObject *result;
4103 str = PyUnicode_FromObject(str);
4104 if (str == NULL)
4105 goto onError;
4106 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4107 PyUnicode_GET_SIZE(str),
4108 mapping,
4109 errors);
4110 Py_DECREF(str);
4111 return result;
4113 onError:
4114 Py_XDECREF(str);
4115 return NULL;
4118 /* --- Decimal Encoder ---------------------------------------------------- */
4120 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
4121 Py_ssize_t length,
4122 char *output,
4123 const char *errors)
4125 Py_UNICODE *p, *end;
4126 PyObject *errorHandler = NULL;
4127 PyObject *exc = NULL;
4128 const char *encoding = "decimal";
4129 const char *reason = "invalid decimal Unicode string";
4130 /* the following variable is used for caching string comparisons
4131 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4132 int known_errorHandler = -1;
4134 if (output == NULL) {
4135 PyErr_BadArgument();
4136 return -1;
4139 p = s;
4140 end = s + length;
4141 while (p < end) {
4142 register Py_UNICODE ch = *p;
4143 int decimal;
4144 PyObject *repunicode;
4145 Py_ssize_t repsize;
4146 Py_ssize_t newpos;
4147 Py_UNICODE *uni2;
4148 Py_UNICODE *collstart;
4149 Py_UNICODE *collend;
4151 if (Py_UNICODE_ISSPACE(ch)) {
4152 *output++ = ' ';
4153 ++p;
4154 continue;
4156 decimal = Py_UNICODE_TODECIMAL(ch);
4157 if (decimal >= 0) {
4158 *output++ = '0' + decimal;
4159 ++p;
4160 continue;
4162 if (0 < ch && ch < 256) {
4163 *output++ = (char)ch;
4164 ++p;
4165 continue;
4167 /* All other characters are considered unencodable */
4168 collstart = p;
4169 collend = p+1;
4170 while (collend < end) {
4171 if ((0 < *collend && *collend < 256) ||
4172 !Py_UNICODE_ISSPACE(*collend) ||
4173 Py_UNICODE_TODECIMAL(*collend))
4174 break;
4176 /* cache callback name lookup
4177 * (if not done yet, i.e. it's the first error) */
4178 if (known_errorHandler==-1) {
4179 if ((errors==NULL) || (!strcmp(errors, "strict")))
4180 known_errorHandler = 1;
4181 else if (!strcmp(errors, "replace"))
4182 known_errorHandler = 2;
4183 else if (!strcmp(errors, "ignore"))
4184 known_errorHandler = 3;
4185 else if (!strcmp(errors, "xmlcharrefreplace"))
4186 known_errorHandler = 4;
4187 else
4188 known_errorHandler = 0;
4190 switch (known_errorHandler) {
4191 case 1: /* strict */
4192 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4193 goto onError;
4194 case 2: /* replace */
4195 for (p = collstart; p < collend; ++p)
4196 *output++ = '?';
4197 /* fall through */
4198 case 3: /* ignore */
4199 p = collend;
4200 break;
4201 case 4: /* xmlcharrefreplace */
4202 /* generate replacement (temporarily (mis)uses p) */
4203 for (p = collstart; p < collend; ++p)
4204 output += sprintf(output, "&#%d;", (int)*p);
4205 p = collend;
4206 break;
4207 default:
4208 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4209 encoding, reason, s, length, &exc,
4210 collstart-s, collend-s, &newpos);
4211 if (repunicode == NULL)
4212 goto onError;
4213 /* generate replacement */
4214 repsize = PyUnicode_GET_SIZE(repunicode);
4215 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4216 Py_UNICODE ch = *uni2;
4217 if (Py_UNICODE_ISSPACE(ch))
4218 *output++ = ' ';
4219 else {
4220 decimal = Py_UNICODE_TODECIMAL(ch);
4221 if (decimal >= 0)
4222 *output++ = '0' + decimal;
4223 else if (0 < ch && ch < 256)
4224 *output++ = (char)ch;
4225 else {
4226 Py_DECREF(repunicode);
4227 raise_encode_exception(&exc, encoding,
4228 s, length, collstart-s, collend-s, reason);
4229 goto onError;
4233 p = s + newpos;
4234 Py_DECREF(repunicode);
4237 /* 0-terminate the output string */
4238 *output++ = '\0';
4239 Py_XDECREF(exc);
4240 Py_XDECREF(errorHandler);
4241 return 0;
4243 onError:
4244 Py_XDECREF(exc);
4245 Py_XDECREF(errorHandler);
4246 return -1;
4249 /* --- Helpers ------------------------------------------------------------ */
4251 #define STRINGLIB_CHAR Py_UNICODE
4253 #define STRINGLIB_LEN PyUnicode_GET_SIZE
4254 #define STRINGLIB_NEW PyUnicode_FromUnicode
4255 #define STRINGLIB_STR PyUnicode_AS_UNICODE
4257 Py_LOCAL_INLINE(int)
4258 STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
4260 if (str[0] != other[0])
4261 return 1;
4262 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
4265 #define STRINGLIB_EMPTY unicode_empty
4267 #include "stringlib/fastsearch.h"
4269 #include "stringlib/count.h"
4270 #include "stringlib/find.h"
4271 #include "stringlib/partition.h"
4273 /* helper macro to fixup start/end slice values */
4274 #define FIX_START_END(obj) \
4275 if (start < 0) \
4276 start += (obj)->length; \
4277 if (start < 0) \
4278 start = 0; \
4279 if (end > (obj)->length) \
4280 end = (obj)->length; \
4281 if (end < 0) \
4282 end += (obj)->length; \
4283 if (end < 0) \
4284 end = 0;
4286 Py_ssize_t PyUnicode_Count(PyObject *str,
4287 PyObject *substr,
4288 Py_ssize_t start,
4289 Py_ssize_t end)
4291 Py_ssize_t result;
4292 PyUnicodeObject* str_obj;
4293 PyUnicodeObject* sub_obj;
4295 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4296 if (!str_obj)
4297 return -1;
4298 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4299 if (!sub_obj) {
4300 Py_DECREF(str_obj);
4301 return -1;
4304 FIX_START_END(str_obj);
4306 result = stringlib_count(
4307 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4310 Py_DECREF(sub_obj);
4311 Py_DECREF(str_obj);
4313 return result;
4316 Py_ssize_t PyUnicode_Find(PyObject *str,
4317 PyObject *sub,
4318 Py_ssize_t start,
4319 Py_ssize_t end,
4320 int direction)
4322 Py_ssize_t result;
4324 str = PyUnicode_FromObject(str);
4325 if (!str)
4326 return -2;
4327 sub = PyUnicode_FromObject(sub);
4328 if (!sub) {
4329 Py_DECREF(str);
4330 return -2;
4333 if (direction > 0)
4334 result = stringlib_find_slice(
4335 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4336 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4337 start, end
4339 else
4340 result = stringlib_rfind_slice(
4341 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4342 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4343 start, end
4346 Py_DECREF(str);
4347 Py_DECREF(sub);
4349 return result;
4352 static
4353 int tailmatch(PyUnicodeObject *self,
4354 PyUnicodeObject *substring,
4355 Py_ssize_t start,
4356 Py_ssize_t end,
4357 int direction)
4359 if (substring->length == 0)
4360 return 1;
4362 FIX_START_END(self);
4364 end -= substring->length;
4365 if (end < start)
4366 return 0;
4368 if (direction > 0) {
4369 if (Py_UNICODE_MATCH(self, end, substring))
4370 return 1;
4371 } else {
4372 if (Py_UNICODE_MATCH(self, start, substring))
4373 return 1;
4376 return 0;
4379 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
4380 PyObject *substr,
4381 Py_ssize_t start,
4382 Py_ssize_t end,
4383 int direction)
4385 Py_ssize_t result;
4387 str = PyUnicode_FromObject(str);
4388 if (str == NULL)
4389 return -1;
4390 substr = PyUnicode_FromObject(substr);
4391 if (substr == NULL) {
4392 Py_DECREF(str);
4393 return -1;
4396 result = tailmatch((PyUnicodeObject *)str,
4397 (PyUnicodeObject *)substr,
4398 start, end, direction);
4399 Py_DECREF(str);
4400 Py_DECREF(substr);
4401 return result;
4404 /* Apply fixfct filter to the Unicode object self and return a
4405 reference to the modified object */
4407 static
4408 PyObject *fixup(PyUnicodeObject *self,
4409 int (*fixfct)(PyUnicodeObject *s))
4412 PyUnicodeObject *u;
4414 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
4415 if (u == NULL)
4416 return NULL;
4418 Py_UNICODE_COPY(u->str, self->str, self->length);
4420 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
4421 /* fixfct should return TRUE if it modified the buffer. If
4422 FALSE, return a reference to the original buffer instead
4423 (to save space, not time) */
4424 Py_INCREF(self);
4425 Py_DECREF(u);
4426 return (PyObject*) self;
4428 return (PyObject*) u;
4431 static
4432 int fixupper(PyUnicodeObject *self)
4434 Py_ssize_t len = self->length;
4435 Py_UNICODE *s = self->str;
4436 int status = 0;
4438 while (len-- > 0) {
4439 register Py_UNICODE ch;
4441 ch = Py_UNICODE_TOUPPER(*s);
4442 if (ch != *s) {
4443 status = 1;
4444 *s = ch;
4446 s++;
4449 return status;
4452 static
4453 int fixlower(PyUnicodeObject *self)
4455 Py_ssize_t len = self->length;
4456 Py_UNICODE *s = self->str;
4457 int status = 0;
4459 while (len-- > 0) {
4460 register Py_UNICODE ch;
4462 ch = Py_UNICODE_TOLOWER(*s);
4463 if (ch != *s) {
4464 status = 1;
4465 *s = ch;
4467 s++;
4470 return status;
4473 static
4474 int fixswapcase(PyUnicodeObject *self)
4476 Py_ssize_t len = self->length;
4477 Py_UNICODE *s = self->str;
4478 int status = 0;
4480 while (len-- > 0) {
4481 if (Py_UNICODE_ISUPPER(*s)) {
4482 *s = Py_UNICODE_TOLOWER(*s);
4483 status = 1;
4484 } else if (Py_UNICODE_ISLOWER(*s)) {
4485 *s = Py_UNICODE_TOUPPER(*s);
4486 status = 1;
4488 s++;
4491 return status;
4494 static
4495 int fixcapitalize(PyUnicodeObject *self)
4497 Py_ssize_t len = self->length;
4498 Py_UNICODE *s = self->str;
4499 int status = 0;
4501 if (len == 0)
4502 return 0;
4503 if (Py_UNICODE_ISLOWER(*s)) {
4504 *s = Py_UNICODE_TOUPPER(*s);
4505 status = 1;
4507 s++;
4508 while (--len > 0) {
4509 if (Py_UNICODE_ISUPPER(*s)) {
4510 *s = Py_UNICODE_TOLOWER(*s);
4511 status = 1;
4513 s++;
4515 return status;
4518 static
4519 int fixtitle(PyUnicodeObject *self)
4521 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4522 register Py_UNICODE *e;
4523 int previous_is_cased;
4525 /* Shortcut for single character strings */
4526 if (PyUnicode_GET_SIZE(self) == 1) {
4527 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4528 if (*p != ch) {
4529 *p = ch;
4530 return 1;
4532 else
4533 return 0;
4536 e = p + PyUnicode_GET_SIZE(self);
4537 previous_is_cased = 0;
4538 for (; p < e; p++) {
4539 register const Py_UNICODE ch = *p;
4541 if (previous_is_cased)
4542 *p = Py_UNICODE_TOLOWER(ch);
4543 else
4544 *p = Py_UNICODE_TOTITLE(ch);
4546 if (Py_UNICODE_ISLOWER(ch) ||
4547 Py_UNICODE_ISUPPER(ch) ||
4548 Py_UNICODE_ISTITLE(ch))
4549 previous_is_cased = 1;
4550 else
4551 previous_is_cased = 0;
4553 return 1;
4556 PyObject *
4557 PyUnicode_Join(PyObject *separator, PyObject *seq)
4559 PyObject *internal_separator = NULL;
4560 const Py_UNICODE blank = ' ';
4561 const Py_UNICODE *sep = &blank;
4562 Py_ssize_t seplen = 1;
4563 PyUnicodeObject *res = NULL; /* the result */
4564 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4565 Py_ssize_t res_used; /* # used bytes */
4566 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4567 PyObject *fseq; /* PySequence_Fast(seq) */
4568 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
4569 PyObject *item;
4570 Py_ssize_t i;
4572 fseq = PySequence_Fast(seq, "");
4573 if (fseq == NULL) {
4574 return NULL;
4577 /* Grrrr. A codec may be invoked to convert str objects to
4578 * Unicode, and so it's possible to call back into Python code
4579 * during PyUnicode_FromObject(), and so it's possible for a sick
4580 * codec to change the size of fseq (if seq is a list). Therefore
4581 * we have to keep refetching the size -- can't assume seqlen
4582 * is invariant.
4584 seqlen = PySequence_Fast_GET_SIZE(fseq);
4585 /* If empty sequence, return u"". */
4586 if (seqlen == 0) {
4587 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4588 goto Done;
4590 /* If singleton sequence with an exact Unicode, return that. */
4591 if (seqlen == 1) {
4592 item = PySequence_Fast_GET_ITEM(fseq, 0);
4593 if (PyUnicode_CheckExact(item)) {
4594 Py_INCREF(item);
4595 res = (PyUnicodeObject *)item;
4596 goto Done;
4600 /* At least two items to join, or one that isn't exact Unicode. */
4601 if (seqlen > 1) {
4602 /* Set up sep and seplen -- they're needed. */
4603 if (separator == NULL) {
4604 sep = &blank;
4605 seplen = 1;
4607 else {
4608 internal_separator = PyUnicode_FromObject(separator);
4609 if (internal_separator == NULL)
4610 goto onError;
4611 sep = PyUnicode_AS_UNICODE(internal_separator);
4612 seplen = PyUnicode_GET_SIZE(internal_separator);
4613 /* In case PyUnicode_FromObject() mutated seq. */
4614 seqlen = PySequence_Fast_GET_SIZE(fseq);
4618 /* Get space. */
4619 res = _PyUnicode_New(res_alloc);
4620 if (res == NULL)
4621 goto onError;
4622 res_p = PyUnicode_AS_UNICODE(res);
4623 res_used = 0;
4625 for (i = 0; i < seqlen; ++i) {
4626 Py_ssize_t itemlen;
4627 Py_ssize_t new_res_used;
4629 item = PySequence_Fast_GET_ITEM(fseq, i);
4630 /* Convert item to Unicode. */
4631 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4632 PyErr_Format(PyExc_TypeError,
4633 "sequence item %zd: expected string or Unicode,"
4634 " %.80s found",
4635 i, item->ob_type->tp_name);
4636 goto onError;
4638 item = PyUnicode_FromObject(item);
4639 if (item == NULL)
4640 goto onError;
4641 /* We own a reference to item from here on. */
4643 /* In case PyUnicode_FromObject() mutated seq. */
4644 seqlen = PySequence_Fast_GET_SIZE(fseq);
4646 /* Make sure we have enough space for the separator and the item. */
4647 itemlen = PyUnicode_GET_SIZE(item);
4648 new_res_used = res_used + itemlen;
4649 if (new_res_used < 0)
4650 goto Overflow;
4651 if (i < seqlen - 1) {
4652 new_res_used += seplen;
4653 if (new_res_used < 0)
4654 goto Overflow;
4656 if (new_res_used > res_alloc) {
4657 /* double allocated size until it's big enough */
4658 do {
4659 res_alloc += res_alloc;
4660 if (res_alloc <= 0)
4661 goto Overflow;
4662 } while (new_res_used > res_alloc);
4663 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
4664 Py_DECREF(item);
4665 goto onError;
4667 res_p = PyUnicode_AS_UNICODE(res) + res_used;
4670 /* Copy item, and maybe the separator. */
4671 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
4672 res_p += itemlen;
4673 if (i < seqlen - 1) {
4674 Py_UNICODE_COPY(res_p, sep, seplen);
4675 res_p += seplen;
4677 Py_DECREF(item);
4678 res_used = new_res_used;
4681 /* Shrink res to match the used area; this probably can't fail,
4682 * but it's cheap to check.
4684 if (_PyUnicode_Resize(&res, res_used) < 0)
4685 goto onError;
4687 Done:
4688 Py_XDECREF(internal_separator);
4689 Py_DECREF(fseq);
4690 return (PyObject *)res;
4692 Overflow:
4693 PyErr_SetString(PyExc_OverflowError,
4694 "join() result is too long for a Python string");
4695 Py_DECREF(item);
4696 /* fall through */
4698 onError:
4699 Py_XDECREF(internal_separator);
4700 Py_DECREF(fseq);
4701 Py_XDECREF(res);
4702 return NULL;
4705 static
4706 PyUnicodeObject *pad(PyUnicodeObject *self,
4707 Py_ssize_t left,
4708 Py_ssize_t right,
4709 Py_UNICODE fill)
4711 PyUnicodeObject *u;
4713 if (left < 0)
4714 left = 0;
4715 if (right < 0)
4716 right = 0;
4718 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
4719 Py_INCREF(self);
4720 return self;
4723 u = _PyUnicode_New(left + self->length + right);
4724 if (u) {
4725 if (left)
4726 Py_UNICODE_FILL(u->str, fill, left);
4727 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4728 if (right)
4729 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4732 return u;
4735 #define SPLIT_APPEND(data, left, right) \
4736 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4737 if (!str) \
4738 goto onError; \
4739 if (PyList_Append(list, str)) { \
4740 Py_DECREF(str); \
4741 goto onError; \
4743 else \
4744 Py_DECREF(str);
4746 static
4747 PyObject *split_whitespace(PyUnicodeObject *self,
4748 PyObject *list,
4749 Py_ssize_t maxcount)
4751 register Py_ssize_t i;
4752 register Py_ssize_t j;
4753 Py_ssize_t len = self->length;
4754 PyObject *str;
4756 for (i = j = 0; i < len; ) {
4757 /* find a token */
4758 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4759 i++;
4760 j = i;
4761 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4762 i++;
4763 if (j < i) {
4764 if (maxcount-- <= 0)
4765 break;
4766 SPLIT_APPEND(self->str, j, i);
4767 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4768 i++;
4769 j = i;
4772 if (j < len) {
4773 SPLIT_APPEND(self->str, j, len);
4775 return list;
4777 onError:
4778 Py_DECREF(list);
4779 return NULL;
4782 PyObject *PyUnicode_Splitlines(PyObject *string,
4783 int keepends)
4785 register Py_ssize_t i;
4786 register Py_ssize_t j;
4787 Py_ssize_t len;
4788 PyObject *list;
4789 PyObject *str;
4790 Py_UNICODE *data;
4792 string = PyUnicode_FromObject(string);
4793 if (string == NULL)
4794 return NULL;
4795 data = PyUnicode_AS_UNICODE(string);
4796 len = PyUnicode_GET_SIZE(string);
4798 list = PyList_New(0);
4799 if (!list)
4800 goto onError;
4802 for (i = j = 0; i < len; ) {
4803 Py_ssize_t eol;
4805 /* Find a line and append it */
4806 while (i < len && !BLOOM_LINEBREAK(data[i]))
4807 i++;
4809 /* Skip the line break reading CRLF as one line break */
4810 eol = i;
4811 if (i < len) {
4812 if (data[i] == '\r' && i + 1 < len &&
4813 data[i+1] == '\n')
4814 i += 2;
4815 else
4816 i++;
4817 if (keepends)
4818 eol = i;
4820 SPLIT_APPEND(data, j, eol);
4821 j = i;
4823 if (j < len) {
4824 SPLIT_APPEND(data, j, len);
4827 Py_DECREF(string);
4828 return list;
4830 onError:
4831 Py_XDECREF(list);
4832 Py_DECREF(string);
4833 return NULL;
4836 static
4837 PyObject *split_char(PyUnicodeObject *self,
4838 PyObject *list,
4839 Py_UNICODE ch,
4840 Py_ssize_t maxcount)
4842 register Py_ssize_t i;
4843 register Py_ssize_t j;
4844 Py_ssize_t len = self->length;
4845 PyObject *str;
4847 for (i = j = 0; i < len; ) {
4848 if (self->str[i] == ch) {
4849 if (maxcount-- <= 0)
4850 break;
4851 SPLIT_APPEND(self->str, j, i);
4852 i = j = i + 1;
4853 } else
4854 i++;
4856 if (j <= len) {
4857 SPLIT_APPEND(self->str, j, len);
4859 return list;
4861 onError:
4862 Py_DECREF(list);
4863 return NULL;
4866 static
4867 PyObject *split_substring(PyUnicodeObject *self,
4868 PyObject *list,
4869 PyUnicodeObject *substring,
4870 Py_ssize_t maxcount)
4872 register Py_ssize_t i;
4873 register Py_ssize_t j;
4874 Py_ssize_t len = self->length;
4875 Py_ssize_t sublen = substring->length;
4876 PyObject *str;
4878 for (i = j = 0; i <= len - sublen; ) {
4879 if (Py_UNICODE_MATCH(self, i, substring)) {
4880 if (maxcount-- <= 0)
4881 break;
4882 SPLIT_APPEND(self->str, j, i);
4883 i = j = i + sublen;
4884 } else
4885 i++;
4887 if (j <= len) {
4888 SPLIT_APPEND(self->str, j, len);
4890 return list;
4892 onError:
4893 Py_DECREF(list);
4894 return NULL;
4897 static
4898 PyObject *rsplit_whitespace(PyUnicodeObject *self,
4899 PyObject *list,
4900 Py_ssize_t maxcount)
4902 register Py_ssize_t i;
4903 register Py_ssize_t j;
4904 Py_ssize_t len = self->length;
4905 PyObject *str;
4907 for (i = j = len - 1; i >= 0; ) {
4908 /* find a token */
4909 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4910 i--;
4911 j = i;
4912 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4913 i--;
4914 if (j > i) {
4915 if (maxcount-- <= 0)
4916 break;
4917 SPLIT_APPEND(self->str, i + 1, j + 1);
4918 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4919 i--;
4920 j = i;
4923 if (j >= 0) {
4924 SPLIT_APPEND(self->str, 0, j + 1);
4926 if (PyList_Reverse(list) < 0)
4927 goto onError;
4928 return list;
4930 onError:
4931 Py_DECREF(list);
4932 return NULL;
4935 static
4936 PyObject *rsplit_char(PyUnicodeObject *self,
4937 PyObject *list,
4938 Py_UNICODE ch,
4939 Py_ssize_t maxcount)
4941 register Py_ssize_t i;
4942 register Py_ssize_t j;
4943 Py_ssize_t len = self->length;
4944 PyObject *str;
4946 for (i = j = len - 1; i >= 0; ) {
4947 if (self->str[i] == ch) {
4948 if (maxcount-- <= 0)
4949 break;
4950 SPLIT_APPEND(self->str, i + 1, j + 1);
4951 j = i = i - 1;
4952 } else
4953 i--;
4955 if (j >= -1) {
4956 SPLIT_APPEND(self->str, 0, j + 1);
4958 if (PyList_Reverse(list) < 0)
4959 goto onError;
4960 return list;
4962 onError:
4963 Py_DECREF(list);
4964 return NULL;
4967 static
4968 PyObject *rsplit_substring(PyUnicodeObject *self,
4969 PyObject *list,
4970 PyUnicodeObject *substring,
4971 Py_ssize_t maxcount)
4973 register Py_ssize_t i;
4974 register Py_ssize_t j;
4975 Py_ssize_t len = self->length;
4976 Py_ssize_t sublen = substring->length;
4977 PyObject *str;
4979 for (i = len - sublen, j = len; i >= 0; ) {
4980 if (Py_UNICODE_MATCH(self, i, substring)) {
4981 if (maxcount-- <= 0)
4982 break;
4983 SPLIT_APPEND(self->str, i + sublen, j);
4984 j = i;
4985 i -= sublen;
4986 } else
4987 i--;
4989 if (j >= 0) {
4990 SPLIT_APPEND(self->str, 0, j);
4992 if (PyList_Reverse(list) < 0)
4993 goto onError;
4994 return list;
4996 onError:
4997 Py_DECREF(list);
4998 return NULL;
5001 #undef SPLIT_APPEND
5003 static
5004 PyObject *split(PyUnicodeObject *self,
5005 PyUnicodeObject *substring,
5006 Py_ssize_t maxcount)
5008 PyObject *list;
5010 if (maxcount < 0)
5011 maxcount = PY_SSIZE_T_MAX;
5013 list = PyList_New(0);
5014 if (!list)
5015 return NULL;
5017 if (substring == NULL)
5018 return split_whitespace(self,list,maxcount);
5020 else if (substring->length == 1)
5021 return split_char(self,list,substring->str[0],maxcount);
5023 else if (substring->length == 0) {
5024 Py_DECREF(list);
5025 PyErr_SetString(PyExc_ValueError, "empty separator");
5026 return NULL;
5028 else
5029 return split_substring(self,list,substring,maxcount);
5032 static
5033 PyObject *rsplit(PyUnicodeObject *self,
5034 PyUnicodeObject *substring,
5035 Py_ssize_t maxcount)
5037 PyObject *list;
5039 if (maxcount < 0)
5040 maxcount = PY_SSIZE_T_MAX;
5042 list = PyList_New(0);
5043 if (!list)
5044 return NULL;
5046 if (substring == NULL)
5047 return rsplit_whitespace(self,list,maxcount);
5049 else if (substring->length == 1)
5050 return rsplit_char(self,list,substring->str[0],maxcount);
5052 else if (substring->length == 0) {
5053 Py_DECREF(list);
5054 PyErr_SetString(PyExc_ValueError, "empty separator");
5055 return NULL;
5057 else
5058 return rsplit_substring(self,list,substring,maxcount);
5061 static
5062 PyObject *replace(PyUnicodeObject *self,
5063 PyUnicodeObject *str1,
5064 PyUnicodeObject *str2,
5065 Py_ssize_t maxcount)
5067 PyUnicodeObject *u;
5069 if (maxcount < 0)
5070 maxcount = PY_SSIZE_T_MAX;
5072 if (str1->length == str2->length) {
5073 /* same length */
5074 Py_ssize_t i;
5075 if (str1->length == 1) {
5076 /* replace characters */
5077 Py_UNICODE u1, u2;
5078 if (!findchar(self->str, self->length, str1->str[0]))
5079 goto nothing;
5080 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5081 if (!u)
5082 return NULL;
5083 Py_UNICODE_COPY(u->str, self->str, self->length);
5084 u1 = str1->str[0];
5085 u2 = str2->str[0];
5086 for (i = 0; i < u->length; i++)
5087 if (u->str[i] == u1) {
5088 if (--maxcount < 0)
5089 break;
5090 u->str[i] = u2;
5092 } else {
5093 i = fastsearch(
5094 self->str, self->length, str1->str, str1->length, FAST_SEARCH
5096 if (i < 0)
5097 goto nothing;
5098 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5099 if (!u)
5100 return NULL;
5101 Py_UNICODE_COPY(u->str, self->str, self->length);
5102 while (i <= self->length - str1->length)
5103 if (Py_UNICODE_MATCH(self, i, str1)) {
5104 if (--maxcount < 0)
5105 break;
5106 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5107 i += str1->length;
5108 } else
5109 i++;
5111 } else {
5113 Py_ssize_t n, i, j, e;
5114 Py_ssize_t product, new_size, delta;
5115 Py_UNICODE *p;
5117 /* replace strings */
5118 n = stringlib_count(self->str, self->length, str1->str, str1->length);
5119 if (n > maxcount)
5120 n = maxcount;
5121 if (n == 0)
5122 goto nothing;
5123 /* new_size = self->length + n * (str2->length - str1->length)); */
5124 delta = (str2->length - str1->length);
5125 if (delta == 0) {
5126 new_size = self->length;
5127 } else {
5128 product = n * (str2->length - str1->length);
5129 if ((product / (str2->length - str1->length)) != n) {
5130 PyErr_SetString(PyExc_OverflowError,
5131 "replace string is too long");
5132 return NULL;
5134 new_size = self->length + product;
5135 if (new_size < 0) {
5136 PyErr_SetString(PyExc_OverflowError,
5137 "replace string is too long");
5138 return NULL;
5141 u = _PyUnicode_New(new_size);
5142 if (!u)
5143 return NULL;
5144 i = 0;
5145 p = u->str;
5146 e = self->length - str1->length;
5147 if (str1->length > 0) {
5148 while (n-- > 0) {
5149 /* look for next match */
5150 j = i;
5151 while (j <= e) {
5152 if (Py_UNICODE_MATCH(self, j, str1))
5153 break;
5154 j++;
5156 if (j > i) {
5157 if (j > e)
5158 break;
5159 /* copy unchanged part [i:j] */
5160 Py_UNICODE_COPY(p, self->str+i, j-i);
5161 p += j - i;
5163 /* copy substitution string */
5164 if (str2->length > 0) {
5165 Py_UNICODE_COPY(p, str2->str, str2->length);
5166 p += str2->length;
5168 i = j + str1->length;
5170 if (i < self->length)
5171 /* copy tail [i:] */
5172 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5173 } else {
5174 /* interleave */
5175 while (n > 0) {
5176 Py_UNICODE_COPY(p, str2->str, str2->length);
5177 p += str2->length;
5178 if (--n <= 0)
5179 break;
5180 *p++ = self->str[i++];
5182 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5185 return (PyObject *) u;
5187 nothing:
5188 /* nothing to replace; return original string (when possible) */
5189 if (PyUnicode_CheckExact(self)) {
5190 Py_INCREF(self);
5191 return (PyObject *) self;
5193 return PyUnicode_FromUnicode(self->str, self->length);
5196 /* --- Unicode Object Methods --------------------------------------------- */
5198 PyDoc_STRVAR(title__doc__,
5199 "S.title() -> unicode\n\
5201 Return a titlecased version of S, i.e. words start with title case\n\
5202 characters, all remaining cased characters have lower case.");
5204 static PyObject*
5205 unicode_title(PyUnicodeObject *self)
5207 return fixup(self, fixtitle);
5210 PyDoc_STRVAR(capitalize__doc__,
5211 "S.capitalize() -> unicode\n\
5213 Return a capitalized version of S, i.e. make the first character\n\
5214 have upper case.");
5216 static PyObject*
5217 unicode_capitalize(PyUnicodeObject *self)
5219 return fixup(self, fixcapitalize);
5222 #if 0
5223 PyDoc_STRVAR(capwords__doc__,
5224 "S.capwords() -> unicode\n\
5226 Apply .capitalize() to all words in S and return the result with\n\
5227 normalized whitespace (all whitespace strings are replaced by ' ').");
5229 static PyObject*
5230 unicode_capwords(PyUnicodeObject *self)
5232 PyObject *list;
5233 PyObject *item;
5234 Py_ssize_t i;
5236 /* Split into words */
5237 list = split(self, NULL, -1);
5238 if (!list)
5239 return NULL;
5241 /* Capitalize each word */
5242 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5243 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5244 fixcapitalize);
5245 if (item == NULL)
5246 goto onError;
5247 Py_DECREF(PyList_GET_ITEM(list, i));
5248 PyList_SET_ITEM(list, i, item);
5251 /* Join the words to form a new string */
5252 item = PyUnicode_Join(NULL, list);
5254 onError:
5255 Py_DECREF(list);
5256 return (PyObject *)item;
5258 #endif
5260 /* Argument converter. Coerces to a single unicode character */
5262 static int
5263 convert_uc(PyObject *obj, void *addr)
5265 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5266 PyObject *uniobj;
5267 Py_UNICODE *unistr;
5269 uniobj = PyUnicode_FromObject(obj);
5270 if (uniobj == NULL) {
5271 PyErr_SetString(PyExc_TypeError,
5272 "The fill character cannot be converted to Unicode");
5273 return 0;
5275 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5276 PyErr_SetString(PyExc_TypeError,
5277 "The fill character must be exactly one character long");
5278 Py_DECREF(uniobj);
5279 return 0;
5281 unistr = PyUnicode_AS_UNICODE(uniobj);
5282 *fillcharloc = unistr[0];
5283 Py_DECREF(uniobj);
5284 return 1;
5287 PyDoc_STRVAR(center__doc__,
5288 "S.center(width[, fillchar]) -> unicode\n\
5290 Return S centered in a Unicode string of length width. Padding is\n\
5291 done using the specified fill character (default is a space)");
5293 static PyObject *
5294 unicode_center(PyUnicodeObject *self, PyObject *args)
5296 Py_ssize_t marg, left;
5297 Py_ssize_t width;
5298 Py_UNICODE fillchar = ' ';
5300 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
5301 return NULL;
5303 if (self->length >= width && PyUnicode_CheckExact(self)) {
5304 Py_INCREF(self);
5305 return (PyObject*) self;
5308 marg = width - self->length;
5309 left = marg / 2 + (marg & width & 1);
5311 return (PyObject*) pad(self, left, marg - left, fillchar);
5314 #if 0
5316 /* This code should go into some future Unicode collation support
5317 module. The basic comparison should compare ordinals on a naive
5318 basis (this is what Java does and thus JPython too). */
5320 /* speedy UTF-16 code point order comparison */
5321 /* gleaned from: */
5322 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5324 static short utf16Fixup[32] =
5326 0, 0, 0, 0, 0, 0, 0, 0,
5327 0, 0, 0, 0, 0, 0, 0, 0,
5328 0, 0, 0, 0, 0, 0, 0, 0,
5329 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
5332 static int
5333 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5335 Py_ssize_t len1, len2;
5337 Py_UNICODE *s1 = str1->str;
5338 Py_UNICODE *s2 = str2->str;
5340 len1 = str1->length;
5341 len2 = str2->length;
5343 while (len1 > 0 && len2 > 0) {
5344 Py_UNICODE c1, c2;
5346 c1 = *s1++;
5347 c2 = *s2++;
5349 if (c1 > (1<<11) * 26)
5350 c1 += utf16Fixup[c1>>11];
5351 if (c2 > (1<<11) * 26)
5352 c2 += utf16Fixup[c2>>11];
5353 /* now c1 and c2 are in UTF-32-compatible order */
5355 if (c1 != c2)
5356 return (c1 < c2) ? -1 : 1;
5358 len1--; len2--;
5361 return (len1 < len2) ? -1 : (len1 != len2);
5364 #else
5366 static int
5367 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5369 register Py_ssize_t len1, len2;
5371 Py_UNICODE *s1 = str1->str;
5372 Py_UNICODE *s2 = str2->str;
5374 len1 = str1->length;
5375 len2 = str2->length;
5377 while (len1 > 0 && len2 > 0) {
5378 Py_UNICODE c1, c2;
5380 c1 = *s1++;
5381 c2 = *s2++;
5383 if (c1 != c2)
5384 return (c1 < c2) ? -1 : 1;
5386 len1--; len2--;
5389 return (len1 < len2) ? -1 : (len1 != len2);
5392 #endif
5394 int PyUnicode_Compare(PyObject *left,
5395 PyObject *right)
5397 PyUnicodeObject *u = NULL, *v = NULL;
5398 int result;
5400 /* Coerce the two arguments */
5401 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5402 if (u == NULL)
5403 goto onError;
5404 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5405 if (v == NULL)
5406 goto onError;
5408 /* Shortcut for empty or interned objects */
5409 if (v == u) {
5410 Py_DECREF(u);
5411 Py_DECREF(v);
5412 return 0;
5415 result = unicode_compare(u, v);
5417 Py_DECREF(u);
5418 Py_DECREF(v);
5419 return result;
5421 onError:
5422 Py_XDECREF(u);
5423 Py_XDECREF(v);
5424 return -1;
5427 PyObject *PyUnicode_RichCompare(PyObject *left,
5428 PyObject *right,
5429 int op)
5431 int result;
5433 result = PyUnicode_Compare(left, right);
5434 if (result == -1 && PyErr_Occurred())
5435 goto onError;
5437 /* Convert the return value to a Boolean */
5438 switch (op) {
5439 case Py_EQ:
5440 result = (result == 0);
5441 break;
5442 case Py_NE:
5443 result = (result != 0);
5444 break;
5445 case Py_LE:
5446 result = (result <= 0);
5447 break;
5448 case Py_GE:
5449 result = (result >= 0);
5450 break;
5451 case Py_LT:
5452 result = (result == -1);
5453 break;
5454 case Py_GT:
5455 result = (result == 1);
5456 break;
5458 return PyBool_FromLong(result);
5460 onError:
5462 /* Standard case
5464 Type errors mean that PyUnicode_FromObject() could not convert
5465 one of the arguments (usually the right hand side) to Unicode,
5466 ie. we can't handle the comparison request. However, it is
5467 possible that the other object knows a comparison method, which
5468 is why we return Py_NotImplemented to give the other object a
5469 chance.
5472 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5473 PyErr_Clear();
5474 Py_INCREF(Py_NotImplemented);
5475 return Py_NotImplemented;
5477 if (op != Py_EQ && op != Py_NE)
5478 return NULL;
5480 /* Equality comparison.
5482 This is a special case: we silence any PyExc_UnicodeDecodeError
5483 and instead turn it into a PyErr_UnicodeWarning.
5486 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5487 return NULL;
5488 PyErr_Clear();
5489 if (PyErr_Warn(PyExc_UnicodeWarning,
5490 (op == Py_EQ) ?
5491 "Unicode equal comparison "
5492 "failed to convert both arguments to Unicode - "
5493 "interpreting them as being unequal" :
5494 "Unicode unequal comparison "
5495 "failed to convert both arguments to Unicode - "
5496 "interpreting them as being unequal"
5497 ) < 0)
5498 return NULL;
5499 result = (op == Py_NE);
5500 return PyBool_FromLong(result);
5503 int PyUnicode_Contains(PyObject *container,
5504 PyObject *element)
5506 PyObject *str, *sub;
5507 int result;
5509 /* Coerce the two arguments */
5510 sub = PyUnicode_FromObject(element);
5511 if (!sub) {
5512 PyErr_SetString(PyExc_TypeError,
5513 "'in <string>' requires string as left operand");
5514 return -1;
5517 str = PyUnicode_FromObject(container);
5518 if (!str) {
5519 Py_DECREF(sub);
5520 return -1;
5523 result = stringlib_contains_obj(str, sub);
5525 Py_DECREF(str);
5526 Py_DECREF(sub);
5528 return result;
5531 /* Concat to string or Unicode object giving a new Unicode object. */
5533 PyObject *PyUnicode_Concat(PyObject *left,
5534 PyObject *right)
5536 PyUnicodeObject *u = NULL, *v = NULL, *w;
5538 /* Coerce the two arguments */
5539 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5540 if (u == NULL)
5541 goto onError;
5542 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5543 if (v == NULL)
5544 goto onError;
5546 /* Shortcuts */
5547 if (v == unicode_empty) {
5548 Py_DECREF(v);
5549 return (PyObject *)u;
5551 if (u == unicode_empty) {
5552 Py_DECREF(u);
5553 return (PyObject *)v;
5556 /* Concat the two Unicode strings */
5557 w = _PyUnicode_New(u->length + v->length);
5558 if (w == NULL)
5559 goto onError;
5560 Py_UNICODE_COPY(w->str, u->str, u->length);
5561 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5563 Py_DECREF(u);
5564 Py_DECREF(v);
5565 return (PyObject *)w;
5567 onError:
5568 Py_XDECREF(u);
5569 Py_XDECREF(v);
5570 return NULL;
5573 PyDoc_STRVAR(count__doc__,
5574 "S.count(sub[, start[, end]]) -> int\n\
5576 Return the number of non-overlapping occurrences of substring sub in\n\
5577 Unicode string S[start:end]. Optional arguments start and end are\n\
5578 interpreted as in slice notation.");
5580 static PyObject *
5581 unicode_count(PyUnicodeObject *self, PyObject *args)
5583 PyUnicodeObject *substring;
5584 Py_ssize_t start = 0;
5585 Py_ssize_t end = PY_SSIZE_T_MAX;
5586 PyObject *result;
5588 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5589 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5590 return NULL;
5592 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5593 (PyObject *)substring);
5594 if (substring == NULL)
5595 return NULL;
5597 FIX_START_END(self);
5599 result = PyInt_FromSsize_t(
5600 stringlib_count(self->str + start, end - start,
5601 substring->str, substring->length)
5604 Py_DECREF(substring);
5606 return result;
5609 PyDoc_STRVAR(encode__doc__,
5610 "S.encode([encoding[,errors]]) -> string or unicode\n\
5612 Encodes S using the codec registered for encoding. encoding defaults\n\
5613 to the default encoding. errors may be given to set a different error\n\
5614 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5615 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5616 'xmlcharrefreplace' as well as any other name registered with\n\
5617 codecs.register_error that can handle UnicodeEncodeErrors.");
5619 static PyObject *
5620 unicode_encode(PyUnicodeObject *self, PyObject *args)
5622 char *encoding = NULL;
5623 char *errors = NULL;
5624 PyObject *v;
5626 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5627 return NULL;
5628 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
5629 if (v == NULL)
5630 goto onError;
5631 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5632 PyErr_Format(PyExc_TypeError,
5633 "encoder did not return a string/unicode object "
5634 "(type=%.400s)",
5635 v->ob_type->tp_name);
5636 Py_DECREF(v);
5637 return NULL;
5639 return v;
5641 onError:
5642 return NULL;
5645 PyDoc_STRVAR(decode__doc__,
5646 "S.decode([encoding[,errors]]) -> string or unicode\n\
5648 Decodes S using the codec registered for encoding. encoding defaults\n\
5649 to the default encoding. errors may be given to set a different error\n\
5650 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5651 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5652 as well as any other name registerd with codecs.register_error that is\n\
5653 able to handle UnicodeDecodeErrors.");
5655 static PyObject *
5656 unicode_decode(PyUnicodeObject *self, PyObject *args)
5658 char *encoding = NULL;
5659 char *errors = NULL;
5660 PyObject *v;
5662 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5663 return NULL;
5664 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
5665 if (v == NULL)
5666 goto onError;
5667 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5668 PyErr_Format(PyExc_TypeError,
5669 "decoder did not return a string/unicode object "
5670 "(type=%.400s)",
5671 v->ob_type->tp_name);
5672 Py_DECREF(v);
5673 return NULL;
5675 return v;
5677 onError:
5678 return NULL;
5681 PyDoc_STRVAR(expandtabs__doc__,
5682 "S.expandtabs([tabsize]) -> unicode\n\
5684 Return a copy of S where all tab characters are expanded using spaces.\n\
5685 If tabsize is not given, a tab size of 8 characters is assumed.");
5687 static PyObject*
5688 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5690 Py_UNICODE *e;
5691 Py_UNICODE *p;
5692 Py_UNICODE *q;
5693 Py_ssize_t i, j;
5694 PyUnicodeObject *u;
5695 int tabsize = 8;
5697 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5698 return NULL;
5700 /* First pass: determine size of output string */
5701 i = j = 0;
5702 e = self->str + self->length;
5703 for (p = self->str; p < e; p++)
5704 if (*p == '\t') {
5705 if (tabsize > 0)
5706 j += tabsize - (j % tabsize);
5708 else {
5709 j++;
5710 if (*p == '\n' || *p == '\r') {
5711 i += j;
5712 j = 0;
5716 /* Second pass: create output string and fill it */
5717 u = _PyUnicode_New(i + j);
5718 if (!u)
5719 return NULL;
5721 j = 0;
5722 q = u->str;
5724 for (p = self->str; p < e; p++)
5725 if (*p == '\t') {
5726 if (tabsize > 0) {
5727 i = tabsize - (j % tabsize);
5728 j += i;
5729 while (i--)
5730 *q++ = ' ';
5733 else {
5734 j++;
5735 *q++ = *p;
5736 if (*p == '\n' || *p == '\r')
5737 j = 0;
5740 return (PyObject*) u;
5743 PyDoc_STRVAR(find__doc__,
5744 "S.find(sub [,start [,end]]) -> int\n\
5746 Return the lowest index in S where substring sub is found,\n\
5747 such that sub is contained within s[start,end]. Optional\n\
5748 arguments start and end are interpreted as in slice notation.\n\
5750 Return -1 on failure.");
5752 static PyObject *
5753 unicode_find(PyUnicodeObject *self, PyObject *args)
5755 PyObject *substring;
5756 Py_ssize_t start = 0;
5757 Py_ssize_t end = PY_SSIZE_T_MAX;
5758 Py_ssize_t result;
5760 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5761 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5762 return NULL;
5763 substring = PyUnicode_FromObject(substring);
5764 if (!substring)
5765 return NULL;
5767 result = stringlib_find_slice(
5768 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5769 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5770 start, end
5773 Py_DECREF(substring);
5775 return PyInt_FromSsize_t(result);
5778 static PyObject *
5779 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
5781 if (index < 0 || index >= self->length) {
5782 PyErr_SetString(PyExc_IndexError, "string index out of range");
5783 return NULL;
5786 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5789 static long
5790 unicode_hash(PyUnicodeObject *self)
5792 /* Since Unicode objects compare equal to their ASCII string
5793 counterparts, they should use the individual character values
5794 as basis for their hash value. This is needed to assure that
5795 strings and Unicode objects behave in the same way as
5796 dictionary keys. */
5798 register Py_ssize_t len;
5799 register Py_UNICODE *p;
5800 register long x;
5802 if (self->hash != -1)
5803 return self->hash;
5804 len = PyUnicode_GET_SIZE(self);
5805 p = PyUnicode_AS_UNICODE(self);
5806 x = *p << 7;
5807 while (--len >= 0)
5808 x = (1000003*x) ^ *p++;
5809 x ^= PyUnicode_GET_SIZE(self);
5810 if (x == -1)
5811 x = -2;
5812 self->hash = x;
5813 return x;
5816 PyDoc_STRVAR(index__doc__,
5817 "S.index(sub [,start [,end]]) -> int\n\
5819 Like S.find() but raise ValueError when the substring is not found.");
5821 static PyObject *
5822 unicode_index(PyUnicodeObject *self, PyObject *args)
5824 Py_ssize_t result;
5825 PyObject *substring;
5826 Py_ssize_t start = 0;
5827 Py_ssize_t end = PY_SSIZE_T_MAX;
5829 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5830 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5831 return NULL;
5832 substring = PyUnicode_FromObject(substring);
5833 if (!substring)
5834 return NULL;
5836 result = stringlib_find_slice(
5837 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5838 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5839 start, end
5842 Py_DECREF(substring);
5844 if (result < 0) {
5845 PyErr_SetString(PyExc_ValueError, "substring not found");
5846 return NULL;
5849 return PyInt_FromSsize_t(result);
5852 PyDoc_STRVAR(islower__doc__,
5853 "S.islower() -> bool\n\
5855 Return True if all cased characters in S are lowercase and there is\n\
5856 at least one cased character in S, False otherwise.");
5858 static PyObject*
5859 unicode_islower(PyUnicodeObject *self)
5861 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5862 register const Py_UNICODE *e;
5863 int cased;
5865 /* Shortcut for single character strings */
5866 if (PyUnicode_GET_SIZE(self) == 1)
5867 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
5869 /* Special case for empty strings */
5870 if (PyUnicode_GET_SIZE(self) == 0)
5871 return PyBool_FromLong(0);
5873 e = p + PyUnicode_GET_SIZE(self);
5874 cased = 0;
5875 for (; p < e; p++) {
5876 register const Py_UNICODE ch = *p;
5878 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
5879 return PyBool_FromLong(0);
5880 else if (!cased && Py_UNICODE_ISLOWER(ch))
5881 cased = 1;
5883 return PyBool_FromLong(cased);
5886 PyDoc_STRVAR(isupper__doc__,
5887 "S.isupper() -> bool\n\
5889 Return True if all cased characters in S are uppercase and there is\n\
5890 at least one cased character in S, False otherwise.");
5892 static PyObject*
5893 unicode_isupper(PyUnicodeObject *self)
5895 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5896 register const Py_UNICODE *e;
5897 int cased;
5899 /* Shortcut for single character strings */
5900 if (PyUnicode_GET_SIZE(self) == 1)
5901 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
5903 /* Special case for empty strings */
5904 if (PyUnicode_GET_SIZE(self) == 0)
5905 return PyBool_FromLong(0);
5907 e = p + PyUnicode_GET_SIZE(self);
5908 cased = 0;
5909 for (; p < e; p++) {
5910 register const Py_UNICODE ch = *p;
5912 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
5913 return PyBool_FromLong(0);
5914 else if (!cased && Py_UNICODE_ISUPPER(ch))
5915 cased = 1;
5917 return PyBool_FromLong(cased);
5920 PyDoc_STRVAR(istitle__doc__,
5921 "S.istitle() -> bool\n\
5923 Return True if S is a titlecased string and there is at least one\n\
5924 character in S, i.e. upper- and titlecase characters may only\n\
5925 follow uncased characters and lowercase characters only cased ones.\n\
5926 Return False otherwise.");
5928 static PyObject*
5929 unicode_istitle(PyUnicodeObject *self)
5931 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5932 register const Py_UNICODE *e;
5933 int cased, previous_is_cased;
5935 /* Shortcut for single character strings */
5936 if (PyUnicode_GET_SIZE(self) == 1)
5937 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5938 (Py_UNICODE_ISUPPER(*p) != 0));
5940 /* Special case for empty strings */
5941 if (PyUnicode_GET_SIZE(self) == 0)
5942 return PyBool_FromLong(0);
5944 e = p + PyUnicode_GET_SIZE(self);
5945 cased = 0;
5946 previous_is_cased = 0;
5947 for (; p < e; p++) {
5948 register const Py_UNICODE ch = *p;
5950 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5951 if (previous_is_cased)
5952 return PyBool_FromLong(0);
5953 previous_is_cased = 1;
5954 cased = 1;
5956 else if (Py_UNICODE_ISLOWER(ch)) {
5957 if (!previous_is_cased)
5958 return PyBool_FromLong(0);
5959 previous_is_cased = 1;
5960 cased = 1;
5962 else
5963 previous_is_cased = 0;
5965 return PyBool_FromLong(cased);
5968 PyDoc_STRVAR(isspace__doc__,
5969 "S.isspace() -> bool\n\
5971 Return True if all characters in S are whitespace\n\
5972 and there is at least one character in S, False otherwise.");
5974 static PyObject*
5975 unicode_isspace(PyUnicodeObject *self)
5977 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5978 register const Py_UNICODE *e;
5980 /* Shortcut for single character strings */
5981 if (PyUnicode_GET_SIZE(self) == 1 &&
5982 Py_UNICODE_ISSPACE(*p))
5983 return PyBool_FromLong(1);
5985 /* Special case for empty strings */
5986 if (PyUnicode_GET_SIZE(self) == 0)
5987 return PyBool_FromLong(0);
5989 e = p + PyUnicode_GET_SIZE(self);
5990 for (; p < e; p++) {
5991 if (!Py_UNICODE_ISSPACE(*p))
5992 return PyBool_FromLong(0);
5994 return PyBool_FromLong(1);
5997 PyDoc_STRVAR(isalpha__doc__,
5998 "S.isalpha() -> bool\n\
6000 Return True if all characters in S are alphabetic\n\
6001 and there is at least one character in S, False otherwise.");
6003 static PyObject*
6004 unicode_isalpha(PyUnicodeObject *self)
6006 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6007 register const Py_UNICODE *e;
6009 /* Shortcut for single character strings */
6010 if (PyUnicode_GET_SIZE(self) == 1 &&
6011 Py_UNICODE_ISALPHA(*p))
6012 return PyBool_FromLong(1);
6014 /* Special case for empty strings */
6015 if (PyUnicode_GET_SIZE(self) == 0)
6016 return PyBool_FromLong(0);
6018 e = p + PyUnicode_GET_SIZE(self);
6019 for (; p < e; p++) {
6020 if (!Py_UNICODE_ISALPHA(*p))
6021 return PyBool_FromLong(0);
6023 return PyBool_FromLong(1);
6026 PyDoc_STRVAR(isalnum__doc__,
6027 "S.isalnum() -> bool\n\
6029 Return True if all characters in S are alphanumeric\n\
6030 and there is at least one character in S, False otherwise.");
6032 static PyObject*
6033 unicode_isalnum(PyUnicodeObject *self)
6035 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6036 register const Py_UNICODE *e;
6038 /* Shortcut for single character strings */
6039 if (PyUnicode_GET_SIZE(self) == 1 &&
6040 Py_UNICODE_ISALNUM(*p))
6041 return PyBool_FromLong(1);
6043 /* Special case for empty strings */
6044 if (PyUnicode_GET_SIZE(self) == 0)
6045 return PyBool_FromLong(0);
6047 e = p + PyUnicode_GET_SIZE(self);
6048 for (; p < e; p++) {
6049 if (!Py_UNICODE_ISALNUM(*p))
6050 return PyBool_FromLong(0);
6052 return PyBool_FromLong(1);
6055 PyDoc_STRVAR(isdecimal__doc__,
6056 "S.isdecimal() -> bool\n\
6058 Return True if there are only decimal characters in S,\n\
6059 False otherwise.");
6061 static PyObject*
6062 unicode_isdecimal(PyUnicodeObject *self)
6064 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6065 register const Py_UNICODE *e;
6067 /* Shortcut for single character strings */
6068 if (PyUnicode_GET_SIZE(self) == 1 &&
6069 Py_UNICODE_ISDECIMAL(*p))
6070 return PyBool_FromLong(1);
6072 /* Special case for empty strings */
6073 if (PyUnicode_GET_SIZE(self) == 0)
6074 return PyBool_FromLong(0);
6076 e = p + PyUnicode_GET_SIZE(self);
6077 for (; p < e; p++) {
6078 if (!Py_UNICODE_ISDECIMAL(*p))
6079 return PyBool_FromLong(0);
6081 return PyBool_FromLong(1);
6084 PyDoc_STRVAR(isdigit__doc__,
6085 "S.isdigit() -> bool\n\
6087 Return True if all characters in S are digits\n\
6088 and there is at least one character in S, False otherwise.");
6090 static PyObject*
6091 unicode_isdigit(PyUnicodeObject *self)
6093 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6094 register const Py_UNICODE *e;
6096 /* Shortcut for single character strings */
6097 if (PyUnicode_GET_SIZE(self) == 1 &&
6098 Py_UNICODE_ISDIGIT(*p))
6099 return PyBool_FromLong(1);
6101 /* Special case for empty strings */
6102 if (PyUnicode_GET_SIZE(self) == 0)
6103 return PyBool_FromLong(0);
6105 e = p + PyUnicode_GET_SIZE(self);
6106 for (; p < e; p++) {
6107 if (!Py_UNICODE_ISDIGIT(*p))
6108 return PyBool_FromLong(0);
6110 return PyBool_FromLong(1);
6113 PyDoc_STRVAR(isnumeric__doc__,
6114 "S.isnumeric() -> bool\n\
6116 Return True if there are only numeric characters in S,\n\
6117 False otherwise.");
6119 static PyObject*
6120 unicode_isnumeric(PyUnicodeObject *self)
6122 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6123 register const Py_UNICODE *e;
6125 /* Shortcut for single character strings */
6126 if (PyUnicode_GET_SIZE(self) == 1 &&
6127 Py_UNICODE_ISNUMERIC(*p))
6128 return PyBool_FromLong(1);
6130 /* Special case for empty strings */
6131 if (PyUnicode_GET_SIZE(self) == 0)
6132 return PyBool_FromLong(0);
6134 e = p + PyUnicode_GET_SIZE(self);
6135 for (; p < e; p++) {
6136 if (!Py_UNICODE_ISNUMERIC(*p))
6137 return PyBool_FromLong(0);
6139 return PyBool_FromLong(1);
6142 PyDoc_STRVAR(join__doc__,
6143 "S.join(sequence) -> unicode\n\
6145 Return a string which is the concatenation of the strings in the\n\
6146 sequence. The separator between elements is S.");
6148 static PyObject*
6149 unicode_join(PyObject *self, PyObject *data)
6151 return PyUnicode_Join(self, data);
6154 static Py_ssize_t
6155 unicode_length(PyUnicodeObject *self)
6157 return self->length;
6160 PyDoc_STRVAR(ljust__doc__,
6161 "S.ljust(width[, fillchar]) -> int\n\
6163 Return S left justified in a Unicode string of length width. Padding is\n\
6164 done using the specified fill character (default is a space).");
6166 static PyObject *
6167 unicode_ljust(PyUnicodeObject *self, PyObject *args)
6169 Py_ssize_t width;
6170 Py_UNICODE fillchar = ' ';
6172 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
6173 return NULL;
6175 if (self->length >= width && PyUnicode_CheckExact(self)) {
6176 Py_INCREF(self);
6177 return (PyObject*) self;
6180 return (PyObject*) pad(self, 0, width - self->length, fillchar);
6183 PyDoc_STRVAR(lower__doc__,
6184 "S.lower() -> unicode\n\
6186 Return a copy of the string S converted to lowercase.");
6188 static PyObject*
6189 unicode_lower(PyUnicodeObject *self)
6191 return fixup(self, fixlower);
6194 #define LEFTSTRIP 0
6195 #define RIGHTSTRIP 1
6196 #define BOTHSTRIP 2
6198 /* Arrays indexed by above */
6199 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6201 #define STRIPNAME(i) (stripformat[i]+3)
6203 /* externally visible for str.strip(unicode) */
6204 PyObject *
6205 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6207 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6208 Py_ssize_t len = PyUnicode_GET_SIZE(self);
6209 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6210 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6211 Py_ssize_t i, j;
6213 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6215 i = 0;
6216 if (striptype != RIGHTSTRIP) {
6217 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6218 i++;
6222 j = len;
6223 if (striptype != LEFTSTRIP) {
6224 do {
6225 j--;
6226 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6227 j++;
6230 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6231 Py_INCREF(self);
6232 return (PyObject*)self;
6234 else
6235 return PyUnicode_FromUnicode(s+i, j-i);
6239 static PyObject *
6240 do_strip(PyUnicodeObject *self, int striptype)
6242 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6243 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
6245 i = 0;
6246 if (striptype != RIGHTSTRIP) {
6247 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6248 i++;
6252 j = len;
6253 if (striptype != LEFTSTRIP) {
6254 do {
6255 j--;
6256 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6257 j++;
6260 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6261 Py_INCREF(self);
6262 return (PyObject*)self;
6264 else
6265 return PyUnicode_FromUnicode(s+i, j-i);
6269 static PyObject *
6270 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6272 PyObject *sep = NULL;
6274 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6275 return NULL;
6277 if (sep != NULL && sep != Py_None) {
6278 if (PyUnicode_Check(sep))
6279 return _PyUnicode_XStrip(self, striptype, sep);
6280 else if (PyString_Check(sep)) {
6281 PyObject *res;
6282 sep = PyUnicode_FromObject(sep);
6283 if (sep==NULL)
6284 return NULL;
6285 res = _PyUnicode_XStrip(self, striptype, sep);
6286 Py_DECREF(sep);
6287 return res;
6289 else {
6290 PyErr_Format(PyExc_TypeError,
6291 "%s arg must be None, unicode or str",
6292 STRIPNAME(striptype));
6293 return NULL;
6297 return do_strip(self, striptype);
6301 PyDoc_STRVAR(strip__doc__,
6302 "S.strip([chars]) -> unicode\n\
6304 Return a copy of the string S with leading and trailing\n\
6305 whitespace removed.\n\
6306 If chars is given and not None, remove characters in chars instead.\n\
6307 If chars is a str, it will be converted to unicode before stripping");
6309 static PyObject *
6310 unicode_strip(PyUnicodeObject *self, PyObject *args)
6312 if (PyTuple_GET_SIZE(args) == 0)
6313 return do_strip(self, BOTHSTRIP); /* Common case */
6314 else
6315 return do_argstrip(self, BOTHSTRIP, args);
6319 PyDoc_STRVAR(lstrip__doc__,
6320 "S.lstrip([chars]) -> unicode\n\
6322 Return a copy of the string S with leading whitespace removed.\n\
6323 If chars is given and not None, remove characters in chars instead.\n\
6324 If chars is a str, it will be converted to unicode before stripping");
6326 static PyObject *
6327 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6329 if (PyTuple_GET_SIZE(args) == 0)
6330 return do_strip(self, LEFTSTRIP); /* Common case */
6331 else
6332 return do_argstrip(self, LEFTSTRIP, args);
6336 PyDoc_STRVAR(rstrip__doc__,
6337 "S.rstrip([chars]) -> unicode\n\
6339 Return a copy of the string S with trailing whitespace removed.\n\
6340 If chars is given and not None, remove characters in chars instead.\n\
6341 If chars is a str, it will be converted to unicode before stripping");
6343 static PyObject *
6344 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6346 if (PyTuple_GET_SIZE(args) == 0)
6347 return do_strip(self, RIGHTSTRIP); /* Common case */
6348 else
6349 return do_argstrip(self, RIGHTSTRIP, args);
6353 static PyObject*
6354 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
6356 PyUnicodeObject *u;
6357 Py_UNICODE *p;
6358 Py_ssize_t nchars;
6359 size_t nbytes;
6361 if (len < 0)
6362 len = 0;
6364 if (len == 1 && PyUnicode_CheckExact(str)) {
6365 /* no repeat, return original string */
6366 Py_INCREF(str);
6367 return (PyObject*) str;
6370 /* ensure # of chars needed doesn't overflow int and # of bytes
6371 * needed doesn't overflow size_t
6373 nchars = len * str->length;
6374 if (len && nchars / len != str->length) {
6375 PyErr_SetString(PyExc_OverflowError,
6376 "repeated string is too long");
6377 return NULL;
6379 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6380 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6381 PyErr_SetString(PyExc_OverflowError,
6382 "repeated string is too long");
6383 return NULL;
6385 u = _PyUnicode_New(nchars);
6386 if (!u)
6387 return NULL;
6389 p = u->str;
6391 if (str->length == 1 && len > 0) {
6392 Py_UNICODE_FILL(p, str->str[0], len);
6393 } else {
6394 Py_ssize_t done = 0; /* number of characters copied this far */
6395 if (done < nchars) {
6396 Py_UNICODE_COPY(p, str->str, str->length);
6397 done = str->length;
6399 while (done < nchars) {
6400 int n = (done <= nchars-done) ? done : nchars-done;
6401 Py_UNICODE_COPY(p+done, p, n);
6402 done += n;
6406 return (PyObject*) u;
6409 PyObject *PyUnicode_Replace(PyObject *obj,
6410 PyObject *subobj,
6411 PyObject *replobj,
6412 Py_ssize_t maxcount)
6414 PyObject *self;
6415 PyObject *str1;
6416 PyObject *str2;
6417 PyObject *result;
6419 self = PyUnicode_FromObject(obj);
6420 if (self == NULL)
6421 return NULL;
6422 str1 = PyUnicode_FromObject(subobj);
6423 if (str1 == NULL) {
6424 Py_DECREF(self);
6425 return NULL;
6427 str2 = PyUnicode_FromObject(replobj);
6428 if (str2 == NULL) {
6429 Py_DECREF(self);
6430 Py_DECREF(str1);
6431 return NULL;
6433 result = replace((PyUnicodeObject *)self,
6434 (PyUnicodeObject *)str1,
6435 (PyUnicodeObject *)str2,
6436 maxcount);
6437 Py_DECREF(self);
6438 Py_DECREF(str1);
6439 Py_DECREF(str2);
6440 return result;
6443 PyDoc_STRVAR(replace__doc__,
6444 "S.replace (old, new[, maxsplit]) -> unicode\n\
6446 Return a copy of S with all occurrences of substring\n\
6447 old replaced by new. If the optional argument maxsplit is\n\
6448 given, only the first maxsplit occurrences are replaced.");
6450 static PyObject*
6451 unicode_replace(PyUnicodeObject *self, PyObject *args)
6453 PyUnicodeObject *str1;
6454 PyUnicodeObject *str2;
6455 Py_ssize_t maxcount = -1;
6456 PyObject *result;
6458 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
6459 return NULL;
6460 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6461 if (str1 == NULL)
6462 return NULL;
6463 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
6464 if (str2 == NULL) {
6465 Py_DECREF(str1);
6466 return NULL;
6469 result = replace(self, str1, str2, maxcount);
6471 Py_DECREF(str1);
6472 Py_DECREF(str2);
6473 return result;
6476 static
6477 PyObject *unicode_repr(PyObject *unicode)
6479 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6480 PyUnicode_GET_SIZE(unicode),
6484 PyDoc_STRVAR(rfind__doc__,
6485 "S.rfind(sub [,start [,end]]) -> int\n\
6487 Return the highest index in S where substring sub is found,\n\
6488 such that sub is contained within s[start,end]. Optional\n\
6489 arguments start and end are interpreted as in slice notation.\n\
6491 Return -1 on failure.");
6493 static PyObject *
6494 unicode_rfind(PyUnicodeObject *self, PyObject *args)
6496 PyObject *substring;
6497 Py_ssize_t start = 0;
6498 Py_ssize_t end = PY_SSIZE_T_MAX;
6499 Py_ssize_t result;
6501 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6502 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6503 return NULL;
6504 substring = PyUnicode_FromObject(substring);
6505 if (!substring)
6506 return NULL;
6508 result = stringlib_rfind_slice(
6509 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6510 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6511 start, end
6514 Py_DECREF(substring);
6516 return PyInt_FromSsize_t(result);
6519 PyDoc_STRVAR(rindex__doc__,
6520 "S.rindex(sub [,start [,end]]) -> int\n\
6522 Like S.rfind() but raise ValueError when the substring is not found.");
6524 static PyObject *
6525 unicode_rindex(PyUnicodeObject *self, PyObject *args)
6527 PyObject *substring;
6528 Py_ssize_t start = 0;
6529 Py_ssize_t end = PY_SSIZE_T_MAX;
6530 Py_ssize_t result;
6532 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6533 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6534 return NULL;
6535 substring = PyUnicode_FromObject(substring);
6536 if (!substring)
6537 return NULL;
6539 result = stringlib_rfind_slice(
6540 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6541 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6542 start, end
6545 Py_DECREF(substring);
6547 if (result < 0) {
6548 PyErr_SetString(PyExc_ValueError, "substring not found");
6549 return NULL;
6551 return PyInt_FromSsize_t(result);
6554 PyDoc_STRVAR(rjust__doc__,
6555 "S.rjust(width[, fillchar]) -> unicode\n\
6557 Return S right justified in a Unicode string of length width. Padding is\n\
6558 done using the specified fill character (default is a space).");
6560 static PyObject *
6561 unicode_rjust(PyUnicodeObject *self, PyObject *args)
6563 Py_ssize_t width;
6564 Py_UNICODE fillchar = ' ';
6566 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
6567 return NULL;
6569 if (self->length >= width && PyUnicode_CheckExact(self)) {
6570 Py_INCREF(self);
6571 return (PyObject*) self;
6574 return (PyObject*) pad(self, width - self->length, 0, fillchar);
6577 static PyObject*
6578 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
6580 /* standard clamping */
6581 if (start < 0)
6582 start = 0;
6583 if (end < 0)
6584 end = 0;
6585 if (end > self->length)
6586 end = self->length;
6587 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
6588 /* full slice, return original string */
6589 Py_INCREF(self);
6590 return (PyObject*) self;
6592 if (start > end)
6593 start = end;
6594 /* copy slice */
6595 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6596 end - start);
6599 PyObject *PyUnicode_Split(PyObject *s,
6600 PyObject *sep,
6601 Py_ssize_t maxsplit)
6603 PyObject *result;
6605 s = PyUnicode_FromObject(s);
6606 if (s == NULL)
6607 return NULL;
6608 if (sep != NULL) {
6609 sep = PyUnicode_FromObject(sep);
6610 if (sep == NULL) {
6611 Py_DECREF(s);
6612 return NULL;
6616 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6618 Py_DECREF(s);
6619 Py_XDECREF(sep);
6620 return result;
6623 PyDoc_STRVAR(split__doc__,
6624 "S.split([sep [,maxsplit]]) -> list of strings\n\
6626 Return a list of the words in S, using sep as the\n\
6627 delimiter string. If maxsplit is given, at most maxsplit\n\
6628 splits are done. If sep is not specified or is None,\n\
6629 any whitespace string is a separator.");
6631 static PyObject*
6632 unicode_split(PyUnicodeObject *self, PyObject *args)
6634 PyObject *substring = Py_None;
6635 Py_ssize_t maxcount = -1;
6637 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
6638 return NULL;
6640 if (substring == Py_None)
6641 return split(self, NULL, maxcount);
6642 else if (PyUnicode_Check(substring))
6643 return split(self, (PyUnicodeObject *)substring, maxcount);
6644 else
6645 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6648 PyObject *
6649 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6651 PyObject* str_obj;
6652 PyObject* sep_obj;
6653 PyObject* out;
6655 str_obj = PyUnicode_FromObject(str_in);
6656 if (!str_obj)
6657 return NULL;
6658 sep_obj = PyUnicode_FromObject(sep_in);
6659 if (!sep_obj) {
6660 Py_DECREF(str_obj);
6661 return NULL;
6664 out = stringlib_partition(
6665 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6666 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6669 Py_DECREF(sep_obj);
6670 Py_DECREF(str_obj);
6672 return out;
6676 PyObject *
6677 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6679 PyObject* str_obj;
6680 PyObject* sep_obj;
6681 PyObject* out;
6683 str_obj = PyUnicode_FromObject(str_in);
6684 if (!str_obj)
6685 return NULL;
6686 sep_obj = PyUnicode_FromObject(sep_in);
6687 if (!sep_obj) {
6688 Py_DECREF(str_obj);
6689 return NULL;
6692 out = stringlib_rpartition(
6693 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6694 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6697 Py_DECREF(sep_obj);
6698 Py_DECREF(str_obj);
6700 return out;
6703 PyDoc_STRVAR(partition__doc__,
6704 "S.partition(sep) -> (head, sep, tail)\n\
6706 Searches for the separator sep in S, and returns the part before it,\n\
6707 the separator itself, and the part after it. If the separator is not\n\
6708 found, returns S and two empty strings.");
6710 static PyObject*
6711 unicode_partition(PyUnicodeObject *self, PyObject *separator)
6713 return PyUnicode_Partition((PyObject *)self, separator);
6716 PyDoc_STRVAR(rpartition__doc__,
6717 "S.rpartition(sep) -> (tail, sep, head)\n\
6719 Searches for the separator sep in S, starting at the end of S, and returns\n\
6720 the part before it, the separator itself, and the part after it. If the\n\
6721 separator is not found, returns two empty strings and S.");
6723 static PyObject*
6724 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
6726 return PyUnicode_RPartition((PyObject *)self, separator);
6729 PyObject *PyUnicode_RSplit(PyObject *s,
6730 PyObject *sep,
6731 Py_ssize_t maxsplit)
6733 PyObject *result;
6735 s = PyUnicode_FromObject(s);
6736 if (s == NULL)
6737 return NULL;
6738 if (sep != NULL) {
6739 sep = PyUnicode_FromObject(sep);
6740 if (sep == NULL) {
6741 Py_DECREF(s);
6742 return NULL;
6746 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6748 Py_DECREF(s);
6749 Py_XDECREF(sep);
6750 return result;
6753 PyDoc_STRVAR(rsplit__doc__,
6754 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6756 Return a list of the words in S, using sep as the\n\
6757 delimiter string, starting at the end of the string and\n\
6758 working to the front. If maxsplit is given, at most maxsplit\n\
6759 splits are done. If sep is not specified, any whitespace string\n\
6760 is a separator.");
6762 static PyObject*
6763 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6765 PyObject *substring = Py_None;
6766 Py_ssize_t maxcount = -1;
6768 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
6769 return NULL;
6771 if (substring == Py_None)
6772 return rsplit(self, NULL, maxcount);
6773 else if (PyUnicode_Check(substring))
6774 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6775 else
6776 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6779 PyDoc_STRVAR(splitlines__doc__,
6780 "S.splitlines([keepends]]) -> list of strings\n\
6782 Return a list of the lines in S, breaking at line boundaries.\n\
6783 Line breaks are not included in the resulting list unless keepends\n\
6784 is given and true.");
6786 static PyObject*
6787 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6789 int keepends = 0;
6791 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
6792 return NULL;
6794 return PyUnicode_Splitlines((PyObject *)self, keepends);
6797 static
6798 PyObject *unicode_str(PyUnicodeObject *self)
6800 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
6803 PyDoc_STRVAR(swapcase__doc__,
6804 "S.swapcase() -> unicode\n\
6806 Return a copy of S with uppercase characters converted to lowercase\n\
6807 and vice versa.");
6809 static PyObject*
6810 unicode_swapcase(PyUnicodeObject *self)
6812 return fixup(self, fixswapcase);
6815 PyDoc_STRVAR(translate__doc__,
6816 "S.translate(table) -> unicode\n\
6818 Return a copy of the string S, where all characters have been mapped\n\
6819 through the given translation table, which must be a mapping of\n\
6820 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6821 Unmapped characters are left untouched. Characters mapped to None\n\
6822 are deleted.");
6824 static PyObject*
6825 unicode_translate(PyUnicodeObject *self, PyObject *table)
6827 return PyUnicode_TranslateCharmap(self->str,
6828 self->length,
6829 table,
6830 "ignore");
6833 PyDoc_STRVAR(upper__doc__,
6834 "S.upper() -> unicode\n\
6836 Return a copy of S converted to uppercase.");
6838 static PyObject*
6839 unicode_upper(PyUnicodeObject *self)
6841 return fixup(self, fixupper);
6844 PyDoc_STRVAR(zfill__doc__,
6845 "S.zfill(width) -> unicode\n\
6847 Pad a numeric string x with zeros on the left, to fill a field\n\
6848 of the specified width. The string x is never truncated.");
6850 static PyObject *
6851 unicode_zfill(PyUnicodeObject *self, PyObject *args)
6853 Py_ssize_t fill;
6854 PyUnicodeObject *u;
6856 Py_ssize_t width;
6857 if (!PyArg_ParseTuple(args, "n:zfill", &width))
6858 return NULL;
6860 if (self->length >= width) {
6861 if (PyUnicode_CheckExact(self)) {
6862 Py_INCREF(self);
6863 return (PyObject*) self;
6865 else
6866 return PyUnicode_FromUnicode(
6867 PyUnicode_AS_UNICODE(self),
6868 PyUnicode_GET_SIZE(self)
6872 fill = width - self->length;
6874 u = pad(self, fill, 0, '0');
6876 if (u == NULL)
6877 return NULL;
6879 if (u->str[fill] == '+' || u->str[fill] == '-') {
6880 /* move sign to beginning of string */
6881 u->str[0] = u->str[fill];
6882 u->str[fill] = '0';
6885 return (PyObject*) u;
6888 #if 0
6889 static PyObject*
6890 unicode_freelistsize(PyUnicodeObject *self)
6892 return PyInt_FromLong(unicode_freelist_size);
6894 #endif
6896 PyDoc_STRVAR(startswith__doc__,
6897 "S.startswith(prefix[, start[, end]]) -> bool\n\
6899 Return True if S starts with the specified prefix, False otherwise.\n\
6900 With optional start, test S beginning at that position.\n\
6901 With optional end, stop comparing S at that position.\n\
6902 prefix can also be a tuple of strings to try.");
6904 static PyObject *
6905 unicode_startswith(PyUnicodeObject *self,
6906 PyObject *args)
6908 PyObject *subobj;
6909 PyUnicodeObject *substring;
6910 Py_ssize_t start = 0;
6911 Py_ssize_t end = PY_SSIZE_T_MAX;
6912 int result;
6914 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
6915 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6916 return NULL;
6917 if (PyTuple_Check(subobj)) {
6918 Py_ssize_t i;
6919 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6920 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6921 PyTuple_GET_ITEM(subobj, i));
6922 if (substring == NULL)
6923 return NULL;
6924 result = tailmatch(self, substring, start, end, -1);
6925 Py_DECREF(substring);
6926 if (result) {
6927 Py_RETURN_TRUE;
6930 /* nothing matched */
6931 Py_RETURN_FALSE;
6933 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
6934 if (substring == NULL)
6935 return NULL;
6936 result = tailmatch(self, substring, start, end, -1);
6937 Py_DECREF(substring);
6938 return PyBool_FromLong(result);
6942 PyDoc_STRVAR(endswith__doc__,
6943 "S.endswith(suffix[, start[, end]]) -> bool\n\
6945 Return True if S ends with the specified suffix, False otherwise.\n\
6946 With optional start, test S beginning at that position.\n\
6947 With optional end, stop comparing S at that position.\n\
6948 suffix can also be a tuple of strings to try.");
6950 static PyObject *
6951 unicode_endswith(PyUnicodeObject *self,
6952 PyObject *args)
6954 PyObject *subobj;
6955 PyUnicodeObject *substring;
6956 Py_ssize_t start = 0;
6957 Py_ssize_t end = PY_SSIZE_T_MAX;
6958 int result;
6960 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
6961 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6962 return NULL;
6963 if (PyTuple_Check(subobj)) {
6964 Py_ssize_t i;
6965 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6966 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6967 PyTuple_GET_ITEM(subobj, i));
6968 if (substring == NULL)
6969 return NULL;
6970 result = tailmatch(self, substring, start, end, +1);
6971 Py_DECREF(substring);
6972 if (result) {
6973 Py_RETURN_TRUE;
6976 Py_RETURN_FALSE;
6978 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
6979 if (substring == NULL)
6980 return NULL;
6982 result = tailmatch(self, substring, start, end, +1);
6983 Py_DECREF(substring);
6984 return PyBool_FromLong(result);
6989 static PyObject *
6990 unicode_getnewargs(PyUnicodeObject *v)
6992 return Py_BuildValue("(u#)", v->str, v->length);
6996 static PyMethodDef unicode_methods[] = {
6998 /* Order is according to common usage: often used methods should
6999 appear first, since lookup is done sequentially. */
7001 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7002 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7003 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
7004 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
7005 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7006 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7007 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7008 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7009 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7010 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7011 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
7012 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
7013 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7014 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7015 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
7016 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
7017 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
7018 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7019 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7020 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7021 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
7022 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
7023 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
7024 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
7025 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
7026 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7027 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7028 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7029 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7030 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7031 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7032 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7033 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7034 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7035 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7036 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7037 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7038 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7039 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
7040 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
7041 #if 0
7042 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
7043 #endif
7045 #if 0
7046 /* This one is just used for debugging the implementation. */
7047 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
7048 #endif
7050 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
7051 {NULL, NULL}
7054 static PyObject *
7055 unicode_mod(PyObject *v, PyObject *w)
7057 if (!PyUnicode_Check(v)) {
7058 Py_INCREF(Py_NotImplemented);
7059 return Py_NotImplemented;
7061 return PyUnicode_Format(v, w);
7064 static PyNumberMethods unicode_as_number = {
7065 0, /*nb_add*/
7066 0, /*nb_subtract*/
7067 0, /*nb_multiply*/
7068 0, /*nb_divide*/
7069 unicode_mod, /*nb_remainder*/
7072 static PySequenceMethods unicode_as_sequence = {
7073 (lenfunc) unicode_length, /* sq_length */
7074 PyUnicode_Concat, /* sq_concat */
7075 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7076 (ssizeargfunc) unicode_getitem, /* sq_item */
7077 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7078 0, /* sq_ass_item */
7079 0, /* sq_ass_slice */
7080 PyUnicode_Contains, /* sq_contains */
7083 static PyObject*
7084 unicode_subscript(PyUnicodeObject* self, PyObject* item)
7086 if (PyIndex_Check(item)) {
7087 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
7088 if (i == -1 && PyErr_Occurred())
7089 return NULL;
7090 if (i < 0)
7091 i += PyUnicode_GET_SIZE(self);
7092 return unicode_getitem(self, i);
7093 } else if (PySlice_Check(item)) {
7094 Py_ssize_t start, stop, step, slicelength, cur, i;
7095 Py_UNICODE* source_buf;
7096 Py_UNICODE* result_buf;
7097 PyObject* result;
7099 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
7100 &start, &stop, &step, &slicelength) < 0) {
7101 return NULL;
7104 if (slicelength <= 0) {
7105 return PyUnicode_FromUnicode(NULL, 0);
7106 } else {
7107 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
7108 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7109 sizeof(Py_UNICODE));
7111 if (result_buf == NULL)
7112 return PyErr_NoMemory();
7114 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7115 result_buf[i] = source_buf[cur];
7118 result = PyUnicode_FromUnicode(result_buf, slicelength);
7119 PyMem_FREE(result_buf);
7120 return result;
7122 } else {
7123 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7124 return NULL;
7128 static PyMappingMethods unicode_as_mapping = {
7129 (lenfunc)unicode_length, /* mp_length */
7130 (binaryfunc)unicode_subscript, /* mp_subscript */
7131 (objobjargproc)0, /* mp_ass_subscript */
7134 static Py_ssize_t
7135 unicode_buffer_getreadbuf(PyUnicodeObject *self,
7136 Py_ssize_t index,
7137 const void **ptr)
7139 if (index != 0) {
7140 PyErr_SetString(PyExc_SystemError,
7141 "accessing non-existent unicode segment");
7142 return -1;
7144 *ptr = (void *) self->str;
7145 return PyUnicode_GET_DATA_SIZE(self);
7148 static Py_ssize_t
7149 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
7150 const void **ptr)
7152 PyErr_SetString(PyExc_TypeError,
7153 "cannot use unicode as modifiable buffer");
7154 return -1;
7157 static int
7158 unicode_buffer_getsegcount(PyUnicodeObject *self,
7159 Py_ssize_t *lenp)
7161 if (lenp)
7162 *lenp = PyUnicode_GET_DATA_SIZE(self);
7163 return 1;
7166 static Py_ssize_t
7167 unicode_buffer_getcharbuf(PyUnicodeObject *self,
7168 Py_ssize_t index,
7169 const void **ptr)
7171 PyObject *str;
7173 if (index != 0) {
7174 PyErr_SetString(PyExc_SystemError,
7175 "accessing non-existent unicode segment");
7176 return -1;
7178 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
7179 if (str == NULL)
7180 return -1;
7181 *ptr = (void *) PyString_AS_STRING(str);
7182 return PyString_GET_SIZE(str);
7185 /* Helpers for PyUnicode_Format() */
7187 static PyObject *
7188 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
7190 Py_ssize_t argidx = *p_argidx;
7191 if (argidx < arglen) {
7192 (*p_argidx)++;
7193 if (arglen < 0)
7194 return args;
7195 else
7196 return PyTuple_GetItem(args, argidx);
7198 PyErr_SetString(PyExc_TypeError,
7199 "not enough arguments for format string");
7200 return NULL;
7203 #define F_LJUST (1<<0)
7204 #define F_SIGN (1<<1)
7205 #define F_BLANK (1<<2)
7206 #define F_ALT (1<<3)
7207 #define F_ZERO (1<<4)
7209 static Py_ssize_t
7210 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
7212 register Py_ssize_t i;
7213 Py_ssize_t len = strlen(charbuffer);
7214 for (i = len - 1; i >= 0; i--)
7215 buffer[i] = (Py_UNICODE) charbuffer[i];
7217 return len;
7220 static int
7221 doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7223 Py_ssize_t result;
7225 PyOS_ascii_formatd((char *)buffer, len, format, x);
7226 result = strtounicode(buffer, (char *)buffer);
7227 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
7230 static int
7231 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7233 Py_ssize_t result;
7235 PyOS_snprintf((char *)buffer, len, format, x);
7236 result = strtounicode(buffer, (char *)buffer);
7237 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
7240 /* XXX To save some code duplication, formatfloat/long/int could have been
7241 shared with stringobject.c, converting from 8-bit to Unicode after the
7242 formatting is done. */
7244 static int
7245 formatfloat(Py_UNICODE *buf,
7246 size_t buflen,
7247 int flags,
7248 int prec,
7249 int type,
7250 PyObject *v)
7252 /* fmt = '%#.' + `prec` + `type`
7253 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
7254 char fmt[20];
7255 double x;
7257 x = PyFloat_AsDouble(v);
7258 if (x == -1.0 && PyErr_Occurred())
7259 return -1;
7260 if (prec < 0)
7261 prec = 6;
7262 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7263 type = 'g';
7264 /* Worst case length calc to ensure no buffer overrun:
7266 'g' formats:
7267 fmt = %#.<prec>g
7268 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7269 for any double rep.)
7270 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7272 'f' formats:
7273 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7274 len = 1 + 50 + 1 + prec = 52 + prec
7276 If prec=0 the effective precision is 1 (the leading digit is
7277 always given), therefore increase the length by one.
7280 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7281 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
7282 PyErr_SetString(PyExc_OverflowError,
7283 "formatted float is too long (precision too large?)");
7284 return -1;
7286 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7287 (flags&F_ALT) ? "#" : "",
7288 prec, type);
7289 return doubletounicode(buf, buflen, fmt, x);
7292 static PyObject*
7293 formatlong(PyObject *val, int flags, int prec, int type)
7295 char *buf;
7296 int i, len;
7297 PyObject *str; /* temporary string object. */
7298 PyUnicodeObject *result;
7300 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7301 if (!str)
7302 return NULL;
7303 result = _PyUnicode_New(len);
7304 if (!result) {
7305 Py_DECREF(str);
7306 return NULL;
7308 for (i = 0; i < len; i++)
7309 result->str[i] = buf[i];
7310 result->str[len] = 0;
7311 Py_DECREF(str);
7312 return (PyObject*)result;
7315 static int
7316 formatint(Py_UNICODE *buf,
7317 size_t buflen,
7318 int flags,
7319 int prec,
7320 int type,
7321 PyObject *v)
7323 /* fmt = '%#.' + `prec` + 'l' + `type`
7324 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7325 * + 1 + 1
7326 * = 24
7328 char fmt[64]; /* plenty big enough! */
7329 char *sign;
7330 long x;
7332 x = PyInt_AsLong(v);
7333 if (x == -1 && PyErr_Occurred())
7334 return -1;
7335 if (x < 0 && type == 'u') {
7336 type = 'd';
7338 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7339 sign = "-";
7340 else
7341 sign = "";
7342 if (prec < 0)
7343 prec = 1;
7345 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7346 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
7348 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
7349 PyErr_SetString(PyExc_OverflowError,
7350 "formatted integer is too long (precision too large?)");
7351 return -1;
7354 if ((flags & F_ALT) &&
7355 (type == 'x' || type == 'X')) {
7356 /* When converting under %#x or %#X, there are a number
7357 * of issues that cause pain:
7358 * - when 0 is being converted, the C standard leaves off
7359 * the '0x' or '0X', which is inconsistent with other
7360 * %#x/%#X conversions and inconsistent with Python's
7361 * hex() function
7362 * - there are platforms that violate the standard and
7363 * convert 0 with the '0x' or '0X'
7364 * (Metrowerks, Compaq Tru64)
7365 * - there are platforms that give '0x' when converting
7366 * under %#X, but convert 0 in accordance with the
7367 * standard (OS/2 EMX)
7369 * We can achieve the desired consistency by inserting our
7370 * own '0x' or '0X' prefix, and substituting %x/%X in place
7371 * of %#x/%#X.
7373 * Note that this is the same approach as used in
7374 * formatint() in stringobject.c
7376 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7377 sign, type, prec, type);
7379 else {
7380 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7381 sign, (flags&F_ALT) ? "#" : "",
7382 prec, type);
7384 if (sign[0])
7385 return longtounicode(buf, buflen, fmt, -x);
7386 else
7387 return longtounicode(buf, buflen, fmt, x);
7390 static int
7391 formatchar(Py_UNICODE *buf,
7392 size_t buflen,
7393 PyObject *v)
7395 /* presume that the buffer is at least 2 characters long */
7396 if (PyUnicode_Check(v)) {
7397 if (PyUnicode_GET_SIZE(v) != 1)
7398 goto onError;
7399 buf[0] = PyUnicode_AS_UNICODE(v)[0];
7402 else if (PyString_Check(v)) {
7403 if (PyString_GET_SIZE(v) != 1)
7404 goto onError;
7405 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7408 else {
7409 /* Integer input truncated to a character */
7410 long x;
7411 x = PyInt_AsLong(v);
7412 if (x == -1 && PyErr_Occurred())
7413 goto onError;
7414 #ifdef Py_UNICODE_WIDE
7415 if (x < 0 || x > 0x10ffff) {
7416 PyErr_SetString(PyExc_OverflowError,
7417 "%c arg not in range(0x110000) "
7418 "(wide Python build)");
7419 return -1;
7421 #else
7422 if (x < 0 || x > 0xffff) {
7423 PyErr_SetString(PyExc_OverflowError,
7424 "%c arg not in range(0x10000) "
7425 "(narrow Python build)");
7426 return -1;
7428 #endif
7429 buf[0] = (Py_UNICODE) x;
7431 buf[1] = '\0';
7432 return 1;
7434 onError:
7435 PyErr_SetString(PyExc_TypeError,
7436 "%c requires int or char");
7437 return -1;
7440 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7442 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7443 chars are formatted. XXX This is a magic number. Each formatting
7444 routine does bounds checking to ensure no overflow, but a better
7445 solution may be to malloc a buffer of appropriate size for each
7446 format. For now, the current solution is sufficient.
7448 #define FORMATBUFLEN (size_t)120
7450 PyObject *PyUnicode_Format(PyObject *format,
7451 PyObject *args)
7453 Py_UNICODE *fmt, *res;
7454 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
7455 int args_owned = 0;
7456 PyUnicodeObject *result = NULL;
7457 PyObject *dict = NULL;
7458 PyObject *uformat;
7460 if (format == NULL || args == NULL) {
7461 PyErr_BadInternalCall();
7462 return NULL;
7464 uformat = PyUnicode_FromObject(format);
7465 if (uformat == NULL)
7466 return NULL;
7467 fmt = PyUnicode_AS_UNICODE(uformat);
7468 fmtcnt = PyUnicode_GET_SIZE(uformat);
7470 reslen = rescnt = fmtcnt + 100;
7471 result = _PyUnicode_New(reslen);
7472 if (result == NULL)
7473 goto onError;
7474 res = PyUnicode_AS_UNICODE(result);
7476 if (PyTuple_Check(args)) {
7477 arglen = PyTuple_Size(args);
7478 argidx = 0;
7480 else {
7481 arglen = -1;
7482 argidx = -2;
7484 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7485 !PyObject_TypeCheck(args, &PyBaseString_Type))
7486 dict = args;
7488 while (--fmtcnt >= 0) {
7489 if (*fmt != '%') {
7490 if (--rescnt < 0) {
7491 rescnt = fmtcnt + 100;
7492 reslen += rescnt;
7493 if (_PyUnicode_Resize(&result, reslen) < 0)
7494 goto onError;
7495 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7496 --rescnt;
7498 *res++ = *fmt++;
7500 else {
7501 /* Got a format specifier */
7502 int flags = 0;
7503 Py_ssize_t width = -1;
7504 int prec = -1;
7505 Py_UNICODE c = '\0';
7506 Py_UNICODE fill;
7507 PyObject *v = NULL;
7508 PyObject *temp = NULL;
7509 Py_UNICODE *pbuf;
7510 Py_UNICODE sign;
7511 Py_ssize_t len;
7512 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
7514 fmt++;
7515 if (*fmt == '(') {
7516 Py_UNICODE *keystart;
7517 Py_ssize_t keylen;
7518 PyObject *key;
7519 int pcount = 1;
7521 if (dict == NULL) {
7522 PyErr_SetString(PyExc_TypeError,
7523 "format requires a mapping");
7524 goto onError;
7526 ++fmt;
7527 --fmtcnt;
7528 keystart = fmt;
7529 /* Skip over balanced parentheses */
7530 while (pcount > 0 && --fmtcnt >= 0) {
7531 if (*fmt == ')')
7532 --pcount;
7533 else if (*fmt == '(')
7534 ++pcount;
7535 fmt++;
7537 keylen = fmt - keystart - 1;
7538 if (fmtcnt < 0 || pcount > 0) {
7539 PyErr_SetString(PyExc_ValueError,
7540 "incomplete format key");
7541 goto onError;
7543 #if 0
7544 /* keys are converted to strings using UTF-8 and
7545 then looked up since Python uses strings to hold
7546 variables names etc. in its namespaces and we
7547 wouldn't want to break common idioms. */
7548 key = PyUnicode_EncodeUTF8(keystart,
7549 keylen,
7550 NULL);
7551 #else
7552 key = PyUnicode_FromUnicode(keystart, keylen);
7553 #endif
7554 if (key == NULL)
7555 goto onError;
7556 if (args_owned) {
7557 Py_DECREF(args);
7558 args_owned = 0;
7560 args = PyObject_GetItem(dict, key);
7561 Py_DECREF(key);
7562 if (args == NULL) {
7563 goto onError;
7565 args_owned = 1;
7566 arglen = -1;
7567 argidx = -2;
7569 while (--fmtcnt >= 0) {
7570 switch (c = *fmt++) {
7571 case '-': flags |= F_LJUST; continue;
7572 case '+': flags |= F_SIGN; continue;
7573 case ' ': flags |= F_BLANK; continue;
7574 case '#': flags |= F_ALT; continue;
7575 case '0': flags |= F_ZERO; continue;
7577 break;
7579 if (c == '*') {
7580 v = getnextarg(args, arglen, &argidx);
7581 if (v == NULL)
7582 goto onError;
7583 if (!PyInt_Check(v)) {
7584 PyErr_SetString(PyExc_TypeError,
7585 "* wants int");
7586 goto onError;
7588 width = PyInt_AsLong(v);
7589 if (width < 0) {
7590 flags |= F_LJUST;
7591 width = -width;
7593 if (--fmtcnt >= 0)
7594 c = *fmt++;
7596 else if (c >= '0' && c <= '9') {
7597 width = c - '0';
7598 while (--fmtcnt >= 0) {
7599 c = *fmt++;
7600 if (c < '0' || c > '9')
7601 break;
7602 if ((width*10) / 10 != width) {
7603 PyErr_SetString(PyExc_ValueError,
7604 "width too big");
7605 goto onError;
7607 width = width*10 + (c - '0');
7610 if (c == '.') {
7611 prec = 0;
7612 if (--fmtcnt >= 0)
7613 c = *fmt++;
7614 if (c == '*') {
7615 v = getnextarg(args, arglen, &argidx);
7616 if (v == NULL)
7617 goto onError;
7618 if (!PyInt_Check(v)) {
7619 PyErr_SetString(PyExc_TypeError,
7620 "* wants int");
7621 goto onError;
7623 prec = PyInt_AsLong(v);
7624 if (prec < 0)
7625 prec = 0;
7626 if (--fmtcnt >= 0)
7627 c = *fmt++;
7629 else if (c >= '0' && c <= '9') {
7630 prec = c - '0';
7631 while (--fmtcnt >= 0) {
7632 c = Py_CHARMASK(*fmt++);
7633 if (c < '0' || c > '9')
7634 break;
7635 if ((prec*10) / 10 != prec) {
7636 PyErr_SetString(PyExc_ValueError,
7637 "prec too big");
7638 goto onError;
7640 prec = prec*10 + (c - '0');
7643 } /* prec */
7644 if (fmtcnt >= 0) {
7645 if (c == 'h' || c == 'l' || c == 'L') {
7646 if (--fmtcnt >= 0)
7647 c = *fmt++;
7650 if (fmtcnt < 0) {
7651 PyErr_SetString(PyExc_ValueError,
7652 "incomplete format");
7653 goto onError;
7655 if (c != '%') {
7656 v = getnextarg(args, arglen, &argidx);
7657 if (v == NULL)
7658 goto onError;
7660 sign = 0;
7661 fill = ' ';
7662 switch (c) {
7664 case '%':
7665 pbuf = formatbuf;
7666 /* presume that buffer length is at least 1 */
7667 pbuf[0] = '%';
7668 len = 1;
7669 break;
7671 case 's':
7672 case 'r':
7673 if (PyUnicode_Check(v) && c == 's') {
7674 temp = v;
7675 Py_INCREF(temp);
7677 else {
7678 PyObject *unicode;
7679 if (c == 's')
7680 temp = PyObject_Unicode(v);
7681 else
7682 temp = PyObject_Repr(v);
7683 if (temp == NULL)
7684 goto onError;
7685 if (PyUnicode_Check(temp))
7686 /* nothing to do */;
7687 else if (PyString_Check(temp)) {
7688 /* convert to string to Unicode */
7689 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
7690 PyString_GET_SIZE(temp),
7691 NULL,
7692 "strict");
7693 Py_DECREF(temp);
7694 temp = unicode;
7695 if (temp == NULL)
7696 goto onError;
7698 else {
7699 Py_DECREF(temp);
7700 PyErr_SetString(PyExc_TypeError,
7701 "%s argument has non-string str()");
7702 goto onError;
7705 pbuf = PyUnicode_AS_UNICODE(temp);
7706 len = PyUnicode_GET_SIZE(temp);
7707 if (prec >= 0 && len > prec)
7708 len = prec;
7709 break;
7711 case 'i':
7712 case 'd':
7713 case 'u':
7714 case 'o':
7715 case 'x':
7716 case 'X':
7717 if (c == 'i')
7718 c = 'd';
7719 if (PyLong_Check(v)) {
7720 temp = formatlong(v, flags, prec, c);
7721 if (!temp)
7722 goto onError;
7723 pbuf = PyUnicode_AS_UNICODE(temp);
7724 len = PyUnicode_GET_SIZE(temp);
7725 sign = 1;
7727 else {
7728 pbuf = formatbuf;
7729 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7730 flags, prec, c, v);
7731 if (len < 0)
7732 goto onError;
7733 sign = 1;
7735 if (flags & F_ZERO)
7736 fill = '0';
7737 break;
7739 case 'e':
7740 case 'E':
7741 case 'f':
7742 case 'F':
7743 case 'g':
7744 case 'G':
7745 if (c == 'F')
7746 c = 'f';
7747 pbuf = formatbuf;
7748 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7749 flags, prec, c, v);
7750 if (len < 0)
7751 goto onError;
7752 sign = 1;
7753 if (flags & F_ZERO)
7754 fill = '0';
7755 break;
7757 case 'c':
7758 pbuf = formatbuf;
7759 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
7760 if (len < 0)
7761 goto onError;
7762 break;
7764 default:
7765 PyErr_Format(PyExc_ValueError,
7766 "unsupported format character '%c' (0x%x) "
7767 "at index %zd",
7768 (31<=c && c<=126) ? (char)c : '?',
7769 (int)c,
7770 (Py_ssize_t)(fmt - 1 -
7771 PyUnicode_AS_UNICODE(uformat)));
7772 goto onError;
7774 if (sign) {
7775 if (*pbuf == '-' || *pbuf == '+') {
7776 sign = *pbuf++;
7777 len--;
7779 else if (flags & F_SIGN)
7780 sign = '+';
7781 else if (flags & F_BLANK)
7782 sign = ' ';
7783 else
7784 sign = 0;
7786 if (width < len)
7787 width = len;
7788 if (rescnt - (sign != 0) < width) {
7789 reslen -= rescnt;
7790 rescnt = width + fmtcnt + 100;
7791 reslen += rescnt;
7792 if (reslen < 0) {
7793 Py_XDECREF(temp);
7794 PyErr_NoMemory();
7795 goto onError;
7797 if (_PyUnicode_Resize(&result, reslen) < 0) {
7798 Py_XDECREF(temp);
7799 goto onError;
7801 res = PyUnicode_AS_UNICODE(result)
7802 + reslen - rescnt;
7804 if (sign) {
7805 if (fill != ' ')
7806 *res++ = sign;
7807 rescnt--;
7808 if (width > len)
7809 width--;
7811 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7812 assert(pbuf[0] == '0');
7813 assert(pbuf[1] == c);
7814 if (fill != ' ') {
7815 *res++ = *pbuf++;
7816 *res++ = *pbuf++;
7818 rescnt -= 2;
7819 width -= 2;
7820 if (width < 0)
7821 width = 0;
7822 len -= 2;
7824 if (width > len && !(flags & F_LJUST)) {
7825 do {
7826 --rescnt;
7827 *res++ = fill;
7828 } while (--width > len);
7830 if (fill == ' ') {
7831 if (sign)
7832 *res++ = sign;
7833 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7834 assert(pbuf[0] == '0');
7835 assert(pbuf[1] == c);
7836 *res++ = *pbuf++;
7837 *res++ = *pbuf++;
7840 Py_UNICODE_COPY(res, pbuf, len);
7841 res += len;
7842 rescnt -= len;
7843 while (--width >= len) {
7844 --rescnt;
7845 *res++ = ' ';
7847 if (dict && (argidx < arglen) && c != '%') {
7848 PyErr_SetString(PyExc_TypeError,
7849 "not all arguments converted during string formatting");
7850 Py_XDECREF(temp);
7851 goto onError;
7853 Py_XDECREF(temp);
7854 } /* '%' */
7855 } /* until end */
7856 if (argidx < arglen && !dict) {
7857 PyErr_SetString(PyExc_TypeError,
7858 "not all arguments converted during string formatting");
7859 goto onError;
7862 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7863 goto onError;
7864 if (args_owned) {
7865 Py_DECREF(args);
7867 Py_DECREF(uformat);
7868 return (PyObject *)result;
7870 onError:
7871 Py_XDECREF(result);
7872 Py_DECREF(uformat);
7873 if (args_owned) {
7874 Py_DECREF(args);
7876 return NULL;
7879 static PyBufferProcs unicode_as_buffer = {
7880 (readbufferproc) unicode_buffer_getreadbuf,
7881 (writebufferproc) unicode_buffer_getwritebuf,
7882 (segcountproc) unicode_buffer_getsegcount,
7883 (charbufferproc) unicode_buffer_getcharbuf,
7886 static PyObject *
7887 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7889 static PyObject *
7890 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7892 PyObject *x = NULL;
7893 static char *kwlist[] = {"string", "encoding", "errors", 0};
7894 char *encoding = NULL;
7895 char *errors = NULL;
7897 if (type != &PyUnicode_Type)
7898 return unicode_subtype_new(type, args, kwds);
7899 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7900 kwlist, &x, &encoding, &errors))
7901 return NULL;
7902 if (x == NULL)
7903 return (PyObject *)_PyUnicode_New(0);
7904 if (encoding == NULL && errors == NULL)
7905 return PyObject_Unicode(x);
7906 else
7907 return PyUnicode_FromEncodedObject(x, encoding, errors);
7910 static PyObject *
7911 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7913 PyUnicodeObject *tmp, *pnew;
7914 Py_ssize_t n;
7916 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7917 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7918 if (tmp == NULL)
7919 return NULL;
7920 assert(PyUnicode_Check(tmp));
7921 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
7922 if (pnew == NULL) {
7923 Py_DECREF(tmp);
7924 return NULL;
7926 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7927 if (pnew->str == NULL) {
7928 _Py_ForgetReference((PyObject *)pnew);
7929 PyObject_Del(pnew);
7930 Py_DECREF(tmp);
7931 return PyErr_NoMemory();
7933 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7934 pnew->length = n;
7935 pnew->hash = tmp->hash;
7936 Py_DECREF(tmp);
7937 return (PyObject *)pnew;
7940 PyDoc_STRVAR(unicode_doc,
7941 "unicode(string [, encoding[, errors]]) -> object\n\
7943 Create a new Unicode object from the given encoded string.\n\
7944 encoding defaults to the current default string encoding.\n\
7945 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
7947 PyTypeObject PyUnicode_Type = {
7948 PyObject_HEAD_INIT(&PyType_Type)
7949 0, /* ob_size */
7950 "unicode", /* tp_name */
7951 sizeof(PyUnicodeObject), /* tp_size */
7952 0, /* tp_itemsize */
7953 /* Slots */
7954 (destructor)unicode_dealloc, /* tp_dealloc */
7955 0, /* tp_print */
7956 0, /* tp_getattr */
7957 0, /* tp_setattr */
7958 0, /* tp_compare */
7959 unicode_repr, /* tp_repr */
7960 &unicode_as_number, /* tp_as_number */
7961 &unicode_as_sequence, /* tp_as_sequence */
7962 &unicode_as_mapping, /* tp_as_mapping */
7963 (hashfunc) unicode_hash, /* tp_hash*/
7964 0, /* tp_call*/
7965 (reprfunc) unicode_str, /* tp_str */
7966 PyObject_GenericGetAttr, /* tp_getattro */
7967 0, /* tp_setattro */
7968 &unicode_as_buffer, /* tp_as_buffer */
7969 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7970 Py_TPFLAGS_BASETYPE, /* tp_flags */
7971 unicode_doc, /* tp_doc */
7972 0, /* tp_traverse */
7973 0, /* tp_clear */
7974 PyUnicode_RichCompare, /* tp_richcompare */
7975 0, /* tp_weaklistoffset */
7976 0, /* tp_iter */
7977 0, /* tp_iternext */
7978 unicode_methods, /* tp_methods */
7979 0, /* tp_members */
7980 0, /* tp_getset */
7981 &PyBaseString_Type, /* tp_base */
7982 0, /* tp_dict */
7983 0, /* tp_descr_get */
7984 0, /* tp_descr_set */
7985 0, /* tp_dictoffset */
7986 0, /* tp_init */
7987 0, /* tp_alloc */
7988 unicode_new, /* tp_new */
7989 PyObject_Del, /* tp_free */
7992 /* Initialize the Unicode implementation */
7994 void _PyUnicode_Init(void)
7996 int i;
7998 /* XXX - move this array to unicodectype.c ? */
7999 Py_UNICODE linebreak[] = {
8000 0x000A, /* LINE FEED */
8001 0x000D, /* CARRIAGE RETURN */
8002 0x001C, /* FILE SEPARATOR */
8003 0x001D, /* GROUP SEPARATOR */
8004 0x001E, /* RECORD SEPARATOR */
8005 0x0085, /* NEXT LINE */
8006 0x2028, /* LINE SEPARATOR */
8007 0x2029, /* PARAGRAPH SEPARATOR */
8010 /* Init the implementation */
8011 unicode_freelist = NULL;
8012 unicode_freelist_size = 0;
8013 unicode_empty = _PyUnicode_New(0);
8014 if (!unicode_empty)
8015 return;
8017 strcpy(unicode_default_encoding, "ascii");
8018 for (i = 0; i < 256; i++)
8019 unicode_latin1[i] = NULL;
8020 if (PyType_Ready(&PyUnicode_Type) < 0)
8021 Py_FatalError("Can't initialize 'unicode'");
8023 /* initialize the linebreak bloom filter */
8024 bloom_linebreak = make_bloom_mask(
8025 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8028 PyType_Ready(&EncodingMapType);
8031 /* Finalize the Unicode implementation */
8033 void
8034 _PyUnicode_Fini(void)
8036 PyUnicodeObject *u;
8037 int i;
8039 Py_XDECREF(unicode_empty);
8040 unicode_empty = NULL;
8042 for (i = 0; i < 256; i++) {
8043 if (unicode_latin1[i]) {
8044 Py_DECREF(unicode_latin1[i]);
8045 unicode_latin1[i] = NULL;
8049 for (u = unicode_freelist; u != NULL;) {
8050 PyUnicodeObject *v = u;
8051 u = *(PyUnicodeObject **)u;
8052 if (v->str)
8053 PyMem_DEL(v->str);
8054 Py_XDECREF(v->defenc);
8055 PyObject_Del(v);
8057 unicode_freelist = NULL;
8058 unicode_freelist_size = 0;
8061 #ifdef __cplusplus
8063 #endif
8067 Local variables:
8068 c-basic-offset: 4
8069 indent-tabs-mode: nil
8070 End: